data-designer-config 0.4.0rc2__tar.gz → 0.5.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/PKG-INFO +1 -1
  2. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/pyproject.toml +11 -6
  3. data_designer_config-0.5.0rc1/src/data_designer/config/__init__.py +226 -0
  4. data_designer_config-0.5.0rc1/src/data_designer/config/base.py +67 -0
  5. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/column_configs.py +149 -57
  6. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/column_types.py +5 -1
  7. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/config_builder.py +103 -3
  8. data_designer_config-0.5.0rc1/src/data_designer/config/custom_column.py +64 -0
  9. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/data_designer_config.py +5 -1
  10. data_designer_config-0.4.0rc2/src/data_designer/config/base.py → data_designer_config-0.5.0rc1/src/data_designer/config/exportable_config.py +1 -11
  11. data_designer_config-0.5.0rc1/src/data_designer/config/mcp.py +109 -0
  12. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/run_config.py +1 -1
  13. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/code_lang.py +13 -2
  14. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/constants.py +10 -1
  15. data_designer_config-0.5.0rc1/src/data_designer/config/utils/trace_type.py +24 -0
  16. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/visualization.py +6 -0
  17. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/logging.py +15 -0
  18. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_columns.py +67 -1
  19. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_config_builder.py +141 -0
  20. data_designer_config-0.5.0rc1/tests/config/test_mcp.py +53 -0
  21. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_code_lang.py +1 -1
  22. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/conftest.py +0 -1
  23. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/test_logging.py +51 -0
  24. data_designer_config-0.4.0rc2/src/data_designer/config/__init__.py +0 -149
  25. data_designer_config-0.4.0rc2/src/data_designer/config/_version.py +0 -34
  26. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/.gitignore +0 -0
  27. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/README.md +0 -0
  28. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/__init__.py +0 -0
  29. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/column_profilers.py +0 -0
  30. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/column_statistics.py +0 -0
  31. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/dataset_profiler.py +0 -0
  32. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/utils/errors.py +0 -0
  33. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/analysis/utils/reporting.py +0 -0
  34. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/dataset_builders.py +0 -0
  35. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/dataset_metadata.py +0 -0
  36. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/default_model_settings.py +0 -0
  37. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/errors.py +0 -0
  38. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/interface.py +0 -0
  39. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/models.py +0 -0
  40. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/preview_results.py +0 -0
  41. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/processors.py +0 -0
  42. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/sampler_constraints.py +0 -0
  43. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/sampler_params.py +0 -0
  44. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/seed.py +0 -0
  45. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/seed_source.py +0 -0
  46. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/seed_source_types.py +0 -0
  47. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/testing/__init__.py +0 -0
  48. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/testing/fixtures.py +0 -0
  49. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/errors.py +0 -0
  50. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/info.py +0 -0
  51. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/io_helpers.py +0 -0
  52. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/misc.py +0 -0
  53. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/numerical_helpers.py +0 -0
  54. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/utils/type_helpers.py +0 -0
  55. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/config/validator_params.py +0 -0
  56. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/errors.py +0 -0
  57. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/lazy_heavy_imports.py +0 -0
  58. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugin_manager.py +0 -0
  59. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugins/__init__.py +0 -0
  60. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugins/errors.py +0 -0
  61. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugins/plugin.py +0 -0
  62. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/src/data_designer/plugins/registry.py +0 -0
  63. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/analysis/conftest.py +0 -0
  64. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/analysis/test_column_statistics.py +0 -0
  65. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/analysis/test_dataset_profiler_results.py +0 -0
  66. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/analysis/utils/test_reporting.py +0 -0
  67. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_data_designer_config.py +0 -0
  68. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_default_model_settings.py +0 -0
  69. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_models.py +0 -0
  70. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_processors.py +0 -0
  71. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_sampler_constraints.py +0 -0
  72. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_sampler_params.py +0 -0
  73. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_seed.py +0 -0
  74. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_seed_source.py +0 -0
  75. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/test_validator_params.py +0 -0
  76. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/__init__.py +0 -0
  77. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_info.py +0 -0
  78. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_io_helpers.py +0 -0
  79. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_misc.py +0 -0
  80. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_type_helpers.py +0 -0
  81. {data_designer_config-0.4.0rc2 → data_designer_config-0.5.0rc1}/tests/config/utils/test_visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer-config
3
- Version: 0.4.0rc2
3
+ Version: 0.5.0rc1
4
4
  Summary: Configuration layer for DataDesigner synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  Classifier: Development Status :: 4 - Beta
@@ -31,16 +31,16 @@ dependencies = [
31
31
  ]
32
32
 
33
33
  [build-system]
34
- requires = ["hatchling", "hatch-vcs"]
34
+ requires = ["hatchling", "uv-dynamic-versioning>=0.7.0"]
35
35
  build-backend = "hatchling.build"
36
36
 
37
37
  [tool.hatch.version]
38
- source = "vcs"
39
- fallback-version = "0.1.0.dev0"
40
- raw-options = { root = "../.." }
38
+ source = "uv-dynamic-versioning"
41
39
 
42
- [tool.hatch.build.hooks.vcs]
43
- version-file = "src/data_designer/config/_version.py"
40
+ [tool.uv-dynamic-versioning]
41
+ vcs = "git"
42
+ style = "pep440"
43
+ bump = true
44
44
 
45
45
  [tool.hatch.build.targets.wheel]
46
46
  packages = ["src/data_designer"]
@@ -48,5 +48,10 @@ packages = ["src/data_designer"]
48
48
  [tool.ruff]
49
49
  extend = "../../pyproject.toml"
50
50
 
51
+ [tool.pytest.ini_options]
52
+ testpaths = ["tests"]
53
+ asyncio_default_fixture_loop_scope = "session"
54
+ env = ["DISABLE_DATA_DESIGNER_PLUGINS=true"]
55
+
51
56
  [tool.uv]
52
57
  package = true
@@ -0,0 +1,226 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import importlib
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ # These imports are for IDE autocomplete and type checking only.
11
+ # At runtime, __getattr__ lazily loads the actual objects.
12
+ from data_designer.config.analysis.column_profilers import ( # noqa: F401
13
+ JudgeScoreProfilerConfig,
14
+ )
15
+ from data_designer.config.column_configs import ( # noqa: F401
16
+ CustomColumnConfig,
17
+ EmbeddingColumnConfig,
18
+ ExpressionColumnConfig,
19
+ GenerationStrategy,
20
+ LLMCodeColumnConfig,
21
+ LLMJudgeColumnConfig,
22
+ LLMStructuredColumnConfig,
23
+ LLMTextColumnConfig,
24
+ SamplerColumnConfig,
25
+ Score,
26
+ SeedDatasetColumnConfig,
27
+ ValidationColumnConfig,
28
+ )
29
+ from data_designer.config.column_types import DataDesignerColumnType # noqa: F401
30
+ from data_designer.config.config_builder import DataDesignerConfigBuilder # noqa: F401
31
+ from data_designer.config.custom_column import custom_column_generator # noqa: F401
32
+ from data_designer.config.data_designer_config import DataDesignerConfig # noqa: F401
33
+ from data_designer.config.dataset_builders import BuildStage # noqa: F401
34
+ from data_designer.config.mcp import ( # noqa: F401
35
+ LocalStdioMCPProvider,
36
+ MCPProvider,
37
+ ToolConfig,
38
+ )
39
+ from data_designer.config.models import ( # noqa: F401
40
+ ChatCompletionInferenceParams,
41
+ EmbeddingInferenceParams,
42
+ GenerationType,
43
+ ImageContext,
44
+ ImageFormat,
45
+ ManualDistribution,
46
+ ManualDistributionParams,
47
+ Modality,
48
+ ModalityContext,
49
+ ModalityDataType,
50
+ ModelConfig,
51
+ ModelProvider,
52
+ UniformDistribution,
53
+ UniformDistributionParams,
54
+ )
55
+ from data_designer.config.processors import ( # noqa: F401
56
+ DropColumnsProcessorConfig,
57
+ ProcessorType,
58
+ SchemaTransformProcessorConfig,
59
+ )
60
+ from data_designer.config.run_config import RunConfig # noqa: F401
61
+ from data_designer.config.sampler_constraints import ( # noqa: F401
62
+ ColumnInequalityConstraint,
63
+ ScalarInequalityConstraint,
64
+ )
65
+ from data_designer.config.sampler_params import ( # noqa: F401
66
+ BernoulliMixtureSamplerParams,
67
+ BernoulliSamplerParams,
68
+ BinomialSamplerParams,
69
+ CategorySamplerParams,
70
+ DatetimeSamplerParams,
71
+ GaussianSamplerParams,
72
+ PersonFromFakerSamplerParams,
73
+ PersonSamplerParams,
74
+ PoissonSamplerParams,
75
+ SamplerType,
76
+ ScipySamplerParams,
77
+ SubcategorySamplerParams,
78
+ TimeDeltaSamplerParams,
79
+ UniformSamplerParams,
80
+ UUIDSamplerParams,
81
+ )
82
+ from data_designer.config.seed import ( # noqa: F401
83
+ IndexRange,
84
+ PartitionBlock,
85
+ SamplingStrategy,
86
+ SeedConfig,
87
+ )
88
+ from data_designer.config.seed_source import ( # noqa: F401
89
+ DataFrameSeedSource,
90
+ HuggingFaceSeedSource,
91
+ LocalFileSeedSource,
92
+ )
93
+ from data_designer.config.utils.code_lang import CodeLang # noqa: F401
94
+ from data_designer.config.utils.info import InfoType # noqa: F401
95
+ from data_designer.config.utils.trace_type import TraceType # noqa: F401
96
+ from data_designer.config.validator_params import ( # noqa: F401
97
+ CodeValidatorParams,
98
+ LocalCallableValidatorParams,
99
+ RemoteValidatorParams,
100
+ ValidatorType,
101
+ )
102
+
103
+ # Base module path and submodule paths for lazy imports
104
+ _MOD_BASE = "data_designer.config"
105
+ _MOD_COLUMN_CONFIGS = f"{_MOD_BASE}.column_configs"
106
+ _MOD_MCP = f"{_MOD_BASE}.mcp"
107
+ _MOD_MODELS = f"{_MOD_BASE}.models"
108
+ _MOD_PROCESSORS = f"{_MOD_BASE}.processors"
109
+ _MOD_SAMPLER_CONSTRAINTS = f"{_MOD_BASE}.sampler_constraints"
110
+ _MOD_SAMPLER_PARAMS = f"{_MOD_BASE}.sampler_params"
111
+ _MOD_SEED = f"{_MOD_BASE}.seed"
112
+ _MOD_SEED_SOURCE = f"{_MOD_BASE}.seed_source"
113
+ _MOD_VALIDATOR_PARAMS = f"{_MOD_BASE}.validator_params"
114
+ _MOD_UTILS = f"{_MOD_BASE}.utils"
115
+
116
+ # Mapping of export names to (module_path, attribute_name) for lazy loading
117
+ _LAZY_IMPORTS: dict[str, tuple[str, str]] = {
118
+ # analysis.column_profilers
119
+ "JudgeScoreProfilerConfig": (f"{_MOD_BASE}.analysis.column_profilers", "JudgeScoreProfilerConfig"),
120
+ # column_configs
121
+ "CustomColumnConfig": (_MOD_COLUMN_CONFIGS, "CustomColumnConfig"),
122
+ "EmbeddingColumnConfig": (_MOD_COLUMN_CONFIGS, "EmbeddingColumnConfig"),
123
+ "ExpressionColumnConfig": (_MOD_COLUMN_CONFIGS, "ExpressionColumnConfig"),
124
+ "GenerationStrategy": (_MOD_COLUMN_CONFIGS, "GenerationStrategy"),
125
+ "LLMCodeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMCodeColumnConfig"),
126
+ "LLMJudgeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMJudgeColumnConfig"),
127
+ "LLMStructuredColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMStructuredColumnConfig"),
128
+ "LLMTextColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMTextColumnConfig"),
129
+ "SamplerColumnConfig": (_MOD_COLUMN_CONFIGS, "SamplerColumnConfig"),
130
+ "Score": (_MOD_COLUMN_CONFIGS, "Score"),
131
+ "SeedDatasetColumnConfig": (_MOD_COLUMN_CONFIGS, "SeedDatasetColumnConfig"),
132
+ "ValidationColumnConfig": (_MOD_COLUMN_CONFIGS, "ValidationColumnConfig"),
133
+ # column_types
134
+ "DataDesignerColumnType": (f"{_MOD_BASE}.column_types", "DataDesignerColumnType"),
135
+ # config_builder
136
+ "DataDesignerConfigBuilder": (f"{_MOD_BASE}.config_builder", "DataDesignerConfigBuilder"),
137
+ # custom_column
138
+ "custom_column_generator": (f"{_MOD_BASE}.custom_column", "custom_column_generator"),
139
+ # data_designer_config
140
+ "DataDesignerConfig": (f"{_MOD_BASE}.data_designer_config", "DataDesignerConfig"),
141
+ # dataset_builders
142
+ "BuildStage": (f"{_MOD_BASE}.dataset_builders", "BuildStage"),
143
+ # mcp
144
+ "LocalStdioMCPProvider": (_MOD_MCP, "LocalStdioMCPProvider"),
145
+ "MCPProvider": (_MOD_MCP, "MCPProvider"),
146
+ "ToolConfig": (_MOD_MCP, "ToolConfig"),
147
+ # models
148
+ "ChatCompletionInferenceParams": (_MOD_MODELS, "ChatCompletionInferenceParams"),
149
+ "EmbeddingInferenceParams": (_MOD_MODELS, "EmbeddingInferenceParams"),
150
+ "GenerationType": (_MOD_MODELS, "GenerationType"),
151
+ "ImageContext": (_MOD_MODELS, "ImageContext"),
152
+ "ImageFormat": (_MOD_MODELS, "ImageFormat"),
153
+ "ManualDistribution": (_MOD_MODELS, "ManualDistribution"),
154
+ "ManualDistributionParams": (_MOD_MODELS, "ManualDistributionParams"),
155
+ "Modality": (_MOD_MODELS, "Modality"),
156
+ "ModalityContext": (_MOD_MODELS, "ModalityContext"),
157
+ "ModalityDataType": (_MOD_MODELS, "ModalityDataType"),
158
+ "ModelConfig": (_MOD_MODELS, "ModelConfig"),
159
+ "ModelProvider": (_MOD_MODELS, "ModelProvider"),
160
+ "UniformDistribution": (_MOD_MODELS, "UniformDistribution"),
161
+ "UniformDistributionParams": (_MOD_MODELS, "UniformDistributionParams"),
162
+ # processors
163
+ "DropColumnsProcessorConfig": (_MOD_PROCESSORS, "DropColumnsProcessorConfig"),
164
+ "ProcessorType": (_MOD_PROCESSORS, "ProcessorType"),
165
+ "SchemaTransformProcessorConfig": (_MOD_PROCESSORS, "SchemaTransformProcessorConfig"),
166
+ # run_config
167
+ "RunConfig": (f"{_MOD_BASE}.run_config", "RunConfig"),
168
+ # sampler_constraints
169
+ "ColumnInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ColumnInequalityConstraint"),
170
+ "ScalarInequalityConstraint": (_MOD_SAMPLER_CONSTRAINTS, "ScalarInequalityConstraint"),
171
+ # sampler_params
172
+ "BernoulliMixtureSamplerParams": (_MOD_SAMPLER_PARAMS, "BernoulliMixtureSamplerParams"),
173
+ "BernoulliSamplerParams": (_MOD_SAMPLER_PARAMS, "BernoulliSamplerParams"),
174
+ "BinomialSamplerParams": (_MOD_SAMPLER_PARAMS, "BinomialSamplerParams"),
175
+ "CategorySamplerParams": (_MOD_SAMPLER_PARAMS, "CategorySamplerParams"),
176
+ "DatetimeSamplerParams": (_MOD_SAMPLER_PARAMS, "DatetimeSamplerParams"),
177
+ "GaussianSamplerParams": (_MOD_SAMPLER_PARAMS, "GaussianSamplerParams"),
178
+ "PersonFromFakerSamplerParams": (_MOD_SAMPLER_PARAMS, "PersonFromFakerSamplerParams"),
179
+ "PersonSamplerParams": (_MOD_SAMPLER_PARAMS, "PersonSamplerParams"),
180
+ "PoissonSamplerParams": (_MOD_SAMPLER_PARAMS, "PoissonSamplerParams"),
181
+ "SamplerType": (_MOD_SAMPLER_PARAMS, "SamplerType"),
182
+ "ScipySamplerParams": (_MOD_SAMPLER_PARAMS, "ScipySamplerParams"),
183
+ "SubcategorySamplerParams": (_MOD_SAMPLER_PARAMS, "SubcategorySamplerParams"),
184
+ "TimeDeltaSamplerParams": (_MOD_SAMPLER_PARAMS, "TimeDeltaSamplerParams"),
185
+ "UniformSamplerParams": (_MOD_SAMPLER_PARAMS, "UniformSamplerParams"),
186
+ "UUIDSamplerParams": (_MOD_SAMPLER_PARAMS, "UUIDSamplerParams"),
187
+ # seed
188
+ "IndexRange": (_MOD_SEED, "IndexRange"),
189
+ "PartitionBlock": (_MOD_SEED, "PartitionBlock"),
190
+ "SamplingStrategy": (_MOD_SEED, "SamplingStrategy"),
191
+ "SeedConfig": (_MOD_SEED, "SeedConfig"),
192
+ # seed_source
193
+ "DataFrameSeedSource": (_MOD_SEED_SOURCE, "DataFrameSeedSource"),
194
+ "HuggingFaceSeedSource": (_MOD_SEED_SOURCE, "HuggingFaceSeedSource"),
195
+ "LocalFileSeedSource": (_MOD_SEED_SOURCE, "LocalFileSeedSource"),
196
+ # utils
197
+ "CodeLang": (f"{_MOD_UTILS}.code_lang", "CodeLang"),
198
+ "InfoType": (f"{_MOD_UTILS}.info", "InfoType"),
199
+ "TraceType": (f"{_MOD_UTILS}.trace_type", "TraceType"),
200
+ # validator_params
201
+ "CodeValidatorParams": (_MOD_VALIDATOR_PARAMS, "CodeValidatorParams"),
202
+ "LocalCallableValidatorParams": (_MOD_VALIDATOR_PARAMS, "LocalCallableValidatorParams"),
203
+ "RemoteValidatorParams": (_MOD_VALIDATOR_PARAMS, "RemoteValidatorParams"),
204
+ "ValidatorType": (_MOD_VALIDATOR_PARAMS, "ValidatorType"),
205
+ }
206
+
207
+ __all__ = list(_LAZY_IMPORTS.keys())
208
+
209
+
210
+ def __getattr__(name: str) -> object:
211
+ """Lazily import config module exports when accessed.
212
+
213
+ This allows fast imports of data_designer.config while deferring loading
214
+ of submodules until they're actually needed.
215
+ """
216
+ if name in _LAZY_IMPORTS:
217
+ module_path, attr_name = _LAZY_IMPORTS[name]
218
+ module = importlib.import_module(module_path)
219
+ return getattr(module, attr_name)
220
+
221
+ raise AttributeError(f"module 'data_designer.config' has no attribute {name!r}")
222
+
223
+
224
+ def __dir__() -> list[str]:
225
+ """Return list of available exports for tab-completion."""
226
+ return __all__
@@ -0,0 +1,67 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # IMPORTANT: This module must NOT import from any data_designer submodules (i.e., data_designer.*).
5
+ # These base abstractions are foundational and should only depend on pydantic and Python builtins.
6
+
7
+ from __future__ import annotations
8
+
9
+ from abc import ABC, abstractmethod
10
+
11
+ from pydantic import BaseModel, ConfigDict
12
+
13
+
14
+ class ConfigBase(BaseModel):
15
+ model_config = ConfigDict(
16
+ protected_namespaces=(),
17
+ use_enum_values=True,
18
+ arbitrary_types_allowed=True,
19
+ extra="forbid",
20
+ json_schema_mode_override="validation",
21
+ )
22
+
23
+
24
+ class SingleColumnConfig(ConfigBase, ABC):
25
+ """Abstract base class for all single-column configuration types.
26
+
27
+ This class serves as the foundation for all column configurations in DataDesigner,
28
+ defining shared fields and properties across all column types.
29
+
30
+ Attributes:
31
+ name: Unique name of the column to be generated.
32
+ drop: If True, the column will be generated but removed from the final dataset.
33
+ Useful for intermediate columns that are dependencies for other columns.
34
+ column_type: Discriminator field that identifies the specific column type.
35
+ Subclasses must override this field to specify the column type with a `Literal` value.
36
+ """
37
+
38
+ name: str
39
+ drop: bool = False
40
+ column_type: str
41
+
42
+ @staticmethod
43
+ def get_column_emoji() -> str:
44
+ return "🎨"
45
+
46
+ @property
47
+ @abstractmethod
48
+ def required_columns(self) -> list[str]:
49
+ """Returns a list of column names that must exist before this column can be generated.
50
+
51
+ Returns:
52
+ List of column names that this column depends on. Empty list indicates
53
+ no dependencies. Override in subclasses to specify dependencies.
54
+ """
55
+
56
+ @property
57
+ @abstractmethod
58
+ def side_effect_columns(self) -> list[str]:
59
+ """Returns a list of additional columns that this column will create as a side effect.
60
+
61
+ Some column types generate additional metadata or auxiliary columns alongside
62
+ the primary column (e.g., reasoning traces for LLM columns).
63
+
64
+ Returns:
65
+ List of column names that this column will create as a side effect. Empty list
66
+ indicates no side effect columns. Override in subclasses to specify side effects.
67
+ """
@@ -3,66 +3,28 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
- from abc import ABC, abstractmethod
7
- from typing import Annotated, Literal
6
+ from enum import Enum
7
+ from typing import Annotated, Any, Literal
8
8
 
9
- from pydantic import BaseModel, Discriminator, Field, model_validator
9
+ from pydantic import BaseModel, Discriminator, Field, field_serializer, field_validator, model_validator
10
10
  from typing_extensions import Self
11
11
 
12
- from data_designer.config.base import ConfigBase
12
+ from data_designer.config.base import ConfigBase, SingleColumnConfig
13
13
  from data_designer.config.errors import InvalidConfigError
14
14
  from data_designer.config.models import ImageContext
15
15
  from data_designer.config.sampler_params import SamplerParamsT, SamplerType
16
16
  from data_designer.config.utils.code_lang import CodeLang
17
- from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX
17
+ from data_designer.config.utils.constants import REASONING_CONTENT_COLUMN_POSTFIX, TRACE_COLUMN_POSTFIX
18
18
  from data_designer.config.utils.misc import assert_valid_jinja2_template, extract_keywords_from_jinja2_template
19
+ from data_designer.config.utils.trace_type import TraceType
19
20
  from data_designer.config.validator_params import ValidatorParamsT, ValidatorType
20
21
 
21
22
 
22
- class SingleColumnConfig(ConfigBase, ABC):
23
- """Abstract base class for all single-column configuration types.
23
+ class GenerationStrategy(str, Enum):
24
+ """Strategy for custom column generation."""
24
25
 
25
- This class serves as the foundation for all column configurations in DataDesigner,
26
- defining shared fields and properties across all column types.
27
-
28
- Attributes:
29
- name: Unique name of the column to be generated.
30
- drop: If True, the column will be generated but removed from the final dataset.
31
- Useful for intermediate columns that are dependencies for other columns.
32
- column_type: Discriminator field that identifies the specific column type.
33
- Subclasses must override this field to specify the column type with a `Literal` value.
34
- """
35
-
36
- name: str
37
- drop: bool = False
38
- column_type: str
39
-
40
- @staticmethod
41
- def get_column_emoji() -> str:
42
- return "🎨"
43
-
44
- @property
45
- @abstractmethod
46
- def required_columns(self) -> list[str]:
47
- """Returns a list of column names that must exist before this column can be generated.
48
-
49
- Returns:
50
- List of column names that this column depends on. Empty list indicates
51
- no dependencies. Override in subclasses to specify dependencies.
52
- """
53
-
54
- @property
55
- @abstractmethod
56
- def side_effect_columns(self) -> list[str]:
57
- """Returns a list of additional columns that this column will create as a side effect.
58
-
59
- Some column types generate additional metadata or auxiliary columns alongside
60
- the primary column (e.g., reasoning traces for LLM columns).
61
-
62
- Returns:
63
- List of column names that this column will create as a side effect. Empty list
64
- indicates no side effect columns. Override in subclasses to specify side effects.
65
- """
26
+ CELL_BY_CELL = "cell_by_cell"
27
+ FULL_COLUMN = "full_column"
66
28
 
67
29
 
68
30
  class SamplerColumnConfig(SingleColumnConfig):
@@ -143,8 +105,8 @@ class LLMTextColumnConfig(SingleColumnConfig):
143
105
 
144
106
  LLM text columns generate free-form text content using language models via LiteLLM.
145
107
  Prompts support Jinja2 templating to reference values from other columns, enabling
146
- context-aware generation. The generated text can optionally include reasoning traces
147
- when models support extended thinking.
108
+ context-aware generation. The generated text can optionally include message traces
109
+ capturing the full conversation history.
148
110
 
149
111
  Attributes:
150
112
  prompt: Prompt template for text generation. Supports Jinja2 syntax to
@@ -159,6 +121,18 @@ class LLMTextColumnConfig(SingleColumnConfig):
159
121
  `LLMStructuredColumnConfig` for structured output, `LLMCodeColumnConfig` for code.
160
122
  multi_modal_context: Optional list of image contexts for multi-modal generation.
161
123
  Enables vision-capable models to generate text based on image inputs.
124
+ tool_alias: Optional alias of the tool configuration to use for MCP tool calls.
125
+ Must match a tool alias defined when initializing the DataDesignerConfigBuilder.
126
+ When provided, the model may call permitted tools during generation.
127
+ with_trace: Specifies what trace information to capture in a `{column_name}__trace`
128
+ column. Options are:
129
+ - `TraceType.NONE` (default): No trace is captured.
130
+ - `TraceType.LAST_MESSAGE`: Only the final assistant message is captured.
131
+ - `TraceType.ALL_MESSAGES`: Full conversation history (system/user/assistant/tool).
132
+ extract_reasoning_content: If True, creates a `{column_name}__reasoning_content` column
133
+ containing only the reasoning_content from the final assistant response. This is
134
+ useful for models that expose chain-of-thought reasoning separately from the main
135
+ response. Defaults to False.
162
136
  column_type: Discriminator field, always "llm-text" for this configuration type.
163
137
  """
164
138
 
@@ -166,6 +140,9 @@ class LLMTextColumnConfig(SingleColumnConfig):
166
140
  model_alias: str
167
141
  system_prompt: str | None = None
168
142
  multi_modal_context: list[ImageContext] | None = None
143
+ tool_alias: str | None = None
144
+ with_trace: TraceType = TraceType.NONE
145
+ extract_reasoning_content: bool = False
169
146
  column_type: Literal["llm-text"] = "llm-text"
170
147
 
171
148
  @staticmethod
@@ -186,14 +163,20 @@ class LLMTextColumnConfig(SingleColumnConfig):
186
163
 
187
164
  @property
188
165
  def side_effect_columns(self) -> list[str]:
189
- """Returns the reasoning trace column, which may be generated alongside the main column.
166
+ """Returns side-effect columns that may be generated alongside the main column.
190
167
 
191
- Reasoning traces are only returned if the served model parses and returns reasoning content.
168
+ Side-effect columns include:
169
+ - `{name}__trace`: Generated when `with_trace` is not `TraceType.NONE` on the column
170
+ config.
171
+ - `{name}__reasoning_content`: Generated when `extract_reasoning_content=True`.
192
172
 
193
173
  Returns:
194
- List containing the reasoning trace column name.
174
+ List of side-effect column names.
195
175
  """
196
- return [f"{self.name}{REASONING_TRACE_COLUMN_POSTFIX}"]
176
+ return [
177
+ *([f"{self.name}{TRACE_COLUMN_POSTFIX}"] if self.with_trace != TraceType.NONE else []),
178
+ *([f"{self.name}{REASONING_CONTENT_COLUMN_POSTFIX}"] if self.extract_reasoning_content else []),
179
+ ]
197
180
 
198
181
  @model_validator(mode="after")
199
182
  def assert_prompt_valid_jinja(self) -> Self:
@@ -216,7 +199,7 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
216
199
 
217
200
  Extends LLMTextColumnConfig to generate code snippets in specific programming languages
218
201
  or SQL dialects. The generated code is automatically extracted from markdown code blocks
219
- for the specified language. Inherits all prompt templating capabilities.
202
+ for the specified language. Inherits all prompt templating capabilities from LLMTextColumnConfig.
220
203
 
221
204
  Attributes:
222
205
  code_lang: Programming language or SQL dialect for code generation. Supported
@@ -224,6 +207,16 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
224
207
  "rust", "ruby", "scala", "swift", "sql:sqlite", "sql:postgres", "sql:mysql",
225
208
  "sql:tsql", "sql:bigquery", "sql:ansi". See CodeLang enum for complete list.
226
209
  column_type: Discriminator field, always "llm-code" for this configuration type.
210
+
211
+ Inherited Attributes:
212
+ prompt: Prompt template for code generation (supports Jinja2).
213
+ model_alias: Alias of the model configuration to use.
214
+ system_prompt: Optional system prompt (supports Jinja2).
215
+ multi_modal_context: Optional image contexts for multi-modal generation.
216
+ tool_alias: Optional tool configuration alias for MCP tool calls.
217
+ with_trace: If True, creates a `{column_name}__trace` column with message history.
218
+ extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
219
+ column containing the reasoning content from the final assistant response.
227
220
  """
228
221
 
229
222
  code_lang: CodeLang
@@ -239,13 +232,24 @@ class LLMStructuredColumnConfig(LLMTextColumnConfig):
239
232
 
240
233
  Extends LLMTextColumnConfig to generate structured data conforming to a specified schema.
241
234
  Uses JSON schema or Pydantic models to define the expected output structure, enabling
242
- type-safe and validated structured output generation. Inherits prompt templating capabilities.
235
+ type-safe and validated structured output generation. Inherits prompt templating capabilities
236
+ from LLMTextColumnConfig.
243
237
 
244
238
  Attributes:
245
239
  output_format: The schema defining the expected output structure. Can be either:
246
240
  - A Pydantic BaseModel class (recommended)
247
241
  - A JSON schema dictionary
248
242
  column_type: Discriminator field, always "llm-structured" for this configuration type.
243
+
244
+ Inherited Attributes:
245
+ prompt: Prompt template for structured generation (supports Jinja2).
246
+ model_alias: Alias of the model configuration to use.
247
+ system_prompt: Optional system prompt (supports Jinja2).
248
+ multi_modal_context: Optional image contexts for multi-modal generation.
249
+ tool_alias: Optional tool configuration alias for MCP tool calls.
250
+ with_trace: If True, creates a `{column_name}__trace` column with message history.
251
+ extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
252
+ column containing the reasoning content from the final assistant response.
249
253
  """
250
254
 
251
255
  output_format: dict | type[BaseModel]
@@ -293,13 +297,24 @@ class LLMJudgeColumnConfig(LLMTextColumnConfig):
293
297
 
294
298
  Extends LLMTextColumnConfig to create judge columns that evaluate and score other
295
299
  generated content based on the defined criteria. Useful for quality assessment, preference
296
- ranking, and multi-dimensional evaluation of generated data.
300
+ ranking, and multi-dimensional evaluation of generated data. Inherits prompt templating
301
+ capabilities from LLMTextColumnConfig.
297
302
 
298
303
  Attributes:
299
304
  scores: List of Score objects defining the evaluation dimensions. Each score
300
305
  represents a different aspect to evaluate (e.g., accuracy, relevance, fluency).
301
306
  Must contain at least one score.
302
307
  column_type: Discriminator field, always "llm-judge" for this configuration type.
308
+
309
+ Inherited Attributes:
310
+ prompt: Prompt template for the judge evaluation (supports Jinja2).
311
+ model_alias: Alias of the model configuration to use.
312
+ system_prompt: Optional system prompt (supports Jinja2).
313
+ multi_modal_context: Optional image contexts for multi-modal generation.
314
+ tool_alias: Optional tool configuration alias for MCP tool calls.
315
+ with_trace: If True, creates a `{column_name}__trace` column with message history.
316
+ extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
317
+ column containing the reasoning content from the final assistant response.
303
318
  """
304
319
 
305
320
  scores: list[Score] = Field(..., min_length=1)
@@ -468,3 +483,80 @@ class EmbeddingColumnConfig(SingleColumnConfig):
468
483
  @property
469
484
  def side_effect_columns(self) -> list[str]:
470
485
  return []
486
+
487
+
488
+ class CustomColumnConfig(SingleColumnConfig):
489
+ """Configuration for custom user-defined column generators.
490
+
491
+ Custom columns allow users to provide their own generation logic via a callable function
492
+ decorated with `@custom_column_generator`. Two strategies are supported: cell_by_cell
493
+ (default, row-based) and full_column (batch-based with DataFrame access).
494
+
495
+ Attributes:
496
+ generator_function: A callable decorated with @custom_column_generator.
497
+ generation_strategy: "cell_by_cell" (row-based) or "full_column" (batch-based).
498
+ generator_params: Optional typed configuration object (Pydantic BaseModel) passed
499
+ as the second argument to the generator function.
500
+ column_type: Discriminator field, always "custom" for this configuration type.
501
+ """
502
+
503
+ generator_function: Any = Field(description="Function decorated with @custom_column_generator")
504
+ generation_strategy: GenerationStrategy = Field(
505
+ default=GenerationStrategy.CELL_BY_CELL,
506
+ description="Generation strategy: 'cell_by_cell' for row-based or 'full_column' for batch-based",
507
+ )
508
+ generator_params: BaseModel | None = Field(
509
+ default=None,
510
+ description="Optional typed configuration object passed as second argument to generator function",
511
+ )
512
+ column_type: Literal["custom"] = "custom"
513
+
514
+ @field_validator("generator_function")
515
+ @classmethod
516
+ def _validate_generator_function(cls, v: Any) -> Any:
517
+ if not callable(v):
518
+ raise ValueError("generator_function must be callable")
519
+ if not hasattr(v, "custom_column_metadata"):
520
+ raise ValueError("generator_function must be decorated with @custom_column_generator")
521
+ return v
522
+
523
+ @staticmethod
524
+ def get_column_emoji() -> str:
525
+ return "🔧"
526
+
527
+ @property
528
+ def required_columns(self) -> list[str]:
529
+ """Returns the columns required for custom generation (from decorator metadata)."""
530
+ metadata = getattr(self.generator_function, "custom_column_metadata", {})
531
+ return metadata.get("required_columns", [])
532
+
533
+ @property
534
+ def side_effect_columns(self) -> list[str]:
535
+ """Returns additional columns created by this generator (from decorator metadata)."""
536
+ metadata = getattr(self.generator_function, "custom_column_metadata", {})
537
+ return metadata.get("side_effect_columns", [])
538
+
539
+ @property
540
+ def model_aliases(self) -> list[str]:
541
+ """Returns model aliases for LLM access and health checks (from decorator metadata)."""
542
+ metadata = getattr(self.generator_function, "custom_column_metadata", {})
543
+ return metadata.get("model_aliases", [])
544
+
545
+ @field_serializer("generator_function")
546
+ def serialize_generator_function(self, v: Any) -> str:
547
+ return getattr(v, "__name__", repr(v))
548
+
549
+ @field_serializer("generator_params")
550
+ def serialize_generator_params(self, v: BaseModel | None) -> dict[str, Any] | None:
551
+ if v is None:
552
+ return None
553
+ return v.model_dump()
554
+
555
+ @model_validator(mode="after")
556
+ def validate_generator_function(self) -> Self:
557
+ if not callable(self.generator_function):
558
+ raise InvalidConfigError(
559
+ f"🛑 `generator_function` must be a callable for custom column '{self.name}'. "
560
+ f"Expected a function decorated with @custom_column_generator."
561
+ )
562
+ return self
@@ -6,6 +6,7 @@ from __future__ import annotations
6
6
  from typing_extensions import TypeAlias
7
7
 
8
8
  from data_designer.config.column_configs import (
9
+ CustomColumnConfig,
9
10
  EmbeddingColumnConfig,
10
11
  ExpressionColumnConfig,
11
12
  LLMCodeColumnConfig,
@@ -28,7 +29,8 @@ from data_designer.plugin_manager import PluginManager
28
29
  plugin_manager = PluginManager()
29
30
 
30
31
  ColumnConfigT: TypeAlias = (
31
- ExpressionColumnConfig
32
+ CustomColumnConfig
33
+ | ExpressionColumnConfig
32
34
  | LLMCodeColumnConfig
33
35
  | LLMJudgeColumnConfig
34
36
  | LLMStructuredColumnConfig
@@ -87,6 +89,7 @@ def get_column_display_order() -> list[DataDesignerColumnType]:
87
89
  DataDesignerColumnType.EMBEDDING,
88
90
  DataDesignerColumnType.VALIDATION,
89
91
  DataDesignerColumnType.EXPRESSION,
92
+ DataDesignerColumnType.CUSTOM,
90
93
  ]
91
94
  display_order.extend(plugin_manager.get_plugin_column_types(DataDesignerColumnType))
92
95
  return display_order
@@ -129,6 +132,7 @@ def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
129
132
 
130
133
 
131
134
  _COLUMN_TYPE_CONFIG_CLS_MAP = {
135
+ DataDesignerColumnType.CUSTOM: CustomColumnConfig,
132
136
  DataDesignerColumnType.LLM_TEXT: LLMTextColumnConfig,
133
137
  DataDesignerColumnType.LLM_CODE: LLMCodeColumnConfig,
134
138
  DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnConfig,