data-designer-config 0.4.0rc3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/config/__init__.py +219 -142
- data_designer/config/base.py +37 -39
- data_designer/config/column_configs.py +147 -61
- data_designer/config/column_types.py +5 -1
- data_designer/config/config_builder.py +103 -3
- data_designer/config/custom_column.py +64 -0
- data_designer/config/data_designer_config.py +5 -1
- data_designer/config/exportable_config.py +59 -0
- data_designer/config/mcp.py +109 -0
- data_designer/config/run_config.py +1 -6
- data_designer/config/utils/constants.py +9 -0
- data_designer/config/utils/trace_type.py +24 -0
- data_designer/config/utils/visualization.py +6 -0
- {data_designer_config-0.4.0rc3.dist-info → data_designer_config-0.5.0rc1.dist-info}/METADATA +1 -1
- {data_designer_config-0.4.0rc3.dist-info → data_designer_config-0.5.0rc1.dist-info}/RECORD +16 -13
- data_designer/config/_version.py +0 -34
- {data_designer_config-0.4.0rc3.dist-info → data_designer_config-0.5.0rc1.dist-info}/WHEEL +0 -0
|
@@ -3,66 +3,28 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from typing import Annotated, Literal
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Annotated, Any, Literal
|
|
8
8
|
|
|
9
|
-
from pydantic import BaseModel, Discriminator, Field, model_validator
|
|
9
|
+
from pydantic import BaseModel, Discriminator, Field, field_serializer, field_validator, model_validator
|
|
10
10
|
from typing_extensions import Self
|
|
11
11
|
|
|
12
|
-
from data_designer.config.base import ConfigBase
|
|
12
|
+
from data_designer.config.base import ConfigBase, SingleColumnConfig
|
|
13
13
|
from data_designer.config.errors import InvalidConfigError
|
|
14
14
|
from data_designer.config.models import ImageContext
|
|
15
15
|
from data_designer.config.sampler_params import SamplerParamsT, SamplerType
|
|
16
16
|
from data_designer.config.utils.code_lang import CodeLang
|
|
17
|
-
from data_designer.config.utils.constants import TRACE_COLUMN_POSTFIX
|
|
17
|
+
from data_designer.config.utils.constants import REASONING_CONTENT_COLUMN_POSTFIX, TRACE_COLUMN_POSTFIX
|
|
18
18
|
from data_designer.config.utils.misc import assert_valid_jinja2_template, extract_keywords_from_jinja2_template
|
|
19
|
+
from data_designer.config.utils.trace_type import TraceType
|
|
19
20
|
from data_designer.config.validator_params import ValidatorParamsT, ValidatorType
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
class
|
|
23
|
-
"""
|
|
23
|
+
class GenerationStrategy(str, Enum):
|
|
24
|
+
"""Strategy for custom column generation."""
|
|
24
25
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
Attributes:
|
|
29
|
-
name: Unique name of the column to be generated.
|
|
30
|
-
drop: If True, the column will be generated but removed from the final dataset.
|
|
31
|
-
Useful for intermediate columns that are dependencies for other columns.
|
|
32
|
-
column_type: Discriminator field that identifies the specific column type.
|
|
33
|
-
Subclasses must override this field to specify the column type with a `Literal` value.
|
|
34
|
-
"""
|
|
35
|
-
|
|
36
|
-
name: str
|
|
37
|
-
drop: bool = False
|
|
38
|
-
column_type: str
|
|
39
|
-
|
|
40
|
-
@staticmethod
|
|
41
|
-
def get_column_emoji() -> str:
|
|
42
|
-
return "🎨"
|
|
43
|
-
|
|
44
|
-
@property
|
|
45
|
-
@abstractmethod
|
|
46
|
-
def required_columns(self) -> list[str]:
|
|
47
|
-
"""Returns a list of column names that must exist before this column can be generated.
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
List of column names that this column depends on. Empty list indicates
|
|
51
|
-
no dependencies. Override in subclasses to specify dependencies.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
@property
|
|
55
|
-
@abstractmethod
|
|
56
|
-
def side_effect_columns(self) -> list[str]:
|
|
57
|
-
"""Returns a list of additional columns that this column will create as a side effect.
|
|
58
|
-
|
|
59
|
-
Some column types generate additional metadata or auxiliary columns alongside
|
|
60
|
-
the primary column (e.g., reasoning traces for LLM columns).
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
List of column names that this column will create as a side effect. Empty list
|
|
64
|
-
indicates no side effect columns. Override in subclasses to specify side effects.
|
|
65
|
-
"""
|
|
26
|
+
CELL_BY_CELL = "cell_by_cell"
|
|
27
|
+
FULL_COLUMN = "full_column"
|
|
66
28
|
|
|
67
29
|
|
|
68
30
|
class SamplerColumnConfig(SingleColumnConfig):
|
|
@@ -159,10 +121,18 @@ class LLMTextColumnConfig(SingleColumnConfig):
|
|
|
159
121
|
`LLMStructuredColumnConfig` for structured output, `LLMCodeColumnConfig` for code.
|
|
160
122
|
multi_modal_context: Optional list of image contexts for multi-modal generation.
|
|
161
123
|
Enables vision-capable models to generate text based on image inputs.
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
124
|
+
tool_alias: Optional alias of the tool configuration to use for MCP tool calls.
|
|
125
|
+
Must match a tool alias defined when initializing the DataDesignerConfigBuilder.
|
|
126
|
+
When provided, the model may call permitted tools during generation.
|
|
127
|
+
with_trace: Specifies what trace information to capture in a `{column_name}__trace`
|
|
128
|
+
column. Options are:
|
|
129
|
+
- `TraceType.NONE` (default): No trace is captured.
|
|
130
|
+
- `TraceType.LAST_MESSAGE`: Only the final assistant message is captured.
|
|
131
|
+
- `TraceType.ALL_MESSAGES`: Full conversation history (system/user/assistant/tool).
|
|
132
|
+
extract_reasoning_content: If True, creates a `{column_name}__reasoning_content` column
|
|
133
|
+
containing only the reasoning_content from the final assistant response. This is
|
|
134
|
+
useful for models that expose chain-of-thought reasoning separately from the main
|
|
135
|
+
response. Defaults to False.
|
|
166
136
|
column_type: Discriminator field, always "llm-text" for this configuration type.
|
|
167
137
|
"""
|
|
168
138
|
|
|
@@ -170,7 +140,9 @@ class LLMTextColumnConfig(SingleColumnConfig):
|
|
|
170
140
|
model_alias: str
|
|
171
141
|
system_prompt: str | None = None
|
|
172
142
|
multi_modal_context: list[ImageContext] | None = None
|
|
173
|
-
|
|
143
|
+
tool_alias: str | None = None
|
|
144
|
+
with_trace: TraceType = TraceType.NONE
|
|
145
|
+
extract_reasoning_content: bool = False
|
|
174
146
|
column_type: Literal["llm-text"] = "llm-text"
|
|
175
147
|
|
|
176
148
|
@staticmethod
|
|
@@ -191,15 +163,20 @@ class LLMTextColumnConfig(SingleColumnConfig):
|
|
|
191
163
|
|
|
192
164
|
@property
|
|
193
165
|
def side_effect_columns(self) -> list[str]:
|
|
194
|
-
"""Returns
|
|
166
|
+
"""Returns side-effect columns that may be generated alongside the main column.
|
|
195
167
|
|
|
196
|
-
|
|
197
|
-
when `
|
|
168
|
+
Side-effect columns include:
|
|
169
|
+
- `{name}__trace`: Generated when `with_trace` is not `TraceType.NONE` on the column
|
|
170
|
+
config.
|
|
171
|
+
- `{name}__reasoning_content`: Generated when `extract_reasoning_content=True`.
|
|
198
172
|
|
|
199
173
|
Returns:
|
|
200
|
-
List
|
|
174
|
+
List of side-effect column names.
|
|
201
175
|
"""
|
|
202
|
-
return [
|
|
176
|
+
return [
|
|
177
|
+
*([f"{self.name}{TRACE_COLUMN_POSTFIX}"] if self.with_trace != TraceType.NONE else []),
|
|
178
|
+
*([f"{self.name}{REASONING_CONTENT_COLUMN_POSTFIX}"] if self.extract_reasoning_content else []),
|
|
179
|
+
]
|
|
203
180
|
|
|
204
181
|
@model_validator(mode="after")
|
|
205
182
|
def assert_prompt_valid_jinja(self) -> Self:
|
|
@@ -222,7 +199,7 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
|
|
|
222
199
|
|
|
223
200
|
Extends LLMTextColumnConfig to generate code snippets in specific programming languages
|
|
224
201
|
or SQL dialects. The generated code is automatically extracted from markdown code blocks
|
|
225
|
-
for the specified language. Inherits all prompt templating capabilities.
|
|
202
|
+
for the specified language. Inherits all prompt templating capabilities from LLMTextColumnConfig.
|
|
226
203
|
|
|
227
204
|
Attributes:
|
|
228
205
|
code_lang: Programming language or SQL dialect for code generation. Supported
|
|
@@ -230,6 +207,16 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
|
|
|
230
207
|
"rust", "ruby", "scala", "swift", "sql:sqlite", "sql:postgres", "sql:mysql",
|
|
231
208
|
"sql:tsql", "sql:bigquery", "sql:ansi". See CodeLang enum for complete list.
|
|
232
209
|
column_type: Discriminator field, always "llm-code" for this configuration type.
|
|
210
|
+
|
|
211
|
+
Inherited Attributes:
|
|
212
|
+
prompt: Prompt template for code generation (supports Jinja2).
|
|
213
|
+
model_alias: Alias of the model configuration to use.
|
|
214
|
+
system_prompt: Optional system prompt (supports Jinja2).
|
|
215
|
+
multi_modal_context: Optional image contexts for multi-modal generation.
|
|
216
|
+
tool_alias: Optional tool configuration alias for MCP tool calls.
|
|
217
|
+
with_trace: If True, creates a `{column_name}__trace` column with message history.
|
|
218
|
+
extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
|
|
219
|
+
column containing the reasoning content from the final assistant response.
|
|
233
220
|
"""
|
|
234
221
|
|
|
235
222
|
code_lang: CodeLang
|
|
@@ -245,13 +232,24 @@ class LLMStructuredColumnConfig(LLMTextColumnConfig):
|
|
|
245
232
|
|
|
246
233
|
Extends LLMTextColumnConfig to generate structured data conforming to a specified schema.
|
|
247
234
|
Uses JSON schema or Pydantic models to define the expected output structure, enabling
|
|
248
|
-
type-safe and validated structured output generation. Inherits prompt templating capabilities
|
|
235
|
+
type-safe and validated structured output generation. Inherits prompt templating capabilities
|
|
236
|
+
from LLMTextColumnConfig.
|
|
249
237
|
|
|
250
238
|
Attributes:
|
|
251
239
|
output_format: The schema defining the expected output structure. Can be either:
|
|
252
240
|
- A Pydantic BaseModel class (recommended)
|
|
253
241
|
- A JSON schema dictionary
|
|
254
242
|
column_type: Discriminator field, always "llm-structured" for this configuration type.
|
|
243
|
+
|
|
244
|
+
Inherited Attributes:
|
|
245
|
+
prompt: Prompt template for structured generation (supports Jinja2).
|
|
246
|
+
model_alias: Alias of the model configuration to use.
|
|
247
|
+
system_prompt: Optional system prompt (supports Jinja2).
|
|
248
|
+
multi_modal_context: Optional image contexts for multi-modal generation.
|
|
249
|
+
tool_alias: Optional tool configuration alias for MCP tool calls.
|
|
250
|
+
with_trace: If True, creates a `{column_name}__trace` column with message history.
|
|
251
|
+
extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
|
|
252
|
+
column containing the reasoning content from the final assistant response.
|
|
255
253
|
"""
|
|
256
254
|
|
|
257
255
|
output_format: dict | type[BaseModel]
|
|
@@ -299,13 +297,24 @@ class LLMJudgeColumnConfig(LLMTextColumnConfig):
|
|
|
299
297
|
|
|
300
298
|
Extends LLMTextColumnConfig to create judge columns that evaluate and score other
|
|
301
299
|
generated content based on the defined criteria. Useful for quality assessment, preference
|
|
302
|
-
ranking, and multi-dimensional evaluation of generated data.
|
|
300
|
+
ranking, and multi-dimensional evaluation of generated data. Inherits prompt templating
|
|
301
|
+
capabilities from LLMTextColumnConfig.
|
|
303
302
|
|
|
304
303
|
Attributes:
|
|
305
304
|
scores: List of Score objects defining the evaluation dimensions. Each score
|
|
306
305
|
represents a different aspect to evaluate (e.g., accuracy, relevance, fluency).
|
|
307
306
|
Must contain at least one score.
|
|
308
307
|
column_type: Discriminator field, always "llm-judge" for this configuration type.
|
|
308
|
+
|
|
309
|
+
Inherited Attributes:
|
|
310
|
+
prompt: Prompt template for the judge evaluation (supports Jinja2).
|
|
311
|
+
model_alias: Alias of the model configuration to use.
|
|
312
|
+
system_prompt: Optional system prompt (supports Jinja2).
|
|
313
|
+
multi_modal_context: Optional image contexts for multi-modal generation.
|
|
314
|
+
tool_alias: Optional tool configuration alias for MCP tool calls.
|
|
315
|
+
with_trace: If True, creates a `{column_name}__trace` column with message history.
|
|
316
|
+
extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
|
|
317
|
+
column containing the reasoning content from the final assistant response.
|
|
309
318
|
"""
|
|
310
319
|
|
|
311
320
|
scores: list[Score] = Field(..., min_length=1)
|
|
@@ -474,3 +483,80 @@ class EmbeddingColumnConfig(SingleColumnConfig):
|
|
|
474
483
|
@property
|
|
475
484
|
def side_effect_columns(self) -> list[str]:
|
|
476
485
|
return []
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
class CustomColumnConfig(SingleColumnConfig):
|
|
489
|
+
"""Configuration for custom user-defined column generators.
|
|
490
|
+
|
|
491
|
+
Custom columns allow users to provide their own generation logic via a callable function
|
|
492
|
+
decorated with `@custom_column_generator`. Two strategies are supported: cell_by_cell
|
|
493
|
+
(default, row-based) and full_column (batch-based with DataFrame access).
|
|
494
|
+
|
|
495
|
+
Attributes:
|
|
496
|
+
generator_function: A callable decorated with @custom_column_generator.
|
|
497
|
+
generation_strategy: "cell_by_cell" (row-based) or "full_column" (batch-based).
|
|
498
|
+
generator_params: Optional typed configuration object (Pydantic BaseModel) passed
|
|
499
|
+
as the second argument to the generator function.
|
|
500
|
+
column_type: Discriminator field, always "custom" for this configuration type.
|
|
501
|
+
"""
|
|
502
|
+
|
|
503
|
+
generator_function: Any = Field(description="Function decorated with @custom_column_generator")
|
|
504
|
+
generation_strategy: GenerationStrategy = Field(
|
|
505
|
+
default=GenerationStrategy.CELL_BY_CELL,
|
|
506
|
+
description="Generation strategy: 'cell_by_cell' for row-based or 'full_column' for batch-based",
|
|
507
|
+
)
|
|
508
|
+
generator_params: BaseModel | None = Field(
|
|
509
|
+
default=None,
|
|
510
|
+
description="Optional typed configuration object passed as second argument to generator function",
|
|
511
|
+
)
|
|
512
|
+
column_type: Literal["custom"] = "custom"
|
|
513
|
+
|
|
514
|
+
@field_validator("generator_function")
|
|
515
|
+
@classmethod
|
|
516
|
+
def _validate_generator_function(cls, v: Any) -> Any:
|
|
517
|
+
if not callable(v):
|
|
518
|
+
raise ValueError("generator_function must be callable")
|
|
519
|
+
if not hasattr(v, "custom_column_metadata"):
|
|
520
|
+
raise ValueError("generator_function must be decorated with @custom_column_generator")
|
|
521
|
+
return v
|
|
522
|
+
|
|
523
|
+
@staticmethod
|
|
524
|
+
def get_column_emoji() -> str:
|
|
525
|
+
return "🔧"
|
|
526
|
+
|
|
527
|
+
@property
|
|
528
|
+
def required_columns(self) -> list[str]:
|
|
529
|
+
"""Returns the columns required for custom generation (from decorator metadata)."""
|
|
530
|
+
metadata = getattr(self.generator_function, "custom_column_metadata", {})
|
|
531
|
+
return metadata.get("required_columns", [])
|
|
532
|
+
|
|
533
|
+
@property
|
|
534
|
+
def side_effect_columns(self) -> list[str]:
|
|
535
|
+
"""Returns additional columns created by this generator (from decorator metadata)."""
|
|
536
|
+
metadata = getattr(self.generator_function, "custom_column_metadata", {})
|
|
537
|
+
return metadata.get("side_effect_columns", [])
|
|
538
|
+
|
|
539
|
+
@property
|
|
540
|
+
def model_aliases(self) -> list[str]:
|
|
541
|
+
"""Returns model aliases for LLM access and health checks (from decorator metadata)."""
|
|
542
|
+
metadata = getattr(self.generator_function, "custom_column_metadata", {})
|
|
543
|
+
return metadata.get("model_aliases", [])
|
|
544
|
+
|
|
545
|
+
@field_serializer("generator_function")
|
|
546
|
+
def serialize_generator_function(self, v: Any) -> str:
|
|
547
|
+
return getattr(v, "__name__", repr(v))
|
|
548
|
+
|
|
549
|
+
@field_serializer("generator_params")
|
|
550
|
+
def serialize_generator_params(self, v: BaseModel | None) -> dict[str, Any] | None:
|
|
551
|
+
if v is None:
|
|
552
|
+
return None
|
|
553
|
+
return v.model_dump()
|
|
554
|
+
|
|
555
|
+
@model_validator(mode="after")
|
|
556
|
+
def validate_generator_function(self) -> Self:
|
|
557
|
+
if not callable(self.generator_function):
|
|
558
|
+
raise InvalidConfigError(
|
|
559
|
+
f"🛑 `generator_function` must be a callable for custom column '{self.name}'. "
|
|
560
|
+
f"Expected a function decorated with @custom_column_generator."
|
|
561
|
+
)
|
|
562
|
+
return self
|
|
@@ -6,6 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
from typing_extensions import TypeAlias
|
|
7
7
|
|
|
8
8
|
from data_designer.config.column_configs import (
|
|
9
|
+
CustomColumnConfig,
|
|
9
10
|
EmbeddingColumnConfig,
|
|
10
11
|
ExpressionColumnConfig,
|
|
11
12
|
LLMCodeColumnConfig,
|
|
@@ -28,7 +29,8 @@ from data_designer.plugin_manager import PluginManager
|
|
|
28
29
|
plugin_manager = PluginManager()
|
|
29
30
|
|
|
30
31
|
ColumnConfigT: TypeAlias = (
|
|
31
|
-
|
|
32
|
+
CustomColumnConfig
|
|
33
|
+
| ExpressionColumnConfig
|
|
32
34
|
| LLMCodeColumnConfig
|
|
33
35
|
| LLMJudgeColumnConfig
|
|
34
36
|
| LLMStructuredColumnConfig
|
|
@@ -87,6 +89,7 @@ def get_column_display_order() -> list[DataDesignerColumnType]:
|
|
|
87
89
|
DataDesignerColumnType.EMBEDDING,
|
|
88
90
|
DataDesignerColumnType.VALIDATION,
|
|
89
91
|
DataDesignerColumnType.EXPRESSION,
|
|
92
|
+
DataDesignerColumnType.CUSTOM,
|
|
90
93
|
]
|
|
91
94
|
display_order.extend(plugin_manager.get_plugin_column_types(DataDesignerColumnType))
|
|
92
95
|
return display_order
|
|
@@ -129,6 +132,7 @@ def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
|
|
|
129
132
|
|
|
130
133
|
|
|
131
134
|
_COLUMN_TYPE_CONFIG_CLS_MAP = {
|
|
135
|
+
DataDesignerColumnType.CUSTOM: CustomColumnConfig,
|
|
132
136
|
DataDesignerColumnType.LLM_TEXT: LLMTextColumnConfig,
|
|
133
137
|
DataDesignerColumnType.LLM_CODE: LLMCodeColumnConfig,
|
|
134
138
|
DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnConfig,
|
|
@@ -13,7 +13,6 @@ from pygments.lexers import PythonLexer
|
|
|
13
13
|
from typing_extensions import Self
|
|
14
14
|
|
|
15
15
|
from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
|
|
16
|
-
from data_designer.config.base import ExportableConfigBase
|
|
17
16
|
from data_designer.config.column_configs import SeedDatasetColumnConfig
|
|
18
17
|
from data_designer.config.column_types import (
|
|
19
18
|
ColumnConfigT,
|
|
@@ -25,6 +24,8 @@ from data_designer.config.data_designer_config import DataDesignerConfig
|
|
|
25
24
|
from data_designer.config.dataset_builders import BuildStage
|
|
26
25
|
from data_designer.config.default_model_settings import get_default_model_configs
|
|
27
26
|
from data_designer.config.errors import BuilderConfigurationError, BuilderSerializationError, InvalidColumnTypeError
|
|
27
|
+
from data_designer.config.exportable_config import ExportableConfigBase
|
|
28
|
+
from data_designer.config.mcp import ToolConfig
|
|
28
29
|
from data_designer.config.models import ModelConfig, load_model_configs
|
|
29
30
|
from data_designer.config.processors import ProcessorConfigT, ProcessorType, get_processor_config_from_kwargs
|
|
30
31
|
from data_designer.config.sampler_constraints import (
|
|
@@ -93,7 +94,10 @@ class DataDesignerConfigBuilder:
|
|
|
93
94
|
json_config = json.loads(serialize_data(smart_load_yaml(config)))
|
|
94
95
|
builder_config = BuilderConfig.model_validate(json_config)
|
|
95
96
|
|
|
96
|
-
builder = cls(
|
|
97
|
+
builder = cls(
|
|
98
|
+
model_configs=builder_config.data_designer.model_configs,
|
|
99
|
+
tool_configs=builder_config.data_designer.tool_configs,
|
|
100
|
+
)
|
|
97
101
|
data_designer_config = builder_config.data_designer
|
|
98
102
|
|
|
99
103
|
for col in data_designer_config.columns:
|
|
@@ -111,7 +115,11 @@ class DataDesignerConfigBuilder:
|
|
|
111
115
|
|
|
112
116
|
return builder
|
|
113
117
|
|
|
114
|
-
def __init__(
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
model_configs: list[ModelConfig] | str | Path | None = None,
|
|
121
|
+
tool_configs: list[ToolConfig] | None = None,
|
|
122
|
+
):
|
|
115
123
|
"""Initialize a new DataDesignerConfigBuilder instance.
|
|
116
124
|
|
|
117
125
|
Args:
|
|
@@ -119,9 +127,13 @@ class DataDesignerConfigBuilder:
|
|
|
119
127
|
- None to use default model configurations in local mode
|
|
120
128
|
- A list of ModelConfig objects
|
|
121
129
|
- A string or Path to a model configuration file
|
|
130
|
+
tool_configs: Tool configurations for MCP tool calling. Can be:
|
|
131
|
+
- None if no tool configs are needed
|
|
132
|
+
- A list of ToolConfig objects
|
|
122
133
|
"""
|
|
123
134
|
self._column_configs = {}
|
|
124
135
|
self._model_configs = _load_model_configs(model_configs)
|
|
136
|
+
self._tool_configs: list[ToolConfig] = tool_configs or []
|
|
125
137
|
self._processor_configs: list[ProcessorConfigT] = []
|
|
126
138
|
self._seed_config: SeedConfig | None = None
|
|
127
139
|
self._constraints: list[ColumnConstraintT] = []
|
|
@@ -136,6 +148,15 @@ class DataDesignerConfigBuilder:
|
|
|
136
148
|
"""
|
|
137
149
|
return self._model_configs
|
|
138
150
|
|
|
151
|
+
@property
|
|
152
|
+
def tool_configs(self) -> list[ToolConfig]:
|
|
153
|
+
"""Get the tool configurations for this builder.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
A list of ToolConfig objects used for MCP tool calling.
|
|
157
|
+
"""
|
|
158
|
+
return self._tool_configs
|
|
159
|
+
|
|
139
160
|
@property
|
|
140
161
|
def allowed_references(self) -> list[str]:
|
|
141
162
|
"""Get all referenceable variables allowed in prompt templates and expressions.
|
|
@@ -184,6 +205,38 @@ class DataDesignerConfigBuilder:
|
|
|
184
205
|
)
|
|
185
206
|
return self
|
|
186
207
|
|
|
208
|
+
def add_tool_config(self, tool_config: ToolConfig) -> Self:
|
|
209
|
+
"""Add a tool configuration to the current Data Designer configuration.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
tool_config: The tool configuration to add.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
The current Data Designer config builder instance.
|
|
216
|
+
|
|
217
|
+
Raises:
|
|
218
|
+
BuilderConfigurationError: If a tool configuration with the same alias already exists.
|
|
219
|
+
"""
|
|
220
|
+
if tool_config.tool_alias in {tc.tool_alias for tc in self._tool_configs}:
|
|
221
|
+
raise BuilderConfigurationError(
|
|
222
|
+
f"Tool configuration with alias {tool_config.tool_alias} already exists. "
|
|
223
|
+
"Please delete the existing tool configuration or choose a different alias."
|
|
224
|
+
)
|
|
225
|
+
self._tool_configs.append(tool_config)
|
|
226
|
+
return self
|
|
227
|
+
|
|
228
|
+
def delete_tool_config(self, alias: str) -> Self:
|
|
229
|
+
"""Delete a tool configuration from the current Data Designer configuration by alias.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
alias: The alias of the tool configuration to delete.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
The current Data Designer config builder instance.
|
|
236
|
+
"""
|
|
237
|
+
self._tool_configs = [tc for tc in self._tool_configs if tc.tool_alias != alias]
|
|
238
|
+
return self
|
|
239
|
+
|
|
187
240
|
def add_column(
|
|
188
241
|
self,
|
|
189
242
|
column_config: ColumnConfigT | None = None,
|
|
@@ -350,9 +403,14 @@ class DataDesignerConfigBuilder:
|
|
|
350
403
|
|
|
351
404
|
Returns:
|
|
352
405
|
The current Data Designer config object.
|
|
406
|
+
|
|
407
|
+
Raises:
|
|
408
|
+
BuilderConfigurationError: If any ToolConfig has duplicate tool names in its allow_tools list.
|
|
353
409
|
"""
|
|
410
|
+
self._validate_tool_configs_no_duplicates()
|
|
354
411
|
return DataDesignerConfig(
|
|
355
412
|
model_configs=self._model_configs,
|
|
413
|
+
tool_configs=self._tool_configs,
|
|
356
414
|
seed_config=self._seed_config,
|
|
357
415
|
columns=list(self._column_configs.values()),
|
|
358
416
|
constraints=self._constraints or None,
|
|
@@ -360,6 +418,31 @@ class DataDesignerConfigBuilder:
|
|
|
360
418
|
processors=self._processor_configs or None,
|
|
361
419
|
)
|
|
362
420
|
|
|
421
|
+
def _validate_tool_configs_no_duplicates(self) -> None:
|
|
422
|
+
"""Validate that no ToolConfig has duplicate tool names in its allow_tools list.
|
|
423
|
+
|
|
424
|
+
This is a static validation that catches obvious duplicates at config build time,
|
|
425
|
+
before providers are queried. Full validation (including duplicates across providers)
|
|
426
|
+
happens at resource provider creation time.
|
|
427
|
+
|
|
428
|
+
Raises:
|
|
429
|
+
BuilderConfigurationError: If any ToolConfig has duplicate tool names in allow_tools.
|
|
430
|
+
"""
|
|
431
|
+
for tool_config in self._tool_configs:
|
|
432
|
+
if tool_config.allow_tools is None:
|
|
433
|
+
continue
|
|
434
|
+
seen: set[str] = set()
|
|
435
|
+
duplicates: list[str] = []
|
|
436
|
+
for tool_name in tool_config.allow_tools:
|
|
437
|
+
if tool_name in seen:
|
|
438
|
+
duplicates.append(tool_name)
|
|
439
|
+
seen.add(tool_name)
|
|
440
|
+
if duplicates:
|
|
441
|
+
raise BuilderConfigurationError(
|
|
442
|
+
f"🛑 ToolConfig '{tool_config.tool_alias}' has duplicate tool names in allow_tools: "
|
|
443
|
+
f"{sorted(set(duplicates))!r}. Each tool name must be unique within a ToolConfig."
|
|
444
|
+
)
|
|
445
|
+
|
|
363
446
|
def delete_constraints(self, target_column: str) -> Self:
|
|
364
447
|
"""Delete all constraints for the given target column.
|
|
365
448
|
|
|
@@ -411,6 +494,23 @@ class DataDesignerConfigBuilder:
|
|
|
411
494
|
"""
|
|
412
495
|
return list(self._column_configs.values())
|
|
413
496
|
|
|
497
|
+
def get_tool_config(self, alias: str) -> ToolConfig:
|
|
498
|
+
"""Get a tool configuration by alias.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
alias: The alias of the tool configuration to retrieve.
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
The tool configuration object.
|
|
505
|
+
|
|
506
|
+
Raises:
|
|
507
|
+
KeyError: If no tool configuration with the given alias exists.
|
|
508
|
+
"""
|
|
509
|
+
for tc in self._tool_configs:
|
|
510
|
+
if tc.tool_alias == alias:
|
|
511
|
+
return tc
|
|
512
|
+
raise KeyError(f"No tool configuration with alias {alias!r} found")
|
|
513
|
+
|
|
414
514
|
def get_constraints(self, target_column: str) -> list[ColumnConstraintT]:
|
|
415
515
|
"""Get all constraints for the given target column.
|
|
416
516
|
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""User-facing utilities for custom column generation."""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import functools
|
|
9
|
+
import inspect
|
|
10
|
+
from typing import Any, Callable, TypeVar
|
|
11
|
+
|
|
12
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
13
|
+
|
|
14
|
+
# Expected parameter names by position (first param validated as row/df at runtime based on strategy)
|
|
15
|
+
EXPECTED_PARAMS = ({"row", "df"}, {"generator_params"}, {"models"})
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def validate_generator_signature(fn: Callable[..., Any]) -> list[inspect.Parameter]:
|
|
19
|
+
"""Validate generator function signature. Returns positional params if valid."""
|
|
20
|
+
params = [
|
|
21
|
+
p
|
|
22
|
+
for p in inspect.signature(fn).parameters.values()
|
|
23
|
+
if p.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
|
|
24
|
+
]
|
|
25
|
+
n = len(params)
|
|
26
|
+
if n == 0 or n > 3:
|
|
27
|
+
raise TypeError(f"Generator '{fn.__name__}' must have 1-3 parameters, got {n}.")
|
|
28
|
+
for i, param in enumerate(params):
|
|
29
|
+
if param.name not in EXPECTED_PARAMS[i]:
|
|
30
|
+
expected = " or ".join(f"'{p}'" for p in sorted(EXPECTED_PARAMS[i]))
|
|
31
|
+
raise TypeError(f"Generator '{fn.__name__}' param {i + 1} must be {expected}, got '{param.name}'.")
|
|
32
|
+
return params
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def custom_column_generator(
|
|
36
|
+
required_columns: list[str] | None = None,
|
|
37
|
+
side_effect_columns: list[str] | None = None,
|
|
38
|
+
model_aliases: list[str] | None = None,
|
|
39
|
+
) -> Callable[[F], F]:
|
|
40
|
+
"""Decorator to define metadata for a custom column generator function.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
required_columns: Columns that must exist before this column runs (DAG ordering).
|
|
44
|
+
side_effect_columns: Additional columns the function will create.
|
|
45
|
+
model_aliases: Model aliases to include in the `models` dict (required for LLM access).
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def decorator(fn: F) -> F:
|
|
49
|
+
validate_generator_signature(fn)
|
|
50
|
+
fn.custom_column_metadata = { # type: ignore[attr-defined]
|
|
51
|
+
"required_columns": required_columns or [],
|
|
52
|
+
"side_effect_columns": side_effect_columns or [],
|
|
53
|
+
"model_aliases": model_aliases or [],
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
@functools.wraps(fn)
|
|
57
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
58
|
+
return fn(*args, **kwargs)
|
|
59
|
+
|
|
60
|
+
# Copy metadata to wrapper
|
|
61
|
+
wrapper.custom_column_metadata = fn.custom_column_metadata # type: ignore[attr-defined]
|
|
62
|
+
return wrapper # type: ignore[return-value]
|
|
63
|
+
|
|
64
|
+
return decorator
|
|
@@ -8,8 +8,9 @@ from typing import Annotated
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
10
|
from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
|
|
11
|
-
from data_designer.config.base import ExportableConfigBase
|
|
12
11
|
from data_designer.config.column_types import ColumnConfigT
|
|
12
|
+
from data_designer.config.exportable_config import ExportableConfigBase
|
|
13
|
+
from data_designer.config.mcp import ToolConfig
|
|
13
14
|
from data_designer.config.models import ModelConfig
|
|
14
15
|
from data_designer.config.processors import ProcessorConfigT
|
|
15
16
|
from data_designer.config.sampler_constraints import ColumnConstraintT
|
|
@@ -27,6 +28,8 @@ class DataDesignerConfig(ExportableConfigBase):
|
|
|
27
28
|
should be generated. Must contain at least one column.
|
|
28
29
|
model_configs: Optional list of model configurations for LLM-based generation.
|
|
29
30
|
Each model config defines the model, provider, and inference parameters.
|
|
31
|
+
tool_configs: Optional list of tool configurations for MCP tool calling.
|
|
32
|
+
Each tool config defines the provider, allowed tools, and execution limits.
|
|
30
33
|
seed_config: Optional seed dataset settings to use for generation.
|
|
31
34
|
constraints: Optional list of column constraints.
|
|
32
35
|
profilers: Optional list of column profilers for analyzing generated data characteristics.
|
|
@@ -34,6 +37,7 @@ class DataDesignerConfig(ExportableConfigBase):
|
|
|
34
37
|
|
|
35
38
|
columns: list[Annotated[ColumnConfigT, Field(discriminator="column_type")]] = Field(min_length=1)
|
|
36
39
|
model_configs: list[ModelConfig] | None = None
|
|
40
|
+
tool_configs: list[ToolConfig] | None = None
|
|
37
41
|
seed_config: SeedConfig | None = None
|
|
38
42
|
constraints: list[ColumnConstraintT] | None = None
|
|
39
43
|
profilers: list[ColumnProfilerConfigT] | None = None
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
from data_designer.config.base import ConfigBase
|
|
12
|
+
from data_designer.config.utils.io_helpers import serialize_data
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ExportableConfigBase(ConfigBase):
|
|
16
|
+
def to_dict(self) -> dict[str, Any]:
|
|
17
|
+
"""Convert the configuration to a dictionary.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
A dictionary representation of the configuration using JSON-compatible
|
|
21
|
+
serialization.
|
|
22
|
+
"""
|
|
23
|
+
return self.model_dump(mode="json")
|
|
24
|
+
|
|
25
|
+
def to_yaml(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
|
|
26
|
+
"""Convert the configuration to a YAML string or file.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
path: Optional file path to write the YAML to. If None, returns the
|
|
30
|
+
YAML string instead of writing to file.
|
|
31
|
+
indent: Number of spaces for YAML indentation. Defaults to 2.
|
|
32
|
+
**kwargs: Additional keyword arguments passed to yaml.dump().
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
The YAML string if path is None, otherwise None (file is written).
|
|
36
|
+
"""
|
|
37
|
+
yaml_str = yaml.dump(self.to_dict(), indent=indent, **kwargs)
|
|
38
|
+
if path is None:
|
|
39
|
+
return yaml_str
|
|
40
|
+
with open(path, "w") as f:
|
|
41
|
+
f.write(yaml_str)
|
|
42
|
+
|
|
43
|
+
def to_json(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
|
|
44
|
+
"""Convert the configuration to a JSON string or file.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
path: Optional file path to write the JSON to. If None, returns the
|
|
48
|
+
JSON string instead of writing to file.
|
|
49
|
+
indent: Number of spaces for JSON indentation. Defaults to 2.
|
|
50
|
+
**kwargs: Additional keyword arguments passed to json.dumps().
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
The JSON string if path is None, otherwise None (file is written).
|
|
54
|
+
"""
|
|
55
|
+
json_str = serialize_data(self.to_dict(), indent=indent, **kwargs)
|
|
56
|
+
if path is None:
|
|
57
|
+
return json_str
|
|
58
|
+
with open(path, "w") as f:
|
|
59
|
+
f.write(json_str)
|