data-designer-config 0.4.0rc2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,66 +3,28 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
- from abc import ABC, abstractmethod
7
- from typing import Annotated, Literal
6
+ from enum import Enum
7
+ from typing import Annotated, Any, Literal
8
8
 
9
- from pydantic import BaseModel, Discriminator, Field, model_validator
9
+ from pydantic import BaseModel, Discriminator, Field, field_serializer, field_validator, model_validator
10
10
  from typing_extensions import Self
11
11
 
12
- from data_designer.config.base import ConfigBase
12
+ from data_designer.config.base import ConfigBase, SingleColumnConfig
13
13
  from data_designer.config.errors import InvalidConfigError
14
14
  from data_designer.config.models import ImageContext
15
15
  from data_designer.config.sampler_params import SamplerParamsT, SamplerType
16
16
  from data_designer.config.utils.code_lang import CodeLang
17
- from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX
17
+ from data_designer.config.utils.constants import REASONING_CONTENT_COLUMN_POSTFIX, TRACE_COLUMN_POSTFIX
18
18
  from data_designer.config.utils.misc import assert_valid_jinja2_template, extract_keywords_from_jinja2_template
19
+ from data_designer.config.utils.trace_type import TraceType
19
20
  from data_designer.config.validator_params import ValidatorParamsT, ValidatorType
20
21
 
21
22
 
22
- class SingleColumnConfig(ConfigBase, ABC):
23
- """Abstract base class for all single-column configuration types.
23
+ class GenerationStrategy(str, Enum):
24
+ """Strategy for custom column generation."""
24
25
 
25
- This class serves as the foundation for all column configurations in DataDesigner,
26
- defining shared fields and properties across all column types.
27
-
28
- Attributes:
29
- name: Unique name of the column to be generated.
30
- drop: If True, the column will be generated but removed from the final dataset.
31
- Useful for intermediate columns that are dependencies for other columns.
32
- column_type: Discriminator field that identifies the specific column type.
33
- Subclasses must override this field to specify the column type with a `Literal` value.
34
- """
35
-
36
- name: str
37
- drop: bool = False
38
- column_type: str
39
-
40
- @staticmethod
41
- def get_column_emoji() -> str:
42
- return "🎨"
43
-
44
- @property
45
- @abstractmethod
46
- def required_columns(self) -> list[str]:
47
- """Returns a list of column names that must exist before this column can be generated.
48
-
49
- Returns:
50
- List of column names that this column depends on. Empty list indicates
51
- no dependencies. Override in subclasses to specify dependencies.
52
- """
53
-
54
- @property
55
- @abstractmethod
56
- def side_effect_columns(self) -> list[str]:
57
- """Returns a list of additional columns that this column will create as a side effect.
58
-
59
- Some column types generate additional metadata or auxiliary columns alongside
60
- the primary column (e.g., reasoning traces for LLM columns).
61
-
62
- Returns:
63
- List of column names that this column will create as a side effect. Empty list
64
- indicates no side effect columns. Override in subclasses to specify side effects.
65
- """
26
+ CELL_BY_CELL = "cell_by_cell"
27
+ FULL_COLUMN = "full_column"
66
28
 
67
29
 
68
30
  class SamplerColumnConfig(SingleColumnConfig):
@@ -143,8 +105,8 @@ class LLMTextColumnConfig(SingleColumnConfig):
143
105
 
144
106
  LLM text columns generate free-form text content using language models via LiteLLM.
145
107
  Prompts support Jinja2 templating to reference values from other columns, enabling
146
- context-aware generation. The generated text can optionally include reasoning traces
147
- when models support extended thinking.
108
+ context-aware generation. The generated text can optionally include message traces
109
+ capturing the full conversation history.
148
110
 
149
111
  Attributes:
150
112
  prompt: Prompt template for text generation. Supports Jinja2 syntax to
@@ -159,6 +121,18 @@ class LLMTextColumnConfig(SingleColumnConfig):
159
121
  `LLMStructuredColumnConfig` for structured output, `LLMCodeColumnConfig` for code.
160
122
  multi_modal_context: Optional list of image contexts for multi-modal generation.
161
123
  Enables vision-capable models to generate text based on image inputs.
124
+ tool_alias: Optional alias of the tool configuration to use for MCP tool calls.
125
+ Must match a tool alias defined when initializing the DataDesignerConfigBuilder.
126
+ When provided, the model may call permitted tools during generation.
127
+ with_trace: Specifies what trace information to capture in a `{column_name}__trace`
128
+ column. Options are:
129
+ - `TraceType.NONE` (default): No trace is captured.
130
+ - `TraceType.LAST_MESSAGE`: Only the final assistant message is captured.
131
+ - `TraceType.ALL_MESSAGES`: Full conversation history (system/user/assistant/tool).
132
+ extract_reasoning_content: If True, creates a `{column_name}__reasoning_content` column
133
+ containing only the reasoning_content from the final assistant response. This is
134
+ useful for models that expose chain-of-thought reasoning separately from the main
135
+ response. Defaults to False.
162
136
  column_type: Discriminator field, always "llm-text" for this configuration type.
163
137
  """
164
138
 
@@ -166,6 +140,9 @@ class LLMTextColumnConfig(SingleColumnConfig):
166
140
  model_alias: str
167
141
  system_prompt: str | None = None
168
142
  multi_modal_context: list[ImageContext] | None = None
143
+ tool_alias: str | None = None
144
+ with_trace: TraceType = TraceType.NONE
145
+ extract_reasoning_content: bool = False
169
146
  column_type: Literal["llm-text"] = "llm-text"
170
147
 
171
148
  @staticmethod
@@ -186,14 +163,20 @@ class LLMTextColumnConfig(SingleColumnConfig):
186
163
 
187
164
  @property
188
165
  def side_effect_columns(self) -> list[str]:
189
- """Returns the reasoning trace column, which may be generated alongside the main column.
166
+ """Returns side-effect columns that may be generated alongside the main column.
190
167
 
191
- Reasoning traces are only returned if the served model parses and returns reasoning content.
168
+ Side-effect columns include:
169
+ - `{name}__trace`: Generated when `with_trace` is not `TraceType.NONE` on the column
170
+ config.
171
+ - `{name}__reasoning_content`: Generated when `extract_reasoning_content=True`.
192
172
 
193
173
  Returns:
194
- List containing the reasoning trace column name.
174
+ List of side-effect column names.
195
175
  """
196
- return [f"{self.name}{REASONING_TRACE_COLUMN_POSTFIX}"]
176
+ return [
177
+ *([f"{self.name}{TRACE_COLUMN_POSTFIX}"] if self.with_trace != TraceType.NONE else []),
178
+ *([f"{self.name}{REASONING_CONTENT_COLUMN_POSTFIX}"] if self.extract_reasoning_content else []),
179
+ ]
197
180
 
198
181
  @model_validator(mode="after")
199
182
  def assert_prompt_valid_jinja(self) -> Self:
@@ -216,7 +199,7 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
216
199
 
217
200
  Extends LLMTextColumnConfig to generate code snippets in specific programming languages
218
201
  or SQL dialects. The generated code is automatically extracted from markdown code blocks
219
- for the specified language. Inherits all prompt templating capabilities.
202
+ for the specified language. Inherits all prompt templating capabilities from LLMTextColumnConfig.
220
203
 
221
204
  Attributes:
222
205
  code_lang: Programming language or SQL dialect for code generation. Supported
@@ -224,6 +207,16 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
224
207
  "rust", "ruby", "scala", "swift", "sql:sqlite", "sql:postgres", "sql:mysql",
225
208
  "sql:tsql", "sql:bigquery", "sql:ansi". See CodeLang enum for complete list.
226
209
  column_type: Discriminator field, always "llm-code" for this configuration type.
210
+
211
+ Inherited Attributes:
212
+ prompt: Prompt template for code generation (supports Jinja2).
213
+ model_alias: Alias of the model configuration to use.
214
+ system_prompt: Optional system prompt (supports Jinja2).
215
+ multi_modal_context: Optional image contexts for multi-modal generation.
216
+ tool_alias: Optional tool configuration alias for MCP tool calls.
217
+ with_trace: If True, creates a `{column_name}__trace` column with message history.
218
+ extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
219
+ column containing the reasoning content from the final assistant response.
227
220
  """
228
221
 
229
222
  code_lang: CodeLang
@@ -239,13 +232,24 @@ class LLMStructuredColumnConfig(LLMTextColumnConfig):
239
232
 
240
233
  Extends LLMTextColumnConfig to generate structured data conforming to a specified schema.
241
234
  Uses JSON schema or Pydantic models to define the expected output structure, enabling
242
- type-safe and validated structured output generation. Inherits prompt templating capabilities.
235
+ type-safe and validated structured output generation. Inherits prompt templating capabilities
236
+ from LLMTextColumnConfig.
243
237
 
244
238
  Attributes:
245
239
  output_format: The schema defining the expected output structure. Can be either:
246
240
  - A Pydantic BaseModel class (recommended)
247
241
  - A JSON schema dictionary
248
242
  column_type: Discriminator field, always "llm-structured" for this configuration type.
243
+
244
+ Inherited Attributes:
245
+ prompt: Prompt template for structured generation (supports Jinja2).
246
+ model_alias: Alias of the model configuration to use.
247
+ system_prompt: Optional system prompt (supports Jinja2).
248
+ multi_modal_context: Optional image contexts for multi-modal generation.
249
+ tool_alias: Optional tool configuration alias for MCP tool calls.
250
+ with_trace: If True, creates a `{column_name}__trace` column with message history.
251
+ extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
252
+ column containing the reasoning content from the final assistant response.
249
253
  """
250
254
 
251
255
  output_format: dict | type[BaseModel]
@@ -293,13 +297,24 @@ class LLMJudgeColumnConfig(LLMTextColumnConfig):
293
297
 
294
298
  Extends LLMTextColumnConfig to create judge columns that evaluate and score other
295
299
  generated content based on the defined criteria. Useful for quality assessment, preference
296
- ranking, and multi-dimensional evaluation of generated data.
300
+ ranking, and multi-dimensional evaluation of generated data. Inherits prompt templating
301
+ capabilities from LLMTextColumnConfig.
297
302
 
298
303
  Attributes:
299
304
  scores: List of Score objects defining the evaluation dimensions. Each score
300
305
  represents a different aspect to evaluate (e.g., accuracy, relevance, fluency).
301
306
  Must contain at least one score.
302
307
  column_type: Discriminator field, always "llm-judge" for this configuration type.
308
+
309
+ Inherited Attributes:
310
+ prompt: Prompt template for the judge evaluation (supports Jinja2).
311
+ model_alias: Alias of the model configuration to use.
312
+ system_prompt: Optional system prompt (supports Jinja2).
313
+ multi_modal_context: Optional image contexts for multi-modal generation.
314
+ tool_alias: Optional tool configuration alias for MCP tool calls.
315
+ with_trace: If True, creates a `{column_name}__trace` column with message history.
316
+ extract_reasoning_content: If True, creates a `{column_name}__reasoning_content`
317
+ column containing the reasoning content from the final assistant response.
303
318
  """
304
319
 
305
320
  scores: list[Score] = Field(..., min_length=1)
@@ -468,3 +483,80 @@ class EmbeddingColumnConfig(SingleColumnConfig):
468
483
  @property
469
484
  def side_effect_columns(self) -> list[str]:
470
485
  return []
486
+
487
+
488
+ class CustomColumnConfig(SingleColumnConfig):
489
+ """Configuration for custom user-defined column generators.
490
+
491
+ Custom columns allow users to provide their own generation logic via a callable function
492
+ decorated with `@custom_column_generator`. Two strategies are supported: cell_by_cell
493
+ (default, row-based) and full_column (batch-based with DataFrame access).
494
+
495
+ Attributes:
496
+ generator_function: A callable decorated with @custom_column_generator.
497
+ generation_strategy: "cell_by_cell" (row-based) or "full_column" (batch-based).
498
+ generator_params: Optional typed configuration object (Pydantic BaseModel) passed
499
+ as the second argument to the generator function.
500
+ column_type: Discriminator field, always "custom" for this configuration type.
501
+ """
502
+
503
+ generator_function: Any = Field(description="Function decorated with @custom_column_generator")
504
+ generation_strategy: GenerationStrategy = Field(
505
+ default=GenerationStrategy.CELL_BY_CELL,
506
+ description="Generation strategy: 'cell_by_cell' for row-based or 'full_column' for batch-based",
507
+ )
508
+ generator_params: BaseModel | None = Field(
509
+ default=None,
510
+ description="Optional typed configuration object passed as second argument to generator function",
511
+ )
512
+ column_type: Literal["custom"] = "custom"
513
+
514
+ @field_validator("generator_function")
515
+ @classmethod
516
+ def _validate_generator_function(cls, v: Any) -> Any:
517
+ if not callable(v):
518
+ raise ValueError("generator_function must be callable")
519
+ if not hasattr(v, "custom_column_metadata"):
520
+ raise ValueError("generator_function must be decorated with @custom_column_generator")
521
+ return v
522
+
523
+ @staticmethod
524
+ def get_column_emoji() -> str:
525
+ return "🔧"
526
+
527
+ @property
528
+ def required_columns(self) -> list[str]:
529
+ """Returns the columns required for custom generation (from decorator metadata)."""
530
+ metadata = getattr(self.generator_function, "custom_column_metadata", {})
531
+ return metadata.get("required_columns", [])
532
+
533
+ @property
534
+ def side_effect_columns(self) -> list[str]:
535
+ """Returns additional columns created by this generator (from decorator metadata)."""
536
+ metadata = getattr(self.generator_function, "custom_column_metadata", {})
537
+ return metadata.get("side_effect_columns", [])
538
+
539
+ @property
540
+ def model_aliases(self) -> list[str]:
541
+ """Returns model aliases for LLM access and health checks (from decorator metadata)."""
542
+ metadata = getattr(self.generator_function, "custom_column_metadata", {})
543
+ return metadata.get("model_aliases", [])
544
+
545
+ @field_serializer("generator_function")
546
+ def serialize_generator_function(self, v: Any) -> str:
547
+ return getattr(v, "__name__", repr(v))
548
+
549
+ @field_serializer("generator_params")
550
+ def serialize_generator_params(self, v: BaseModel | None) -> dict[str, Any] | None:
551
+ if v is None:
552
+ return None
553
+ return v.model_dump()
554
+
555
+ @model_validator(mode="after")
556
+ def validate_generator_function(self) -> Self:
557
+ if not callable(self.generator_function):
558
+ raise InvalidConfigError(
559
+ f"🛑 `generator_function` must be a callable for custom column '{self.name}'. "
560
+ f"Expected a function decorated with @custom_column_generator."
561
+ )
562
+ return self
@@ -6,6 +6,7 @@ from __future__ import annotations
6
6
  from typing_extensions import TypeAlias
7
7
 
8
8
  from data_designer.config.column_configs import (
9
+ CustomColumnConfig,
9
10
  EmbeddingColumnConfig,
10
11
  ExpressionColumnConfig,
11
12
  LLMCodeColumnConfig,
@@ -28,7 +29,8 @@ from data_designer.plugin_manager import PluginManager
28
29
  plugin_manager = PluginManager()
29
30
 
30
31
  ColumnConfigT: TypeAlias = (
31
- ExpressionColumnConfig
32
+ CustomColumnConfig
33
+ | ExpressionColumnConfig
32
34
  | LLMCodeColumnConfig
33
35
  | LLMJudgeColumnConfig
34
36
  | LLMStructuredColumnConfig
@@ -87,6 +89,7 @@ def get_column_display_order() -> list[DataDesignerColumnType]:
87
89
  DataDesignerColumnType.EMBEDDING,
88
90
  DataDesignerColumnType.VALIDATION,
89
91
  DataDesignerColumnType.EXPRESSION,
92
+ DataDesignerColumnType.CUSTOM,
90
93
  ]
91
94
  display_order.extend(plugin_manager.get_plugin_column_types(DataDesignerColumnType))
92
95
  return display_order
@@ -129,6 +132,7 @@ def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
129
132
 
130
133
 
131
134
  _COLUMN_TYPE_CONFIG_CLS_MAP = {
135
+ DataDesignerColumnType.CUSTOM: CustomColumnConfig,
132
136
  DataDesignerColumnType.LLM_TEXT: LLMTextColumnConfig,
133
137
  DataDesignerColumnType.LLM_CODE: LLMCodeColumnConfig,
134
138
  DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnConfig,
@@ -13,7 +13,6 @@ from pygments.lexers import PythonLexer
13
13
  from typing_extensions import Self
14
14
 
15
15
  from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
16
- from data_designer.config.base import ExportableConfigBase
17
16
  from data_designer.config.column_configs import SeedDatasetColumnConfig
18
17
  from data_designer.config.column_types import (
19
18
  ColumnConfigT,
@@ -25,6 +24,8 @@ from data_designer.config.data_designer_config import DataDesignerConfig
25
24
  from data_designer.config.dataset_builders import BuildStage
26
25
  from data_designer.config.default_model_settings import get_default_model_configs
27
26
  from data_designer.config.errors import BuilderConfigurationError, BuilderSerializationError, InvalidColumnTypeError
27
+ from data_designer.config.exportable_config import ExportableConfigBase
28
+ from data_designer.config.mcp import ToolConfig
28
29
  from data_designer.config.models import ModelConfig, load_model_configs
29
30
  from data_designer.config.processors import ProcessorConfigT, ProcessorType, get_processor_config_from_kwargs
30
31
  from data_designer.config.sampler_constraints import (
@@ -93,7 +94,10 @@ class DataDesignerConfigBuilder:
93
94
  json_config = json.loads(serialize_data(smart_load_yaml(config)))
94
95
  builder_config = BuilderConfig.model_validate(json_config)
95
96
 
96
- builder = cls(model_configs=builder_config.data_designer.model_configs)
97
+ builder = cls(
98
+ model_configs=builder_config.data_designer.model_configs,
99
+ tool_configs=builder_config.data_designer.tool_configs,
100
+ )
97
101
  data_designer_config = builder_config.data_designer
98
102
 
99
103
  for col in data_designer_config.columns:
@@ -111,7 +115,11 @@ class DataDesignerConfigBuilder:
111
115
 
112
116
  return builder
113
117
 
114
- def __init__(self, model_configs: list[ModelConfig] | str | Path | None = None):
118
+ def __init__(
119
+ self,
120
+ model_configs: list[ModelConfig] | str | Path | None = None,
121
+ tool_configs: list[ToolConfig] | None = None,
122
+ ):
115
123
  """Initialize a new DataDesignerConfigBuilder instance.
116
124
 
117
125
  Args:
@@ -119,9 +127,13 @@ class DataDesignerConfigBuilder:
119
127
  - None to use default model configurations in local mode
120
128
  - A list of ModelConfig objects
121
129
  - A string or Path to a model configuration file
130
+ tool_configs: Tool configurations for MCP tool calling. Can be:
131
+ - None if no tool configs are needed
132
+ - A list of ToolConfig objects
122
133
  """
123
134
  self._column_configs = {}
124
135
  self._model_configs = _load_model_configs(model_configs)
136
+ self._tool_configs: list[ToolConfig] = tool_configs or []
125
137
  self._processor_configs: list[ProcessorConfigT] = []
126
138
  self._seed_config: SeedConfig | None = None
127
139
  self._constraints: list[ColumnConstraintT] = []
@@ -136,6 +148,15 @@ class DataDesignerConfigBuilder:
136
148
  """
137
149
  return self._model_configs
138
150
 
151
+ @property
152
+ def tool_configs(self) -> list[ToolConfig]:
153
+ """Get the tool configurations for this builder.
154
+
155
+ Returns:
156
+ A list of ToolConfig objects used for MCP tool calling.
157
+ """
158
+ return self._tool_configs
159
+
139
160
  @property
140
161
  def allowed_references(self) -> list[str]:
141
162
  """Get all referenceable variables allowed in prompt templates and expressions.
@@ -184,6 +205,38 @@ class DataDesignerConfigBuilder:
184
205
  )
185
206
  return self
186
207
 
208
+ def add_tool_config(self, tool_config: ToolConfig) -> Self:
209
+ """Add a tool configuration to the current Data Designer configuration.
210
+
211
+ Args:
212
+ tool_config: The tool configuration to add.
213
+
214
+ Returns:
215
+ The current Data Designer config builder instance.
216
+
217
+ Raises:
218
+ BuilderConfigurationError: If a tool configuration with the same alias already exists.
219
+ """
220
+ if tool_config.tool_alias in {tc.tool_alias for tc in self._tool_configs}:
221
+ raise BuilderConfigurationError(
222
+ f"Tool configuration with alias {tool_config.tool_alias} already exists. "
223
+ "Please delete the existing tool configuration or choose a different alias."
224
+ )
225
+ self._tool_configs.append(tool_config)
226
+ return self
227
+
228
+ def delete_tool_config(self, alias: str) -> Self:
229
+ """Delete a tool configuration from the current Data Designer configuration by alias.
230
+
231
+ Args:
232
+ alias: The alias of the tool configuration to delete.
233
+
234
+ Returns:
235
+ The current Data Designer config builder instance.
236
+ """
237
+ self._tool_configs = [tc for tc in self._tool_configs if tc.tool_alias != alias]
238
+ return self
239
+
187
240
  def add_column(
188
241
  self,
189
242
  column_config: ColumnConfigT | None = None,
@@ -350,9 +403,14 @@ class DataDesignerConfigBuilder:
350
403
 
351
404
  Returns:
352
405
  The current Data Designer config object.
406
+
407
+ Raises:
408
+ BuilderConfigurationError: If any ToolConfig has duplicate tool names in its allow_tools list.
353
409
  """
410
+ self._validate_tool_configs_no_duplicates()
354
411
  return DataDesignerConfig(
355
412
  model_configs=self._model_configs,
413
+ tool_configs=self._tool_configs,
356
414
  seed_config=self._seed_config,
357
415
  columns=list(self._column_configs.values()),
358
416
  constraints=self._constraints or None,
@@ -360,6 +418,31 @@ class DataDesignerConfigBuilder:
360
418
  processors=self._processor_configs or None,
361
419
  )
362
420
 
421
+ def _validate_tool_configs_no_duplicates(self) -> None:
422
+ """Validate that no ToolConfig has duplicate tool names in its allow_tools list.
423
+
424
+ This is a static validation that catches obvious duplicates at config build time,
425
+ before providers are queried. Full validation (including duplicates across providers)
426
+ happens at resource provider creation time.
427
+
428
+ Raises:
429
+ BuilderConfigurationError: If any ToolConfig has duplicate tool names in allow_tools.
430
+ """
431
+ for tool_config in self._tool_configs:
432
+ if tool_config.allow_tools is None:
433
+ continue
434
+ seen: set[str] = set()
435
+ duplicates: list[str] = []
436
+ for tool_name in tool_config.allow_tools:
437
+ if tool_name in seen:
438
+ duplicates.append(tool_name)
439
+ seen.add(tool_name)
440
+ if duplicates:
441
+ raise BuilderConfigurationError(
442
+ f"🛑 ToolConfig '{tool_config.tool_alias}' has duplicate tool names in allow_tools: "
443
+ f"{sorted(set(duplicates))!r}. Each tool name must be unique within a ToolConfig."
444
+ )
445
+
363
446
  def delete_constraints(self, target_column: str) -> Self:
364
447
  """Delete all constraints for the given target column.
365
448
 
@@ -411,6 +494,23 @@ class DataDesignerConfigBuilder:
411
494
  """
412
495
  return list(self._column_configs.values())
413
496
 
497
+ def get_tool_config(self, alias: str) -> ToolConfig:
498
+ """Get a tool configuration by alias.
499
+
500
+ Args:
501
+ alias: The alias of the tool configuration to retrieve.
502
+
503
+ Returns:
504
+ The tool configuration object.
505
+
506
+ Raises:
507
+ KeyError: If no tool configuration with the given alias exists.
508
+ """
509
+ for tc in self._tool_configs:
510
+ if tc.tool_alias == alias:
511
+ return tc
512
+ raise KeyError(f"No tool configuration with alias {alias!r} found")
513
+
414
514
  def get_constraints(self, target_column: str) -> list[ColumnConstraintT]:
415
515
  """Get all constraints for the given target column.
416
516
 
@@ -0,0 +1,64 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """User-facing utilities for custom column generation."""
5
+
6
+ from __future__ import annotations
7
+
8
+ import functools
9
+ import inspect
10
+ from typing import Any, Callable, TypeVar
11
+
12
+ F = TypeVar("F", bound=Callable[..., Any])
13
+
14
+ # Expected parameter names by position (first param validated as row/df at runtime based on strategy)
15
+ EXPECTED_PARAMS = ({"row", "df"}, {"generator_params"}, {"models"})
16
+
17
+
18
+ def validate_generator_signature(fn: Callable[..., Any]) -> list[inspect.Parameter]:
19
+ """Validate generator function signature. Returns positional params if valid."""
20
+ params = [
21
+ p
22
+ for p in inspect.signature(fn).parameters.values()
23
+ if p.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD)
24
+ ]
25
+ n = len(params)
26
+ if n == 0 or n > 3:
27
+ raise TypeError(f"Generator '{fn.__name__}' must have 1-3 parameters, got {n}.")
28
+ for i, param in enumerate(params):
29
+ if param.name not in EXPECTED_PARAMS[i]:
30
+ expected = " or ".join(f"'{p}'" for p in sorted(EXPECTED_PARAMS[i]))
31
+ raise TypeError(f"Generator '{fn.__name__}' param {i + 1} must be {expected}, got '{param.name}'.")
32
+ return params
33
+
34
+
35
+ def custom_column_generator(
36
+ required_columns: list[str] | None = None,
37
+ side_effect_columns: list[str] | None = None,
38
+ model_aliases: list[str] | None = None,
39
+ ) -> Callable[[F], F]:
40
+ """Decorator to define metadata for a custom column generator function.
41
+
42
+ Args:
43
+ required_columns: Columns that must exist before this column runs (DAG ordering).
44
+ side_effect_columns: Additional columns the function will create.
45
+ model_aliases: Model aliases to include in the `models` dict (required for LLM access).
46
+ """
47
+
48
+ def decorator(fn: F) -> F:
49
+ validate_generator_signature(fn)
50
+ fn.custom_column_metadata = { # type: ignore[attr-defined]
51
+ "required_columns": required_columns or [],
52
+ "side_effect_columns": side_effect_columns or [],
53
+ "model_aliases": model_aliases or [],
54
+ }
55
+
56
+ @functools.wraps(fn)
57
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
58
+ return fn(*args, **kwargs)
59
+
60
+ # Copy metadata to wrapper
61
+ wrapper.custom_column_metadata = fn.custom_column_metadata # type: ignore[attr-defined]
62
+ return wrapper # type: ignore[return-value]
63
+
64
+ return decorator
@@ -8,8 +8,9 @@ from typing import Annotated
8
8
  from pydantic import Field
9
9
 
10
10
  from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
11
- from data_designer.config.base import ExportableConfigBase
12
11
  from data_designer.config.column_types import ColumnConfigT
12
+ from data_designer.config.exportable_config import ExportableConfigBase
13
+ from data_designer.config.mcp import ToolConfig
13
14
  from data_designer.config.models import ModelConfig
14
15
  from data_designer.config.processors import ProcessorConfigT
15
16
  from data_designer.config.sampler_constraints import ColumnConstraintT
@@ -27,6 +28,8 @@ class DataDesignerConfig(ExportableConfigBase):
27
28
  should be generated. Must contain at least one column.
28
29
  model_configs: Optional list of model configurations for LLM-based generation.
29
30
  Each model config defines the model, provider, and inference parameters.
31
+ tool_configs: Optional list of tool configurations for MCP tool calling.
32
+ Each tool config defines the provider, allowed tools, and execution limits.
30
33
  seed_config: Optional seed dataset settings to use for generation.
31
34
  constraints: Optional list of column constraints.
32
35
  profilers: Optional list of column profilers for analyzing generated data characteristics.
@@ -34,6 +37,7 @@ class DataDesignerConfig(ExportableConfigBase):
34
37
 
35
38
  columns: list[Annotated[ColumnConfigT, Field(discriminator="column_type")]] = Field(min_length=1)
36
39
  model_configs: list[ModelConfig] | None = None
40
+ tool_configs: list[ToolConfig] | None = None
37
41
  seed_config: SeedConfig | None = None
38
42
  constraints: list[ColumnConstraintT] | None = None
39
43
  profilers: list[ColumnProfilerConfigT] | None = None
@@ -0,0 +1,59 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import yaml
10
+
11
+ from data_designer.config.base import ConfigBase
12
+ from data_designer.config.utils.io_helpers import serialize_data
13
+
14
+
15
+ class ExportableConfigBase(ConfigBase):
16
+ def to_dict(self) -> dict[str, Any]:
17
+ """Convert the configuration to a dictionary.
18
+
19
+ Returns:
20
+ A dictionary representation of the configuration using JSON-compatible
21
+ serialization.
22
+ """
23
+ return self.model_dump(mode="json")
24
+
25
+ def to_yaml(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
26
+ """Convert the configuration to a YAML string or file.
27
+
28
+ Args:
29
+ path: Optional file path to write the YAML to. If None, returns the
30
+ YAML string instead of writing to file.
31
+ indent: Number of spaces for YAML indentation. Defaults to 2.
32
+ **kwargs: Additional keyword arguments passed to yaml.dump().
33
+
34
+ Returns:
35
+ The YAML string if path is None, otherwise None (file is written).
36
+ """
37
+ yaml_str = yaml.dump(self.to_dict(), indent=indent, **kwargs)
38
+ if path is None:
39
+ return yaml_str
40
+ with open(path, "w") as f:
41
+ f.write(yaml_str)
42
+
43
+ def to_json(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
44
+ """Convert the configuration to a JSON string or file.
45
+
46
+ Args:
47
+ path: Optional file path to write the JSON to. If None, returns the
48
+ JSON string instead of writing to file.
49
+ indent: Number of spaces for JSON indentation. Defaults to 2.
50
+ **kwargs: Additional keyword arguments passed to json.dumps().
51
+
52
+ Returns:
53
+ The JSON string if path is None, otherwise None (file is written).
54
+ """
55
+ json_str = serialize_data(self.to_dict(), indent=indent, **kwargs)
56
+ if path is None:
57
+ return json_str
58
+ with open(path, "w") as f:
59
+ f.write(json_str)