data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0rc1.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/entry_points.txt +0 -0
@@ -1,470 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from abc import ABC, abstractmethod
7
- from typing import Annotated, Literal
8
-
9
- from pydantic import BaseModel, Discriminator, Field, model_validator
10
- from typing_extensions import Self
11
-
12
- from data_designer.config.base import ConfigBase
13
- from data_designer.config.errors import InvalidConfigError
14
- from data_designer.config.models import ImageContext
15
- from data_designer.config.sampler_params import SamplerParamsT, SamplerType
16
- from data_designer.config.utils.code_lang import CodeLang
17
- from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX
18
- from data_designer.config.utils.misc import assert_valid_jinja2_template, extract_keywords_from_jinja2_template
19
- from data_designer.config.validator_params import ValidatorParamsT, ValidatorType
20
-
21
-
22
- class SingleColumnConfig(ConfigBase, ABC):
23
- """Abstract base class for all single-column configuration types.
24
-
25
- This class serves as the foundation for all column configurations in DataDesigner,
26
- defining shared fields and properties across all column types.
27
-
28
- Attributes:
29
- name: Unique name of the column to be generated.
30
- drop: If True, the column will be generated but removed from the final dataset.
31
- Useful for intermediate columns that are dependencies for other columns.
32
- column_type: Discriminator field that identifies the specific column type.
33
- Subclasses must override this field to specify the column type with a `Literal` value.
34
- """
35
-
36
- name: str
37
- drop: bool = False
38
- column_type: str
39
-
40
- @staticmethod
41
- def get_column_emoji() -> str:
42
- return "🎨"
43
-
44
- @property
45
- @abstractmethod
46
- def required_columns(self) -> list[str]:
47
- """Returns a list of column names that must exist before this column can be generated.
48
-
49
- Returns:
50
- List of column names that this column depends on. Empty list indicates
51
- no dependencies. Override in subclasses to specify dependencies.
52
- """
53
-
54
- @property
55
- @abstractmethod
56
- def side_effect_columns(self) -> list[str]:
57
- """Returns a list of additional columns that this column will create as a side effect.
58
-
59
- Some column types generate additional metadata or auxiliary columns alongside
60
- the primary column (e.g., reasoning traces for LLM columns).
61
-
62
- Returns:
63
- List of column names that this column will create as a side effect. Empty list
64
- indicates no side effect columns. Override in subclasses to specify side effects.
65
- """
66
-
67
-
68
- class SamplerColumnConfig(SingleColumnConfig):
69
- """Configuration for columns generated using numerical samplers.
70
-
71
- Sampler columns provide efficient data generation using numerical samplers for
72
- common data types and distributions. Supported samplers include UUID generation,
73
- datetime/timedelta sampling, person generation, category / subcategory sampling,
74
- and various statistical distributions (uniform, gaussian, binomial, poisson, scipy).
75
-
76
- Attributes:
77
- sampler_type: Type of sampler to use. Available types include:
78
- "uuid", "category", "subcategory", "uniform", "gaussian", "bernoulli",
79
- "bernoulli_mixture", "binomial", "poisson", "scipy", "person", "datetime", "timedelta".
80
- params: Parameters specific to the chosen sampler type. Type varies based on the `sampler_type`
81
- (e.g., `CategorySamplerParams`, `UniformSamplerParams`, `PersonSamplerParams`).
82
- conditional_params: Optional dictionary for conditional parameters. The dict keys
83
- are the conditions that must be met (e.g., "age > 21") for the conditional parameters
84
- to be used. The values of dict are the parameters to use when the condition is met.
85
- convert_to: Optional type conversion to apply after sampling. Must be one of "float", "int", or "str".
86
- Useful for converting numerical samples to strings or other types.
87
- column_type: Discriminator field, always "sampler" for this configuration type.
88
-
89
- !!! tip "Displaying available samplers and their parameters"
90
- The config builder has an `info` attribute that can be used to display the
91
- available samplers and their parameters:
92
- ```python
93
- config_builder.info.display("samplers")
94
- ```
95
- """
96
-
97
- sampler_type: SamplerType
98
- params: Annotated[SamplerParamsT, Discriminator("sampler_type")]
99
- conditional_params: dict[str, Annotated[SamplerParamsT, Discriminator("sampler_type")]] = {}
100
- convert_to: str | None = None
101
- column_type: Literal["sampler"] = "sampler"
102
-
103
- @staticmethod
104
- def get_column_emoji() -> str:
105
- return "🎲"
106
-
107
- @property
108
- def required_columns(self) -> list[str]:
109
- return []
110
-
111
- @property
112
- def side_effect_columns(self) -> list[str]:
113
- return []
114
-
115
- @model_validator(mode="before")
116
- @classmethod
117
- def inject_sampler_type_into_params(cls, data: dict) -> dict:
118
- """Inject sampler_type into params dict to enable discriminated union resolution.
119
-
120
- This allows users to pass params as a simple dict without the sampler_type field,
121
- which will be automatically added based on the outer sampler_type field.
122
- """
123
- if isinstance(data, dict):
124
- sampler_type = data.get("sampler_type")
125
- params = data.get("params")
126
-
127
- # If params is a dict and doesn't have sampler_type, inject it
128
- if sampler_type and isinstance(params, dict) and "sampler_type" not in params:
129
- data["params"] = {"sampler_type": sampler_type, **params}
130
-
131
- # Handle conditional_params similarly
132
- conditional_params = data.get("conditional_params")
133
- if conditional_params and isinstance(conditional_params, dict):
134
- for condition, cond_params in conditional_params.items():
135
- if isinstance(cond_params, dict) and "sampler_type" not in cond_params:
136
- data["conditional_params"][condition] = {"sampler_type": sampler_type, **cond_params}
137
-
138
- return data
139
-
140
-
141
- class LLMTextColumnConfig(SingleColumnConfig):
142
- """Configuration for text generation columns using Large Language Models.
143
-
144
- LLM text columns generate free-form text content using language models via LiteLLM.
145
- Prompts support Jinja2 templating to reference values from other columns, enabling
146
- context-aware generation. The generated text can optionally include reasoning traces
147
- when models support extended thinking.
148
-
149
- Attributes:
150
- prompt: Prompt template for text generation. Supports Jinja2 syntax to
151
- reference other columns (e.g., "Write a story about {{ character_name }}").
152
- Must be a valid Jinja2 template.
153
- model_alias: Alias of the model configuration to use for generation.
154
- Must match a model alias defined when initializing the DataDesignerConfigBuilder.
155
- system_prompt: Optional system prompt to set model behavior and constraints.
156
- Also supports Jinja2 templating. If provided, must be a valid Jinja2 template.
157
- Do not put any output parsing instructions in the system prompt. Instead,
158
- use the appropriate column type for the output you want to generate - e.g.,
159
- `LLMStructuredColumnConfig` for structured output, `LLMCodeColumnConfig` for code.
160
- multi_modal_context: Optional list of image contexts for multi-modal generation.
161
- Enables vision-capable models to generate text based on image inputs.
162
- column_type: Discriminator field, always "llm-text" for this configuration type.
163
- """
164
-
165
- prompt: str
166
- model_alias: str
167
- system_prompt: str | None = None
168
- multi_modal_context: list[ImageContext] | None = None
169
- column_type: Literal["llm-text"] = "llm-text"
170
-
171
- @staticmethod
172
- def get_column_emoji() -> str:
173
- return "📝"
174
-
175
- @property
176
- def required_columns(self) -> list[str]:
177
- """Get columns referenced in the prompt and system_prompt templates.
178
-
179
- Returns:
180
- List of unique column names referenced in Jinja2 templates.
181
- """
182
- required_cols = list(extract_keywords_from_jinja2_template(self.prompt))
183
- if self.system_prompt:
184
- required_cols.extend(list(extract_keywords_from_jinja2_template(self.system_prompt)))
185
- return list(set(required_cols))
186
-
187
- @property
188
- def side_effect_columns(self) -> list[str]:
189
- """Returns the reasoning trace column, which may be generated alongside the main column.
190
-
191
- Reasoning traces are only returned if the served model parses and returns reasoning content.
192
-
193
- Returns:
194
- List containing the reasoning trace column name.
195
- """
196
- return [f"{self.name}{REASONING_TRACE_COLUMN_POSTFIX}"]
197
-
198
- @model_validator(mode="after")
199
- def assert_prompt_valid_jinja(self) -> Self:
200
- """Validate that prompt and system_prompt are valid Jinja2 templates.
201
-
202
- Returns:
203
- The validated instance.
204
-
205
- Raises:
206
- InvalidConfigError: If prompt or system_prompt contains invalid Jinja2 syntax.
207
- """
208
- assert_valid_jinja2_template(self.prompt)
209
- if self.system_prompt:
210
- assert_valid_jinja2_template(self.system_prompt)
211
- return self
212
-
213
-
214
- class LLMCodeColumnConfig(LLMTextColumnConfig):
215
- """Configuration for code generation columns using Large Language Models.
216
-
217
- Extends LLMTextColumnConfig to generate code snippets in specific programming languages
218
- or SQL dialects. The generated code is automatically extracted from markdown code blocks
219
- for the specified language. Inherits all prompt templating capabilities.
220
-
221
- Attributes:
222
- code_lang: Programming language or SQL dialect for code generation. Supported
223
- values include: "python", "javascript", "typescript", "java", "kotlin", "go",
224
- "rust", "ruby", "scala", "swift", "sql:sqlite", "sql:postgres", "sql:mysql",
225
- "sql:tsql", "sql:bigquery", "sql:ansi". See CodeLang enum for complete list.
226
- column_type: Discriminator field, always "llm-code" for this configuration type.
227
- """
228
-
229
- code_lang: CodeLang
230
- column_type: Literal["llm-code"] = "llm-code"
231
-
232
- @staticmethod
233
- def get_column_emoji() -> str:
234
- return "💻"
235
-
236
-
237
- class LLMStructuredColumnConfig(LLMTextColumnConfig):
238
- """Configuration for structured JSON generation columns using Large Language Models.
239
-
240
- Extends LLMTextColumnConfig to generate structured data conforming to a specified schema.
241
- Uses JSON schema or Pydantic models to define the expected output structure, enabling
242
- type-safe and validated structured output generation. Inherits prompt templating capabilities.
243
-
244
- Attributes:
245
- output_format: The schema defining the expected output structure. Can be either:
246
- - A Pydantic BaseModel class (recommended)
247
- - A JSON schema dictionary
248
- column_type: Discriminator field, always "llm-structured" for this configuration type.
249
- """
250
-
251
- output_format: dict | type[BaseModel]
252
- column_type: Literal["llm-structured"] = "llm-structured"
253
-
254
- @staticmethod
255
- def get_column_emoji() -> str:
256
- return "🗂️"
257
-
258
- @model_validator(mode="after")
259
- def validate_output_format(self) -> Self:
260
- """Convert Pydantic model to JSON schema if needed.
261
-
262
- Returns:
263
- The validated instance with output_format as a JSON schema dict.
264
- """
265
- if not isinstance(self.output_format, dict) and issubclass(self.output_format, BaseModel):
266
- self.output_format = self.output_format.model_json_schema()
267
- return self
268
-
269
-
270
- class Score(ConfigBase):
271
- """Configuration for a "score" in an LLM judge evaluation.
272
-
273
- Defines a single scoring criterion with its possible values and descriptions. Multiple
274
- Score objects can be combined in an LLMJudgeColumnConfig to create multi-dimensional
275
- quality assessments.
276
-
277
- Attributes:
278
- name: A clear, concise name for this scoring dimension (e.g., "Relevance", "Fluency").
279
- description: An informative and detailed assessment guide explaining how to evaluate
280
- this dimension. Should provide clear criteria for scoring.
281
- options: Dictionary mapping score values to their descriptions. Keys can be integers
282
- (e.g., 1-5 scale) or strings (e.g., "Poor", "Good", "Excellent"). Values are
283
- descriptions explaining what each score level means.
284
- """
285
-
286
- name: str = Field(..., description="A clear name for this score.")
287
- description: str = Field(..., description="An informative and detailed assessment guide for using this score.")
288
- options: dict[int | str, str] = Field(..., description="Score options in the format of {score: description}.")
289
-
290
-
291
- class LLMJudgeColumnConfig(LLMTextColumnConfig):
292
- """Configuration for LLM-as-a-judge quality assessment and scoring columns.
293
-
294
- Extends LLMTextColumnConfig to create judge columns that evaluate and score other
295
- generated content based on the defined criteria. Useful for quality assessment, preference
296
- ranking, and multi-dimensional evaluation of generated data.
297
-
298
- Attributes:
299
- scores: List of Score objects defining the evaluation dimensions. Each score
300
- represents a different aspect to evaluate (e.g., accuracy, relevance, fluency).
301
- Must contain at least one score.
302
- column_type: Discriminator field, always "llm-judge" for this configuration type.
303
- """
304
-
305
- scores: list[Score] = Field(..., min_length=1)
306
- column_type: Literal["llm-judge"] = "llm-judge"
307
-
308
- @staticmethod
309
- def get_column_emoji() -> str:
310
- return "⚖️"
311
-
312
-
313
- class ExpressionColumnConfig(SingleColumnConfig):
314
- """Configuration for derived columns using Jinja2 expressions.
315
-
316
- Expression columns compute values by evaluating Jinja2 templates that reference other
317
- columns. Useful for transformations, concatenations, conditional logic, and derived
318
- features without requiring LLM generation. The expression is evaluated row-by-row.
319
-
320
- Attributes:
321
- expr: Jinja2 expression to evaluate. Can reference other column values using
322
- {{ column_name }} syntax. Supports filters, conditionals, and arithmetic.
323
- Must be a valid, non-empty Jinja2 template.
324
- dtype: Data type to cast the result to. Must be one of "int", "float", "str", or "bool".
325
- Defaults to "str". Type conversion is applied after expression evaluation.
326
- column_type: Discriminator field, always "expression" for this configuration type.
327
- """
328
-
329
- name: str
330
- expr: str
331
- dtype: Literal["int", "float", "str", "bool"] = "str"
332
- column_type: Literal["expression"] = "expression"
333
-
334
- @staticmethod
335
- def get_column_emoji() -> str:
336
- return "🧩"
337
-
338
- @property
339
- def required_columns(self) -> list[str]:
340
- """Returns the columns referenced in the expression template."""
341
- return list(extract_keywords_from_jinja2_template(self.expr))
342
-
343
- @property
344
- def side_effect_columns(self) -> list[str]:
345
- return []
346
-
347
- @model_validator(mode="after")
348
- def assert_expression_valid_jinja(self) -> Self:
349
- """Validate that the expression is a valid, non-empty Jinja2 template.
350
-
351
- Returns:
352
- The validated instance.
353
-
354
- Raises:
355
- InvalidConfigError: If expression is empty or contains invalid Jinja2 syntax.
356
- """
357
- if not self.expr.strip():
358
- raise InvalidConfigError(
359
- f"🛑 Expression column '{self.name}' has an empty or whitespace-only expression. "
360
- f"Please provide a valid Jinja2 expression (e.g., '{{ column_name }}' or '{{ col1 }} + {{ col2 }}') "
361
- "or remove this column if not needed."
362
- )
363
- assert_valid_jinja2_template(self.expr)
364
- return self
365
-
366
-
367
- class ValidationColumnConfig(SingleColumnConfig):
368
- """Configuration for validation columns that validate existing columns.
369
-
370
- Validation columns execute validation logic against specified target columns and return
371
- structured results indicating pass/fail status with validation details. Supports multiple
372
- validation strategies: code execution (Python/SQL), local callable functions (library only),
373
- and remote HTTP endpoints.
374
-
375
- Attributes:
376
- target_columns: List of column names to validate. These columns are passed to the
377
- validator for validation. All target columns must exist in the dataset
378
- before validation runs.
379
- validator_type: The type of validator to use. Options:
380
- - "code": Execute code (Python or SQL) for validation. The code receives a
381
- DataFrame with target columns and must return a DataFrame with validation results.
382
- - "local_callable": Call a local Python function with the data. Only supported
383
- when running DataDesigner locally.
384
- - "remote": Send data to a remote HTTP endpoint for validation. Useful for
385
- validator_params: Parameters specific to the validator type. Type varies by validator:
386
- - CodeValidatorParams: Specifies code language (python or SQL dialect like
387
- "sql:postgres", "sql:mysql").
388
- - LocalCallableValidatorParams: Provides validation function (Callable[[pd.DataFrame],
389
- pd.DataFrame]) and optional output schema for validation results.
390
- - RemoteValidatorParams: Configures endpoint URL, HTTP timeout, retry behavior
391
- (max_retries, retry_backoff), and parallel request limits (max_parallel_requests).
392
- batch_size: Number of records to process in each validation batch. Defaults to 10.
393
- Larger batches are more efficient but use more memory. Adjust based on validator
394
- complexity and available resources.
395
- column_type: Discriminator field, always "validation" for this configuration type.
396
- """
397
-
398
- target_columns: list[str]
399
- validator_type: ValidatorType
400
- validator_params: ValidatorParamsT
401
- batch_size: int = Field(default=10, ge=1, description="Number of records to process in each batch")
402
- column_type: Literal["validation"] = "validation"
403
-
404
- @staticmethod
405
- def get_column_emoji() -> str:
406
- return "🔍"
407
-
408
- @property
409
- def required_columns(self) -> list[str]:
410
- """Returns the columns that need to be validated."""
411
- return self.target_columns
412
-
413
- @property
414
- def side_effect_columns(self) -> list[str]:
415
- return []
416
-
417
-
418
- class SeedDatasetColumnConfig(SingleColumnConfig):
419
- """Configuration for columns sourced from seed datasets.
420
-
421
- This config marks columns that come from seed data. It is typically created
422
- automatically when calling `with_seed_dataset()` on the builder, rather than
423
- being instantiated directly by users.
424
-
425
- Attributes:
426
- column_type: Discriminator field, always "seed-dataset" for this configuration type.
427
- """
428
-
429
- column_type: Literal["seed-dataset"] = "seed-dataset"
430
-
431
- @staticmethod
432
- def get_column_emoji() -> str:
433
- return "🌱"
434
-
435
- @property
436
- def required_columns(self) -> list[str]:
437
- return []
438
-
439
- @property
440
- def side_effect_columns(self) -> list[str]:
441
- return []
442
-
443
-
444
- class EmbeddingColumnConfig(SingleColumnConfig):
445
- """Configuration for embedding generation columns.
446
-
447
- Embedding columns generate embeddings for text input using a specified model.
448
-
449
- Attributes:
450
- target_column: The column to generate embeddings for. The column could be a single text string or a list of text strings in stringified JSON format.
451
- If it is a list of text strings in stringified JSON format, the embeddings will be generated for each text string.
452
- model_alias: The model to use for embedding generation.
453
- column_type: Discriminator field, always "embedding" for this configuration type.
454
- """
455
-
456
- target_column: str
457
- model_alias: str
458
- column_type: Literal["embedding"] = "embedding"
459
-
460
- @staticmethod
461
- def get_column_emoji() -> str:
462
- return "🧬"
463
-
464
- @property
465
- def required_columns(self) -> list[str]:
466
- return [self.target_column]
467
-
468
- @property
469
- def side_effect_columns(self) -> list[str]:
470
- return []
@@ -1,141 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from typing_extensions import TypeAlias
7
-
8
- from data_designer.config.column_configs import (
9
- EmbeddingColumnConfig,
10
- ExpressionColumnConfig,
11
- LLMCodeColumnConfig,
12
- LLMJudgeColumnConfig,
13
- LLMStructuredColumnConfig,
14
- LLMTextColumnConfig,
15
- SamplerColumnConfig,
16
- SeedDatasetColumnConfig,
17
- ValidationColumnConfig,
18
- )
19
- from data_designer.config.errors import InvalidConfigError
20
- from data_designer.config.sampler_params import SamplerType
21
- from data_designer.config.utils.type_helpers import (
22
- SAMPLER_PARAMS,
23
- create_str_enum_from_discriminated_type_union,
24
- resolve_string_enum,
25
- )
26
- from data_designer.plugin_manager import PluginManager
27
-
28
- plugin_manager = PluginManager()
29
-
30
- ColumnConfigT: TypeAlias = (
31
- ExpressionColumnConfig
32
- | LLMCodeColumnConfig
33
- | LLMJudgeColumnConfig
34
- | LLMStructuredColumnConfig
35
- | LLMTextColumnConfig
36
- | SamplerColumnConfig
37
- | SeedDatasetColumnConfig
38
- | ValidationColumnConfig
39
- | EmbeddingColumnConfig
40
- )
41
- ColumnConfigT = plugin_manager.inject_into_column_config_type_union(ColumnConfigT)
42
-
43
- DataDesignerColumnType = create_str_enum_from_discriminated_type_union(
44
- enum_name="DataDesignerColumnType",
45
- type_union=ColumnConfigT,
46
- discriminator_field_name="column_type",
47
- )
48
-
49
-
50
- def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType, **kwargs) -> ColumnConfigT:
51
- """Create a Data Designer column config object from kwargs.
52
-
53
- Args:
54
- name: Name of the column.
55
- column_type: Type of the column.
56
- **kwargs: Keyword arguments to pass to the column constructor.
57
-
58
- Returns:
59
- Data Designer column object of the appropriate type.
60
- """
61
- column_type = resolve_string_enum(column_type, DataDesignerColumnType)
62
- config_cls = get_column_config_cls_from_type(column_type)
63
- if column_type == DataDesignerColumnType.SAMPLER:
64
- kwargs = _resolve_sampler_kwargs(name, kwargs)
65
- return config_cls(name=name, **kwargs)
66
-
67
-
68
- def get_column_config_cls_from_type(column_type: DataDesignerColumnType) -> type[ColumnConfigT]:
69
- """Get the column config class for a column type."""
70
- column_type = resolve_string_enum(column_type, DataDesignerColumnType)
71
- if column_type in _COLUMN_TYPE_CONFIG_CLS_MAP:
72
- return _COLUMN_TYPE_CONFIG_CLS_MAP[column_type]
73
- if plugin := plugin_manager.get_column_generator_plugin_if_exists(column_type.value):
74
- return plugin.config_cls
75
- raise InvalidConfigError(f"🛑 {column_type} is not a valid column type.")
76
-
77
-
78
- def get_column_display_order() -> list[DataDesignerColumnType]:
79
- """Return the preferred display order of the column types."""
80
- display_order = [
81
- DataDesignerColumnType.SEED_DATASET,
82
- DataDesignerColumnType.SAMPLER,
83
- DataDesignerColumnType.LLM_TEXT,
84
- DataDesignerColumnType.LLM_CODE,
85
- DataDesignerColumnType.LLM_STRUCTURED,
86
- DataDesignerColumnType.LLM_JUDGE,
87
- DataDesignerColumnType.EMBEDDING,
88
- DataDesignerColumnType.VALIDATION,
89
- DataDesignerColumnType.EXPRESSION,
90
- ]
91
- display_order.extend(plugin_manager.get_plugin_column_types(DataDesignerColumnType))
92
- return display_order
93
-
94
-
95
- def get_column_emoji_from_type(column_type: DataDesignerColumnType) -> str:
96
- """Get the emoji for a column type."""
97
- config_cls = get_column_config_cls_from_type(resolve_string_enum(column_type, DataDesignerColumnType))
98
- return config_cls.get_column_emoji()
99
-
100
-
101
- def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
102
- if "sampler_type" not in kwargs:
103
- raise InvalidConfigError(f"🛑 `sampler_type` is required for sampler column '{name}'.")
104
- sampler_type = resolve_string_enum(kwargs["sampler_type"], SamplerType)
105
-
106
- # Handle params - it could be a dict or already a concrete object
107
- params_value = kwargs.get("params", {})
108
- expected_params_class = SAMPLER_PARAMS[sampler_type.value]
109
-
110
- if isinstance(params_value, expected_params_class):
111
- # params is already a concrete object of the right type
112
- params = params_value
113
- elif isinstance(params_value, dict):
114
- # params is a dictionary, create new instance
115
- params = expected_params_class(**params_value)
116
- else:
117
- # params is neither dict nor expected type
118
- raise InvalidConfigError(
119
- f"🛑 Invalid params for sampler column '{name}'. "
120
- f"Expected a dictionary or an instance of {expected_params_class.__name__}. "
121
- f"You provided {params_value=}."
122
- )
123
-
124
- return {
125
- "sampler_type": sampler_type,
126
- "params": params,
127
- **{k: v for k, v in kwargs.items() if k not in ["sampler_type", "params"]},
128
- }
129
-
130
-
131
- _COLUMN_TYPE_CONFIG_CLS_MAP = {
132
- DataDesignerColumnType.LLM_TEXT: LLMTextColumnConfig,
133
- DataDesignerColumnType.LLM_CODE: LLMCodeColumnConfig,
134
- DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnConfig,
135
- DataDesignerColumnType.LLM_JUDGE: LLMJudgeColumnConfig,
136
- DataDesignerColumnType.VALIDATION: ValidationColumnConfig,
137
- DataDesignerColumnType.EXPRESSION: ExpressionColumnConfig,
138
- DataDesignerColumnType.SAMPLER: SamplerColumnConfig,
139
- DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnConfig,
140
- DataDesignerColumnType.EMBEDDING: EmbeddingColumnConfig,
141
- }