data-designer-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. data_designer/engine/__init__.py +2 -0
  2. data_designer/engine/_version.py +34 -0
  3. data_designer/engine/analysis/column_profilers/base.py +49 -0
  4. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +153 -0
  5. data_designer/engine/analysis/column_profilers/registry.py +22 -0
  6. data_designer/engine/analysis/column_statistics.py +145 -0
  7. data_designer/engine/analysis/dataset_profiler.py +149 -0
  8. data_designer/engine/analysis/errors.py +9 -0
  9. data_designer/engine/analysis/utils/column_statistics_calculations.py +234 -0
  10. data_designer/engine/analysis/utils/judge_score_processing.py +132 -0
  11. data_designer/engine/column_generators/__init__.py +2 -0
  12. data_designer/engine/column_generators/generators/__init__.py +2 -0
  13. data_designer/engine/column_generators/generators/base.py +122 -0
  14. data_designer/engine/column_generators/generators/embedding.py +35 -0
  15. data_designer/engine/column_generators/generators/expression.py +55 -0
  16. data_designer/engine/column_generators/generators/llm_completion.py +116 -0
  17. data_designer/engine/column_generators/generators/samplers.py +69 -0
  18. data_designer/engine/column_generators/generators/seed_dataset.py +144 -0
  19. data_designer/engine/column_generators/generators/validation.py +140 -0
  20. data_designer/engine/column_generators/registry.py +60 -0
  21. data_designer/engine/column_generators/utils/errors.py +15 -0
  22. data_designer/engine/column_generators/utils/generator_classification.py +43 -0
  23. data_designer/engine/column_generators/utils/judge_score_factory.py +58 -0
  24. data_designer/engine/column_generators/utils/prompt_renderer.py +100 -0
  25. data_designer/engine/compiler.py +97 -0
  26. data_designer/engine/configurable_task.py +71 -0
  27. data_designer/engine/dataset_builders/artifact_storage.py +283 -0
  28. data_designer/engine/dataset_builders/column_wise_builder.py +354 -0
  29. data_designer/engine/dataset_builders/errors.py +15 -0
  30. data_designer/engine/dataset_builders/multi_column_configs.py +46 -0
  31. data_designer/engine/dataset_builders/utils/__init__.py +2 -0
  32. data_designer/engine/dataset_builders/utils/concurrency.py +212 -0
  33. data_designer/engine/dataset_builders/utils/config_compiler.py +62 -0
  34. data_designer/engine/dataset_builders/utils/dag.py +62 -0
  35. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +200 -0
  36. data_designer/engine/dataset_builders/utils/errors.py +15 -0
  37. data_designer/engine/dataset_builders/utils/progress_tracker.py +122 -0
  38. data_designer/engine/errors.py +51 -0
  39. data_designer/engine/model_provider.py +77 -0
  40. data_designer/engine/models/__init__.py +2 -0
  41. data_designer/engine/models/errors.py +300 -0
  42. data_designer/engine/models/facade.py +284 -0
  43. data_designer/engine/models/factory.py +42 -0
  44. data_designer/engine/models/litellm_overrides.py +179 -0
  45. data_designer/engine/models/parsers/__init__.py +2 -0
  46. data_designer/engine/models/parsers/errors.py +34 -0
  47. data_designer/engine/models/parsers/parser.py +235 -0
  48. data_designer/engine/models/parsers/postprocessors.py +93 -0
  49. data_designer/engine/models/parsers/tag_parsers.py +62 -0
  50. data_designer/engine/models/parsers/types.py +84 -0
  51. data_designer/engine/models/recipes/base.py +81 -0
  52. data_designer/engine/models/recipes/response_recipes.py +293 -0
  53. data_designer/engine/models/registry.py +151 -0
  54. data_designer/engine/models/telemetry.py +362 -0
  55. data_designer/engine/models/usage.py +73 -0
  56. data_designer/engine/models/utils.py +101 -0
  57. data_designer/engine/processing/ginja/__init__.py +2 -0
  58. data_designer/engine/processing/ginja/ast.py +65 -0
  59. data_designer/engine/processing/ginja/environment.py +463 -0
  60. data_designer/engine/processing/ginja/exceptions.py +56 -0
  61. data_designer/engine/processing/ginja/record.py +32 -0
  62. data_designer/engine/processing/gsonschema/__init__.py +2 -0
  63. data_designer/engine/processing/gsonschema/exceptions.py +15 -0
  64. data_designer/engine/processing/gsonschema/schema_transformers.py +83 -0
  65. data_designer/engine/processing/gsonschema/types.py +10 -0
  66. data_designer/engine/processing/gsonschema/validators.py +202 -0
  67. data_designer/engine/processing/processors/base.py +13 -0
  68. data_designer/engine/processing/processors/drop_columns.py +42 -0
  69. data_designer/engine/processing/processors/registry.py +25 -0
  70. data_designer/engine/processing/processors/schema_transform.py +71 -0
  71. data_designer/engine/processing/utils.py +169 -0
  72. data_designer/engine/registry/base.py +99 -0
  73. data_designer/engine/registry/data_designer_registry.py +39 -0
  74. data_designer/engine/registry/errors.py +12 -0
  75. data_designer/engine/resources/managed_dataset_generator.py +39 -0
  76. data_designer/engine/resources/managed_dataset_repository.py +197 -0
  77. data_designer/engine/resources/managed_storage.py +65 -0
  78. data_designer/engine/resources/resource_provider.py +77 -0
  79. data_designer/engine/resources/seed_reader.py +154 -0
  80. data_designer/engine/sampling_gen/column.py +91 -0
  81. data_designer/engine/sampling_gen/constraints.py +100 -0
  82. data_designer/engine/sampling_gen/data_sources/base.py +217 -0
  83. data_designer/engine/sampling_gen/data_sources/errors.py +12 -0
  84. data_designer/engine/sampling_gen/data_sources/sources.py +347 -0
  85. data_designer/engine/sampling_gen/entities/__init__.py +2 -0
  86. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  87. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +90 -0
  88. data_designer/engine/sampling_gen/entities/email_address_utils.py +171 -0
  89. data_designer/engine/sampling_gen/entities/errors.py +10 -0
  90. data_designer/engine/sampling_gen/entities/national_id_utils.py +102 -0
  91. data_designer/engine/sampling_gen/entities/person.py +144 -0
  92. data_designer/engine/sampling_gen/entities/phone_number.py +128 -0
  93. data_designer/engine/sampling_gen/errors.py +26 -0
  94. data_designer/engine/sampling_gen/generator.py +122 -0
  95. data_designer/engine/sampling_gen/jinja_utils.py +64 -0
  96. data_designer/engine/sampling_gen/people_gen.py +199 -0
  97. data_designer/engine/sampling_gen/person_constants.py +56 -0
  98. data_designer/engine/sampling_gen/schema.py +147 -0
  99. data_designer/engine/sampling_gen/schema_builder.py +61 -0
  100. data_designer/engine/sampling_gen/utils.py +46 -0
  101. data_designer/engine/secret_resolver.py +82 -0
  102. data_designer/engine/testing/__init__.py +12 -0
  103. data_designer/engine/testing/stubs.py +133 -0
  104. data_designer/engine/testing/utils.py +20 -0
  105. data_designer/engine/validation.py +367 -0
  106. data_designer/engine/validators/__init__.py +19 -0
  107. data_designer/engine/validators/base.py +38 -0
  108. data_designer/engine/validators/local_callable.py +39 -0
  109. data_designer/engine/validators/python.py +254 -0
  110. data_designer/engine/validators/remote.py +89 -0
  111. data_designer/engine/validators/sql.py +65 -0
  112. data_designer_engine-0.4.0.dist-info/METADATA +50 -0
  113. data_designer_engine-0.4.0.dist-info/RECORD +114 -0
  114. data_designer_engine-0.4.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,46 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import numbers
7
+ from typing import TYPE_CHECKING
8
+
9
+ from data_designer.lazy_heavy_imports import np
10
+
11
+ if TYPE_CHECKING:
12
+ import numpy as np
13
+
14
+
15
+ def check_random_state(seed):
16
+ """Turn seed into a np.random.RandomState instance.
17
+
18
+ This function was taken from scikit-learn's utils module.
19
+ Source GitHub: https://github.com/scikit-learn/scikit-learn
20
+
21
+ Parameters
22
+ ----------
23
+ seed : None, int or instance of RandomState
24
+ If seed is None, return the RandomState singleton used by np.random.
25
+ If seed is an int, return a new RandomState instance seeded with seed.
26
+ If seed is already a RandomState instance, return it.
27
+ Otherwise raise ValueError.
28
+
29
+ Returns
30
+ -------
31
+ :class:`numpy:numpy.random.RandomState`
32
+ The random state object based on `seed` parameter.
33
+
34
+ Examples
35
+ --------
36
+ >>> from data_designer.engine.sampling_gen.utils import check_random_state
37
+ >>> check_random_state(42)
38
+ RandomState(MT19937) at 0x...
39
+ """
40
+ if seed is None or seed is np.random:
41
+ return np.random.mtrand._rand
42
+ if isinstance(seed, numbers.Integral):
43
+ return np.random.RandomState(seed)
44
+ if isinstance(seed, np.random.RandomState):
45
+ return seed
46
+ raise ValueError("%r cannot be used to seed a numpy.random.RandomState instance" % seed)
@@ -0,0 +1,82 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import logging
8
+ import os
9
+ from collections.abc import Sequence
10
+ from pathlib import Path
11
+ from typing import Protocol
12
+
13
+ from data_designer.engine.errors import SecretResolutionError
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class SecretResolver(Protocol):
19
+ def resolve(self, secret: str) -> str: ...
20
+
21
+
22
+ class SecretsFileResolver(SecretResolver):
23
+ _secrets: dict[str, str]
24
+
25
+ def __init__(self, filepath: Path):
26
+ if not filepath.exists():
27
+ self._secrets = {}
28
+ else:
29
+ with open(filepath) as f:
30
+ self._secrets = json.load(f)
31
+
32
+ def resolve(self, secret: str) -> str:
33
+ try:
34
+ return self._secrets[secret]
35
+ except KeyError:
36
+ raise SecretResolutionError(f"No secret found in secrets file with key {secret!r}")
37
+
38
+
39
+ class EnvironmentResolver(SecretResolver):
40
+ def resolve(self, secret: str) -> str:
41
+ try:
42
+ return os.environ[secret]
43
+ except KeyError:
44
+ raise SecretResolutionError(
45
+ f"Environment variable with name {secret!r} is required but not set. Please set it in your environment and try again."
46
+ )
47
+
48
+
49
+ class PlaintextResolver(SecretResolver):
50
+ def resolve(self, secret: str) -> str:
51
+ return secret
52
+
53
+
54
+ class CompositeResolver(SecretResolver):
55
+ _resolvers: Sequence[SecretResolver]
56
+
57
+ def __init__(self, resolvers: Sequence[SecretResolver]):
58
+ if len(resolvers) == 0:
59
+ raise SecretResolutionError("Must provide at least one SecretResolver to CompositeResolver")
60
+ self._resolvers = resolvers
61
+
62
+ @property
63
+ def resolvers(self) -> Sequence[SecretResolver]:
64
+ """Get the sequence of resolvers in this composite resolver.
65
+
66
+ Returns:
67
+ Sequence of SecretResolver instances used to resolve secrets.
68
+ """
69
+ return self._resolvers
70
+
71
+ def resolve(self, secret: str) -> str:
72
+ errors = []
73
+ for resolver in self._resolvers:
74
+ try:
75
+ return resolver.resolve(secret)
76
+ except SecretResolutionError as err:
77
+ errors.append(str(err))
78
+ continue
79
+
80
+ raise SecretResolutionError(
81
+ f"No configured resolvers were able to resolve secret {secret!r}: {', '.join(errors)}"
82
+ )
@@ -0,0 +1,12 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from data_designer.engine.testing.stubs import StubHuggingFaceSeedReader
7
+ from data_designer.engine.testing.utils import assert_valid_plugin
8
+
9
+ __all__ = [
10
+ "StubHuggingFaceSeedReader",
11
+ assert_valid_plugin.__name__,
12
+ ]
@@ -0,0 +1,133 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from typing import Literal
7
+
8
+ from data_designer.config.base import ConfigBase
9
+ from data_designer.config.column_configs import SingleColumnConfig
10
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorCellByCell
11
+ from data_designer.engine.resources.seed_reader import SeedReader
12
+ from data_designer.plugins.plugin import Plugin, PluginType
13
+
14
+ MODULE_NAME = __name__
15
+
16
+
17
+ class StubHuggingFaceSeedReader(SeedReader):
18
+ """Stub seed reader for testing."""
19
+
20
+ def get_column_names(self) -> list[str]:
21
+ return ["age", "city"]
22
+
23
+ def get_dataset_uri(self) -> str:
24
+ return "unused in these tests"
25
+
26
+ def create_duckdb_connection(self):
27
+ pass
28
+
29
+ def get_seed_type(self) -> str:
30
+ return "hf"
31
+
32
+
33
+ class ValidTestConfig(SingleColumnConfig):
34
+ """Valid config for testing plugin creation."""
35
+
36
+ column_type: Literal["test-generator"] = "test-generator"
37
+ name: str
38
+
39
+
40
+ class ValidTestTask(ColumnGeneratorCellByCell[ValidTestConfig]):
41
+ """Valid task for testing plugin creation."""
42
+
43
+ def generate(self, data: dict) -> dict:
44
+ return data
45
+
46
+
47
+ class ConfigWithoutDiscriminator(ConfigBase):
48
+ some_field: str
49
+
50
+
51
+ class ConfigWithStringField(ConfigBase):
52
+ column_type: str = "test-generator"
53
+
54
+
55
+ class ConfigWithNonStringDefault(ConfigBase):
56
+ column_type: Literal["test-generator"] = 123 # type: ignore
57
+
58
+
59
+ class ConfigWithInvalidKey(ConfigBase):
60
+ column_type: Literal["invalid-key-!@#"] = "invalid-key-!@#"
61
+
62
+
63
+ class StubPluginConfigA(SingleColumnConfig):
64
+ column_type: Literal["test-plugin-a"] = "test-plugin-a"
65
+
66
+
67
+ class StubPluginConfigB(SingleColumnConfig):
68
+ column_type: Literal["test-plugin-b"] = "test-plugin-b"
69
+
70
+
71
+ class StubPluginTaskA(ColumnGeneratorCellByCell[StubPluginConfigA]):
72
+ def generate(self, data: dict) -> dict:
73
+ return data
74
+
75
+
76
+ class StubPluginTaskB(ColumnGeneratorCellByCell[StubPluginConfigB]):
77
+ def generate(self, data: dict) -> dict:
78
+ return data
79
+
80
+
81
+ # Stub plugins requiring different combinations of resources
82
+
83
+
84
+ class StubPluginConfigModels(SingleColumnConfig):
85
+ column_type: Literal["test-plugin-models"] = "test-plugin-models"
86
+
87
+
88
+ class StubPluginConfigModelsAndBlobs(SingleColumnConfig):
89
+ column_type: Literal["test-plugin-models-and-blobs"] = "test-plugin-models-and-blobs"
90
+
91
+
92
+ class StubPluginConfigBlobsAndSeeds(SingleColumnConfig):
93
+ column_type: Literal["test-plugin-blobs-and-seeds"] = "test-plugin-blobs-and-seeds"
94
+
95
+
96
+ class StubPluginTaskModels(ColumnGeneratorCellByCell[StubPluginConfigModels]):
97
+ def generate(self, data: dict) -> dict:
98
+ return data
99
+
100
+
101
+ class StubPluginTaskModelsAndBlobs(ColumnGeneratorCellByCell[StubPluginConfigModelsAndBlobs]):
102
+ def generate(self, data: dict) -> dict:
103
+ return data
104
+
105
+
106
+ class StubPluginTaskBlobsAndSeeds(ColumnGeneratorCellByCell[StubPluginConfigBlobsAndSeeds]):
107
+ def generate(self, data: dict) -> dict:
108
+ return data
109
+
110
+
111
+ plugin_none = Plugin(
112
+ config_qualified_name=f"{MODULE_NAME}.StubPluginConfigA",
113
+ impl_qualified_name=f"{MODULE_NAME}.StubPluginTaskA",
114
+ plugin_type=PluginType.COLUMN_GENERATOR,
115
+ )
116
+
117
+ plugin_models = Plugin(
118
+ config_qualified_name=f"{MODULE_NAME}.StubPluginConfigModels",
119
+ impl_qualified_name=f"{MODULE_NAME}.StubPluginTaskModels",
120
+ plugin_type=PluginType.COLUMN_GENERATOR,
121
+ )
122
+
123
+ plugin_models_and_blobs = Plugin(
124
+ config_qualified_name=f"{MODULE_NAME}.StubPluginConfigModelsAndBlobs",
125
+ impl_qualified_name=f"{MODULE_NAME}.StubPluginTaskModelsAndBlobs",
126
+ plugin_type=PluginType.COLUMN_GENERATOR,
127
+ )
128
+
129
+ plugin_blobs_and_seeds = Plugin(
130
+ config_qualified_name=f"{MODULE_NAME}.StubPluginConfigBlobsAndSeeds",
131
+ impl_qualified_name=f"{MODULE_NAME}.StubPluginTaskBlobsAndSeeds",
132
+ plugin_type=PluginType.COLUMN_GENERATOR,
133
+ )
@@ -0,0 +1,20 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from data_designer.config.base import ConfigBase
7
+ from data_designer.engine.configurable_task import ConfigurableTask
8
+ from data_designer.engine.resources.seed_reader import SeedReader
9
+ from data_designer.plugins.plugin import Plugin, PluginType
10
+
11
+
12
+ def assert_valid_plugin(plugin: Plugin) -> None:
13
+ assert issubclass(plugin.config_cls, ConfigBase), "Plugin config class is not a subclass of ConfigBase"
14
+
15
+ if plugin.plugin_type == PluginType.COLUMN_GENERATOR:
16
+ assert issubclass(plugin.impl_cls, ConfigurableTask), (
17
+ "Column generator plugin impl class must be a subclass of ConfigurableTask"
18
+ )
19
+ elif plugin.plugin_type == PluginType.SEED_READER:
20
+ assert issubclass(plugin.impl_cls, SeedReader), "Seed reader plugin impl class must be a subclass of SeedReader"
@@ -0,0 +1,367 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from enum import Enum
7
+ from string import Formatter
8
+
9
+ from jinja2 import meta
10
+ from jinja2.sandbox import ImmutableSandboxedEnvironment
11
+ from pydantic import BaseModel
12
+ from rich import box
13
+ from rich.console import Console, Group
14
+ from rich.padding import Padding
15
+ from rich.panel import Panel
16
+
17
+ from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType
18
+ from data_designer.config.processors import ProcessorConfigT, ProcessorType
19
+ from data_designer.config.utils.constants import RICH_CONSOLE_THEME
20
+ from data_designer.config.utils.misc import (
21
+ can_run_data_designer_locally,
22
+ extract_keywords_from_jinja2_template,
23
+ )
24
+ from data_designer.config.validator_params import ValidatorType
25
+ from data_designer.engine.column_generators.utils.generator_classification import column_type_is_model_generated
26
+
27
+
28
+ class ViolationType(str, Enum):
29
+ ALL_COLUMNS_DROPPED = "all_columns_dropped"
30
+ CODE_COLUMN_MISSING = "code_column_missing"
31
+ CODE_COLUMN_NOT_CODE = "code_column_not_code"
32
+ CODE_LANG_MISMATCH = "code_lang_mismatch"
33
+ EXPRESSION_REFERENCE_MISSING = "expression_reference_missing"
34
+ F_STRING_SYNTAX = "f_string_syntax"
35
+ LOCAL_ONLY_COLUMN = "local_only_column"
36
+ INVALID_COLUMN = "invalid_column"
37
+ INVALID_MODEL_CONFIG = "invalid_model_config"
38
+ INVALID_REFERENCE = "invalid_reference"
39
+ PROMPT_WITHOUT_REFERENCES = "prompt_without_references"
40
+
41
+
42
+ class ViolationLevel(str, Enum):
43
+ ERROR = "ERROR"
44
+ WARNING = "WARNING"
45
+
46
+
47
+ class Violation(BaseModel):
48
+ column: str | None = None
49
+ type: ViolationType
50
+ message: str
51
+ level: ViolationLevel
52
+
53
+ @property
54
+ def has_column(self) -> bool:
55
+ return self.column is not None
56
+
57
+
58
+ def validate_data_designer_config(
59
+ columns: list[ColumnConfigT],
60
+ processor_configs: list[ProcessorConfigT],
61
+ allowed_references: list[str],
62
+ ) -> list[Violation]:
63
+ violations = []
64
+ violations.extend(validate_prompt_templates(columns=columns, allowed_references=allowed_references))
65
+ violations.extend(validate_code_validation(columns=columns))
66
+ violations.extend(validate_expression_references(columns=columns, allowed_references=allowed_references))
67
+ violations.extend(validate_columns_not_all_dropped(columns=columns))
68
+ violations.extend(validate_drop_columns_processor(columns=columns, processor_configs=processor_configs))
69
+ violations.extend(validate_schema_transform_processor(columns=columns, processor_configs=processor_configs))
70
+ if not can_run_data_designer_locally():
71
+ violations.extend(validate_local_only_columns(columns=columns))
72
+ return violations
73
+
74
+
75
+ def rich_print_violations(violations: list[Violation]) -> None:
76
+ if len(violations) == 0:
77
+ return
78
+
79
+ console = Console(theme=RICH_CONSOLE_THEME)
80
+
81
+ render_list = []
82
+ render_list.append(
83
+ Padding(
84
+ Panel(
85
+ f"🔎 Identified {len(violations)} validation "
86
+ f"issue{'' if len(violations) == 1 else 's'} "
87
+ "in your Data Designer column definitions",
88
+ box=box.SIMPLE,
89
+ highlight=True,
90
+ ),
91
+ (0, 0, 1, 0),
92
+ )
93
+ )
94
+
95
+ for v in violations:
96
+ emoji = "🛑" if v.level == ViolationLevel.ERROR else "⚠️"
97
+
98
+ error_title = f"{emoji} {v.level.upper()} | {v.type.value.upper()}"
99
+
100
+ render_list.append(
101
+ Padding(
102
+ Panel(
103
+ f"{error_title}\n\n{v.message}",
104
+ box=box.HORIZONTALS,
105
+ title=f"Column: {v.column}" if v.has_column else "",
106
+ padding=(1, 0, 1, 1),
107
+ highlight=True,
108
+ ),
109
+ (0, 0, 1, 0),
110
+ )
111
+ )
112
+
113
+ console.print(Group(*render_list), markup=False)
114
+
115
+
116
+ def validate_prompt_templates(
117
+ columns: list[ColumnConfigT],
118
+ allowed_references: list[str],
119
+ ) -> list[Violation]:
120
+ env = ImmutableSandboxedEnvironment()
121
+
122
+ columns_with_prompts = [c for c in columns if column_type_is_model_generated(c.column_type)]
123
+
124
+ violations = []
125
+ for column in columns_with_prompts:
126
+ for prompt_type in ["prompt", "system_prompt"]:
127
+ if not hasattr(column, prompt_type) or getattr(column, prompt_type) is None:
128
+ continue
129
+
130
+ prompt = getattr(column, prompt_type)
131
+
132
+ # check for invalid references
133
+ prompt_references = set()
134
+ prompt_references.update(meta.find_undeclared_variables(env.parse(prompt)))
135
+ invalid_references = list(set(prompt_references) - set(allowed_references))
136
+ num_invalid = len(invalid_references)
137
+ if num_invalid > 0:
138
+ ref_msg = (
139
+ f"references {num_invalid} columns that do not exist"
140
+ if num_invalid > 1
141
+ else "references a column that does not exist"
142
+ )
143
+ invalid_references = ", ".join([f"'{r}'" for r in invalid_references])
144
+ message = f"The {prompt_type} template for '{column.name}' {ref_msg}: {invalid_references}."
145
+ violations.append(
146
+ Violation(
147
+ column=column.name,
148
+ type=ViolationType.INVALID_REFERENCE,
149
+ message=message,
150
+ level=ViolationLevel.ERROR,
151
+ )
152
+ )
153
+
154
+ # check for prompts without references
155
+
156
+ if (
157
+ prompt_type == "prompt"
158
+ and len(prompt_references) == 0
159
+ and (not hasattr(column, "multi_modal_context") or getattr(column, "multi_modal_context") is None)
160
+ ):
161
+ message = (
162
+ f"The {prompt_type} template for '{column.name}' does not reference any columns. "
163
+ "This means the same prompt will be used for every row in the dataset. To increase "
164
+ "the diversity of the generated data, consider adding references to other columns "
165
+ "in the prompt template."
166
+ )
167
+ violations.append(
168
+ Violation(
169
+ column=column.name,
170
+ type=ViolationType.PROMPT_WITHOUT_REFERENCES,
171
+ message=message,
172
+ level=ViolationLevel.WARNING,
173
+ )
174
+ )
175
+
176
+ # check for f-string syntax
177
+ f_string_references = _get_string_formatter_references(prompt, allowed_references)
178
+ if len(f_string_references) > 0:
179
+ f_string_references = ", ".join([f"'{r}'" for r in f_string_references])
180
+ message = (
181
+ f"The {prompt_type} template for '{column.name}' references the "
182
+ f"following columns using f-string syntax: {f_string_references}. "
183
+ "Please use jinja2 syntax to reference columns: {reference} -> {{ reference }}."
184
+ )
185
+ violations.append(
186
+ Violation(
187
+ column=column.name,
188
+ type=ViolationType.F_STRING_SYNTAX,
189
+ message=message,
190
+ level=ViolationLevel.WARNING,
191
+ )
192
+ )
193
+ return violations
194
+
195
+
196
+ def validate_code_validation(
197
+ columns: list[ColumnConfigT],
198
+ ) -> list[Violation]:
199
+ columns_by_name = {c.name: c for c in columns}
200
+ code_validation_columns = [
201
+ c for c in columns if c.column_type == DataDesignerColumnType.VALIDATION and c.validator_type == "code"
202
+ ]
203
+
204
+ violations = []
205
+ for validation_column in code_validation_columns:
206
+ for target_column_name in validation_column.target_columns:
207
+ # check that the target column exists
208
+ if target_column_name not in columns_by_name:
209
+ message = f"Target code column '{target_column_name}' not found in column list."
210
+ violations.append(
211
+ Violation(
212
+ column=validation_column.name,
213
+ type=ViolationType.CODE_COLUMN_MISSING,
214
+ message=message,
215
+ level=ViolationLevel.ERROR,
216
+ )
217
+ )
218
+ continue
219
+
220
+ # check for consistent code languages
221
+ target_column = columns_by_name[target_column_name]
222
+ if target_column.column_type != DataDesignerColumnType.LLM_CODE:
223
+ message = (
224
+ f"Code validation column '{validation_column.name}' is set to validate "
225
+ f"code, but the target column was generated as {target_column.column_type}."
226
+ )
227
+ violations.append(
228
+ Violation(
229
+ column=validation_column.name,
230
+ type=ViolationType.CODE_COLUMN_NOT_CODE,
231
+ message=message,
232
+ level=ViolationLevel.WARNING,
233
+ )
234
+ )
235
+ elif target_column.code_lang != validation_column.validator_params.code_lang:
236
+ message = (
237
+ f"Code validation column '{validation_column.name}' is set to validate "
238
+ f"{validation_column.validator_params.code_lang}, but the target column was generated as "
239
+ f"{target_column.code_lang}."
240
+ )
241
+ violations.append(
242
+ Violation(
243
+ column=validation_column.name,
244
+ type=ViolationType.CODE_LANG_MISMATCH,
245
+ message=message,
246
+ level=ViolationLevel.ERROR,
247
+ )
248
+ )
249
+
250
+ return violations
251
+
252
+
253
+ def validate_columns_not_all_dropped(
254
+ columns: list[ColumnConfigT],
255
+ ) -> list[Violation]:
256
+ remaining_cols = [c for c in columns if c.column_type != DataDesignerColumnType.SEED_DATASET and not c.drop]
257
+
258
+ if len(remaining_cols) == 0:
259
+ return [
260
+ Violation(
261
+ column=None,
262
+ type=ViolationType.ALL_COLUMNS_DROPPED,
263
+ message=(
264
+ "All generated columns are configured to be dropped. "
265
+ "Please mark at least one column with `drop=False`."
266
+ ),
267
+ level=ViolationLevel.ERROR,
268
+ )
269
+ ]
270
+
271
+ return []
272
+
273
+
274
+ def validate_drop_columns_processor(
275
+ columns: list[ColumnConfigT],
276
+ processor_configs: list[ProcessorConfigT],
277
+ ) -> list[Violation]:
278
+ all_column_names = {c.name for c in columns}
279
+ for processor_config in processor_configs:
280
+ if processor_config.processor_type == ProcessorType.DROP_COLUMNS:
281
+ invalid_columns = set(processor_config.column_names) - all_column_names
282
+ if len(invalid_columns) > 0:
283
+ return [
284
+ Violation(
285
+ column=c,
286
+ type=ViolationType.INVALID_COLUMN,
287
+ message=f"Drop columns processor is configured to drop column '{c!r}', but the column is not defined.",
288
+ level=ViolationLevel.ERROR,
289
+ )
290
+ for c in invalid_columns
291
+ ]
292
+ return []
293
+
294
+
295
+ def validate_schema_transform_processor(
296
+ columns: list[ColumnConfigT],
297
+ processor_configs: list[ProcessorConfigT],
298
+ ) -> list[Violation]:
299
+ violations = []
300
+
301
+ all_column_names = {c.name for c in columns}
302
+ for processor_config in processor_configs:
303
+ if processor_config.processor_type == ProcessorType.SCHEMA_TRANSFORM:
304
+ for col, template in processor_config.template.items():
305
+ template_keywords = extract_keywords_from_jinja2_template(template)
306
+ invalid_keywords = set(template_keywords) - all_column_names
307
+ if len(invalid_keywords) > 0:
308
+ invalid_keywords = ", ".join([f"'{k}'" for k in invalid_keywords])
309
+ message = f"Ancillary dataset processor attempts to reference columns {invalid_keywords} in the template for '{col}', but the columns are not defined in the dataset."
310
+ violations.append(
311
+ Violation(
312
+ column=None,
313
+ type=ViolationType.INVALID_REFERENCE,
314
+ message=message,
315
+ level=ViolationLevel.ERROR,
316
+ )
317
+ )
318
+
319
+ return violations
320
+
321
+
322
+ def validate_expression_references(
323
+ columns: list[ColumnConfigT],
324
+ allowed_references: list[str],
325
+ ) -> list[Violation]:
326
+ expression_columns = [c for c in columns if c.column_type == DataDesignerColumnType.EXPRESSION]
327
+ violations = []
328
+ for expression_column in expression_columns:
329
+ for reference in expression_column.required_columns:
330
+ if reference not in allowed_references:
331
+ violations.append(
332
+ Violation(
333
+ column=expression_column.name,
334
+ type=ViolationType.EXPRESSION_REFERENCE_MISSING,
335
+ message=f"Expression column '{expression_column.name}' references missing column '{reference}'.",
336
+ level=ViolationLevel.ERROR,
337
+ )
338
+ )
339
+ return violations
340
+
341
+
342
+ def validate_local_only_columns(
343
+ columns: list[ColumnConfigT],
344
+ ) -> list[Violation]:
345
+ violations = []
346
+ validation_columns = [c for c in columns if c.column_type == DataDesignerColumnType.VALIDATION]
347
+
348
+ # Local validation columns
349
+ for validation_column in validation_columns:
350
+ if validation_column.validator_type == ValidatorType.LOCAL_CALLABLE:
351
+ violations.append(
352
+ Violation(
353
+ column=validation_column.name,
354
+ type=ViolationType.LOCAL_ONLY_COLUMN,
355
+ message="Validation using functions are only supported when running Data Designer locally",
356
+ level=ViolationLevel.ERROR,
357
+ )
358
+ )
359
+ return violations
360
+
361
+
362
+ def _get_string_formatter_references(template: str, allowed_references: list[str]) -> list[str]:
363
+ return [
364
+ k[1].strip()
365
+ for k in Formatter().parse(template)
366
+ if len(k) > 1 and k[1] is not None and k[1].strip() in allowed_references
367
+ ]
@@ -0,0 +1,19 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from data_designer.engine.validators.base import BaseValidator, ValidationResult
7
+ from data_designer.engine.validators.local_callable import LocalCallableValidator
8
+ from data_designer.engine.validators.python import PythonValidator
9
+ from data_designer.engine.validators.remote import RemoteValidator
10
+ from data_designer.engine.validators.sql import SQLValidator
11
+
12
+ __all__ = [
13
+ "BaseValidator",
14
+ "LocalCallableValidator",
15
+ "RemoteValidator",
16
+ "ValidationResult",
17
+ "PythonValidator",
18
+ "SQLValidator",
19
+ ]