data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,46 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import numbers
7
- from typing import TYPE_CHECKING
8
-
9
- from data_designer.lazy_heavy_imports import np
10
-
11
- if TYPE_CHECKING:
12
- import numpy as np
13
-
14
-
15
- def check_random_state(seed):
16
- """Turn seed into a np.random.RandomState instance.
17
-
18
- This function was taken from scikit-learn's utils module.
19
- Source GitHub: https://github.com/scikit-learn/scikit-learn
20
-
21
- Parameters
22
- ----------
23
- seed : None, int or instance of RandomState
24
- If seed is None, return the RandomState singleton used by np.random.
25
- If seed is an int, return a new RandomState instance seeded with seed.
26
- If seed is already a RandomState instance, return it.
27
- Otherwise raise ValueError.
28
-
29
- Returns
30
- -------
31
- :class:`numpy:numpy.random.RandomState`
32
- The random state object based on `seed` parameter.
33
-
34
- Examples
35
- --------
36
- >>> from data_designer.engine.sampling_gen.utils import check_random_state
37
- >>> check_random_state(42)
38
- RandomState(MT19937) at 0x...
39
- """
40
- if seed is None or seed is np.random:
41
- return np.random.mtrand._rand
42
- if isinstance(seed, numbers.Integral):
43
- return np.random.RandomState(seed)
44
- if isinstance(seed, np.random.RandomState):
45
- return seed
46
- raise ValueError("%r cannot be used to seed a numpy.random.RandomState instance" % seed)
@@ -1,82 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import json
7
- import logging
8
- import os
9
- from collections.abc import Sequence
10
- from pathlib import Path
11
- from typing import Protocol
12
-
13
- from data_designer.engine.errors import SecretResolutionError
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class SecretResolver(Protocol):
19
- def resolve(self, secret: str) -> str: ...
20
-
21
-
22
- class SecretsFileResolver(SecretResolver):
23
- _secrets: dict[str, str]
24
-
25
- def __init__(self, filepath: Path):
26
- if not filepath.exists():
27
- self._secrets = {}
28
- else:
29
- with open(filepath) as f:
30
- self._secrets = json.load(f)
31
-
32
- def resolve(self, secret: str) -> str:
33
- try:
34
- return self._secrets[secret]
35
- except KeyError:
36
- raise SecretResolutionError(f"No secret found in secrets file with key {secret!r}")
37
-
38
-
39
- class EnvironmentResolver(SecretResolver):
40
- def resolve(self, secret: str) -> str:
41
- try:
42
- return os.environ[secret]
43
- except KeyError:
44
- raise SecretResolutionError(
45
- f"Environment variable with name {secret!r} is required but not set. Please set it in your environment and try again."
46
- )
47
-
48
-
49
- class PlaintextResolver(SecretResolver):
50
- def resolve(self, secret: str) -> str:
51
- return secret
52
-
53
-
54
- class CompositeResolver(SecretResolver):
55
- _resolvers: Sequence[SecretResolver]
56
-
57
- def __init__(self, resolvers: Sequence[SecretResolver]):
58
- if len(resolvers) == 0:
59
- raise SecretResolutionError("Must provide at least one SecretResolver to CompositeResolver")
60
- self._resolvers = resolvers
61
-
62
- @property
63
- def resolvers(self) -> Sequence[SecretResolver]:
64
- """Get the sequence of resolvers in this composite resolver.
65
-
66
- Returns:
67
- Sequence of SecretResolver instances used to resolve secrets.
68
- """
69
- return self._resolvers
70
-
71
- def resolve(self, secret: str) -> str:
72
- errors = []
73
- for resolver in self._resolvers:
74
- try:
75
- return resolver.resolve(secret)
76
- except SecretResolutionError as err:
77
- errors.append(str(err))
78
- continue
79
-
80
- raise SecretResolutionError(
81
- f"No configured resolvers were able to resolve secret {secret!r}: {', '.join(errors)}"
82
- )
@@ -1,367 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from enum import Enum
7
- from string import Formatter
8
-
9
- from jinja2 import meta
10
- from jinja2.sandbox import ImmutableSandboxedEnvironment
11
- from pydantic import BaseModel
12
- from rich import box
13
- from rich.console import Console, Group
14
- from rich.padding import Padding
15
- from rich.panel import Panel
16
-
17
- from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType
18
- from data_designer.config.processors import ProcessorConfigT, ProcessorType
19
- from data_designer.config.utils.constants import RICH_CONSOLE_THEME
20
- from data_designer.config.utils.misc import (
21
- can_run_data_designer_locally,
22
- extract_keywords_from_jinja2_template,
23
- )
24
- from data_designer.config.validator_params import ValidatorType
25
- from data_designer.engine.column_generators.utils.generator_classification import column_type_is_model_generated
26
-
27
-
28
- class ViolationType(str, Enum):
29
- ALL_COLUMNS_DROPPED = "all_columns_dropped"
30
- CODE_COLUMN_MISSING = "code_column_missing"
31
- CODE_COLUMN_NOT_CODE = "code_column_not_code"
32
- CODE_LANG_MISMATCH = "code_lang_mismatch"
33
- EXPRESSION_REFERENCE_MISSING = "expression_reference_missing"
34
- F_STRING_SYNTAX = "f_string_syntax"
35
- LOCAL_ONLY_COLUMN = "local_only_column"
36
- INVALID_COLUMN = "invalid_column"
37
- INVALID_MODEL_CONFIG = "invalid_model_config"
38
- INVALID_REFERENCE = "invalid_reference"
39
- PROMPT_WITHOUT_REFERENCES = "prompt_without_references"
40
-
41
-
42
- class ViolationLevel(str, Enum):
43
- ERROR = "ERROR"
44
- WARNING = "WARNING"
45
-
46
-
47
- class Violation(BaseModel):
48
- column: str | None = None
49
- type: ViolationType
50
- message: str
51
- level: ViolationLevel
52
-
53
- @property
54
- def has_column(self) -> bool:
55
- return self.column is not None
56
-
57
-
58
- def validate_data_designer_config(
59
- columns: list[ColumnConfigT],
60
- processor_configs: list[ProcessorConfigT],
61
- allowed_references: list[str],
62
- ) -> list[Violation]:
63
- violations = []
64
- violations.extend(validate_prompt_templates(columns=columns, allowed_references=allowed_references))
65
- violations.extend(validate_code_validation(columns=columns))
66
- violations.extend(validate_expression_references(columns=columns, allowed_references=allowed_references))
67
- violations.extend(validate_columns_not_all_dropped(columns=columns))
68
- violations.extend(validate_drop_columns_processor(columns=columns, processor_configs=processor_configs))
69
- violations.extend(validate_schema_transform_processor(columns=columns, processor_configs=processor_configs))
70
- if not can_run_data_designer_locally():
71
- violations.extend(validate_local_only_columns(columns=columns))
72
- return violations
73
-
74
-
75
- def rich_print_violations(violations: list[Violation]) -> None:
76
- if len(violations) == 0:
77
- return
78
-
79
- console = Console(theme=RICH_CONSOLE_THEME)
80
-
81
- render_list = []
82
- render_list.append(
83
- Padding(
84
- Panel(
85
- f"🔎 Identified {len(violations)} validation "
86
- f"issue{'' if len(violations) == 1 else 's'} "
87
- "in your Data Designer column definitions",
88
- box=box.SIMPLE,
89
- highlight=True,
90
- ),
91
- (0, 0, 1, 0),
92
- )
93
- )
94
-
95
- for v in violations:
96
- emoji = "🛑" if v.level == ViolationLevel.ERROR else "⚠️"
97
-
98
- error_title = f"{emoji} {v.level.upper()} | {v.type.value.upper()}"
99
-
100
- render_list.append(
101
- Padding(
102
- Panel(
103
- f"{error_title}\n\n{v.message}",
104
- box=box.HORIZONTALS,
105
- title=f"Column: {v.column}" if v.has_column else "",
106
- padding=(1, 0, 1, 1),
107
- highlight=True,
108
- ),
109
- (0, 0, 1, 0),
110
- )
111
- )
112
-
113
- console.print(Group(*render_list), markup=False)
114
-
115
-
116
- def validate_prompt_templates(
117
- columns: list[ColumnConfigT],
118
- allowed_references: list[str],
119
- ) -> list[Violation]:
120
- env = ImmutableSandboxedEnvironment()
121
-
122
- columns_with_prompts = [c for c in columns if column_type_is_model_generated(c.column_type)]
123
-
124
- violations = []
125
- for column in columns_with_prompts:
126
- for prompt_type in ["prompt", "system_prompt"]:
127
- if not hasattr(column, prompt_type) or getattr(column, prompt_type) is None:
128
- continue
129
-
130
- prompt = getattr(column, prompt_type)
131
-
132
- # check for invalid references
133
- prompt_references = set()
134
- prompt_references.update(meta.find_undeclared_variables(env.parse(prompt)))
135
- invalid_references = list(set(prompt_references) - set(allowed_references))
136
- num_invalid = len(invalid_references)
137
- if num_invalid > 0:
138
- ref_msg = (
139
- f"references {num_invalid} columns that do not exist"
140
- if num_invalid > 1
141
- else "references a column that does not exist"
142
- )
143
- invalid_references = ", ".join([f"'{r}'" for r in invalid_references])
144
- message = f"The {prompt_type} template for '{column.name}' {ref_msg}: {invalid_references}."
145
- violations.append(
146
- Violation(
147
- column=column.name,
148
- type=ViolationType.INVALID_REFERENCE,
149
- message=message,
150
- level=ViolationLevel.ERROR,
151
- )
152
- )
153
-
154
- # check for prompts without references
155
-
156
- if (
157
- prompt_type == "prompt"
158
- and len(prompt_references) == 0
159
- and (not hasattr(column, "multi_modal_context") or getattr(column, "multi_modal_context") is None)
160
- ):
161
- message = (
162
- f"The {prompt_type} template for '{column.name}' does not reference any columns. "
163
- "This means the same prompt will be used for every row in the dataset. To increase "
164
- "the diversity of the generated data, consider adding references to other columns "
165
- "in the prompt template."
166
- )
167
- violations.append(
168
- Violation(
169
- column=column.name,
170
- type=ViolationType.PROMPT_WITHOUT_REFERENCES,
171
- message=message,
172
- level=ViolationLevel.WARNING,
173
- )
174
- )
175
-
176
- # check for f-string syntax
177
- f_string_references = _get_string_formatter_references(prompt, allowed_references)
178
- if len(f_string_references) > 0:
179
- f_string_references = ", ".join([f"'{r}'" for r in f_string_references])
180
- message = (
181
- f"The {prompt_type} template for '{column.name}' references the "
182
- f"following columns using f-string syntax: {f_string_references}. "
183
- "Please use jinja2 syntax to reference columns: {reference} -> {{ reference }}."
184
- )
185
- violations.append(
186
- Violation(
187
- column=column.name,
188
- type=ViolationType.F_STRING_SYNTAX,
189
- message=message,
190
- level=ViolationLevel.WARNING,
191
- )
192
- )
193
- return violations
194
-
195
-
196
- def validate_code_validation(
197
- columns: list[ColumnConfigT],
198
- ) -> list[Violation]:
199
- columns_by_name = {c.name: c for c in columns}
200
- code_validation_columns = [
201
- c for c in columns if c.column_type == DataDesignerColumnType.VALIDATION and c.validator_type == "code"
202
- ]
203
-
204
- violations = []
205
- for validation_column in code_validation_columns:
206
- for target_column_name in validation_column.target_columns:
207
- # check that the target column exists
208
- if target_column_name not in columns_by_name:
209
- message = f"Target code column '{target_column_name}' not found in column list."
210
- violations.append(
211
- Violation(
212
- column=validation_column.name,
213
- type=ViolationType.CODE_COLUMN_MISSING,
214
- message=message,
215
- level=ViolationLevel.ERROR,
216
- )
217
- )
218
- continue
219
-
220
- # check for consistent code languages
221
- target_column = columns_by_name[target_column_name]
222
- if target_column.column_type != DataDesignerColumnType.LLM_CODE:
223
- message = (
224
- f"Code validation column '{validation_column.name}' is set to validate "
225
- f"code, but the target column was generated as {target_column.column_type}."
226
- )
227
- violations.append(
228
- Violation(
229
- column=validation_column.name,
230
- type=ViolationType.CODE_COLUMN_NOT_CODE,
231
- message=message,
232
- level=ViolationLevel.WARNING,
233
- )
234
- )
235
- elif target_column.code_lang != validation_column.validator_params.code_lang:
236
- message = (
237
- f"Code validation column '{validation_column.name}' is set to validate "
238
- f"{validation_column.validator_params.code_lang}, but the target column was generated as "
239
- f"{target_column.code_lang}."
240
- )
241
- violations.append(
242
- Violation(
243
- column=validation_column.name,
244
- type=ViolationType.CODE_LANG_MISMATCH,
245
- message=message,
246
- level=ViolationLevel.ERROR,
247
- )
248
- )
249
-
250
- return violations
251
-
252
-
253
- def validate_columns_not_all_dropped(
254
- columns: list[ColumnConfigT],
255
- ) -> list[Violation]:
256
- remaining_cols = [c for c in columns if c.column_type != DataDesignerColumnType.SEED_DATASET and not c.drop]
257
-
258
- if len(remaining_cols) == 0:
259
- return [
260
- Violation(
261
- column=None,
262
- type=ViolationType.ALL_COLUMNS_DROPPED,
263
- message=(
264
- "All generated columns are configured to be dropped. "
265
- "Please mark at least one column with `drop=False`."
266
- ),
267
- level=ViolationLevel.ERROR,
268
- )
269
- ]
270
-
271
- return []
272
-
273
-
274
- def validate_drop_columns_processor(
275
- columns: list[ColumnConfigT],
276
- processor_configs: list[ProcessorConfigT],
277
- ) -> list[Violation]:
278
- all_column_names = {c.name for c in columns}
279
- for processor_config in processor_configs:
280
- if processor_config.processor_type == ProcessorType.DROP_COLUMNS:
281
- invalid_columns = set(processor_config.column_names) - all_column_names
282
- if len(invalid_columns) > 0:
283
- return [
284
- Violation(
285
- column=c,
286
- type=ViolationType.INVALID_COLUMN,
287
- message=f"Drop columns processor is configured to drop column '{c!r}', but the column is not defined.",
288
- level=ViolationLevel.ERROR,
289
- )
290
- for c in invalid_columns
291
- ]
292
- return []
293
-
294
-
295
- def validate_schema_transform_processor(
296
- columns: list[ColumnConfigT],
297
- processor_configs: list[ProcessorConfigT],
298
- ) -> list[Violation]:
299
- violations = []
300
-
301
- all_column_names = {c.name for c in columns}
302
- for processor_config in processor_configs:
303
- if processor_config.processor_type == ProcessorType.SCHEMA_TRANSFORM:
304
- for col, template in processor_config.template.items():
305
- template_keywords = extract_keywords_from_jinja2_template(template)
306
- invalid_keywords = set(template_keywords) - all_column_names
307
- if len(invalid_keywords) > 0:
308
- invalid_keywords = ", ".join([f"'{k}'" for k in invalid_keywords])
309
- message = f"Ancillary dataset processor attempts to reference columns {invalid_keywords} in the template for '{col}', but the columns are not defined in the dataset."
310
- violations.append(
311
- Violation(
312
- column=None,
313
- type=ViolationType.INVALID_REFERENCE,
314
- message=message,
315
- level=ViolationLevel.ERROR,
316
- )
317
- )
318
-
319
- return violations
320
-
321
-
322
- def validate_expression_references(
323
- columns: list[ColumnConfigT],
324
- allowed_references: list[str],
325
- ) -> list[Violation]:
326
- expression_columns = [c for c in columns if c.column_type == DataDesignerColumnType.EXPRESSION]
327
- violations = []
328
- for expression_column in expression_columns:
329
- for reference in expression_column.required_columns:
330
- if reference not in allowed_references:
331
- violations.append(
332
- Violation(
333
- column=expression_column.name,
334
- type=ViolationType.EXPRESSION_REFERENCE_MISSING,
335
- message=f"Expression column '{expression_column.name}' references missing column '{reference}'.",
336
- level=ViolationLevel.ERROR,
337
- )
338
- )
339
- return violations
340
-
341
-
342
- def validate_local_only_columns(
343
- columns: list[ColumnConfigT],
344
- ) -> list[Violation]:
345
- violations = []
346
- validation_columns = [c for c in columns if c.column_type == DataDesignerColumnType.VALIDATION]
347
-
348
- # Local validation columns
349
- for validation_column in validation_columns:
350
- if validation_column.validator_type == ValidatorType.LOCAL_CALLABLE:
351
- violations.append(
352
- Violation(
353
- column=validation_column.name,
354
- type=ViolationType.LOCAL_ONLY_COLUMN,
355
- message="Validation using functions are only supported when running Data Designer locally",
356
- level=ViolationLevel.ERROR,
357
- )
358
- )
359
- return violations
360
-
361
-
362
- def _get_string_formatter_references(template: str, allowed_references: list[str]) -> list[str]:
363
- return [
364
- k[1].strip()
365
- for k in Formatter().parse(template)
366
- if len(k) > 1 and k[1] is not None and k[1].strip() in allowed_references
367
- ]
@@ -1,19 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.engine.validators.base import BaseValidator, ValidationResult
7
- from data_designer.engine.validators.local_callable import LocalCallableValidator
8
- from data_designer.engine.validators.python import PythonValidator
9
- from data_designer.engine.validators.remote import RemoteValidator
10
- from data_designer.engine.validators.sql import SQLValidator
11
-
12
- __all__ = [
13
- "BaseValidator",
14
- "LocalCallableValidator",
15
- "RemoteValidator",
16
- "ValidationResult",
17
- "PythonValidator",
18
- "SQLValidator",
19
- ]
@@ -1,38 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from abc import ABC, abstractmethod
7
- from typing import Iterator
8
-
9
- from pydantic import BaseModel, ConfigDict
10
- from typing_extensions import Self
11
-
12
-
13
- class ValidationOutput(BaseModel):
14
- is_valid: bool | None
15
- model_config = ConfigDict(extra="allow")
16
-
17
-
18
- class ValidationResult(BaseModel):
19
- data: list[ValidationOutput]
20
-
21
- def __len__(self) -> int:
22
- return len(self.data)
23
-
24
- def __getitem__(self, index: int) -> ValidationOutput:
25
- return self.data[index]
26
-
27
- def __iter__(self) -> Iterator[ValidationOutput]:
28
- return iter(self.data)
29
-
30
- @classmethod
31
- def empty(cls, size: int) -> Self:
32
- return cls(data=[ValidationOutput(is_valid=None) for _ in range(size)])
33
-
34
-
35
- class BaseValidator(ABC):
36
- @abstractmethod
37
- def run_validation(self, data: list[dict]) -> ValidationResult:
38
- pass
@@ -1,39 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import logging
7
- from typing import TYPE_CHECKING
8
-
9
- from data_designer.config.validator_params import LocalCallableValidatorParams
10
- from data_designer.engine.errors import LocalCallableValidationError
11
- from data_designer.engine.processing.gsonschema.validators import validate
12
- from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
13
- from data_designer.lazy_heavy_imports import pd
14
-
15
- if TYPE_CHECKING:
16
- import pandas as pd
17
-
18
- logger = logging.getLogger(__name__)
19
-
20
-
21
- class LocalCallableValidator(BaseValidator):
22
- def __init__(self, config: LocalCallableValidatorParams):
23
- self.validation_function = config.validation_function
24
- self.output_schema = config.output_schema
25
-
26
- def run_validation(self, data: list[dict]) -> ValidationResult:
27
- df = pd.DataFrame(data)
28
-
29
- try:
30
- result_as_df = self.validation_function(df)
31
- except Exception as e:
32
- logger.error(f"Callback validator failed: {e}")
33
- raise LocalCallableValidationError(str(e))
34
-
35
- records = result_as_df.to_dict(orient="records")
36
- result = ValidationResult(data=[ValidationOutput.model_validate(record) for record in records])
37
- if self.output_schema:
38
- validate(result.model_dump(mode="json"), self.output_schema, no_extra_properties=True)
39
- return result