data-designer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/__init__.py +15 -0
- data_designer/_version.py +34 -0
- data_designer/cli/README.md +236 -0
- data_designer/cli/__init__.py +6 -0
- data_designer/cli/commands/__init__.py +2 -0
- data_designer/cli/commands/list.py +130 -0
- data_designer/cli/commands/models.py +10 -0
- data_designer/cli/commands/providers.py +11 -0
- data_designer/cli/commands/reset.py +100 -0
- data_designer/cli/controllers/__init__.py +7 -0
- data_designer/cli/controllers/model_controller.py +246 -0
- data_designer/cli/controllers/provider_controller.py +317 -0
- data_designer/cli/forms/__init__.py +20 -0
- data_designer/cli/forms/builder.py +51 -0
- data_designer/cli/forms/field.py +180 -0
- data_designer/cli/forms/form.py +59 -0
- data_designer/cli/forms/model_builder.py +125 -0
- data_designer/cli/forms/provider_builder.py +76 -0
- data_designer/cli/main.py +44 -0
- data_designer/cli/repositories/__init__.py +8 -0
- data_designer/cli/repositories/base.py +39 -0
- data_designer/cli/repositories/model_repository.py +42 -0
- data_designer/cli/repositories/provider_repository.py +43 -0
- data_designer/cli/services/__init__.py +7 -0
- data_designer/cli/services/model_service.py +116 -0
- data_designer/cli/services/provider_service.py +111 -0
- data_designer/cli/ui.py +448 -0
- data_designer/cli/utils.py +47 -0
- data_designer/config/__init__.py +2 -0
- data_designer/config/analysis/column_profilers.py +89 -0
- data_designer/config/analysis/column_statistics.py +274 -0
- data_designer/config/analysis/dataset_profiler.py +60 -0
- data_designer/config/analysis/utils/errors.py +8 -0
- data_designer/config/analysis/utils/reporting.py +188 -0
- data_designer/config/base.py +68 -0
- data_designer/config/column_configs.py +354 -0
- data_designer/config/column_types.py +168 -0
- data_designer/config/config_builder.py +660 -0
- data_designer/config/data_designer_config.py +40 -0
- data_designer/config/dataset_builders.py +11 -0
- data_designer/config/datastore.py +151 -0
- data_designer/config/default_model_settings.py +123 -0
- data_designer/config/errors.py +19 -0
- data_designer/config/interface.py +54 -0
- data_designer/config/models.py +231 -0
- data_designer/config/preview_results.py +32 -0
- data_designer/config/processors.py +41 -0
- data_designer/config/sampler_constraints.py +51 -0
- data_designer/config/sampler_params.py +604 -0
- data_designer/config/seed.py +145 -0
- data_designer/config/utils/code_lang.py +83 -0
- data_designer/config/utils/constants.py +313 -0
- data_designer/config/utils/errors.py +19 -0
- data_designer/config/utils/info.py +88 -0
- data_designer/config/utils/io_helpers.py +273 -0
- data_designer/config/utils/misc.py +81 -0
- data_designer/config/utils/numerical_helpers.py +28 -0
- data_designer/config/utils/type_helpers.py +100 -0
- data_designer/config/utils/validation.py +336 -0
- data_designer/config/utils/visualization.py +427 -0
- data_designer/config/validator_params.py +96 -0
- data_designer/engine/__init__.py +2 -0
- data_designer/engine/analysis/column_profilers/base.py +55 -0
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
- data_designer/engine/analysis/column_profilers/registry.py +20 -0
- data_designer/engine/analysis/column_statistics.py +142 -0
- data_designer/engine/analysis/dataset_profiler.py +125 -0
- data_designer/engine/analysis/errors.py +7 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
- data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
- data_designer/engine/column_generators/__init__.py +2 -0
- data_designer/engine/column_generators/generators/__init__.py +2 -0
- data_designer/engine/column_generators/generators/base.py +61 -0
- data_designer/engine/column_generators/generators/expression.py +63 -0
- data_designer/engine/column_generators/generators/llm_generators.py +172 -0
- data_designer/engine/column_generators/generators/samplers.py +75 -0
- data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
- data_designer/engine/column_generators/generators/validation.py +147 -0
- data_designer/engine/column_generators/registry.py +56 -0
- data_designer/engine/column_generators/utils/errors.py +13 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
- data_designer/engine/configurable_task.py +82 -0
- data_designer/engine/dataset_builders/artifact_storage.py +181 -0
- data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
- data_designer/engine/dataset_builders/errors.py +13 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
- data_designer/engine/dataset_builders/utils/__init__.py +2 -0
- data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
- data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
- data_designer/engine/dataset_builders/utils/dag.py +56 -0
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
- data_designer/engine/dataset_builders/utils/errors.py +13 -0
- data_designer/engine/errors.py +49 -0
- data_designer/engine/model_provider.py +75 -0
- data_designer/engine/models/__init__.py +2 -0
- data_designer/engine/models/errors.py +308 -0
- data_designer/engine/models/facade.py +225 -0
- data_designer/engine/models/litellm_overrides.py +162 -0
- data_designer/engine/models/parsers/__init__.py +2 -0
- data_designer/engine/models/parsers/errors.py +34 -0
- data_designer/engine/models/parsers/parser.py +236 -0
- data_designer/engine/models/parsers/postprocessors.py +93 -0
- data_designer/engine/models/parsers/tag_parsers.py +60 -0
- data_designer/engine/models/parsers/types.py +82 -0
- data_designer/engine/models/recipes/base.py +79 -0
- data_designer/engine/models/recipes/response_recipes.py +291 -0
- data_designer/engine/models/registry.py +118 -0
- data_designer/engine/models/usage.py +75 -0
- data_designer/engine/models/utils.py +38 -0
- data_designer/engine/processing/ginja/__init__.py +2 -0
- data_designer/engine/processing/ginja/ast.py +64 -0
- data_designer/engine/processing/ginja/environment.py +461 -0
- data_designer/engine/processing/ginja/exceptions.py +54 -0
- data_designer/engine/processing/ginja/record.py +30 -0
- data_designer/engine/processing/gsonschema/__init__.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +8 -0
- data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
- data_designer/engine/processing/gsonschema/types.py +8 -0
- data_designer/engine/processing/gsonschema/validators.py +143 -0
- data_designer/engine/processing/processors/base.py +15 -0
- data_designer/engine/processing/processors/drop_columns.py +46 -0
- data_designer/engine/processing/processors/registry.py +20 -0
- data_designer/engine/processing/utils.py +120 -0
- data_designer/engine/registry/base.py +97 -0
- data_designer/engine/registry/data_designer_registry.py +37 -0
- data_designer/engine/registry/errors.py +10 -0
- data_designer/engine/resources/managed_dataset_generator.py +35 -0
- data_designer/engine/resources/managed_dataset_repository.py +194 -0
- data_designer/engine/resources/managed_storage.py +63 -0
- data_designer/engine/resources/resource_provider.py +46 -0
- data_designer/engine/resources/seed_dataset_data_store.py +66 -0
- data_designer/engine/sampling_gen/column.py +89 -0
- data_designer/engine/sampling_gen/constraints.py +95 -0
- data_designer/engine/sampling_gen/data_sources/base.py +214 -0
- data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
- data_designer/engine/sampling_gen/entities/__init__.py +2 -0
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
- data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
- data_designer/engine/sampling_gen/entities/errors.py +8 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
- data_designer/engine/sampling_gen/entities/person.py +142 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
- data_designer/engine/sampling_gen/errors.py +24 -0
- data_designer/engine/sampling_gen/generator.py +121 -0
- data_designer/engine/sampling_gen/jinja_utils.py +60 -0
- data_designer/engine/sampling_gen/people_gen.py +203 -0
- data_designer/engine/sampling_gen/person_constants.py +54 -0
- data_designer/engine/sampling_gen/schema.py +143 -0
- data_designer/engine/sampling_gen/schema_builder.py +59 -0
- data_designer/engine/sampling_gen/utils.py +40 -0
- data_designer/engine/secret_resolver.py +80 -0
- data_designer/engine/validators/__init__.py +17 -0
- data_designer/engine/validators/base.py +36 -0
- data_designer/engine/validators/local_callable.py +34 -0
- data_designer/engine/validators/python.py +245 -0
- data_designer/engine/validators/remote.py +83 -0
- data_designer/engine/validators/sql.py +60 -0
- data_designer/errors.py +5 -0
- data_designer/essentials/__init__.py +137 -0
- data_designer/interface/__init__.py +2 -0
- data_designer/interface/data_designer.py +351 -0
- data_designer/interface/errors.py +16 -0
- data_designer/interface/results.py +55 -0
- data_designer/logging.py +161 -0
- data_designer/plugin_manager.py +83 -0
- data_designer/plugins/__init__.py +6 -0
- data_designer/plugins/errors.py +10 -0
- data_designer/plugins/plugin.py +69 -0
- data_designer/plugins/registry.py +86 -0
- data_designer-0.1.0.dist-info/METADATA +173 -0
- data_designer-0.1.0.dist-info/RECORD +177 -0
- data_designer-0.1.0.dist-info/WHEEL +4 -0
- data_designer-0.1.0.dist-info/entry_points.txt +2 -0
- data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from string import Formatter
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from jinja2 import meta
|
|
11
|
+
from jinja2.sandbox import ImmutableSandboxedEnvironment
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
from rich import box
|
|
14
|
+
from rich.console import Console, Group
|
|
15
|
+
from rich.padding import Padding
|
|
16
|
+
from rich.panel import Panel
|
|
17
|
+
|
|
18
|
+
from ..column_types import ColumnConfigT, DataDesignerColumnType, column_type_is_llm_generated
|
|
19
|
+
from ..processors import ProcessorConfig, ProcessorType
|
|
20
|
+
from ..validator_params import ValidatorType
|
|
21
|
+
from .constants import RICH_CONSOLE_THEME
|
|
22
|
+
from .misc import can_run_data_designer_locally
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ViolationType(str, Enum):
|
|
26
|
+
ALL_COLUMNS_DROPPED = "all_columns_dropped"
|
|
27
|
+
CODE_COLUMN_MISSING = "code_column_missing"
|
|
28
|
+
CODE_COLUMN_NOT_CODE = "code_column_not_code"
|
|
29
|
+
CODE_LANG_MISMATCH = "code_lang_mismatch"
|
|
30
|
+
EXPRESSION_REFERENCE_MISSING = "expression_reference_missing"
|
|
31
|
+
F_STRING_SYNTAX = "f_string_syntax"
|
|
32
|
+
LOCAL_ONLY_COLUMN = "local_only_column"
|
|
33
|
+
INVALID_COLUMN = "invalid_column"
|
|
34
|
+
INVALID_MODEL_CONFIG = "invalid_model_config"
|
|
35
|
+
INVALID_REFERENCE = "invalid_reference"
|
|
36
|
+
PROMPT_WITHOUT_REFERENCES = "prompt_without_references"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ViolationLevel(str, Enum):
|
|
40
|
+
ERROR = "ERROR"
|
|
41
|
+
WARNING = "WARNING"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Violation(BaseModel):
|
|
45
|
+
column: Optional[str] = None
|
|
46
|
+
type: ViolationType
|
|
47
|
+
message: str
|
|
48
|
+
level: ViolationLevel
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def has_column(self) -> bool:
|
|
52
|
+
return self.column is not None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def validate_data_designer_config(
|
|
56
|
+
columns: list[ColumnConfigT],
|
|
57
|
+
processor_configs: list[ProcessorConfig],
|
|
58
|
+
allowed_references: list[str],
|
|
59
|
+
) -> list[Violation]:
|
|
60
|
+
violations = []
|
|
61
|
+
violations.extend(validate_prompt_templates(columns=columns, allowed_references=allowed_references))
|
|
62
|
+
violations.extend(validate_code_validation(columns=columns))
|
|
63
|
+
violations.extend(validate_expression_references(columns=columns, allowed_references=allowed_references))
|
|
64
|
+
violations.extend(validate_columns_not_all_dropped(columns=columns))
|
|
65
|
+
violations.extend(validate_drop_columns_processor(columns=columns, processor_configs=processor_configs))
|
|
66
|
+
if not can_run_data_designer_locally():
|
|
67
|
+
violations.extend(validate_local_only_columns(columns=columns))
|
|
68
|
+
return violations
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def rich_print_violations(violations: list[Violation]) -> None:
|
|
72
|
+
if len(violations) == 0:
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
console = Console(theme=RICH_CONSOLE_THEME)
|
|
76
|
+
|
|
77
|
+
render_list = []
|
|
78
|
+
render_list.append(
|
|
79
|
+
Padding(
|
|
80
|
+
Panel(
|
|
81
|
+
f"🔎 Identified {len(violations)} validation "
|
|
82
|
+
f"issue{'' if len(violations) == 1 else 's'} "
|
|
83
|
+
"in your Data Designer column definitions",
|
|
84
|
+
box=box.SIMPLE,
|
|
85
|
+
highlight=True,
|
|
86
|
+
),
|
|
87
|
+
(0, 0, 1, 0),
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
for v in violations:
|
|
92
|
+
emoji = "🛑" if v.level == ViolationLevel.ERROR else "⚠️"
|
|
93
|
+
|
|
94
|
+
error_title = f"{emoji} {v.level.upper()} | {v.type.value.upper()}"
|
|
95
|
+
|
|
96
|
+
render_list.append(
|
|
97
|
+
Padding(
|
|
98
|
+
Panel(
|
|
99
|
+
f"{error_title}\n\n{v.message}",
|
|
100
|
+
box=box.HORIZONTALS,
|
|
101
|
+
title=f"Column: {v.column}" if v.has_column else "",
|
|
102
|
+
padding=(1, 0, 1, 1),
|
|
103
|
+
highlight=True,
|
|
104
|
+
),
|
|
105
|
+
(0, 0, 1, 0),
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
console.print(Group(*render_list), markup=False)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def validate_prompt_templates(
|
|
113
|
+
columns: list[ColumnConfigT],
|
|
114
|
+
allowed_references: list[str],
|
|
115
|
+
) -> list[Violation]:
|
|
116
|
+
env = ImmutableSandboxedEnvironment()
|
|
117
|
+
|
|
118
|
+
columns_with_prompts = [c for c in columns if column_type_is_llm_generated(c.column_type)]
|
|
119
|
+
|
|
120
|
+
violations = []
|
|
121
|
+
for column in columns_with_prompts:
|
|
122
|
+
for prompt_type in ["prompt", "system_prompt"]:
|
|
123
|
+
if not hasattr(column, prompt_type) or getattr(column, prompt_type) is None:
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
prompt = getattr(column, prompt_type)
|
|
127
|
+
|
|
128
|
+
# check for invalid references
|
|
129
|
+
prompt_references = set()
|
|
130
|
+
prompt_references.update(meta.find_undeclared_variables(env.parse(prompt)))
|
|
131
|
+
invalid_references = list(set(prompt_references) - set(allowed_references))
|
|
132
|
+
num_invalid = len(invalid_references)
|
|
133
|
+
if num_invalid > 0:
|
|
134
|
+
ref_msg = (
|
|
135
|
+
f"references {num_invalid} columns that do not exist"
|
|
136
|
+
if num_invalid > 1
|
|
137
|
+
else "references a column that does not exist"
|
|
138
|
+
)
|
|
139
|
+
invalid_references = ", ".join([f"'{r}'" for r in invalid_references])
|
|
140
|
+
message = f"The {prompt_type} template for '{column.name}' {ref_msg}: {invalid_references}."
|
|
141
|
+
violations.append(
|
|
142
|
+
Violation(
|
|
143
|
+
column=column.name,
|
|
144
|
+
type=ViolationType.INVALID_REFERENCE,
|
|
145
|
+
message=message,
|
|
146
|
+
level=ViolationLevel.ERROR,
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# check for prompts without references
|
|
151
|
+
|
|
152
|
+
if (
|
|
153
|
+
prompt_type == "prompt"
|
|
154
|
+
and len(prompt_references) == 0
|
|
155
|
+
and (not hasattr(column, "multi_modal_context") or getattr(column, "multi_modal_context") is None)
|
|
156
|
+
):
|
|
157
|
+
message = (
|
|
158
|
+
f"The {prompt_type} template for '{column.name}' does not reference any columns. "
|
|
159
|
+
"This means the same prompt will be used for every row in the dataset. To increase "
|
|
160
|
+
"the diversity of the generated data, consider adding references to other columns "
|
|
161
|
+
"in the prompt template."
|
|
162
|
+
)
|
|
163
|
+
violations.append(
|
|
164
|
+
Violation(
|
|
165
|
+
column=column.name,
|
|
166
|
+
type=ViolationType.PROMPT_WITHOUT_REFERENCES,
|
|
167
|
+
message=message,
|
|
168
|
+
level=ViolationLevel.WARNING,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# check for f-string syntax
|
|
173
|
+
f_string_references = _get_string_formatter_references(prompt, allowed_references)
|
|
174
|
+
if len(f_string_references) > 0:
|
|
175
|
+
f_string_references = ", ".join([f"'{r}'" for r in f_string_references])
|
|
176
|
+
message = (
|
|
177
|
+
f"The {prompt_type} template for '{column.name}' references the "
|
|
178
|
+
f"following columns using f-string syntax: {f_string_references}. "
|
|
179
|
+
"Please use jinja2 syntax to reference columns: {reference} -> {{ reference }}."
|
|
180
|
+
)
|
|
181
|
+
violations.append(
|
|
182
|
+
Violation(
|
|
183
|
+
column=column.name,
|
|
184
|
+
type=ViolationType.F_STRING_SYNTAX,
|
|
185
|
+
message=message,
|
|
186
|
+
level=ViolationLevel.WARNING,
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
return violations
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def validate_code_validation(
|
|
193
|
+
columns: list[ColumnConfigT],
|
|
194
|
+
) -> list[Violation]:
|
|
195
|
+
columns_by_name = {c.name: c for c in columns}
|
|
196
|
+
code_validation_columns = [
|
|
197
|
+
c for c in columns if c.column_type == DataDesignerColumnType.VALIDATION and c.validator_type == "code"
|
|
198
|
+
]
|
|
199
|
+
|
|
200
|
+
violations = []
|
|
201
|
+
for validation_column in code_validation_columns:
|
|
202
|
+
for target_column_name in validation_column.target_columns:
|
|
203
|
+
# check that the target column exists
|
|
204
|
+
if target_column_name not in columns_by_name:
|
|
205
|
+
message = f"Target code column '{target_column_name}' not found in column list."
|
|
206
|
+
violations.append(
|
|
207
|
+
Violation(
|
|
208
|
+
column=validation_column.name,
|
|
209
|
+
type=ViolationType.CODE_COLUMN_MISSING,
|
|
210
|
+
message=message,
|
|
211
|
+
level=ViolationLevel.ERROR,
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
# check for consistent code languages
|
|
217
|
+
target_column = columns_by_name[target_column_name]
|
|
218
|
+
if target_column.column_type != DataDesignerColumnType.LLM_CODE:
|
|
219
|
+
message = (
|
|
220
|
+
f"Code validation column '{validation_column.name}' is set to validate "
|
|
221
|
+
f"code, but the target column was generated as {target_column.column_type}."
|
|
222
|
+
)
|
|
223
|
+
violations.append(
|
|
224
|
+
Violation(
|
|
225
|
+
column=validation_column.name,
|
|
226
|
+
type=ViolationType.CODE_COLUMN_NOT_CODE,
|
|
227
|
+
message=message,
|
|
228
|
+
level=ViolationLevel.WARNING,
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
elif target_column.code_lang != validation_column.validator_params.code_lang:
|
|
232
|
+
message = (
|
|
233
|
+
f"Code validation column '{validation_column.name}' is set to validate "
|
|
234
|
+
f"{validation_column.validator_params.code_lang}, but the target column was generated as "
|
|
235
|
+
f"{target_column.code_lang}."
|
|
236
|
+
)
|
|
237
|
+
violations.append(
|
|
238
|
+
Violation(
|
|
239
|
+
column=validation_column.name,
|
|
240
|
+
type=ViolationType.CODE_LANG_MISMATCH,
|
|
241
|
+
message=message,
|
|
242
|
+
level=ViolationLevel.ERROR,
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
return violations
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def validate_columns_not_all_dropped(
|
|
250
|
+
columns: list[ColumnConfigT],
|
|
251
|
+
) -> list[Violation]:
|
|
252
|
+
remaining_cols = [c for c in columns if c.column_type != DataDesignerColumnType.SEED_DATASET and not c.drop]
|
|
253
|
+
|
|
254
|
+
if len(remaining_cols) == 0:
|
|
255
|
+
return [
|
|
256
|
+
Violation(
|
|
257
|
+
column=None,
|
|
258
|
+
type=ViolationType.ALL_COLUMNS_DROPPED,
|
|
259
|
+
message=(
|
|
260
|
+
"All generated columns are configured to be dropped. "
|
|
261
|
+
"Please mark at least one column with `drop=False`."
|
|
262
|
+
),
|
|
263
|
+
level=ViolationLevel.ERROR,
|
|
264
|
+
)
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
return []
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def validate_drop_columns_processor(
|
|
271
|
+
columns: list[ColumnConfigT],
|
|
272
|
+
processor_configs: list[ProcessorConfig],
|
|
273
|
+
) -> list[Violation]:
|
|
274
|
+
all_column_names = set([c.name for c in columns])
|
|
275
|
+
for processor_config in processor_configs:
|
|
276
|
+
if processor_config.processor_type == ProcessorType.DROP_COLUMNS:
|
|
277
|
+
invalid_columns = set(processor_config.column_names) - all_column_names
|
|
278
|
+
if len(invalid_columns) > 0:
|
|
279
|
+
return [
|
|
280
|
+
Violation(
|
|
281
|
+
column=c,
|
|
282
|
+
type=ViolationType.INVALID_COLUMN,
|
|
283
|
+
message=f"Drop columns processor is configured to drop column '{c!r}', but the column is not defined.",
|
|
284
|
+
level=ViolationLevel.ERROR,
|
|
285
|
+
)
|
|
286
|
+
for c in invalid_columns
|
|
287
|
+
]
|
|
288
|
+
return []
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def validate_expression_references(
|
|
292
|
+
columns: list[ColumnConfigT],
|
|
293
|
+
allowed_references: list[str],
|
|
294
|
+
) -> list[Violation]:
|
|
295
|
+
expression_columns = [c for c in columns if c.column_type == DataDesignerColumnType.EXPRESSION]
|
|
296
|
+
violations = []
|
|
297
|
+
for expression_column in expression_columns:
|
|
298
|
+
for reference in expression_column.required_columns:
|
|
299
|
+
if reference not in allowed_references:
|
|
300
|
+
violations.append(
|
|
301
|
+
Violation(
|
|
302
|
+
column=expression_column.name,
|
|
303
|
+
type=ViolationType.EXPRESSION_REFERENCE_MISSING,
|
|
304
|
+
message=f"Expression column '{expression_column.name}' references missing column '{reference}'.",
|
|
305
|
+
level=ViolationLevel.ERROR,
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
return violations
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def validate_local_only_columns(
|
|
312
|
+
columns: list[ColumnConfigT],
|
|
313
|
+
) -> list[Violation]:
|
|
314
|
+
violations = []
|
|
315
|
+
validation_columns = [c for c in columns if c.column_type == DataDesignerColumnType.VALIDATION]
|
|
316
|
+
|
|
317
|
+
# Local validation columns
|
|
318
|
+
for validation_column in validation_columns:
|
|
319
|
+
if validation_column.validator_type == ValidatorType.LOCAL_CALLABLE:
|
|
320
|
+
violations.append(
|
|
321
|
+
Violation(
|
|
322
|
+
column=validation_column.name,
|
|
323
|
+
type=ViolationType.LOCAL_ONLY_COLUMN,
|
|
324
|
+
message="Validation using functions are only supported when running Data Designer locally",
|
|
325
|
+
level=ViolationLevel.ERROR,
|
|
326
|
+
)
|
|
327
|
+
)
|
|
328
|
+
return violations
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _get_string_formatter_references(template: str, allowed_references: list[str]) -> list[str]:
|
|
332
|
+
return [
|
|
333
|
+
k[1].strip()
|
|
334
|
+
for k in Formatter().parse(template)
|
|
335
|
+
if len(k) > 1 and k[1] is not None and k[1].strip() in allowed_references
|
|
336
|
+
]
|