data-designer 0.3.8rc1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/cli/commands/__init__.py +1 -1
- data_designer/interface/__init__.py +21 -1
- data_designer/{_version.py → interface/_version.py} +2 -2
- data_designer/interface/data_designer.py +8 -11
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
- data_designer-0.4.0.dist-info/RECORD +39 -0
- data_designer/__init__.py +0 -17
- data_designer/config/__init__.py +0 -2
- data_designer/config/analysis/__init__.py +0 -2
- data_designer/config/analysis/column_profilers.py +0 -159
- data_designer/config/analysis/column_statistics.py +0 -421
- data_designer/config/analysis/dataset_profiler.py +0 -84
- data_designer/config/analysis/utils/errors.py +0 -10
- data_designer/config/analysis/utils/reporting.py +0 -192
- data_designer/config/base.py +0 -69
- data_designer/config/column_configs.py +0 -470
- data_designer/config/column_types.py +0 -141
- data_designer/config/config_builder.py +0 -595
- data_designer/config/data_designer_config.py +0 -40
- data_designer/config/dataset_builders.py +0 -13
- data_designer/config/dataset_metadata.py +0 -18
- data_designer/config/default_model_settings.py +0 -121
- data_designer/config/errors.py +0 -24
- data_designer/config/exports.py +0 -145
- data_designer/config/interface.py +0 -55
- data_designer/config/models.py +0 -455
- data_designer/config/preview_results.py +0 -41
- data_designer/config/processors.py +0 -148
- data_designer/config/run_config.py +0 -48
- data_designer/config/sampler_constraints.py +0 -52
- data_designer/config/sampler_params.py +0 -639
- data_designer/config/seed.py +0 -116
- data_designer/config/seed_source.py +0 -84
- data_designer/config/seed_source_types.py +0 -19
- data_designer/config/utils/code_lang.py +0 -82
- data_designer/config/utils/constants.py +0 -363
- data_designer/config/utils/errors.py +0 -21
- data_designer/config/utils/info.py +0 -94
- data_designer/config/utils/io_helpers.py +0 -258
- data_designer/config/utils/misc.py +0 -78
- data_designer/config/utils/numerical_helpers.py +0 -30
- data_designer/config/utils/type_helpers.py +0 -106
- data_designer/config/utils/visualization.py +0 -482
- data_designer/config/validator_params.py +0 -94
- data_designer/engine/__init__.py +0 -2
- data_designer/engine/analysis/column_profilers/base.py +0 -49
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
- data_designer/engine/analysis/column_profilers/registry.py +0 -22
- data_designer/engine/analysis/column_statistics.py +0 -145
- data_designer/engine/analysis/dataset_profiler.py +0 -149
- data_designer/engine/analysis/errors.py +0 -9
- data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
- data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
- data_designer/engine/column_generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/base.py +0 -122
- data_designer/engine/column_generators/generators/embedding.py +0 -35
- data_designer/engine/column_generators/generators/expression.py +0 -55
- data_designer/engine/column_generators/generators/llm_completion.py +0 -113
- data_designer/engine/column_generators/generators/samplers.py +0 -69
- data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
- data_designer/engine/column_generators/generators/validation.py +0 -140
- data_designer/engine/column_generators/registry.py +0 -60
- data_designer/engine/column_generators/utils/errors.py +0 -15
- data_designer/engine/column_generators/utils/generator_classification.py +0 -43
- data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
- data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
- data_designer/engine/compiler.py +0 -97
- data_designer/engine/configurable_task.py +0 -71
- data_designer/engine/dataset_builders/artifact_storage.py +0 -283
- data_designer/engine/dataset_builders/column_wise_builder.py +0 -338
- data_designer/engine/dataset_builders/errors.py +0 -15
- data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
- data_designer/engine/dataset_builders/utils/__init__.py +0 -2
- data_designer/engine/dataset_builders/utils/concurrency.py +0 -215
- data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
- data_designer/engine/dataset_builders/utils/dag.py +0 -62
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
- data_designer/engine/dataset_builders/utils/errors.py +0 -15
- data_designer/engine/errors.py +0 -51
- data_designer/engine/model_provider.py +0 -77
- data_designer/engine/models/__init__.py +0 -2
- data_designer/engine/models/errors.py +0 -300
- data_designer/engine/models/facade.py +0 -287
- data_designer/engine/models/factory.py +0 -42
- data_designer/engine/models/litellm_overrides.py +0 -179
- data_designer/engine/models/parsers/__init__.py +0 -2
- data_designer/engine/models/parsers/errors.py +0 -34
- data_designer/engine/models/parsers/parser.py +0 -235
- data_designer/engine/models/parsers/postprocessors.py +0 -93
- data_designer/engine/models/parsers/tag_parsers.py +0 -62
- data_designer/engine/models/parsers/types.py +0 -84
- data_designer/engine/models/recipes/base.py +0 -81
- data_designer/engine/models/recipes/response_recipes.py +0 -293
- data_designer/engine/models/registry.py +0 -146
- data_designer/engine/models/telemetry.py +0 -359
- data_designer/engine/models/usage.py +0 -73
- data_designer/engine/models/utils.py +0 -38
- data_designer/engine/processing/ginja/__init__.py +0 -2
- data_designer/engine/processing/ginja/ast.py +0 -65
- data_designer/engine/processing/ginja/environment.py +0 -463
- data_designer/engine/processing/ginja/exceptions.py +0 -56
- data_designer/engine/processing/ginja/record.py +0 -32
- data_designer/engine/processing/gsonschema/__init__.py +0 -2
- data_designer/engine/processing/gsonschema/exceptions.py +0 -15
- data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
- data_designer/engine/processing/gsonschema/types.py +0 -10
- data_designer/engine/processing/gsonschema/validators.py +0 -202
- data_designer/engine/processing/processors/base.py +0 -13
- data_designer/engine/processing/processors/drop_columns.py +0 -42
- data_designer/engine/processing/processors/registry.py +0 -25
- data_designer/engine/processing/processors/schema_transform.py +0 -49
- data_designer/engine/processing/utils.py +0 -169
- data_designer/engine/registry/base.py +0 -99
- data_designer/engine/registry/data_designer_registry.py +0 -39
- data_designer/engine/registry/errors.py +0 -12
- data_designer/engine/resources/managed_dataset_generator.py +0 -39
- data_designer/engine/resources/managed_dataset_repository.py +0 -197
- data_designer/engine/resources/managed_storage.py +0 -65
- data_designer/engine/resources/resource_provider.py +0 -77
- data_designer/engine/resources/seed_reader.py +0 -154
- data_designer/engine/sampling_gen/column.py +0 -91
- data_designer/engine/sampling_gen/constraints.py +0 -100
- data_designer/engine/sampling_gen/data_sources/base.py +0 -217
- data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
- data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
- data_designer/engine/sampling_gen/entities/__init__.py +0 -2
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
- data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
- data_designer/engine/sampling_gen/entities/errors.py +0 -10
- data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
- data_designer/engine/sampling_gen/entities/person.py +0 -144
- data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
- data_designer/engine/sampling_gen/errors.py +0 -26
- data_designer/engine/sampling_gen/generator.py +0 -122
- data_designer/engine/sampling_gen/jinja_utils.py +0 -64
- data_designer/engine/sampling_gen/people_gen.py +0 -199
- data_designer/engine/sampling_gen/person_constants.py +0 -56
- data_designer/engine/sampling_gen/schema.py +0 -147
- data_designer/engine/sampling_gen/schema_builder.py +0 -61
- data_designer/engine/sampling_gen/utils.py +0 -46
- data_designer/engine/secret_resolver.py +0 -82
- data_designer/engine/validation.py +0 -367
- data_designer/engine/validators/__init__.py +0 -19
- data_designer/engine/validators/base.py +0 -38
- data_designer/engine/validators/local_callable.py +0 -39
- data_designer/engine/validators/python.py +0 -254
- data_designer/engine/validators/remote.py +0 -89
- data_designer/engine/validators/sql.py +0 -65
- data_designer/errors.py +0 -7
- data_designer/essentials/__init__.py +0 -33
- data_designer/lazy_heavy_imports.py +0 -54
- data_designer/logging.py +0 -163
- data_designer/plugin_manager.py +0 -78
- data_designer/plugins/__init__.py +0 -8
- data_designer/plugins/errors.py +0 -15
- data_designer/plugins/plugin.py +0 -141
- data_designer/plugins/registry.py +0 -88
- data_designer/plugins/testing/__init__.py +0 -10
- data_designer/plugins/testing/stubs.py +0 -116
- data_designer/plugins/testing/utils.py +0 -20
- data_designer-0.3.8rc1.dist-info/RECORD +0 -196
- data_designer-0.3.8rc1.dist-info/licenses/LICENSE +0 -201
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,254 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import ast
|
|
7
|
-
import json
|
|
8
|
-
import logging
|
|
9
|
-
import subprocess
|
|
10
|
-
import tempfile
|
|
11
|
-
from collections import defaultdict
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from typing import TYPE_CHECKING
|
|
14
|
-
from uuid import uuid4
|
|
15
|
-
|
|
16
|
-
from pydantic import BaseModel
|
|
17
|
-
from ruff.__main__ import find_ruff_bin
|
|
18
|
-
|
|
19
|
-
from data_designer.config.validator_params import CodeValidatorParams
|
|
20
|
-
from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
|
|
21
|
-
from data_designer.lazy_heavy_imports import pd
|
|
22
|
-
|
|
23
|
-
if TYPE_CHECKING:
|
|
24
|
-
import pandas as pd
|
|
25
|
-
|
|
26
|
-
logger = logging.getLogger(__name__)
|
|
27
|
-
|
|
28
|
-
PYLINT_ERROR_CATEGORIES_ORDERED = [
|
|
29
|
-
"fatal",
|
|
30
|
-
"error",
|
|
31
|
-
"warning",
|
|
32
|
-
"convention",
|
|
33
|
-
"refactor",
|
|
34
|
-
]
|
|
35
|
-
PYLINT_VALID_LEVELS = {"none", "warning", "convention", "refactor"}
|
|
36
|
-
|
|
37
|
-
TYPE_FROM_SYMBOL = {
|
|
38
|
-
"E": "refactor",
|
|
39
|
-
"F": "error",
|
|
40
|
-
"SIM": "refactor",
|
|
41
|
-
"PLC": "convention",
|
|
42
|
-
"PLE": "error",
|
|
43
|
-
"PLR": "refactor",
|
|
44
|
-
"PLW": "warning",
|
|
45
|
-
"SyntaxError": "fatal",
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
PYTHON_MESSAGES_FIELD = "python_linter_messages"
|
|
49
|
-
RECORD_ID_COLUMN_NAME = "internal_code_record_id"
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class PythonValidationStat(BaseModel):
|
|
53
|
-
fatal: int = 0
|
|
54
|
-
error: int = 0
|
|
55
|
-
warning: int = 0
|
|
56
|
-
refactor: int = 0
|
|
57
|
-
convention: int = 0
|
|
58
|
-
statement: int = 0
|
|
59
|
-
|
|
60
|
-
@property
|
|
61
|
-
def score(self) -> float:
|
|
62
|
-
# https://pylint.pycqa.org/en/latest/user_guide/configuration/all-options.html#evaluation
|
|
63
|
-
if self.statement == 0: # prevent division by zero down below
|
|
64
|
-
self.statement = max(1, self.statement)
|
|
65
|
-
return max(
|
|
66
|
-
0,
|
|
67
|
-
(
|
|
68
|
-
0
|
|
69
|
-
if self.fatal
|
|
70
|
-
else 10.0
|
|
71
|
-
- ((float(5 * self.error + self.warning + self.refactor + self.convention) / self.statement) * 10)
|
|
72
|
-
),
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
class PythonLinterMessage(BaseModel):
|
|
77
|
-
type: str
|
|
78
|
-
symbol: str
|
|
79
|
-
line: int
|
|
80
|
-
column: int
|
|
81
|
-
message: str
|
|
82
|
-
|
|
83
|
-
@property
|
|
84
|
-
def type_sort_order(self) -> int:
|
|
85
|
-
return PYLINT_ERROR_CATEGORIES_ORDERED.index(self.type)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
class PythonLinterMessages(BaseModel):
|
|
89
|
-
_messages: list[PythonLinterMessage] = []
|
|
90
|
-
|
|
91
|
-
@property
|
|
92
|
-
def messages(self) -> list[PythonLinterMessage]:
|
|
93
|
-
# Ordered by severity first then by line number
|
|
94
|
-
return sorted(self._messages, key=lambda msg: (msg.type_sort_order, msg.line))
|
|
95
|
-
|
|
96
|
-
def add(self, message: PythonLinterMessage) -> None:
|
|
97
|
-
self._messages.append(message)
|
|
98
|
-
|
|
99
|
-
def get_count_by_type(self) -> dict[str, int]:
|
|
100
|
-
count_by_type = defaultdict(int)
|
|
101
|
-
for message in self.messages:
|
|
102
|
-
count_by_type[message.type] += 1
|
|
103
|
-
return dict(count_by_type)
|
|
104
|
-
|
|
105
|
-
@property
|
|
106
|
-
def is_empty(self) -> bool:
|
|
107
|
-
return len(self.messages) == 0
|
|
108
|
-
|
|
109
|
-
@property
|
|
110
|
-
def severity(self) -> str:
|
|
111
|
-
if self.is_empty:
|
|
112
|
-
return "none"
|
|
113
|
-
return self.messages[0].type
|
|
114
|
-
|
|
115
|
-
@property
|
|
116
|
-
def is_valid(self) -> bool:
|
|
117
|
-
return self.is_empty or self.messages[0].type in PYLINT_VALID_LEVELS
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
class PythonValidator(BaseValidator):
|
|
121
|
-
def __init__(self, config: CodeValidatorParams):
|
|
122
|
-
self.config = config
|
|
123
|
-
|
|
124
|
-
def run_validation(self, data: list[dict]) -> ValidationResult:
|
|
125
|
-
df = pd.DataFrame(data)
|
|
126
|
-
|
|
127
|
-
if len(df.columns) > 1:
|
|
128
|
-
raise ValueError("Python validator assumes single column input")
|
|
129
|
-
target_column = df.columns[0]
|
|
130
|
-
|
|
131
|
-
df.loc[:, RECORD_ID_COLUMN_NAME] = [uuid4() for _ in range(df.shape[0])]
|
|
132
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
133
|
-
_ = df.apply(
|
|
134
|
-
self._write_code_to_file,
|
|
135
|
-
args=(target_column, temp_dir),
|
|
136
|
-
axis=1,
|
|
137
|
-
)
|
|
138
|
-
results = self._validate_files_in_path(path=temp_dir)
|
|
139
|
-
|
|
140
|
-
records = df.to_dict(orient="records")
|
|
141
|
-
|
|
142
|
-
ordered_results = []
|
|
143
|
-
for record in records:
|
|
144
|
-
module_id = self._get_module_name(record[RECORD_ID_COLUMN_NAME], target_column)
|
|
145
|
-
result = results.get(module_id)
|
|
146
|
-
if result is not None:
|
|
147
|
-
ordered_results.append(result)
|
|
148
|
-
|
|
149
|
-
return ValidationResult(data=ordered_results)
|
|
150
|
-
|
|
151
|
-
def _validate_files_in_path(self, path: str) -> dict[str, ValidationOutput]:
|
|
152
|
-
lint_results = self._run_linter(path)
|
|
153
|
-
|
|
154
|
-
scores_by_module = self._get_scores(
|
|
155
|
-
{
|
|
156
|
-
module: messages.get_count_by_type()
|
|
157
|
-
| {"statement": self._count_python_statements(f"{path}/{module}.py")}
|
|
158
|
-
for module, messages in lint_results.items()
|
|
159
|
-
}
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
validation_result = {}
|
|
163
|
-
for module, score in scores_by_module.items():
|
|
164
|
-
messages = lint_results.get(module, PythonLinterMessages())
|
|
165
|
-
metadata = {
|
|
166
|
-
"python_linter_score": score,
|
|
167
|
-
"python_linter_severity": messages.severity,
|
|
168
|
-
PYTHON_MESSAGES_FIELD: [m.model_dump() for m in messages.messages],
|
|
169
|
-
}
|
|
170
|
-
validation_result[module] = ValidationOutput(is_valid=messages.is_valid, **metadata)
|
|
171
|
-
return validation_result
|
|
172
|
-
|
|
173
|
-
def _write_code_to_file(self, row: pd.Series, code_column: str, path: str) -> None:
|
|
174
|
-
with open(f"{path}/{self._get_module_name(row[RECORD_ID_COLUMN_NAME], code_column)}.py", "w") as file:
|
|
175
|
-
file.write(row[code_column])
|
|
176
|
-
|
|
177
|
-
@staticmethod
|
|
178
|
-
def _get_module_name(record_id: str, column_name: str) -> str:
|
|
179
|
-
return f"{record_id}_{column_name}"
|
|
180
|
-
|
|
181
|
-
@staticmethod
|
|
182
|
-
def _run_linter(codebase_path: str) -> dict[str, PythonLinterMessages]:
|
|
183
|
-
# Create empty dict for output
|
|
184
|
-
processed = {}
|
|
185
|
-
for file in Path(codebase_path).glob("*.py"):
|
|
186
|
-
processed[file.stem] = PythonLinterMessages()
|
|
187
|
-
|
|
188
|
-
# Run ruff linter with JSON output
|
|
189
|
-
ruff_bin = find_ruff_bin()
|
|
190
|
-
|
|
191
|
-
ruff_exec = subprocess.run(
|
|
192
|
-
[
|
|
193
|
-
ruff_bin,
|
|
194
|
-
"check",
|
|
195
|
-
"--select",
|
|
196
|
-
"E,F6,F7,F8,SIM,PLC,PLE,PLR,PLW",
|
|
197
|
-
"--output-format=json",
|
|
198
|
-
codebase_path,
|
|
199
|
-
],
|
|
200
|
-
text=True,
|
|
201
|
-
capture_output=True,
|
|
202
|
-
check=False,
|
|
203
|
-
cwd=Path.cwd(),
|
|
204
|
-
)
|
|
205
|
-
ruff_output = ruff_exec.stdout
|
|
206
|
-
|
|
207
|
-
# Parse JSON output
|
|
208
|
-
try:
|
|
209
|
-
diagnostics = json.loads(ruff_output)
|
|
210
|
-
except json.JSONDecodeError as e:
|
|
211
|
-
raise RuntimeError(f"Failed to parse ruff JSON output: {e}")
|
|
212
|
-
|
|
213
|
-
if not diagnostics:
|
|
214
|
-
return processed # no errors or warnings
|
|
215
|
-
|
|
216
|
-
for diagnostic in diagnostics:
|
|
217
|
-
filename = diagnostic["filename"]
|
|
218
|
-
code = diagnostic["code"]
|
|
219
|
-
location = diagnostic["location"]
|
|
220
|
-
message = diagnostic["message"]
|
|
221
|
-
|
|
222
|
-
# Extract alphabetic prefix from code for type mapping
|
|
223
|
-
alpha_prefix = "".join(c for c in code if c.isalpha())
|
|
224
|
-
error_type = TYPE_FROM_SYMBOL.get(alpha_prefix, "warning")
|
|
225
|
-
|
|
226
|
-
processed[Path(filename).stem].add(
|
|
227
|
-
PythonLinterMessage(
|
|
228
|
-
type=error_type,
|
|
229
|
-
symbol=code,
|
|
230
|
-
line=location["row"],
|
|
231
|
-
column=location["column"],
|
|
232
|
-
message=message,
|
|
233
|
-
)
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
return processed
|
|
237
|
-
|
|
238
|
-
@staticmethod
|
|
239
|
-
def _get_scores(stats_by_module: dict[str, dict[str, int]]) -> dict[str, float]:
|
|
240
|
-
scores = {}
|
|
241
|
-
for key, item in stats_by_module.items():
|
|
242
|
-
stat = PythonValidationStat(**item)
|
|
243
|
-
scores[key] = stat.score
|
|
244
|
-
return scores
|
|
245
|
-
|
|
246
|
-
@staticmethod
|
|
247
|
-
def _count_python_statements(file_path: str) -> int:
|
|
248
|
-
"""Count the number of statements in a Python file."""
|
|
249
|
-
try:
|
|
250
|
-
with open(file_path, "r", encoding="utf-8") as f:
|
|
251
|
-
tree = ast.parse(f.read())
|
|
252
|
-
return sum(1 for node in ast.walk(tree) if isinstance(node, ast.stmt))
|
|
253
|
-
except Exception:
|
|
254
|
-
return 0
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
from typing import TYPE_CHECKING
|
|
8
|
-
|
|
9
|
-
from httpx_retries import Retry, RetryTransport
|
|
10
|
-
|
|
11
|
-
from data_designer.config.validator_params import RemoteValidatorParams
|
|
12
|
-
from data_designer.engine.errors import RemoteValidationSchemaError
|
|
13
|
-
from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError
|
|
14
|
-
from data_designer.engine.processing.gsonschema.validators import validate
|
|
15
|
-
from data_designer.engine.validators.base import BaseValidator, ValidationResult
|
|
16
|
-
from data_designer.lazy_heavy_imports import httpx
|
|
17
|
-
|
|
18
|
-
if TYPE_CHECKING:
|
|
19
|
-
import httpx
|
|
20
|
-
|
|
21
|
-
logger = logging.getLogger(__name__)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class RemoteEndpointClient:
|
|
25
|
-
"""Client for making parallel HTTP requests to remote endpoints with retry, timeout, and auth support."""
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
self,
|
|
29
|
-
config: RemoteValidatorParams,
|
|
30
|
-
):
|
|
31
|
-
"""
|
|
32
|
-
Initialize the remote endpoint client.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
config: Remote validator parameters
|
|
36
|
-
"""
|
|
37
|
-
self.endpoint_url = config.endpoint_url
|
|
38
|
-
self.output_schema = config.output_schema
|
|
39
|
-
self.timeout = config.timeout
|
|
40
|
-
self.max_retries = config.max_retries
|
|
41
|
-
self.retry_backoff = config.retry_backoff
|
|
42
|
-
|
|
43
|
-
def post_to_remote_endpoint(self, content: dict) -> dict:
|
|
44
|
-
"""
|
|
45
|
-
Make a single HTTP request with retry logic.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
content: The content to be posted to the remote endpoint
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
The JSON response from the remote endpoint
|
|
52
|
-
|
|
53
|
-
Raises:
|
|
54
|
-
httpx.RequestError: If all retry attempts fail
|
|
55
|
-
httpx.HTTPStatusError: If the server returns an error status
|
|
56
|
-
"""
|
|
57
|
-
retry = Retry(
|
|
58
|
-
total=self.max_retries,
|
|
59
|
-
backoff_factor=self.retry_backoff,
|
|
60
|
-
status_forcelist=[429, 500, 502, 503, 504],
|
|
61
|
-
)
|
|
62
|
-
transport = RetryTransport(retry=retry)
|
|
63
|
-
|
|
64
|
-
with httpx.Client(
|
|
65
|
-
timeout=httpx.Timeout(self.timeout),
|
|
66
|
-
transport=transport,
|
|
67
|
-
) as http_client:
|
|
68
|
-
response = http_client.post(
|
|
69
|
-
self.endpoint_url,
|
|
70
|
-
json=content,
|
|
71
|
-
)
|
|
72
|
-
response.raise_for_status()
|
|
73
|
-
|
|
74
|
-
response_json = response.json()
|
|
75
|
-
if self.output_schema:
|
|
76
|
-
try:
|
|
77
|
-
validate(response_json, self.output_schema, no_extra_properties=True)
|
|
78
|
-
except JSONSchemaValidationError as exc:
|
|
79
|
-
raise RemoteValidationSchemaError(str(exc)) from exc
|
|
80
|
-
return response_json
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
class RemoteValidator(BaseValidator):
|
|
84
|
-
def __init__(self, config: RemoteValidatorParams):
|
|
85
|
-
self.remote_endpoint_client = RemoteEndpointClient(config=config)
|
|
86
|
-
|
|
87
|
-
def run_validation(self, data: list[dict]) -> ValidationResult:
|
|
88
|
-
result = self.remote_endpoint_client.post_to_remote_endpoint(content={"data": data})
|
|
89
|
-
return ValidationResult.model_validate(result)
|
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
import re
|
|
8
|
-
from typing import TYPE_CHECKING
|
|
9
|
-
|
|
10
|
-
from data_designer.config.utils.code_lang import CodeLang
|
|
11
|
-
from data_designer.config.validator_params import CodeValidatorParams
|
|
12
|
-
from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
|
|
13
|
-
from data_designer.lazy_heavy_imports import pd, sqlfluff
|
|
14
|
-
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
import pandas as pd
|
|
17
|
-
import sqlfluff
|
|
18
|
-
|
|
19
|
-
sqlfluff_logger = logging.getLogger("sqlfluff")
|
|
20
|
-
sqlfluff_logger.setLevel(logging.WARNING)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class SQLValidator(BaseValidator):
|
|
24
|
-
def __init__(self, config: CodeValidatorParams):
|
|
25
|
-
self.config = config
|
|
26
|
-
|
|
27
|
-
def run_validation(self, data: list[dict]) -> ValidationResult:
|
|
28
|
-
df = pd.DataFrame(data)
|
|
29
|
-
|
|
30
|
-
if len(df.columns) > 1:
|
|
31
|
-
raise ValueError("SQL validator assumes single column input")
|
|
32
|
-
target_column = df.columns[0]
|
|
33
|
-
|
|
34
|
-
records = df.to_dict(orient="records")
|
|
35
|
-
|
|
36
|
-
results = []
|
|
37
|
-
for record in records:
|
|
38
|
-
result = self._validate_query(record[target_column])
|
|
39
|
-
results.append(result)
|
|
40
|
-
|
|
41
|
-
return ValidationResult(data=results)
|
|
42
|
-
|
|
43
|
-
def _validate_query(self, content: str) -> ValidationResult:
|
|
44
|
-
try:
|
|
45
|
-
result = sqlfluff.lint(
|
|
46
|
-
content,
|
|
47
|
-
dialect=CodeLang.parse_dialect(self.config.code_lang),
|
|
48
|
-
)
|
|
49
|
-
prs_errors = [res for res in result if res["code"].startswith("PRS")]
|
|
50
|
-
error_messages = "\n".join([f"{error['code']}: {error['description']}" for error in prs_errors])
|
|
51
|
-
decimal_pattern = re.compile(r"DECIMAL\(\d+\)")
|
|
52
|
-
decimal_issues = decimal_pattern.findall(content)
|
|
53
|
-
if decimal_issues:
|
|
54
|
-
error_messages += "\nCustom Check: Found DECIMAL definitions without a scale, which may be incorrect."
|
|
55
|
-
if error_messages:
|
|
56
|
-
return ValidationOutput(
|
|
57
|
-
is_valid=False,
|
|
58
|
-
error_messages=error_messages,
|
|
59
|
-
)
|
|
60
|
-
return ValidationOutput(is_valid=True, error_messages="")
|
|
61
|
-
except Exception as e:
|
|
62
|
-
return ValidationOutput(
|
|
63
|
-
is_valid=False,
|
|
64
|
-
error_messages=f"Exception during SQL parsing: {e}",
|
|
65
|
-
)
|
data_designer/errors.py
DELETED
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from data_designer.config.default_model_settings import resolve_seed_default_model_settings
|
|
7
|
-
from data_designer.config.exports import * # noqa: F403
|
|
8
|
-
from data_designer.config.run_config import RunConfig
|
|
9
|
-
from data_designer.config.validator_params import LocalCallableValidatorParams
|
|
10
|
-
from data_designer.interface.data_designer import DataDesigner
|
|
11
|
-
from data_designer.logging import LoggingConfig, configure_logging
|
|
12
|
-
|
|
13
|
-
configure_logging(LoggingConfig.default())
|
|
14
|
-
|
|
15
|
-
# Resolve default model settings on import to ensure they are available when the library is used.
|
|
16
|
-
resolve_seed_default_model_settings()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def get_essentials_exports() -> list[str]:
|
|
20
|
-
logging = [
|
|
21
|
-
configure_logging.__name__,
|
|
22
|
-
LoggingConfig.__name__,
|
|
23
|
-
]
|
|
24
|
-
local = [
|
|
25
|
-
DataDesigner.__name__,
|
|
26
|
-
LocalCallableValidatorParams.__name__,
|
|
27
|
-
RunConfig.__name__,
|
|
28
|
-
]
|
|
29
|
-
|
|
30
|
-
return logging + local + get_config_exports() # noqa: F405
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
__all__ = get_essentials_exports()
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
Lazy imports facade for heavy third-party dependencies.
|
|
6
|
-
|
|
7
|
-
This module provides a centralized facade that lazily imports heavy dependencies
|
|
8
|
-
only when accessed, significantly improving import performance.
|
|
9
|
-
|
|
10
|
-
Usage:
|
|
11
|
-
from data_designer.lazy_heavy_imports import pd, np, faker, litellm
|
|
12
|
-
|
|
13
|
-
df = pd.DataFrame(...)
|
|
14
|
-
arr = np.array([1, 2, 3])
|
|
15
|
-
fake = faker.Faker()
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
from __future__ import annotations
|
|
19
|
-
|
|
20
|
-
import importlib
|
|
21
|
-
|
|
22
|
-
# Mapping of lazy import names to their actual module paths
|
|
23
|
-
_LAZY_IMPORTS = {
|
|
24
|
-
"pd": "pandas",
|
|
25
|
-
"np": "numpy",
|
|
26
|
-
"pq": "pyarrow.parquet",
|
|
27
|
-
"pa": "pyarrow",
|
|
28
|
-
"faker": "faker",
|
|
29
|
-
"litellm": "litellm",
|
|
30
|
-
"sqlfluff": "sqlfluff",
|
|
31
|
-
"httpx": "httpx",
|
|
32
|
-
"duckdb": "duckdb",
|
|
33
|
-
"nx": "networkx",
|
|
34
|
-
"scipy": "scipy",
|
|
35
|
-
"jsonschema": "jsonschema",
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def __getattr__(name: str) -> object:
|
|
40
|
-
"""Lazily import heavy third-party dependencies when accessed.
|
|
41
|
-
|
|
42
|
-
This allows fast imports of data_designer while deferring loading of heavy
|
|
43
|
-
libraries until they're actually needed.
|
|
44
|
-
"""
|
|
45
|
-
if name in _LAZY_IMPORTS:
|
|
46
|
-
module_name = _LAZY_IMPORTS[name]
|
|
47
|
-
return importlib.import_module(module_name)
|
|
48
|
-
|
|
49
|
-
raise AttributeError(f"module 'data_designer.lazy_heavy_imports' has no attribute {name!r}")
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def __dir__() -> list[str]:
|
|
53
|
-
"""Return list of available lazy imports."""
|
|
54
|
-
return list(_LAZY_IMPORTS.keys())
|
data_designer/logging.py
DELETED
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
import random
|
|
8
|
-
import sys
|
|
9
|
-
from dataclasses import dataclass, field
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import TextIO
|
|
12
|
-
|
|
13
|
-
from pythonjsonlogger import jsonlogger
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@dataclass
|
|
17
|
-
class LoggerConfig:
|
|
18
|
-
name: str
|
|
19
|
-
level: str
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@dataclass
|
|
23
|
-
class OutputConfig:
|
|
24
|
-
destination: TextIO | Path
|
|
25
|
-
structured: bool
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@dataclass
|
|
29
|
-
class LoggingConfig:
|
|
30
|
-
logger_configs: list[LoggerConfig]
|
|
31
|
-
output_configs: list[OutputConfig]
|
|
32
|
-
root_level: str = "INFO"
|
|
33
|
-
to_silence: list[str] = field(default_factory=lambda: _DEFAULT_NOISY_LOGGERS)
|
|
34
|
-
|
|
35
|
-
@classmethod
|
|
36
|
-
def default(cls):
|
|
37
|
-
return LoggingConfig(
|
|
38
|
-
logger_configs=[LoggerConfig(name="data_designer", level="INFO")],
|
|
39
|
-
output_configs=[OutputConfig(destination=sys.stderr, structured=False)],
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
@classmethod
|
|
43
|
-
def debug(cls):
|
|
44
|
-
return LoggingConfig(
|
|
45
|
-
logger_configs=[LoggerConfig(name="data_designer", level="DEBUG")],
|
|
46
|
-
output_configs=[OutputConfig(destination=sys.stderr, structured=False)],
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class RandomEmoji:
|
|
51
|
-
"""A generator for various themed emoji collections."""
|
|
52
|
-
|
|
53
|
-
@staticmethod
|
|
54
|
-
def cooking() -> str:
|
|
55
|
-
"""Get a random cooking or food preparation emoji."""
|
|
56
|
-
return random.choice(["👨🍳", "👩🍳", "🍳", "🥘", "🍲", "🔪", "🥄", "🍴", "⏲️", "🥗"])
|
|
57
|
-
|
|
58
|
-
@staticmethod
|
|
59
|
-
def data() -> str:
|
|
60
|
-
"""Get a random data or analytics emoji."""
|
|
61
|
-
return random.choice(["📊", "📈", "📉", "💾", "💿", "📀", "🗄️", "📁", "📂", "🗃️"])
|
|
62
|
-
|
|
63
|
-
@staticmethod
|
|
64
|
-
def generating() -> str:
|
|
65
|
-
"""Get a random generating or creating emoji."""
|
|
66
|
-
return random.choice(["🏭", "⚙️", "🔨", "🛠️", "🏗️", "🎨", "✍️", "📝", "🔧", "⚒️"])
|
|
67
|
-
|
|
68
|
-
@staticmethod
|
|
69
|
-
def loading() -> str:
|
|
70
|
-
"""Get a random loading or waiting emoji."""
|
|
71
|
-
return random.choice(["⏳", "⌛", "🔄", "♻️", "🔃", "⏰", "⏱️", "⏲️", "📡", "🌀"])
|
|
72
|
-
|
|
73
|
-
@staticmethod
|
|
74
|
-
def magic() -> str:
|
|
75
|
-
"""Get a random magical or special effect emoji."""
|
|
76
|
-
return random.choice(["✨", "⭐", "🌟", "💫", "🪄", "🔮", "🎩", "🌈", "💎", "🦄"])
|
|
77
|
-
|
|
78
|
-
@staticmethod
|
|
79
|
-
def previewing() -> str:
|
|
80
|
-
"""Get a random previewing or looking ahead emoji."""
|
|
81
|
-
return random.choice(["👀", "📺", "🔁", "👁️", "🔭", "🕵️", "🧐", "📸", "🎥", "🖼️"])
|
|
82
|
-
|
|
83
|
-
@staticmethod
|
|
84
|
-
def speed() -> str:
|
|
85
|
-
"""Get a random speed or fast emoji."""
|
|
86
|
-
return random.choice(["⚡", "💨", "🏃", "🏎️", "🚄", "✈️", "💥", "⏩", "🏃♂️", "🏃♀️"])
|
|
87
|
-
|
|
88
|
-
@staticmethod
|
|
89
|
-
def start() -> str:
|
|
90
|
-
"""Get a random emoji representing starting or launching something."""
|
|
91
|
-
return random.choice(["🚀", "▶️", "🎬", "🌅", "🏁", "🎯", "🚦", "🔔", "📣", "🎺"])
|
|
92
|
-
|
|
93
|
-
@staticmethod
|
|
94
|
-
def success() -> str:
|
|
95
|
-
"""Get a random success or celebration emoji."""
|
|
96
|
-
return random.choice(["🎉", "🎊", "👏", "🙌", "🎆", "🍾", "☀️", "🏆", "✅", "🥳"])
|
|
97
|
-
|
|
98
|
-
@staticmethod
|
|
99
|
-
def thinking() -> str:
|
|
100
|
-
"""Get a random thinking or processing emoji."""
|
|
101
|
-
return random.choice(["🤔", "💭", "🧠", "💡", "🔍", "🔎", "🤨", "🧐", "📝", "🧮"])
|
|
102
|
-
|
|
103
|
-
@staticmethod
|
|
104
|
-
def working() -> str:
|
|
105
|
-
"""Get a random working or in-progress emoji."""
|
|
106
|
-
return random.choice(["⚙️", "🔧", "🔨", "⚒️", "🛠️", "💼", "👷", "🏗️", "🪛", "👨💻"])
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def configure_logging(config: LoggingConfig) -> None:
|
|
110
|
-
root_logger = logging.getLogger()
|
|
111
|
-
|
|
112
|
-
# Remove all handlers
|
|
113
|
-
root_logger.handlers.clear()
|
|
114
|
-
|
|
115
|
-
# Create and attach handler(s)
|
|
116
|
-
handlers = [_create_handler(output_config) for output_config in config.output_configs]
|
|
117
|
-
for handler in handlers:
|
|
118
|
-
root_logger.addHandler(handler)
|
|
119
|
-
|
|
120
|
-
# Set levels
|
|
121
|
-
root_logger.setLevel(config.root_level)
|
|
122
|
-
for logger_config in config.logger_configs:
|
|
123
|
-
logger = logging.getLogger(logger_config.name)
|
|
124
|
-
logger.setLevel(logger_config.level)
|
|
125
|
-
|
|
126
|
-
# Adjust noisy loggers
|
|
127
|
-
for name in config.to_silence:
|
|
128
|
-
quiet_noisy_logger(name)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
def quiet_noisy_logger(name: str) -> None:
|
|
132
|
-
logger = logging.getLogger(name)
|
|
133
|
-
logger.handlers.clear()
|
|
134
|
-
logger.setLevel(logging.WARNING)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
def _create_handler(output_config: OutputConfig) -> logging.Handler:
|
|
138
|
-
if isinstance(output_config.destination, Path):
|
|
139
|
-
handler = logging.FileHandler(str(output_config.destination))
|
|
140
|
-
else:
|
|
141
|
-
handler = logging.StreamHandler()
|
|
142
|
-
|
|
143
|
-
if output_config.structured:
|
|
144
|
-
formatter = _make_json_formatter()
|
|
145
|
-
else:
|
|
146
|
-
formatter = _make_stream_formatter()
|
|
147
|
-
|
|
148
|
-
handler.setFormatter(formatter)
|
|
149
|
-
return handler
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def _make_json_formatter() -> logging.Formatter:
|
|
153
|
-
log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
|
|
154
|
-
return jsonlogger.JsonFormatter(log_format)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def _make_stream_formatter() -> logging.Formatter:
|
|
158
|
-
log_format = "[%(asctime)s] [%(levelname)s] %(message)s"
|
|
159
|
-
time_format = "%H:%M:%S"
|
|
160
|
-
return logging.Formatter(log_format, time_format)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
_DEFAULT_NOISY_LOGGERS = ["httpx", "matplotlib"]
|