data-designer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/__init__.py +15 -0
- data_designer/_version.py +34 -0
- data_designer/cli/README.md +236 -0
- data_designer/cli/__init__.py +6 -0
- data_designer/cli/commands/__init__.py +2 -0
- data_designer/cli/commands/list.py +130 -0
- data_designer/cli/commands/models.py +10 -0
- data_designer/cli/commands/providers.py +11 -0
- data_designer/cli/commands/reset.py +100 -0
- data_designer/cli/controllers/__init__.py +7 -0
- data_designer/cli/controllers/model_controller.py +246 -0
- data_designer/cli/controllers/provider_controller.py +317 -0
- data_designer/cli/forms/__init__.py +20 -0
- data_designer/cli/forms/builder.py +51 -0
- data_designer/cli/forms/field.py +180 -0
- data_designer/cli/forms/form.py +59 -0
- data_designer/cli/forms/model_builder.py +125 -0
- data_designer/cli/forms/provider_builder.py +76 -0
- data_designer/cli/main.py +44 -0
- data_designer/cli/repositories/__init__.py +8 -0
- data_designer/cli/repositories/base.py +39 -0
- data_designer/cli/repositories/model_repository.py +42 -0
- data_designer/cli/repositories/provider_repository.py +43 -0
- data_designer/cli/services/__init__.py +7 -0
- data_designer/cli/services/model_service.py +116 -0
- data_designer/cli/services/provider_service.py +111 -0
- data_designer/cli/ui.py +448 -0
- data_designer/cli/utils.py +47 -0
- data_designer/config/__init__.py +2 -0
- data_designer/config/analysis/column_profilers.py +89 -0
- data_designer/config/analysis/column_statistics.py +274 -0
- data_designer/config/analysis/dataset_profiler.py +60 -0
- data_designer/config/analysis/utils/errors.py +8 -0
- data_designer/config/analysis/utils/reporting.py +188 -0
- data_designer/config/base.py +68 -0
- data_designer/config/column_configs.py +354 -0
- data_designer/config/column_types.py +168 -0
- data_designer/config/config_builder.py +660 -0
- data_designer/config/data_designer_config.py +40 -0
- data_designer/config/dataset_builders.py +11 -0
- data_designer/config/datastore.py +151 -0
- data_designer/config/default_model_settings.py +123 -0
- data_designer/config/errors.py +19 -0
- data_designer/config/interface.py +54 -0
- data_designer/config/models.py +231 -0
- data_designer/config/preview_results.py +32 -0
- data_designer/config/processors.py +41 -0
- data_designer/config/sampler_constraints.py +51 -0
- data_designer/config/sampler_params.py +604 -0
- data_designer/config/seed.py +145 -0
- data_designer/config/utils/code_lang.py +83 -0
- data_designer/config/utils/constants.py +313 -0
- data_designer/config/utils/errors.py +19 -0
- data_designer/config/utils/info.py +88 -0
- data_designer/config/utils/io_helpers.py +273 -0
- data_designer/config/utils/misc.py +81 -0
- data_designer/config/utils/numerical_helpers.py +28 -0
- data_designer/config/utils/type_helpers.py +100 -0
- data_designer/config/utils/validation.py +336 -0
- data_designer/config/utils/visualization.py +427 -0
- data_designer/config/validator_params.py +96 -0
- data_designer/engine/__init__.py +2 -0
- data_designer/engine/analysis/column_profilers/base.py +55 -0
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
- data_designer/engine/analysis/column_profilers/registry.py +20 -0
- data_designer/engine/analysis/column_statistics.py +142 -0
- data_designer/engine/analysis/dataset_profiler.py +125 -0
- data_designer/engine/analysis/errors.py +7 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
- data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
- data_designer/engine/column_generators/__init__.py +2 -0
- data_designer/engine/column_generators/generators/__init__.py +2 -0
- data_designer/engine/column_generators/generators/base.py +61 -0
- data_designer/engine/column_generators/generators/expression.py +63 -0
- data_designer/engine/column_generators/generators/llm_generators.py +172 -0
- data_designer/engine/column_generators/generators/samplers.py +75 -0
- data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
- data_designer/engine/column_generators/generators/validation.py +147 -0
- data_designer/engine/column_generators/registry.py +56 -0
- data_designer/engine/column_generators/utils/errors.py +13 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
- data_designer/engine/configurable_task.py +82 -0
- data_designer/engine/dataset_builders/artifact_storage.py +181 -0
- data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
- data_designer/engine/dataset_builders/errors.py +13 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
- data_designer/engine/dataset_builders/utils/__init__.py +2 -0
- data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
- data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
- data_designer/engine/dataset_builders/utils/dag.py +56 -0
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
- data_designer/engine/dataset_builders/utils/errors.py +13 -0
- data_designer/engine/errors.py +49 -0
- data_designer/engine/model_provider.py +75 -0
- data_designer/engine/models/__init__.py +2 -0
- data_designer/engine/models/errors.py +308 -0
- data_designer/engine/models/facade.py +225 -0
- data_designer/engine/models/litellm_overrides.py +162 -0
- data_designer/engine/models/parsers/__init__.py +2 -0
- data_designer/engine/models/parsers/errors.py +34 -0
- data_designer/engine/models/parsers/parser.py +236 -0
- data_designer/engine/models/parsers/postprocessors.py +93 -0
- data_designer/engine/models/parsers/tag_parsers.py +60 -0
- data_designer/engine/models/parsers/types.py +82 -0
- data_designer/engine/models/recipes/base.py +79 -0
- data_designer/engine/models/recipes/response_recipes.py +291 -0
- data_designer/engine/models/registry.py +118 -0
- data_designer/engine/models/usage.py +75 -0
- data_designer/engine/models/utils.py +38 -0
- data_designer/engine/processing/ginja/__init__.py +2 -0
- data_designer/engine/processing/ginja/ast.py +64 -0
- data_designer/engine/processing/ginja/environment.py +461 -0
- data_designer/engine/processing/ginja/exceptions.py +54 -0
- data_designer/engine/processing/ginja/record.py +30 -0
- data_designer/engine/processing/gsonschema/__init__.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +8 -0
- data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
- data_designer/engine/processing/gsonschema/types.py +8 -0
- data_designer/engine/processing/gsonschema/validators.py +143 -0
- data_designer/engine/processing/processors/base.py +15 -0
- data_designer/engine/processing/processors/drop_columns.py +46 -0
- data_designer/engine/processing/processors/registry.py +20 -0
- data_designer/engine/processing/utils.py +120 -0
- data_designer/engine/registry/base.py +97 -0
- data_designer/engine/registry/data_designer_registry.py +37 -0
- data_designer/engine/registry/errors.py +10 -0
- data_designer/engine/resources/managed_dataset_generator.py +35 -0
- data_designer/engine/resources/managed_dataset_repository.py +194 -0
- data_designer/engine/resources/managed_storage.py +63 -0
- data_designer/engine/resources/resource_provider.py +46 -0
- data_designer/engine/resources/seed_dataset_data_store.py +66 -0
- data_designer/engine/sampling_gen/column.py +89 -0
- data_designer/engine/sampling_gen/constraints.py +95 -0
- data_designer/engine/sampling_gen/data_sources/base.py +214 -0
- data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
- data_designer/engine/sampling_gen/entities/__init__.py +2 -0
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
- data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
- data_designer/engine/sampling_gen/entities/errors.py +8 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
- data_designer/engine/sampling_gen/entities/person.py +142 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
- data_designer/engine/sampling_gen/errors.py +24 -0
- data_designer/engine/sampling_gen/generator.py +121 -0
- data_designer/engine/sampling_gen/jinja_utils.py +60 -0
- data_designer/engine/sampling_gen/people_gen.py +203 -0
- data_designer/engine/sampling_gen/person_constants.py +54 -0
- data_designer/engine/sampling_gen/schema.py +143 -0
- data_designer/engine/sampling_gen/schema_builder.py +59 -0
- data_designer/engine/sampling_gen/utils.py +40 -0
- data_designer/engine/secret_resolver.py +80 -0
- data_designer/engine/validators/__init__.py +17 -0
- data_designer/engine/validators/base.py +36 -0
- data_designer/engine/validators/local_callable.py +34 -0
- data_designer/engine/validators/python.py +245 -0
- data_designer/engine/validators/remote.py +83 -0
- data_designer/engine/validators/sql.py +60 -0
- data_designer/errors.py +5 -0
- data_designer/essentials/__init__.py +137 -0
- data_designer/interface/__init__.py +2 -0
- data_designer/interface/data_designer.py +351 -0
- data_designer/interface/errors.py +16 -0
- data_designer/interface/results.py +55 -0
- data_designer/logging.py +161 -0
- data_designer/plugin_manager.py +83 -0
- data_designer/plugins/__init__.py +6 -0
- data_designer/plugins/errors.py +10 -0
- data_designer/plugins/plugin.py +69 -0
- data_designer/plugins/registry.py +86 -0
- data_designer-0.1.0.dist-info/METADATA +173 -0
- data_designer-0.1.0.dist-info/RECORD +177 -0
- data_designer-0.1.0.dist-info/WHEEL +4 -0
- data_designer-0.1.0.dist-info/entry_points.txt +2 -0
- data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import Iterator, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict
|
|
8
|
+
from typing_extensions import Self
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ValidationOutput(BaseModel):
|
|
12
|
+
is_valid: Optional[bool]
|
|
13
|
+
model_config = ConfigDict(extra="allow")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ValidationResult(BaseModel):
|
|
17
|
+
data: list[ValidationOutput]
|
|
18
|
+
|
|
19
|
+
def __len__(self) -> int:
|
|
20
|
+
return len(self.data)
|
|
21
|
+
|
|
22
|
+
def __getitem__(self, index: int) -> ValidationOutput:
|
|
23
|
+
return self.data[index]
|
|
24
|
+
|
|
25
|
+
def __iter__(self) -> Iterator[ValidationOutput]:
|
|
26
|
+
return iter(self.data)
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def empty(cls, size: int) -> Self:
|
|
30
|
+
return cls(data=[ValidationOutput(is_valid=None) for _ in range(size)])
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class BaseValidator(ABC):
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def run_validation(self, data: list[dict]) -> ValidationResult:
|
|
36
|
+
pass
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from data_designer.config.validator_params import LocalCallableValidatorParams
|
|
9
|
+
from data_designer.engine.errors import LocalCallableValidationError
|
|
10
|
+
from data_designer.engine.processing.gsonschema.validators import validate
|
|
11
|
+
from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LocalCallableValidator(BaseValidator):
|
|
17
|
+
def __init__(self, config: LocalCallableValidatorParams):
|
|
18
|
+
self.validation_function = config.validation_function
|
|
19
|
+
self.output_schema = config.output_schema
|
|
20
|
+
|
|
21
|
+
def run_validation(self, data: list[dict]) -> ValidationResult:
|
|
22
|
+
df = pd.DataFrame(data)
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
result_as_df = self.validation_function(df)
|
|
26
|
+
except Exception as e:
|
|
27
|
+
logger.error(f"Callback validator failed: {e}")
|
|
28
|
+
raise LocalCallableValidationError(str(e))
|
|
29
|
+
|
|
30
|
+
records = result_as_df.to_dict(orient="records")
|
|
31
|
+
result = ValidationResult(data=[ValidationOutput.model_validate(record) for record in records])
|
|
32
|
+
if self.output_schema:
|
|
33
|
+
validate(result.model_dump(mode="json"), self.output_schema, no_extra_properties=True)
|
|
34
|
+
return result
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import ast
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import re
|
|
9
|
+
import subprocess
|
|
10
|
+
import tempfile
|
|
11
|
+
from uuid import uuid4
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
from ruff.__main__ import find_ruff_bin
|
|
16
|
+
|
|
17
|
+
from data_designer.config.validator_params import CodeValidatorParams
|
|
18
|
+
from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
PYLINT_ERROR_CATEGORIES_ORDERED = [
|
|
23
|
+
"fatal",
|
|
24
|
+
"error",
|
|
25
|
+
"warning",
|
|
26
|
+
"convention",
|
|
27
|
+
"refactor",
|
|
28
|
+
]
|
|
29
|
+
PYLINT_VALID_LEVELS = {"none", "warning", "convention", "refactor"}
|
|
30
|
+
|
|
31
|
+
TYPE_FROM_SYMBOL = {
|
|
32
|
+
"E": "refactor",
|
|
33
|
+
"F": "error",
|
|
34
|
+
"SIM": "refactor",
|
|
35
|
+
"PLC": "convention",
|
|
36
|
+
"PLE": "error",
|
|
37
|
+
"PLR": "refactor",
|
|
38
|
+
"PLW": "warning",
|
|
39
|
+
"SyntaxError": "fatal",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
PYTHON_MESSAGES_FIELD = "python_linter_messages"
|
|
43
|
+
RECORD_ID_COLUMN_NAME = "internal_code_record_id"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class PythonValidationStat(BaseModel):
|
|
47
|
+
fatal: int = 0
|
|
48
|
+
error: int = 0
|
|
49
|
+
warning: int = 0
|
|
50
|
+
refactor: int = 0
|
|
51
|
+
convention: int = 0
|
|
52
|
+
statement: int = 0
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def score(self) -> float:
|
|
56
|
+
# https://pylint.pycqa.org/en/latest/user_guide/configuration/all-options.html#evaluation
|
|
57
|
+
if self.statement == 0: # prevent division by zero down below
|
|
58
|
+
self.statement = max(1, self.statement)
|
|
59
|
+
return max(
|
|
60
|
+
0,
|
|
61
|
+
(
|
|
62
|
+
0
|
|
63
|
+
if self.fatal
|
|
64
|
+
else 10.0
|
|
65
|
+
- ((float(5 * self.error + self.warning + self.refactor + self.convention) / self.statement) * 10)
|
|
66
|
+
),
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class PythonLinterMessage(BaseModel):
|
|
71
|
+
type: str
|
|
72
|
+
symbol: str
|
|
73
|
+
line: int
|
|
74
|
+
column: int
|
|
75
|
+
message: str
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def type_sort_order(self) -> int:
|
|
79
|
+
return PYLINT_ERROR_CATEGORIES_ORDERED.index(self.type)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class PythonLinterMessages(BaseModel):
|
|
83
|
+
_messages: list[PythonLinterMessage] = []
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def messages(self) -> list[PythonLinterMessage]:
|
|
87
|
+
# Ordered by severity first then by line number
|
|
88
|
+
return sorted(self._messages, key=lambda msg: (msg.type_sort_order, msg.line))
|
|
89
|
+
|
|
90
|
+
def add(self, message: PythonLinterMessage) -> None:
|
|
91
|
+
self._messages.append(message)
|
|
92
|
+
|
|
93
|
+
def get_count_by_type(self) -> dict[str, int]:
|
|
94
|
+
count_by_type = defaultdict(int)
|
|
95
|
+
for message in self.messages:
|
|
96
|
+
count_by_type[message.type] += 1
|
|
97
|
+
return dict(count_by_type)
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def is_empty(self) -> bool:
|
|
101
|
+
return len(self.messages) == 0
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def severity(self) -> str:
|
|
105
|
+
if self.is_empty:
|
|
106
|
+
return "none"
|
|
107
|
+
return self.messages[0].type
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def is_valid(self) -> bool:
|
|
111
|
+
return self.is_empty or self.messages[0].type in PYLINT_VALID_LEVELS
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class PythonValidator(BaseValidator):
|
|
115
|
+
def __init__(self, config: CodeValidatorParams):
|
|
116
|
+
self.config = config
|
|
117
|
+
|
|
118
|
+
def run_validation(self, data: list[dict]) -> ValidationResult:
|
|
119
|
+
df = pd.DataFrame(data)
|
|
120
|
+
|
|
121
|
+
if len(df.columns) > 1:
|
|
122
|
+
raise ValueError("Python validator assumes single column input")
|
|
123
|
+
target_column = df.columns[0]
|
|
124
|
+
|
|
125
|
+
df.loc[:, RECORD_ID_COLUMN_NAME] = [uuid4() for _ in range(df.shape[0])]
|
|
126
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
127
|
+
_ = df.apply(
|
|
128
|
+
self._write_code_to_file,
|
|
129
|
+
args=(target_column, temp_dir),
|
|
130
|
+
axis=1,
|
|
131
|
+
)
|
|
132
|
+
results = self._validate_files_in_path(path=temp_dir)
|
|
133
|
+
|
|
134
|
+
records = df.to_dict(orient="records")
|
|
135
|
+
|
|
136
|
+
ordered_results = []
|
|
137
|
+
for record in records:
|
|
138
|
+
module_id = self._get_module_name(record[RECORD_ID_COLUMN_NAME], target_column)
|
|
139
|
+
result = results.get(module_id)
|
|
140
|
+
if result is not None:
|
|
141
|
+
ordered_results.append(result)
|
|
142
|
+
|
|
143
|
+
return ValidationResult(data=ordered_results)
|
|
144
|
+
|
|
145
|
+
def _validate_files_in_path(self, path: str) -> dict[str, ValidationOutput]:
|
|
146
|
+
lint_results = self._run_linter(path)
|
|
147
|
+
|
|
148
|
+
scores_by_module = self._get_scores(
|
|
149
|
+
{
|
|
150
|
+
module: messages.get_count_by_type()
|
|
151
|
+
| {"statement": self._count_python_statements(f"{path}/{module}.py")}
|
|
152
|
+
for module, messages in lint_results.items()
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
validation_result = {}
|
|
157
|
+
for module, score in scores_by_module.items():
|
|
158
|
+
messages = lint_results.get(module, PythonLinterMessages())
|
|
159
|
+
metadata = {
|
|
160
|
+
"python_linter_score": score,
|
|
161
|
+
"python_linter_severity": messages.severity,
|
|
162
|
+
PYTHON_MESSAGES_FIELD: [m.model_dump() for m in messages.messages],
|
|
163
|
+
}
|
|
164
|
+
validation_result[module] = ValidationOutput(is_valid=messages.is_valid, **metadata)
|
|
165
|
+
return validation_result
|
|
166
|
+
|
|
167
|
+
def _write_code_to_file(self, row: pd.Series, code_column: str, path: str) -> None:
|
|
168
|
+
with open(f"{path}/{self._get_module_name(row[RECORD_ID_COLUMN_NAME], code_column)}.py", "w") as file:
|
|
169
|
+
file.write(row[code_column])
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def _get_module_name(record_id: str, column_name: str) -> str:
|
|
173
|
+
return f"{record_id}_{column_name}"
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def _run_linter(codebase_path: str) -> dict[str, PythonLinterMessages]:
|
|
177
|
+
# Create empty dict for output
|
|
178
|
+
processed = {}
|
|
179
|
+
for file in Path(codebase_path).glob("*.py"):
|
|
180
|
+
processed[file.stem] = PythonLinterMessages()
|
|
181
|
+
|
|
182
|
+
# Run ruff linter
|
|
183
|
+
ruff_bin = find_ruff_bin()
|
|
184
|
+
env = {"NO_COLOR": "1"}
|
|
185
|
+
|
|
186
|
+
ruff_exec = subprocess.run(
|
|
187
|
+
[
|
|
188
|
+
ruff_bin,
|
|
189
|
+
"check",
|
|
190
|
+
"--select",
|
|
191
|
+
"E,F6,F7,F8,SIM,PLC,PLE,PLR,PLW",
|
|
192
|
+
codebase_path,
|
|
193
|
+
],
|
|
194
|
+
env=env,
|
|
195
|
+
text=True,
|
|
196
|
+
capture_output=True,
|
|
197
|
+
check=False,
|
|
198
|
+
cwd=Path.cwd(),
|
|
199
|
+
)
|
|
200
|
+
ruff_output = ruff_exec.stdout
|
|
201
|
+
|
|
202
|
+
# Parse ruff output
|
|
203
|
+
if "All checks passed!" in ruff_output:
|
|
204
|
+
return processed # no errors or warnings
|
|
205
|
+
|
|
206
|
+
pattern = r"(.*):([0-9]*):([0-9]*): ([A-Za-z0-9]*):? (?:\[\*\] )?(.*)\n"
|
|
207
|
+
errors = re.findall(pattern, ruff_output)
|
|
208
|
+
|
|
209
|
+
if errors == []: # output could not be parsed
|
|
210
|
+
raise RuntimeError("ruff's output could not be parsed")
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
for error in errors:
|
|
214
|
+
filename, line, column, symbol, message = error
|
|
215
|
+
processed[Path(filename).stem].add(
|
|
216
|
+
PythonLinterMessage(
|
|
217
|
+
type=TYPE_FROM_SYMBOL[re.sub(r"[^A-Za-z]+", "", symbol)],
|
|
218
|
+
symbol=symbol,
|
|
219
|
+
line=int(line),
|
|
220
|
+
column=int(column),
|
|
221
|
+
message=message,
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
except Exception: # output not in expected format
|
|
225
|
+
raise RuntimeError("ruff's output not in expected format")
|
|
226
|
+
|
|
227
|
+
return processed
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def _get_scores(stats_by_module: dict[str, dict[str, int]]) -> dict[str, float]:
|
|
231
|
+
scores = {}
|
|
232
|
+
for key, item in stats_by_module.items():
|
|
233
|
+
stat = PythonValidationStat(**item)
|
|
234
|
+
scores[key] = stat.score
|
|
235
|
+
return scores
|
|
236
|
+
|
|
237
|
+
@staticmethod
|
|
238
|
+
def _count_python_statements(file_path: str) -> int:
|
|
239
|
+
"""Count the number of statements in a Python file."""
|
|
240
|
+
try:
|
|
241
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
242
|
+
tree = ast.parse(f.read())
|
|
243
|
+
return sum(1 for node in ast.walk(tree) if isinstance(node, ast.stmt))
|
|
244
|
+
except Exception:
|
|
245
|
+
return 0
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
from httpx_retries import Retry, RetryTransport
|
|
8
|
+
|
|
9
|
+
from data_designer.config.validator_params import RemoteValidatorParams
|
|
10
|
+
from data_designer.engine.errors import RemoteValidationSchemaError
|
|
11
|
+
from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError
|
|
12
|
+
from data_designer.engine.processing.gsonschema.validators import validate
|
|
13
|
+
from data_designer.engine.validators.base import BaseValidator, ValidationResult
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RemoteEndpointClient:
|
|
19
|
+
"""Client for making parallel HTTP requests to remote endpoints with retry, timeout, and auth support."""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
config: RemoteValidatorParams,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
Initialize the remote endpoint client.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
config: Remote validator parameters
|
|
30
|
+
"""
|
|
31
|
+
self.endpoint_url = config.endpoint_url
|
|
32
|
+
self.output_schema = config.output_schema
|
|
33
|
+
self.timeout = config.timeout
|
|
34
|
+
self.max_retries = config.max_retries
|
|
35
|
+
self.retry_backoff = config.retry_backoff
|
|
36
|
+
|
|
37
|
+
def post_to_remote_endpoint(self, content: dict) -> dict:
|
|
38
|
+
"""
|
|
39
|
+
Make a single HTTP request with retry logic.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
content: The content to be posted to the remote endpoint
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
The JSON response from the remote endpoint
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
httpx.RequestError: If all retry attempts fail
|
|
49
|
+
httpx.HTTPStatusError: If the server returns an error status
|
|
50
|
+
"""
|
|
51
|
+
retry = Retry(
|
|
52
|
+
total=self.max_retries,
|
|
53
|
+
backoff_factor=self.retry_backoff,
|
|
54
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
|
55
|
+
)
|
|
56
|
+
transport = RetryTransport(retry=retry)
|
|
57
|
+
|
|
58
|
+
with httpx.Client(
|
|
59
|
+
timeout=httpx.Timeout(self.timeout),
|
|
60
|
+
transport=transport,
|
|
61
|
+
) as http_client:
|
|
62
|
+
response = http_client.post(
|
|
63
|
+
self.endpoint_url,
|
|
64
|
+
json=content,
|
|
65
|
+
)
|
|
66
|
+
response.raise_for_status()
|
|
67
|
+
|
|
68
|
+
response_json = response.json()
|
|
69
|
+
if self.output_schema:
|
|
70
|
+
try:
|
|
71
|
+
validate(response_json, self.output_schema, no_extra_properties=True)
|
|
72
|
+
except JSONSchemaValidationError as exc:
|
|
73
|
+
raise RemoteValidationSchemaError(str(exc)) from exc
|
|
74
|
+
return response_json
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class RemoteValidator(BaseValidator):
|
|
78
|
+
def __init__(self, config: RemoteValidatorParams):
|
|
79
|
+
self.remote_endpoint_client = RemoteEndpointClient(config=config)
|
|
80
|
+
|
|
81
|
+
def run_validation(self, data: list[dict]) -> ValidationResult:
|
|
82
|
+
result = self.remote_endpoint_client.post_to_remote_endpoint(content={"data": data})
|
|
83
|
+
return ValidationResult.model_validate(result)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import sqlfluff
|
|
9
|
+
|
|
10
|
+
from data_designer.config.utils.code_lang import CodeLang
|
|
11
|
+
from data_designer.config.validator_params import CodeValidatorParams
|
|
12
|
+
from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
|
|
13
|
+
|
|
14
|
+
sqlfluff_logger = logging.getLogger("sqlfluff")
|
|
15
|
+
sqlfluff_logger.setLevel(logging.WARNING)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SQLValidator(BaseValidator):
|
|
19
|
+
def __init__(self, config: CodeValidatorParams):
|
|
20
|
+
self.config = config
|
|
21
|
+
|
|
22
|
+
def run_validation(self, data: list[dict]) -> ValidationResult:
|
|
23
|
+
df = pd.DataFrame(data)
|
|
24
|
+
|
|
25
|
+
if len(df.columns) > 1:
|
|
26
|
+
raise ValueError("SQL validator assumes single column input")
|
|
27
|
+
target_column = df.columns[0]
|
|
28
|
+
|
|
29
|
+
records = df.to_dict(orient="records")
|
|
30
|
+
|
|
31
|
+
results = []
|
|
32
|
+
for record in records:
|
|
33
|
+
result = self._validate_query(record[target_column])
|
|
34
|
+
results.append(result)
|
|
35
|
+
|
|
36
|
+
return ValidationResult(data=results)
|
|
37
|
+
|
|
38
|
+
def _validate_query(self, content: str) -> ValidationResult:
|
|
39
|
+
try:
|
|
40
|
+
result = sqlfluff.lint(
|
|
41
|
+
content,
|
|
42
|
+
dialect=CodeLang.parse_dialect(self.config.code_lang),
|
|
43
|
+
)
|
|
44
|
+
prs_errors = [res for res in result if res["code"].startswith("PRS")]
|
|
45
|
+
error_messages = "\n".join([f"{error['code']}: {error['description']}" for error in prs_errors])
|
|
46
|
+
decimal_pattern = re.compile(r"DECIMAL\(\d+\)")
|
|
47
|
+
decimal_issues = decimal_pattern.findall(content)
|
|
48
|
+
if decimal_issues:
|
|
49
|
+
error_messages += "\nCustom Check: Found DECIMAL definitions without a scale, which may be incorrect."
|
|
50
|
+
if error_messages:
|
|
51
|
+
return ValidationOutput(
|
|
52
|
+
is_valid=False,
|
|
53
|
+
error_messages=error_messages,
|
|
54
|
+
)
|
|
55
|
+
return ValidationOutput(is_valid=True, error_messages="")
|
|
56
|
+
except Exception as e:
|
|
57
|
+
return ValidationOutput(
|
|
58
|
+
is_valid=False,
|
|
59
|
+
error_messages=f"Exception during SQL parsing: {e}",
|
|
60
|
+
)
|
data_designer/errors.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
from ..logging import LoggingConfig, configure_logging
|
|
4
|
+
|
|
5
|
+
configure_logging(LoggingConfig.default())
|
|
6
|
+
|
|
7
|
+
from ..config.analysis.column_profilers import JudgeScoreProfilerConfig
|
|
8
|
+
from ..config.column_configs import (
|
|
9
|
+
ExpressionColumnConfig,
|
|
10
|
+
LLMCodeColumnConfig,
|
|
11
|
+
LLMJudgeColumnConfig,
|
|
12
|
+
LLMStructuredColumnConfig,
|
|
13
|
+
LLMTextColumnConfig,
|
|
14
|
+
SamplerColumnConfig,
|
|
15
|
+
Score,
|
|
16
|
+
SeedDatasetColumnConfig,
|
|
17
|
+
ValidationColumnConfig,
|
|
18
|
+
)
|
|
19
|
+
from ..config.column_types import DataDesignerColumnType
|
|
20
|
+
from ..config.config_builder import DataDesignerConfigBuilder
|
|
21
|
+
from ..config.data_designer_config import DataDesignerConfig
|
|
22
|
+
from ..config.dataset_builders import BuildStage
|
|
23
|
+
from ..config.datastore import DatastoreSettings
|
|
24
|
+
from ..config.models import (
|
|
25
|
+
ImageContext,
|
|
26
|
+
ImageFormat,
|
|
27
|
+
InferenceParameters,
|
|
28
|
+
ManualDistribution,
|
|
29
|
+
ManualDistributionParams,
|
|
30
|
+
Modality,
|
|
31
|
+
ModalityContext,
|
|
32
|
+
ModalityDataType,
|
|
33
|
+
ModelConfig,
|
|
34
|
+
UniformDistribution,
|
|
35
|
+
UniformDistributionParams,
|
|
36
|
+
)
|
|
37
|
+
from ..config.processors import DropColumnsProcessorConfig, ProcessorType
|
|
38
|
+
from ..config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
|
|
39
|
+
from ..config.sampler_params import (
|
|
40
|
+
BernoulliMixtureSamplerParams,
|
|
41
|
+
BernoulliSamplerParams,
|
|
42
|
+
BinomialSamplerParams,
|
|
43
|
+
CategorySamplerParams,
|
|
44
|
+
DatetimeSamplerParams,
|
|
45
|
+
GaussianSamplerParams,
|
|
46
|
+
PersonFromFakerSamplerParams,
|
|
47
|
+
PersonSamplerParams,
|
|
48
|
+
PoissonSamplerParams,
|
|
49
|
+
SamplerType,
|
|
50
|
+
ScipySamplerParams,
|
|
51
|
+
SubcategorySamplerParams,
|
|
52
|
+
TimeDeltaSamplerParams,
|
|
53
|
+
UniformSamplerParams,
|
|
54
|
+
UUIDSamplerParams,
|
|
55
|
+
)
|
|
56
|
+
from ..config.seed import DatastoreSeedDatasetReference, IndexRange, PartitionBlock, SamplingStrategy, SeedConfig
|
|
57
|
+
from ..config.utils.code_lang import CodeLang
|
|
58
|
+
from ..config.utils.info import InfoType
|
|
59
|
+
from ..config.utils.misc import can_run_data_designer_locally
|
|
60
|
+
from ..config.validator_params import (
|
|
61
|
+
CodeValidatorParams,
|
|
62
|
+
RemoteValidatorParams,
|
|
63
|
+
ValidatorType,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
local_library_imports = []
|
|
67
|
+
try:
|
|
68
|
+
if can_run_data_designer_locally():
|
|
69
|
+
from ..config.validator_params import LocalCallableValidatorParams # noqa: F401
|
|
70
|
+
from ..engine.model_provider import ModelProvider # noqa: F401
|
|
71
|
+
from ..interface.data_designer import DataDesigner # noqa: F401
|
|
72
|
+
|
|
73
|
+
local_library_imports = ["DataDesigner", "LocalCallableValidatorParams", "ModelProvider"]
|
|
74
|
+
except ModuleNotFoundError:
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
__all__ = [
|
|
78
|
+
"BernoulliMixtureSamplerParams",
|
|
79
|
+
"BernoulliSamplerParams",
|
|
80
|
+
"BinomialSamplerParams",
|
|
81
|
+
"CategorySamplerParams",
|
|
82
|
+
"CodeLang",
|
|
83
|
+
"CodeValidatorParams",
|
|
84
|
+
"ColumnInequalityConstraint",
|
|
85
|
+
"configure_logging",
|
|
86
|
+
"DataDesignerColumnType",
|
|
87
|
+
"DataDesignerConfig",
|
|
88
|
+
"DataDesignerConfigBuilder",
|
|
89
|
+
"BuildStage",
|
|
90
|
+
"DatastoreSeedDatasetReference",
|
|
91
|
+
"DatastoreSettings",
|
|
92
|
+
"DatetimeSamplerParams",
|
|
93
|
+
"DropColumnsProcessorConfig",
|
|
94
|
+
"ExpressionColumnConfig",
|
|
95
|
+
"GaussianSamplerParams",
|
|
96
|
+
"IndexRange",
|
|
97
|
+
"InfoType",
|
|
98
|
+
"ImageContext",
|
|
99
|
+
"ImageFormat",
|
|
100
|
+
"InferenceParameters",
|
|
101
|
+
"JudgeScoreProfilerConfig",
|
|
102
|
+
"LLMCodeColumnConfig",
|
|
103
|
+
"LLMJudgeColumnConfig",
|
|
104
|
+
"LLMStructuredColumnConfig",
|
|
105
|
+
"LLMTextColumnConfig",
|
|
106
|
+
"LoggingConfig",
|
|
107
|
+
"ManualDistribution",
|
|
108
|
+
"ManualDistributionParams",
|
|
109
|
+
"Modality",
|
|
110
|
+
"ModalityContext",
|
|
111
|
+
"ModalityDataType",
|
|
112
|
+
"ModelConfig",
|
|
113
|
+
"PartitionBlock",
|
|
114
|
+
"PersonSamplerParams",
|
|
115
|
+
"PersonFromFakerSamplerParams",
|
|
116
|
+
"PoissonSamplerParams",
|
|
117
|
+
"ProcessorType",
|
|
118
|
+
"RemoteValidatorParams",
|
|
119
|
+
"SamplerColumnConfig",
|
|
120
|
+
"SamplerType",
|
|
121
|
+
"SamplingStrategy",
|
|
122
|
+
"ScalarInequalityConstraint",
|
|
123
|
+
"ScipySamplerParams",
|
|
124
|
+
"Score",
|
|
125
|
+
"SeedConfig",
|
|
126
|
+
"SeedDatasetColumnConfig",
|
|
127
|
+
"SubcategorySamplerParams",
|
|
128
|
+
"TimeDeltaSamplerParams",
|
|
129
|
+
"UniformDistribution",
|
|
130
|
+
"UniformDistributionParams",
|
|
131
|
+
"UniformSamplerParams",
|
|
132
|
+
"UUIDSamplerParams",
|
|
133
|
+
"ValidationColumnConfig",
|
|
134
|
+
"ValidatorType",
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
__all__.extend(local_library_imports)
|