data-designer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. data_designer/__init__.py +15 -0
  2. data_designer/_version.py +34 -0
  3. data_designer/cli/README.md +236 -0
  4. data_designer/cli/__init__.py +6 -0
  5. data_designer/cli/commands/__init__.py +2 -0
  6. data_designer/cli/commands/list.py +130 -0
  7. data_designer/cli/commands/models.py +10 -0
  8. data_designer/cli/commands/providers.py +11 -0
  9. data_designer/cli/commands/reset.py +100 -0
  10. data_designer/cli/controllers/__init__.py +7 -0
  11. data_designer/cli/controllers/model_controller.py +246 -0
  12. data_designer/cli/controllers/provider_controller.py +317 -0
  13. data_designer/cli/forms/__init__.py +20 -0
  14. data_designer/cli/forms/builder.py +51 -0
  15. data_designer/cli/forms/field.py +180 -0
  16. data_designer/cli/forms/form.py +59 -0
  17. data_designer/cli/forms/model_builder.py +125 -0
  18. data_designer/cli/forms/provider_builder.py +76 -0
  19. data_designer/cli/main.py +44 -0
  20. data_designer/cli/repositories/__init__.py +8 -0
  21. data_designer/cli/repositories/base.py +39 -0
  22. data_designer/cli/repositories/model_repository.py +42 -0
  23. data_designer/cli/repositories/provider_repository.py +43 -0
  24. data_designer/cli/services/__init__.py +7 -0
  25. data_designer/cli/services/model_service.py +116 -0
  26. data_designer/cli/services/provider_service.py +111 -0
  27. data_designer/cli/ui.py +448 -0
  28. data_designer/cli/utils.py +47 -0
  29. data_designer/config/__init__.py +2 -0
  30. data_designer/config/analysis/column_profilers.py +89 -0
  31. data_designer/config/analysis/column_statistics.py +274 -0
  32. data_designer/config/analysis/dataset_profiler.py +60 -0
  33. data_designer/config/analysis/utils/errors.py +8 -0
  34. data_designer/config/analysis/utils/reporting.py +188 -0
  35. data_designer/config/base.py +68 -0
  36. data_designer/config/column_configs.py +354 -0
  37. data_designer/config/column_types.py +168 -0
  38. data_designer/config/config_builder.py +660 -0
  39. data_designer/config/data_designer_config.py +40 -0
  40. data_designer/config/dataset_builders.py +11 -0
  41. data_designer/config/datastore.py +151 -0
  42. data_designer/config/default_model_settings.py +123 -0
  43. data_designer/config/errors.py +19 -0
  44. data_designer/config/interface.py +54 -0
  45. data_designer/config/models.py +231 -0
  46. data_designer/config/preview_results.py +32 -0
  47. data_designer/config/processors.py +41 -0
  48. data_designer/config/sampler_constraints.py +51 -0
  49. data_designer/config/sampler_params.py +604 -0
  50. data_designer/config/seed.py +145 -0
  51. data_designer/config/utils/code_lang.py +83 -0
  52. data_designer/config/utils/constants.py +313 -0
  53. data_designer/config/utils/errors.py +19 -0
  54. data_designer/config/utils/info.py +88 -0
  55. data_designer/config/utils/io_helpers.py +273 -0
  56. data_designer/config/utils/misc.py +81 -0
  57. data_designer/config/utils/numerical_helpers.py +28 -0
  58. data_designer/config/utils/type_helpers.py +100 -0
  59. data_designer/config/utils/validation.py +336 -0
  60. data_designer/config/utils/visualization.py +427 -0
  61. data_designer/config/validator_params.py +96 -0
  62. data_designer/engine/__init__.py +2 -0
  63. data_designer/engine/analysis/column_profilers/base.py +55 -0
  64. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
  65. data_designer/engine/analysis/column_profilers/registry.py +20 -0
  66. data_designer/engine/analysis/column_statistics.py +142 -0
  67. data_designer/engine/analysis/dataset_profiler.py +125 -0
  68. data_designer/engine/analysis/errors.py +7 -0
  69. data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
  70. data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
  71. data_designer/engine/column_generators/__init__.py +2 -0
  72. data_designer/engine/column_generators/generators/__init__.py +2 -0
  73. data_designer/engine/column_generators/generators/base.py +61 -0
  74. data_designer/engine/column_generators/generators/expression.py +63 -0
  75. data_designer/engine/column_generators/generators/llm_generators.py +172 -0
  76. data_designer/engine/column_generators/generators/samplers.py +75 -0
  77. data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
  78. data_designer/engine/column_generators/generators/validation.py +147 -0
  79. data_designer/engine/column_generators/registry.py +56 -0
  80. data_designer/engine/column_generators/utils/errors.py +13 -0
  81. data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
  82. data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
  83. data_designer/engine/configurable_task.py +82 -0
  84. data_designer/engine/dataset_builders/artifact_storage.py +181 -0
  85. data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
  86. data_designer/engine/dataset_builders/errors.py +13 -0
  87. data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
  88. data_designer/engine/dataset_builders/utils/__init__.py +2 -0
  89. data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
  90. data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
  91. data_designer/engine/dataset_builders/utils/dag.py +56 -0
  92. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
  93. data_designer/engine/dataset_builders/utils/errors.py +13 -0
  94. data_designer/engine/errors.py +49 -0
  95. data_designer/engine/model_provider.py +75 -0
  96. data_designer/engine/models/__init__.py +2 -0
  97. data_designer/engine/models/errors.py +308 -0
  98. data_designer/engine/models/facade.py +225 -0
  99. data_designer/engine/models/litellm_overrides.py +162 -0
  100. data_designer/engine/models/parsers/__init__.py +2 -0
  101. data_designer/engine/models/parsers/errors.py +34 -0
  102. data_designer/engine/models/parsers/parser.py +236 -0
  103. data_designer/engine/models/parsers/postprocessors.py +93 -0
  104. data_designer/engine/models/parsers/tag_parsers.py +60 -0
  105. data_designer/engine/models/parsers/types.py +82 -0
  106. data_designer/engine/models/recipes/base.py +79 -0
  107. data_designer/engine/models/recipes/response_recipes.py +291 -0
  108. data_designer/engine/models/registry.py +118 -0
  109. data_designer/engine/models/usage.py +75 -0
  110. data_designer/engine/models/utils.py +38 -0
  111. data_designer/engine/processing/ginja/__init__.py +2 -0
  112. data_designer/engine/processing/ginja/ast.py +64 -0
  113. data_designer/engine/processing/ginja/environment.py +461 -0
  114. data_designer/engine/processing/ginja/exceptions.py +54 -0
  115. data_designer/engine/processing/ginja/record.py +30 -0
  116. data_designer/engine/processing/gsonschema/__init__.py +2 -0
  117. data_designer/engine/processing/gsonschema/exceptions.py +8 -0
  118. data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
  119. data_designer/engine/processing/gsonschema/types.py +8 -0
  120. data_designer/engine/processing/gsonschema/validators.py +143 -0
  121. data_designer/engine/processing/processors/base.py +15 -0
  122. data_designer/engine/processing/processors/drop_columns.py +46 -0
  123. data_designer/engine/processing/processors/registry.py +20 -0
  124. data_designer/engine/processing/utils.py +120 -0
  125. data_designer/engine/registry/base.py +97 -0
  126. data_designer/engine/registry/data_designer_registry.py +37 -0
  127. data_designer/engine/registry/errors.py +10 -0
  128. data_designer/engine/resources/managed_dataset_generator.py +35 -0
  129. data_designer/engine/resources/managed_dataset_repository.py +194 -0
  130. data_designer/engine/resources/managed_storage.py +63 -0
  131. data_designer/engine/resources/resource_provider.py +46 -0
  132. data_designer/engine/resources/seed_dataset_data_store.py +66 -0
  133. data_designer/engine/sampling_gen/column.py +89 -0
  134. data_designer/engine/sampling_gen/constraints.py +95 -0
  135. data_designer/engine/sampling_gen/data_sources/base.py +214 -0
  136. data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
  137. data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
  138. data_designer/engine/sampling_gen/entities/__init__.py +2 -0
  139. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  140. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
  141. data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
  142. data_designer/engine/sampling_gen/entities/errors.py +8 -0
  143. data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
  144. data_designer/engine/sampling_gen/entities/person.py +142 -0
  145. data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
  146. data_designer/engine/sampling_gen/errors.py +24 -0
  147. data_designer/engine/sampling_gen/generator.py +121 -0
  148. data_designer/engine/sampling_gen/jinja_utils.py +60 -0
  149. data_designer/engine/sampling_gen/people_gen.py +203 -0
  150. data_designer/engine/sampling_gen/person_constants.py +54 -0
  151. data_designer/engine/sampling_gen/schema.py +143 -0
  152. data_designer/engine/sampling_gen/schema_builder.py +59 -0
  153. data_designer/engine/sampling_gen/utils.py +40 -0
  154. data_designer/engine/secret_resolver.py +80 -0
  155. data_designer/engine/validators/__init__.py +17 -0
  156. data_designer/engine/validators/base.py +36 -0
  157. data_designer/engine/validators/local_callable.py +34 -0
  158. data_designer/engine/validators/python.py +245 -0
  159. data_designer/engine/validators/remote.py +83 -0
  160. data_designer/engine/validators/sql.py +60 -0
  161. data_designer/errors.py +5 -0
  162. data_designer/essentials/__init__.py +137 -0
  163. data_designer/interface/__init__.py +2 -0
  164. data_designer/interface/data_designer.py +351 -0
  165. data_designer/interface/errors.py +16 -0
  166. data_designer/interface/results.py +55 -0
  167. data_designer/logging.py +161 -0
  168. data_designer/plugin_manager.py +83 -0
  169. data_designer/plugins/__init__.py +6 -0
  170. data_designer/plugins/errors.py +10 -0
  171. data_designer/plugins/plugin.py +69 -0
  172. data_designer/plugins/registry.py +86 -0
  173. data_designer-0.1.0.dist-info/METADATA +173 -0
  174. data_designer-0.1.0.dist-info/RECORD +177 -0
  175. data_designer-0.1.0.dist-info/WHEEL +4 -0
  176. data_designer-0.1.0.dist-info/entry_points.txt +2 -0
  177. data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,36 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from abc import ABC, abstractmethod
5
+ from typing import Iterator, Optional
6
+
7
+ from pydantic import BaseModel, ConfigDict
8
+ from typing_extensions import Self
9
+
10
+
11
+ class ValidationOutput(BaseModel):
12
+ is_valid: Optional[bool]
13
+ model_config = ConfigDict(extra="allow")
14
+
15
+
16
+ class ValidationResult(BaseModel):
17
+ data: list[ValidationOutput]
18
+
19
+ def __len__(self) -> int:
20
+ return len(self.data)
21
+
22
+ def __getitem__(self, index: int) -> ValidationOutput:
23
+ return self.data[index]
24
+
25
+ def __iter__(self) -> Iterator[ValidationOutput]:
26
+ return iter(self.data)
27
+
28
+ @classmethod
29
+ def empty(cls, size: int) -> Self:
30
+ return cls(data=[ValidationOutput(is_valid=None) for _ in range(size)])
31
+
32
+
33
+ class BaseValidator(ABC):
34
+ @abstractmethod
35
+ def run_validation(self, data: list[dict]) -> ValidationResult:
36
+ pass
@@ -0,0 +1,34 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import logging
5
+
6
+ import pandas as pd
7
+
8
+ from data_designer.config.validator_params import LocalCallableValidatorParams
9
+ from data_designer.engine.errors import LocalCallableValidationError
10
+ from data_designer.engine.processing.gsonschema.validators import validate
11
+ from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LocalCallableValidator(BaseValidator):
17
+ def __init__(self, config: LocalCallableValidatorParams):
18
+ self.validation_function = config.validation_function
19
+ self.output_schema = config.output_schema
20
+
21
+ def run_validation(self, data: list[dict]) -> ValidationResult:
22
+ df = pd.DataFrame(data)
23
+
24
+ try:
25
+ result_as_df = self.validation_function(df)
26
+ except Exception as e:
27
+ logger.error(f"Callback validator failed: {e}")
28
+ raise LocalCallableValidationError(str(e))
29
+
30
+ records = result_as_df.to_dict(orient="records")
31
+ result = ValidationResult(data=[ValidationOutput.model_validate(record) for record in records])
32
+ if self.output_schema:
33
+ validate(result.model_dump(mode="json"), self.output_schema, no_extra_properties=True)
34
+ return result
@@ -0,0 +1,245 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import ast
5
+ from collections import defaultdict
6
+ import logging
7
+ from pathlib import Path
8
+ import re
9
+ import subprocess
10
+ import tempfile
11
+ from uuid import uuid4
12
+
13
+ import pandas as pd
14
+ from pydantic import BaseModel
15
+ from ruff.__main__ import find_ruff_bin
16
+
17
+ from data_designer.config.validator_params import CodeValidatorParams
18
+ from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ PYLINT_ERROR_CATEGORIES_ORDERED = [
23
+ "fatal",
24
+ "error",
25
+ "warning",
26
+ "convention",
27
+ "refactor",
28
+ ]
29
+ PYLINT_VALID_LEVELS = {"none", "warning", "convention", "refactor"}
30
+
31
+ TYPE_FROM_SYMBOL = {
32
+ "E": "refactor",
33
+ "F": "error",
34
+ "SIM": "refactor",
35
+ "PLC": "convention",
36
+ "PLE": "error",
37
+ "PLR": "refactor",
38
+ "PLW": "warning",
39
+ "SyntaxError": "fatal",
40
+ }
41
+
42
+ PYTHON_MESSAGES_FIELD = "python_linter_messages"
43
+ RECORD_ID_COLUMN_NAME = "internal_code_record_id"
44
+
45
+
46
+ class PythonValidationStat(BaseModel):
47
+ fatal: int = 0
48
+ error: int = 0
49
+ warning: int = 0
50
+ refactor: int = 0
51
+ convention: int = 0
52
+ statement: int = 0
53
+
54
+ @property
55
+ def score(self) -> float:
56
+ # https://pylint.pycqa.org/en/latest/user_guide/configuration/all-options.html#evaluation
57
+ if self.statement == 0: # prevent division by zero down below
58
+ self.statement = max(1, self.statement)
59
+ return max(
60
+ 0,
61
+ (
62
+ 0
63
+ if self.fatal
64
+ else 10.0
65
+ - ((float(5 * self.error + self.warning + self.refactor + self.convention) / self.statement) * 10)
66
+ ),
67
+ )
68
+
69
+
70
+ class PythonLinterMessage(BaseModel):
71
+ type: str
72
+ symbol: str
73
+ line: int
74
+ column: int
75
+ message: str
76
+
77
+ @property
78
+ def type_sort_order(self) -> int:
79
+ return PYLINT_ERROR_CATEGORIES_ORDERED.index(self.type)
80
+
81
+
82
+ class PythonLinterMessages(BaseModel):
83
+ _messages: list[PythonLinterMessage] = []
84
+
85
+ @property
86
+ def messages(self) -> list[PythonLinterMessage]:
87
+ # Ordered by severity first then by line number
88
+ return sorted(self._messages, key=lambda msg: (msg.type_sort_order, msg.line))
89
+
90
+ def add(self, message: PythonLinterMessage) -> None:
91
+ self._messages.append(message)
92
+
93
+ def get_count_by_type(self) -> dict[str, int]:
94
+ count_by_type = defaultdict(int)
95
+ for message in self.messages:
96
+ count_by_type[message.type] += 1
97
+ return dict(count_by_type)
98
+
99
+ @property
100
+ def is_empty(self) -> bool:
101
+ return len(self.messages) == 0
102
+
103
+ @property
104
+ def severity(self) -> str:
105
+ if self.is_empty:
106
+ return "none"
107
+ return self.messages[0].type
108
+
109
+ @property
110
+ def is_valid(self) -> bool:
111
+ return self.is_empty or self.messages[0].type in PYLINT_VALID_LEVELS
112
+
113
+
114
+ class PythonValidator(BaseValidator):
115
+ def __init__(self, config: CodeValidatorParams):
116
+ self.config = config
117
+
118
+ def run_validation(self, data: list[dict]) -> ValidationResult:
119
+ df = pd.DataFrame(data)
120
+
121
+ if len(df.columns) > 1:
122
+ raise ValueError("Python validator assumes single column input")
123
+ target_column = df.columns[0]
124
+
125
+ df.loc[:, RECORD_ID_COLUMN_NAME] = [uuid4() for _ in range(df.shape[0])]
126
+ with tempfile.TemporaryDirectory() as temp_dir:
127
+ _ = df.apply(
128
+ self._write_code_to_file,
129
+ args=(target_column, temp_dir),
130
+ axis=1,
131
+ )
132
+ results = self._validate_files_in_path(path=temp_dir)
133
+
134
+ records = df.to_dict(orient="records")
135
+
136
+ ordered_results = []
137
+ for record in records:
138
+ module_id = self._get_module_name(record[RECORD_ID_COLUMN_NAME], target_column)
139
+ result = results.get(module_id)
140
+ if result is not None:
141
+ ordered_results.append(result)
142
+
143
+ return ValidationResult(data=ordered_results)
144
+
145
+ def _validate_files_in_path(self, path: str) -> dict[str, ValidationOutput]:
146
+ lint_results = self._run_linter(path)
147
+
148
+ scores_by_module = self._get_scores(
149
+ {
150
+ module: messages.get_count_by_type()
151
+ | {"statement": self._count_python_statements(f"{path}/{module}.py")}
152
+ for module, messages in lint_results.items()
153
+ }
154
+ )
155
+
156
+ validation_result = {}
157
+ for module, score in scores_by_module.items():
158
+ messages = lint_results.get(module, PythonLinterMessages())
159
+ metadata = {
160
+ "python_linter_score": score,
161
+ "python_linter_severity": messages.severity,
162
+ PYTHON_MESSAGES_FIELD: [m.model_dump() for m in messages.messages],
163
+ }
164
+ validation_result[module] = ValidationOutput(is_valid=messages.is_valid, **metadata)
165
+ return validation_result
166
+
167
+ def _write_code_to_file(self, row: pd.Series, code_column: str, path: str) -> None:
168
+ with open(f"{path}/{self._get_module_name(row[RECORD_ID_COLUMN_NAME], code_column)}.py", "w") as file:
169
+ file.write(row[code_column])
170
+
171
+ @staticmethod
172
+ def _get_module_name(record_id: str, column_name: str) -> str:
173
+ return f"{record_id}_{column_name}"
174
+
175
+ @staticmethod
176
+ def _run_linter(codebase_path: str) -> dict[str, PythonLinterMessages]:
177
+ # Create empty dict for output
178
+ processed = {}
179
+ for file in Path(codebase_path).glob("*.py"):
180
+ processed[file.stem] = PythonLinterMessages()
181
+
182
+ # Run ruff linter
183
+ ruff_bin = find_ruff_bin()
184
+ env = {"NO_COLOR": "1"}
185
+
186
+ ruff_exec = subprocess.run(
187
+ [
188
+ ruff_bin,
189
+ "check",
190
+ "--select",
191
+ "E,F6,F7,F8,SIM,PLC,PLE,PLR,PLW",
192
+ codebase_path,
193
+ ],
194
+ env=env,
195
+ text=True,
196
+ capture_output=True,
197
+ check=False,
198
+ cwd=Path.cwd(),
199
+ )
200
+ ruff_output = ruff_exec.stdout
201
+
202
+ # Parse ruff output
203
+ if "All checks passed!" in ruff_output:
204
+ return processed # no errors or warnings
205
+
206
+ pattern = r"(.*):([0-9]*):([0-9]*): ([A-Za-z0-9]*):? (?:\[\*\] )?(.*)\n"
207
+ errors = re.findall(pattern, ruff_output)
208
+
209
+ if errors == []: # output could not be parsed
210
+ raise RuntimeError("ruff's output could not be parsed")
211
+
212
+ try:
213
+ for error in errors:
214
+ filename, line, column, symbol, message = error
215
+ processed[Path(filename).stem].add(
216
+ PythonLinterMessage(
217
+ type=TYPE_FROM_SYMBOL[re.sub(r"[^A-Za-z]+", "", symbol)],
218
+ symbol=symbol,
219
+ line=int(line),
220
+ column=int(column),
221
+ message=message,
222
+ )
223
+ )
224
+ except Exception: # output not in expected format
225
+ raise RuntimeError("ruff's output not in expected format")
226
+
227
+ return processed
228
+
229
+ @staticmethod
230
+ def _get_scores(stats_by_module: dict[str, dict[str, int]]) -> dict[str, float]:
231
+ scores = {}
232
+ for key, item in stats_by_module.items():
233
+ stat = PythonValidationStat(**item)
234
+ scores[key] = stat.score
235
+ return scores
236
+
237
+ @staticmethod
238
+ def _count_python_statements(file_path: str) -> int:
239
+ """Count the number of statements in a Python file."""
240
+ try:
241
+ with open(file_path, "r", encoding="utf-8") as f:
242
+ tree = ast.parse(f.read())
243
+ return sum(1 for node in ast.walk(tree) if isinstance(node, ast.stmt))
244
+ except Exception:
245
+ return 0
@@ -0,0 +1,83 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import logging
5
+
6
+ import httpx
7
+ from httpx_retries import Retry, RetryTransport
8
+
9
+ from data_designer.config.validator_params import RemoteValidatorParams
10
+ from data_designer.engine.errors import RemoteValidationSchemaError
11
+ from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError
12
+ from data_designer.engine.processing.gsonschema.validators import validate
13
+ from data_designer.engine.validators.base import BaseValidator, ValidationResult
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class RemoteEndpointClient:
19
+ """Client for making parallel HTTP requests to remote endpoints with retry, timeout, and auth support."""
20
+
21
+ def __init__(
22
+ self,
23
+ config: RemoteValidatorParams,
24
+ ):
25
+ """
26
+ Initialize the remote endpoint client.
27
+
28
+ Args:
29
+ config: Remote validator parameters
30
+ """
31
+ self.endpoint_url = config.endpoint_url
32
+ self.output_schema = config.output_schema
33
+ self.timeout = config.timeout
34
+ self.max_retries = config.max_retries
35
+ self.retry_backoff = config.retry_backoff
36
+
37
+ def post_to_remote_endpoint(self, content: dict) -> dict:
38
+ """
39
+ Make a single HTTP request with retry logic.
40
+
41
+ Args:
42
+ content: The content to be posted to the remote endpoint
43
+
44
+ Returns:
45
+ The JSON response from the remote endpoint
46
+
47
+ Raises:
48
+ httpx.RequestError: If all retry attempts fail
49
+ httpx.HTTPStatusError: If the server returns an error status
50
+ """
51
+ retry = Retry(
52
+ total=self.max_retries,
53
+ backoff_factor=self.retry_backoff,
54
+ status_forcelist=[429, 500, 502, 503, 504],
55
+ )
56
+ transport = RetryTransport(retry=retry)
57
+
58
+ with httpx.Client(
59
+ timeout=httpx.Timeout(self.timeout),
60
+ transport=transport,
61
+ ) as http_client:
62
+ response = http_client.post(
63
+ self.endpoint_url,
64
+ json=content,
65
+ )
66
+ response.raise_for_status()
67
+
68
+ response_json = response.json()
69
+ if self.output_schema:
70
+ try:
71
+ validate(response_json, self.output_schema, no_extra_properties=True)
72
+ except JSONSchemaValidationError as exc:
73
+ raise RemoteValidationSchemaError(str(exc)) from exc
74
+ return response_json
75
+
76
+
77
+ class RemoteValidator(BaseValidator):
78
+ def __init__(self, config: RemoteValidatorParams):
79
+ self.remote_endpoint_client = RemoteEndpointClient(config=config)
80
+
81
+ def run_validation(self, data: list[dict]) -> ValidationResult:
82
+ result = self.remote_endpoint_client.post_to_remote_endpoint(content={"data": data})
83
+ return ValidationResult.model_validate(result)
@@ -0,0 +1,60 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ import logging
5
+ import re
6
+
7
+ import pandas as pd
8
+ import sqlfluff
9
+
10
+ from data_designer.config.utils.code_lang import CodeLang
11
+ from data_designer.config.validator_params import CodeValidatorParams
12
+ from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
13
+
14
+ sqlfluff_logger = logging.getLogger("sqlfluff")
15
+ sqlfluff_logger.setLevel(logging.WARNING)
16
+
17
+
18
+ class SQLValidator(BaseValidator):
19
+ def __init__(self, config: CodeValidatorParams):
20
+ self.config = config
21
+
22
+ def run_validation(self, data: list[dict]) -> ValidationResult:
23
+ df = pd.DataFrame(data)
24
+
25
+ if len(df.columns) > 1:
26
+ raise ValueError("SQL validator assumes single column input")
27
+ target_column = df.columns[0]
28
+
29
+ records = df.to_dict(orient="records")
30
+
31
+ results = []
32
+ for record in records:
33
+ result = self._validate_query(record[target_column])
34
+ results.append(result)
35
+
36
+ return ValidationResult(data=results)
37
+
38
+ def _validate_query(self, content: str) -> ValidationResult:
39
+ try:
40
+ result = sqlfluff.lint(
41
+ content,
42
+ dialect=CodeLang.parse_dialect(self.config.code_lang),
43
+ )
44
+ prs_errors = [res for res in result if res["code"].startswith("PRS")]
45
+ error_messages = "\n".join([f"{error['code']}: {error['description']}" for error in prs_errors])
46
+ decimal_pattern = re.compile(r"DECIMAL\(\d+\)")
47
+ decimal_issues = decimal_pattern.findall(content)
48
+ if decimal_issues:
49
+ error_messages += "\nCustom Check: Found DECIMAL definitions without a scale, which may be incorrect."
50
+ if error_messages:
51
+ return ValidationOutput(
52
+ is_valid=False,
53
+ error_messages=error_messages,
54
+ )
55
+ return ValidationOutput(is_valid=True, error_messages="")
56
+ except Exception as e:
57
+ return ValidationOutput(
58
+ is_valid=False,
59
+ error_messages=f"Exception during SQL parsing: {e}",
60
+ )
@@ -0,0 +1,5 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+
5
+ class DataDesignerError(Exception): ...
@@ -0,0 +1,137 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from ..logging import LoggingConfig, configure_logging
4
+
5
+ configure_logging(LoggingConfig.default())
6
+
7
+ from ..config.analysis.column_profilers import JudgeScoreProfilerConfig
8
+ from ..config.column_configs import (
9
+ ExpressionColumnConfig,
10
+ LLMCodeColumnConfig,
11
+ LLMJudgeColumnConfig,
12
+ LLMStructuredColumnConfig,
13
+ LLMTextColumnConfig,
14
+ SamplerColumnConfig,
15
+ Score,
16
+ SeedDatasetColumnConfig,
17
+ ValidationColumnConfig,
18
+ )
19
+ from ..config.column_types import DataDesignerColumnType
20
+ from ..config.config_builder import DataDesignerConfigBuilder
21
+ from ..config.data_designer_config import DataDesignerConfig
22
+ from ..config.dataset_builders import BuildStage
23
+ from ..config.datastore import DatastoreSettings
24
+ from ..config.models import (
25
+ ImageContext,
26
+ ImageFormat,
27
+ InferenceParameters,
28
+ ManualDistribution,
29
+ ManualDistributionParams,
30
+ Modality,
31
+ ModalityContext,
32
+ ModalityDataType,
33
+ ModelConfig,
34
+ UniformDistribution,
35
+ UniformDistributionParams,
36
+ )
37
+ from ..config.processors import DropColumnsProcessorConfig, ProcessorType
38
+ from ..config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
39
+ from ..config.sampler_params import (
40
+ BernoulliMixtureSamplerParams,
41
+ BernoulliSamplerParams,
42
+ BinomialSamplerParams,
43
+ CategorySamplerParams,
44
+ DatetimeSamplerParams,
45
+ GaussianSamplerParams,
46
+ PersonFromFakerSamplerParams,
47
+ PersonSamplerParams,
48
+ PoissonSamplerParams,
49
+ SamplerType,
50
+ ScipySamplerParams,
51
+ SubcategorySamplerParams,
52
+ TimeDeltaSamplerParams,
53
+ UniformSamplerParams,
54
+ UUIDSamplerParams,
55
+ )
56
+ from ..config.seed import DatastoreSeedDatasetReference, IndexRange, PartitionBlock, SamplingStrategy, SeedConfig
57
+ from ..config.utils.code_lang import CodeLang
58
+ from ..config.utils.info import InfoType
59
+ from ..config.utils.misc import can_run_data_designer_locally
60
+ from ..config.validator_params import (
61
+ CodeValidatorParams,
62
+ RemoteValidatorParams,
63
+ ValidatorType,
64
+ )
65
+
66
+ local_library_imports = []
67
+ try:
68
+ if can_run_data_designer_locally():
69
+ from ..config.validator_params import LocalCallableValidatorParams # noqa: F401
70
+ from ..engine.model_provider import ModelProvider # noqa: F401
71
+ from ..interface.data_designer import DataDesigner # noqa: F401
72
+
73
+ local_library_imports = ["DataDesigner", "LocalCallableValidatorParams", "ModelProvider"]
74
+ except ModuleNotFoundError:
75
+ pass
76
+
77
+ __all__ = [
78
+ "BernoulliMixtureSamplerParams",
79
+ "BernoulliSamplerParams",
80
+ "BinomialSamplerParams",
81
+ "CategorySamplerParams",
82
+ "CodeLang",
83
+ "CodeValidatorParams",
84
+ "ColumnInequalityConstraint",
85
+ "configure_logging",
86
+ "DataDesignerColumnType",
87
+ "DataDesignerConfig",
88
+ "DataDesignerConfigBuilder",
89
+ "BuildStage",
90
+ "DatastoreSeedDatasetReference",
91
+ "DatastoreSettings",
92
+ "DatetimeSamplerParams",
93
+ "DropColumnsProcessorConfig",
94
+ "ExpressionColumnConfig",
95
+ "GaussianSamplerParams",
96
+ "IndexRange",
97
+ "InfoType",
98
+ "ImageContext",
99
+ "ImageFormat",
100
+ "InferenceParameters",
101
+ "JudgeScoreProfilerConfig",
102
+ "LLMCodeColumnConfig",
103
+ "LLMJudgeColumnConfig",
104
+ "LLMStructuredColumnConfig",
105
+ "LLMTextColumnConfig",
106
+ "LoggingConfig",
107
+ "ManualDistribution",
108
+ "ManualDistributionParams",
109
+ "Modality",
110
+ "ModalityContext",
111
+ "ModalityDataType",
112
+ "ModelConfig",
113
+ "PartitionBlock",
114
+ "PersonSamplerParams",
115
+ "PersonFromFakerSamplerParams",
116
+ "PoissonSamplerParams",
117
+ "ProcessorType",
118
+ "RemoteValidatorParams",
119
+ "SamplerColumnConfig",
120
+ "SamplerType",
121
+ "SamplingStrategy",
122
+ "ScalarInequalityConstraint",
123
+ "ScipySamplerParams",
124
+ "Score",
125
+ "SeedConfig",
126
+ "SeedDatasetColumnConfig",
127
+ "SubcategorySamplerParams",
128
+ "TimeDeltaSamplerParams",
129
+ "UniformDistribution",
130
+ "UniformDistributionParams",
131
+ "UniformSamplerParams",
132
+ "UUIDSamplerParams",
133
+ "ValidationColumnConfig",
134
+ "ValidatorType",
135
+ ]
136
+
137
+ __all__.extend(local_library_imports)
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0