data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,254 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import ast
7
- import json
8
- import logging
9
- import subprocess
10
- import tempfile
11
- from collections import defaultdict
12
- from pathlib import Path
13
- from typing import TYPE_CHECKING
14
- from uuid import uuid4
15
-
16
- from pydantic import BaseModel
17
- from ruff.__main__ import find_ruff_bin
18
-
19
- from data_designer.config.validator_params import CodeValidatorParams
20
- from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
21
- from data_designer.lazy_heavy_imports import pd
22
-
23
- if TYPE_CHECKING:
24
- import pandas as pd
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
- PYLINT_ERROR_CATEGORIES_ORDERED = [
29
- "fatal",
30
- "error",
31
- "warning",
32
- "convention",
33
- "refactor",
34
- ]
35
- PYLINT_VALID_LEVELS = {"none", "warning", "convention", "refactor"}
36
-
37
- TYPE_FROM_SYMBOL = {
38
- "E": "refactor",
39
- "F": "error",
40
- "SIM": "refactor",
41
- "PLC": "convention",
42
- "PLE": "error",
43
- "PLR": "refactor",
44
- "PLW": "warning",
45
- "SyntaxError": "fatal",
46
- }
47
-
48
- PYTHON_MESSAGES_FIELD = "python_linter_messages"
49
- RECORD_ID_COLUMN_NAME = "internal_code_record_id"
50
-
51
-
52
- class PythonValidationStat(BaseModel):
53
- fatal: int = 0
54
- error: int = 0
55
- warning: int = 0
56
- refactor: int = 0
57
- convention: int = 0
58
- statement: int = 0
59
-
60
- @property
61
- def score(self) -> float:
62
- # https://pylint.pycqa.org/en/latest/user_guide/configuration/all-options.html#evaluation
63
- if self.statement == 0: # prevent division by zero down below
64
- self.statement = max(1, self.statement)
65
- return max(
66
- 0,
67
- (
68
- 0
69
- if self.fatal
70
- else 10.0
71
- - ((float(5 * self.error + self.warning + self.refactor + self.convention) / self.statement) * 10)
72
- ),
73
- )
74
-
75
-
76
- class PythonLinterMessage(BaseModel):
77
- type: str
78
- symbol: str
79
- line: int
80
- column: int
81
- message: str
82
-
83
- @property
84
- def type_sort_order(self) -> int:
85
- return PYLINT_ERROR_CATEGORIES_ORDERED.index(self.type)
86
-
87
-
88
- class PythonLinterMessages(BaseModel):
89
- _messages: list[PythonLinterMessage] = []
90
-
91
- @property
92
- def messages(self) -> list[PythonLinterMessage]:
93
- # Ordered by severity first then by line number
94
- return sorted(self._messages, key=lambda msg: (msg.type_sort_order, msg.line))
95
-
96
- def add(self, message: PythonLinterMessage) -> None:
97
- self._messages.append(message)
98
-
99
- def get_count_by_type(self) -> dict[str, int]:
100
- count_by_type = defaultdict(int)
101
- for message in self.messages:
102
- count_by_type[message.type] += 1
103
- return dict(count_by_type)
104
-
105
- @property
106
- def is_empty(self) -> bool:
107
- return len(self.messages) == 0
108
-
109
- @property
110
- def severity(self) -> str:
111
- if self.is_empty:
112
- return "none"
113
- return self.messages[0].type
114
-
115
- @property
116
- def is_valid(self) -> bool:
117
- return self.is_empty or self.messages[0].type in PYLINT_VALID_LEVELS
118
-
119
-
120
- class PythonValidator(BaseValidator):
121
- def __init__(self, config: CodeValidatorParams):
122
- self.config = config
123
-
124
- def run_validation(self, data: list[dict]) -> ValidationResult:
125
- df = pd.DataFrame(data)
126
-
127
- if len(df.columns) > 1:
128
- raise ValueError("Python validator assumes single column input")
129
- target_column = df.columns[0]
130
-
131
- df.loc[:, RECORD_ID_COLUMN_NAME] = [uuid4() for _ in range(df.shape[0])]
132
- with tempfile.TemporaryDirectory() as temp_dir:
133
- _ = df.apply(
134
- self._write_code_to_file,
135
- args=(target_column, temp_dir),
136
- axis=1,
137
- )
138
- results = self._validate_files_in_path(path=temp_dir)
139
-
140
- records = df.to_dict(orient="records")
141
-
142
- ordered_results = []
143
- for record in records:
144
- module_id = self._get_module_name(record[RECORD_ID_COLUMN_NAME], target_column)
145
- result = results.get(module_id)
146
- if result is not None:
147
- ordered_results.append(result)
148
-
149
- return ValidationResult(data=ordered_results)
150
-
151
- def _validate_files_in_path(self, path: str) -> dict[str, ValidationOutput]:
152
- lint_results = self._run_linter(path)
153
-
154
- scores_by_module = self._get_scores(
155
- {
156
- module: messages.get_count_by_type()
157
- | {"statement": self._count_python_statements(f"{path}/{module}.py")}
158
- for module, messages in lint_results.items()
159
- }
160
- )
161
-
162
- validation_result = {}
163
- for module, score in scores_by_module.items():
164
- messages = lint_results.get(module, PythonLinterMessages())
165
- metadata = {
166
- "python_linter_score": score,
167
- "python_linter_severity": messages.severity,
168
- PYTHON_MESSAGES_FIELD: [m.model_dump() for m in messages.messages],
169
- }
170
- validation_result[module] = ValidationOutput(is_valid=messages.is_valid, **metadata)
171
- return validation_result
172
-
173
- def _write_code_to_file(self, row: pd.Series, code_column: str, path: str) -> None:
174
- with open(f"{path}/{self._get_module_name(row[RECORD_ID_COLUMN_NAME], code_column)}.py", "w") as file:
175
- file.write(row[code_column])
176
-
177
- @staticmethod
178
- def _get_module_name(record_id: str, column_name: str) -> str:
179
- return f"{record_id}_{column_name}"
180
-
181
- @staticmethod
182
- def _run_linter(codebase_path: str) -> dict[str, PythonLinterMessages]:
183
- # Create empty dict for output
184
- processed = {}
185
- for file in Path(codebase_path).glob("*.py"):
186
- processed[file.stem] = PythonLinterMessages()
187
-
188
- # Run ruff linter with JSON output
189
- ruff_bin = find_ruff_bin()
190
-
191
- ruff_exec = subprocess.run(
192
- [
193
- ruff_bin,
194
- "check",
195
- "--select",
196
- "E,F6,F7,F8,SIM,PLC,PLE,PLR,PLW",
197
- "--output-format=json",
198
- codebase_path,
199
- ],
200
- text=True,
201
- capture_output=True,
202
- check=False,
203
- cwd=Path.cwd(),
204
- )
205
- ruff_output = ruff_exec.stdout
206
-
207
- # Parse JSON output
208
- try:
209
- diagnostics = json.loads(ruff_output)
210
- except json.JSONDecodeError as e:
211
- raise RuntimeError(f"Failed to parse ruff JSON output: {e}")
212
-
213
- if not diagnostics:
214
- return processed # no errors or warnings
215
-
216
- for diagnostic in diagnostics:
217
- filename = diagnostic["filename"]
218
- code = diagnostic["code"]
219
- location = diagnostic["location"]
220
- message = diagnostic["message"]
221
-
222
- # Extract alphabetic prefix from code for type mapping
223
- alpha_prefix = "".join(c for c in code if c.isalpha())
224
- error_type = TYPE_FROM_SYMBOL.get(alpha_prefix, "warning")
225
-
226
- processed[Path(filename).stem].add(
227
- PythonLinterMessage(
228
- type=error_type,
229
- symbol=code,
230
- line=location["row"],
231
- column=location["column"],
232
- message=message,
233
- )
234
- )
235
-
236
- return processed
237
-
238
- @staticmethod
239
- def _get_scores(stats_by_module: dict[str, dict[str, int]]) -> dict[str, float]:
240
- scores = {}
241
- for key, item in stats_by_module.items():
242
- stat = PythonValidationStat(**item)
243
- scores[key] = stat.score
244
- return scores
245
-
246
- @staticmethod
247
- def _count_python_statements(file_path: str) -> int:
248
- """Count the number of statements in a Python file."""
249
- try:
250
- with open(file_path, "r", encoding="utf-8") as f:
251
- tree = ast.parse(f.read())
252
- return sum(1 for node in ast.walk(tree) if isinstance(node, ast.stmt))
253
- except Exception:
254
- return 0
@@ -1,89 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import logging
7
- from typing import TYPE_CHECKING
8
-
9
- from httpx_retries import Retry, RetryTransport
10
-
11
- from data_designer.config.validator_params import RemoteValidatorParams
12
- from data_designer.engine.errors import RemoteValidationSchemaError
13
- from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError
14
- from data_designer.engine.processing.gsonschema.validators import validate
15
- from data_designer.engine.validators.base import BaseValidator, ValidationResult
16
- from data_designer.lazy_heavy_imports import httpx
17
-
18
- if TYPE_CHECKING:
19
- import httpx
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
-
24
- class RemoteEndpointClient:
25
- """Client for making parallel HTTP requests to remote endpoints with retry, timeout, and auth support."""
26
-
27
- def __init__(
28
- self,
29
- config: RemoteValidatorParams,
30
- ):
31
- """
32
- Initialize the remote endpoint client.
33
-
34
- Args:
35
- config: Remote validator parameters
36
- """
37
- self.endpoint_url = config.endpoint_url
38
- self.output_schema = config.output_schema
39
- self.timeout = config.timeout
40
- self.max_retries = config.max_retries
41
- self.retry_backoff = config.retry_backoff
42
-
43
- def post_to_remote_endpoint(self, content: dict) -> dict:
44
- """
45
- Make a single HTTP request with retry logic.
46
-
47
- Args:
48
- content: The content to be posted to the remote endpoint
49
-
50
- Returns:
51
- The JSON response from the remote endpoint
52
-
53
- Raises:
54
- httpx.RequestError: If all retry attempts fail
55
- httpx.HTTPStatusError: If the server returns an error status
56
- """
57
- retry = Retry(
58
- total=self.max_retries,
59
- backoff_factor=self.retry_backoff,
60
- status_forcelist=[429, 500, 502, 503, 504],
61
- )
62
- transport = RetryTransport(retry=retry)
63
-
64
- with httpx.Client(
65
- timeout=httpx.Timeout(self.timeout),
66
- transport=transport,
67
- ) as http_client:
68
- response = http_client.post(
69
- self.endpoint_url,
70
- json=content,
71
- )
72
- response.raise_for_status()
73
-
74
- response_json = response.json()
75
- if self.output_schema:
76
- try:
77
- validate(response_json, self.output_schema, no_extra_properties=True)
78
- except JSONSchemaValidationError as exc:
79
- raise RemoteValidationSchemaError(str(exc)) from exc
80
- return response_json
81
-
82
-
83
- class RemoteValidator(BaseValidator):
84
- def __init__(self, config: RemoteValidatorParams):
85
- self.remote_endpoint_client = RemoteEndpointClient(config=config)
86
-
87
- def run_validation(self, data: list[dict]) -> ValidationResult:
88
- result = self.remote_endpoint_client.post_to_remote_endpoint(content={"data": data})
89
- return ValidationResult.model_validate(result)
@@ -1,65 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import logging
7
- import re
8
- from typing import TYPE_CHECKING
9
-
10
- from data_designer.config.utils.code_lang import CodeLang
11
- from data_designer.config.validator_params import CodeValidatorParams
12
- from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
13
- from data_designer.lazy_heavy_imports import pd, sqlfluff
14
-
15
- if TYPE_CHECKING:
16
- import pandas as pd
17
- import sqlfluff
18
-
19
- sqlfluff_logger = logging.getLogger("sqlfluff")
20
- sqlfluff_logger.setLevel(logging.WARNING)
21
-
22
-
23
- class SQLValidator(BaseValidator):
24
- def __init__(self, config: CodeValidatorParams):
25
- self.config = config
26
-
27
- def run_validation(self, data: list[dict]) -> ValidationResult:
28
- df = pd.DataFrame(data)
29
-
30
- if len(df.columns) > 1:
31
- raise ValueError("SQL validator assumes single column input")
32
- target_column = df.columns[0]
33
-
34
- records = df.to_dict(orient="records")
35
-
36
- results = []
37
- for record in records:
38
- result = self._validate_query(record[target_column])
39
- results.append(result)
40
-
41
- return ValidationResult(data=results)
42
-
43
- def _validate_query(self, content: str) -> ValidationResult:
44
- try:
45
- result = sqlfluff.lint(
46
- content,
47
- dialect=CodeLang.parse_dialect(self.config.code_lang),
48
- )
49
- prs_errors = [res for res in result if res["code"].startswith("PRS")]
50
- error_messages = "\n".join([f"{error['code']}: {error['description']}" for error in prs_errors])
51
- decimal_pattern = re.compile(r"DECIMAL\(\d+\)")
52
- decimal_issues = decimal_pattern.findall(content)
53
- if decimal_issues:
54
- error_messages += "\nCustom Check: Found DECIMAL definitions without a scale, which may be incorrect."
55
- if error_messages:
56
- return ValidationOutput(
57
- is_valid=False,
58
- error_messages=error_messages,
59
- )
60
- return ValidationOutput(is_valid=True, error_messages="")
61
- except Exception as e:
62
- return ValidationOutput(
63
- is_valid=False,
64
- error_messages=f"Exception during SQL parsing: {e}",
65
- )
data_designer/errors.py DELETED
@@ -1,7 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
-
7
- class DataDesignerError(Exception): ...
@@ -1,33 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.config.default_model_settings import resolve_seed_default_model_settings
7
- from data_designer.config.exports import * # noqa: F403
8
- from data_designer.config.run_config import RunConfig
9
- from data_designer.config.validator_params import LocalCallableValidatorParams
10
- from data_designer.interface.data_designer import DataDesigner
11
- from data_designer.logging import LoggingConfig, configure_logging
12
-
13
- configure_logging(LoggingConfig.default())
14
-
15
- # Resolve default model settings on import to ensure they are available when the library is used.
16
- resolve_seed_default_model_settings()
17
-
18
-
19
- def get_essentials_exports() -> list[str]:
20
- logging = [
21
- configure_logging.__name__,
22
- LoggingConfig.__name__,
23
- ]
24
- local = [
25
- DataDesigner.__name__,
26
- LocalCallableValidatorParams.__name__,
27
- RunConfig.__name__,
28
- ]
29
-
30
- return logging + local + get_config_exports() # noqa: F405
31
-
32
-
33
- __all__ = get_essentials_exports()
@@ -1,54 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- """
5
- Lazy imports facade for heavy third-party dependencies.
6
-
7
- This module provides a centralized facade that lazily imports heavy dependencies
8
- only when accessed, significantly improving import performance.
9
-
10
- Usage:
11
- from data_designer.lazy_heavy_imports import pd, np, faker, litellm
12
-
13
- df = pd.DataFrame(...)
14
- arr = np.array([1, 2, 3])
15
- fake = faker.Faker()
16
- """
17
-
18
- from __future__ import annotations
19
-
20
- import importlib
21
-
22
- # Mapping of lazy import names to their actual module paths
23
- _LAZY_IMPORTS = {
24
- "pd": "pandas",
25
- "np": "numpy",
26
- "pq": "pyarrow.parquet",
27
- "pa": "pyarrow",
28
- "faker": "faker",
29
- "litellm": "litellm",
30
- "sqlfluff": "sqlfluff",
31
- "httpx": "httpx",
32
- "duckdb": "duckdb",
33
- "nx": "networkx",
34
- "scipy": "scipy",
35
- "jsonschema": "jsonschema",
36
- }
37
-
38
-
39
- def __getattr__(name: str) -> object:
40
- """Lazily import heavy third-party dependencies when accessed.
41
-
42
- This allows fast imports of data_designer while deferring loading of heavy
43
- libraries until they're actually needed.
44
- """
45
- if name in _LAZY_IMPORTS:
46
- module_name = _LAZY_IMPORTS[name]
47
- return importlib.import_module(module_name)
48
-
49
- raise AttributeError(f"module 'data_designer.lazy_heavy_imports' has no attribute {name!r}")
50
-
51
-
52
- def __dir__() -> list[str]:
53
- """Return list of available lazy imports."""
54
- return list(_LAZY_IMPORTS.keys())
data_designer/logging.py DELETED
@@ -1,163 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import logging
7
- import random
8
- import sys
9
- from dataclasses import dataclass, field
10
- from pathlib import Path
11
- from typing import TextIO
12
-
13
- from pythonjsonlogger import jsonlogger
14
-
15
-
16
- @dataclass
17
- class LoggerConfig:
18
- name: str
19
- level: str
20
-
21
-
22
- @dataclass
23
- class OutputConfig:
24
- destination: TextIO | Path
25
- structured: bool
26
-
27
-
28
- @dataclass
29
- class LoggingConfig:
30
- logger_configs: list[LoggerConfig]
31
- output_configs: list[OutputConfig]
32
- root_level: str = "INFO"
33
- to_silence: list[str] = field(default_factory=lambda: _DEFAULT_NOISY_LOGGERS)
34
-
35
- @classmethod
36
- def default(cls):
37
- return LoggingConfig(
38
- logger_configs=[LoggerConfig(name="data_designer", level="INFO")],
39
- output_configs=[OutputConfig(destination=sys.stderr, structured=False)],
40
- )
41
-
42
- @classmethod
43
- def debug(cls):
44
- return LoggingConfig(
45
- logger_configs=[LoggerConfig(name="data_designer", level="DEBUG")],
46
- output_configs=[OutputConfig(destination=sys.stderr, structured=False)],
47
- )
48
-
49
-
50
- class RandomEmoji:
51
- """A generator for various themed emoji collections."""
52
-
53
- @staticmethod
54
- def cooking() -> str:
55
- """Get a random cooking or food preparation emoji."""
56
- return random.choice(["👨‍🍳", "👩‍🍳", "🍳", "🥘", "🍲", "🔪", "🥄", "🍴", "⏲️", "🥗"])
57
-
58
- @staticmethod
59
- def data() -> str:
60
- """Get a random data or analytics emoji."""
61
- return random.choice(["📊", "📈", "📉", "💾", "💿", "📀", "🗄️", "📁", "📂", "🗃️"])
62
-
63
- @staticmethod
64
- def generating() -> str:
65
- """Get a random generating or creating emoji."""
66
- return random.choice(["🏭", "⚙️", "🔨", "🛠️", "🏗️", "🎨", "✍️", "📝", "🔧", "⚒️"])
67
-
68
- @staticmethod
69
- def loading() -> str:
70
- """Get a random loading or waiting emoji."""
71
- return random.choice(["⏳", "⌛", "🔄", "♻️", "🔃", "⏰", "⏱️", "⏲️", "📡", "🌀"])
72
-
73
- @staticmethod
74
- def magic() -> str:
75
- """Get a random magical or special effect emoji."""
76
- return random.choice(["✨", "⭐", "🌟", "💫", "🪄", "🔮", "🎩", "🌈", "💎", "🦄"])
77
-
78
- @staticmethod
79
- def previewing() -> str:
80
- """Get a random previewing or looking ahead emoji."""
81
- return random.choice(["👀", "📺", "🔁", "👁️", "🔭", "🕵️", "🧐", "📸", "🎥", "🖼️"])
82
-
83
- @staticmethod
84
- def speed() -> str:
85
- """Get a random speed or fast emoji."""
86
- return random.choice(["⚡", "💨", "🏃", "🏎️", "🚄", "✈️", "💥", "⏩", "🏃‍♂️", "🏃‍♀️"])
87
-
88
- @staticmethod
89
- def start() -> str:
90
- """Get a random emoji representing starting or launching something."""
91
- return random.choice(["🚀", "▶️", "🎬", "🌅", "🏁", "🎯", "🚦", "🔔", "📣", "🎺"])
92
-
93
- @staticmethod
94
- def success() -> str:
95
- """Get a random success or celebration emoji."""
96
- return random.choice(["🎉", "🎊", "👏", "🙌", "🎆", "🍾", "☀️", "🏆", "✅", "🥳"])
97
-
98
- @staticmethod
99
- def thinking() -> str:
100
- """Get a random thinking or processing emoji."""
101
- return random.choice(["🤔", "💭", "🧠", "💡", "🔍", "🔎", "🤨", "🧐", "📝", "🧮"])
102
-
103
- @staticmethod
104
- def working() -> str:
105
- """Get a random working or in-progress emoji."""
106
- return random.choice(["⚙️", "🔧", "🔨", "⚒️", "🛠️", "💼", "👷", "🏗️", "🪛", "👨‍💻"])
107
-
108
-
109
- def configure_logging(config: LoggingConfig) -> None:
110
- root_logger = logging.getLogger()
111
-
112
- # Remove all handlers
113
- root_logger.handlers.clear()
114
-
115
- # Create and attach handler(s)
116
- handlers = [_create_handler(output_config) for output_config in config.output_configs]
117
- for handler in handlers:
118
- root_logger.addHandler(handler)
119
-
120
- # Set levels
121
- root_logger.setLevel(config.root_level)
122
- for logger_config in config.logger_configs:
123
- logger = logging.getLogger(logger_config.name)
124
- logger.setLevel(logger_config.level)
125
-
126
- # Adjust noisy loggers
127
- for name in config.to_silence:
128
- quiet_noisy_logger(name)
129
-
130
-
131
- def quiet_noisy_logger(name: str) -> None:
132
- logger = logging.getLogger(name)
133
- logger.handlers.clear()
134
- logger.setLevel(logging.WARNING)
135
-
136
-
137
- def _create_handler(output_config: OutputConfig) -> logging.Handler:
138
- if isinstance(output_config.destination, Path):
139
- handler = logging.FileHandler(str(output_config.destination))
140
- else:
141
- handler = logging.StreamHandler()
142
-
143
- if output_config.structured:
144
- formatter = _make_json_formatter()
145
- else:
146
- formatter = _make_stream_formatter()
147
-
148
- handler.setFormatter(formatter)
149
- return handler
150
-
151
-
152
- def _make_json_formatter() -> logging.Formatter:
153
- log_format = "%(asctime)s %(levelname)s %(name)s %(message)s"
154
- return jsonlogger.JsonFormatter(log_format)
155
-
156
-
157
- def _make_stream_formatter() -> logging.Formatter:
158
- log_format = "[%(asctime)s] [%(levelname)s] %(message)s"
159
- time_format = "%H:%M:%S"
160
- return logging.Formatter(log_format, time_format)
161
-
162
-
163
- _DEFAULT_NOISY_LOGGERS = ["httpx", "matplotlib"]