data-designer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. data_designer/__init__.py +15 -0
  2. data_designer/_version.py +34 -0
  3. data_designer/cli/README.md +236 -0
  4. data_designer/cli/__init__.py +6 -0
  5. data_designer/cli/commands/__init__.py +2 -0
  6. data_designer/cli/commands/list.py +130 -0
  7. data_designer/cli/commands/models.py +10 -0
  8. data_designer/cli/commands/providers.py +11 -0
  9. data_designer/cli/commands/reset.py +100 -0
  10. data_designer/cli/controllers/__init__.py +7 -0
  11. data_designer/cli/controllers/model_controller.py +246 -0
  12. data_designer/cli/controllers/provider_controller.py +317 -0
  13. data_designer/cli/forms/__init__.py +20 -0
  14. data_designer/cli/forms/builder.py +51 -0
  15. data_designer/cli/forms/field.py +180 -0
  16. data_designer/cli/forms/form.py +59 -0
  17. data_designer/cli/forms/model_builder.py +125 -0
  18. data_designer/cli/forms/provider_builder.py +76 -0
  19. data_designer/cli/main.py +44 -0
  20. data_designer/cli/repositories/__init__.py +8 -0
  21. data_designer/cli/repositories/base.py +39 -0
  22. data_designer/cli/repositories/model_repository.py +42 -0
  23. data_designer/cli/repositories/provider_repository.py +43 -0
  24. data_designer/cli/services/__init__.py +7 -0
  25. data_designer/cli/services/model_service.py +116 -0
  26. data_designer/cli/services/provider_service.py +111 -0
  27. data_designer/cli/ui.py +448 -0
  28. data_designer/cli/utils.py +47 -0
  29. data_designer/config/__init__.py +2 -0
  30. data_designer/config/analysis/column_profilers.py +89 -0
  31. data_designer/config/analysis/column_statistics.py +274 -0
  32. data_designer/config/analysis/dataset_profiler.py +60 -0
  33. data_designer/config/analysis/utils/errors.py +8 -0
  34. data_designer/config/analysis/utils/reporting.py +188 -0
  35. data_designer/config/base.py +68 -0
  36. data_designer/config/column_configs.py +354 -0
  37. data_designer/config/column_types.py +168 -0
  38. data_designer/config/config_builder.py +660 -0
  39. data_designer/config/data_designer_config.py +40 -0
  40. data_designer/config/dataset_builders.py +11 -0
  41. data_designer/config/datastore.py +151 -0
  42. data_designer/config/default_model_settings.py +123 -0
  43. data_designer/config/errors.py +19 -0
  44. data_designer/config/interface.py +54 -0
  45. data_designer/config/models.py +231 -0
  46. data_designer/config/preview_results.py +32 -0
  47. data_designer/config/processors.py +41 -0
  48. data_designer/config/sampler_constraints.py +51 -0
  49. data_designer/config/sampler_params.py +604 -0
  50. data_designer/config/seed.py +145 -0
  51. data_designer/config/utils/code_lang.py +83 -0
  52. data_designer/config/utils/constants.py +313 -0
  53. data_designer/config/utils/errors.py +19 -0
  54. data_designer/config/utils/info.py +88 -0
  55. data_designer/config/utils/io_helpers.py +273 -0
  56. data_designer/config/utils/misc.py +81 -0
  57. data_designer/config/utils/numerical_helpers.py +28 -0
  58. data_designer/config/utils/type_helpers.py +100 -0
  59. data_designer/config/utils/validation.py +336 -0
  60. data_designer/config/utils/visualization.py +427 -0
  61. data_designer/config/validator_params.py +96 -0
  62. data_designer/engine/__init__.py +2 -0
  63. data_designer/engine/analysis/column_profilers/base.py +55 -0
  64. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
  65. data_designer/engine/analysis/column_profilers/registry.py +20 -0
  66. data_designer/engine/analysis/column_statistics.py +142 -0
  67. data_designer/engine/analysis/dataset_profiler.py +125 -0
  68. data_designer/engine/analysis/errors.py +7 -0
  69. data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
  70. data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
  71. data_designer/engine/column_generators/__init__.py +2 -0
  72. data_designer/engine/column_generators/generators/__init__.py +2 -0
  73. data_designer/engine/column_generators/generators/base.py +61 -0
  74. data_designer/engine/column_generators/generators/expression.py +63 -0
  75. data_designer/engine/column_generators/generators/llm_generators.py +172 -0
  76. data_designer/engine/column_generators/generators/samplers.py +75 -0
  77. data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
  78. data_designer/engine/column_generators/generators/validation.py +147 -0
  79. data_designer/engine/column_generators/registry.py +56 -0
  80. data_designer/engine/column_generators/utils/errors.py +13 -0
  81. data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
  82. data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
  83. data_designer/engine/configurable_task.py +82 -0
  84. data_designer/engine/dataset_builders/artifact_storage.py +181 -0
  85. data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
  86. data_designer/engine/dataset_builders/errors.py +13 -0
  87. data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
  88. data_designer/engine/dataset_builders/utils/__init__.py +2 -0
  89. data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
  90. data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
  91. data_designer/engine/dataset_builders/utils/dag.py +56 -0
  92. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
  93. data_designer/engine/dataset_builders/utils/errors.py +13 -0
  94. data_designer/engine/errors.py +49 -0
  95. data_designer/engine/model_provider.py +75 -0
  96. data_designer/engine/models/__init__.py +2 -0
  97. data_designer/engine/models/errors.py +308 -0
  98. data_designer/engine/models/facade.py +225 -0
  99. data_designer/engine/models/litellm_overrides.py +162 -0
  100. data_designer/engine/models/parsers/__init__.py +2 -0
  101. data_designer/engine/models/parsers/errors.py +34 -0
  102. data_designer/engine/models/parsers/parser.py +236 -0
  103. data_designer/engine/models/parsers/postprocessors.py +93 -0
  104. data_designer/engine/models/parsers/tag_parsers.py +60 -0
  105. data_designer/engine/models/parsers/types.py +82 -0
  106. data_designer/engine/models/recipes/base.py +79 -0
  107. data_designer/engine/models/recipes/response_recipes.py +291 -0
  108. data_designer/engine/models/registry.py +118 -0
  109. data_designer/engine/models/usage.py +75 -0
  110. data_designer/engine/models/utils.py +38 -0
  111. data_designer/engine/processing/ginja/__init__.py +2 -0
  112. data_designer/engine/processing/ginja/ast.py +64 -0
  113. data_designer/engine/processing/ginja/environment.py +461 -0
  114. data_designer/engine/processing/ginja/exceptions.py +54 -0
  115. data_designer/engine/processing/ginja/record.py +30 -0
  116. data_designer/engine/processing/gsonschema/__init__.py +2 -0
  117. data_designer/engine/processing/gsonschema/exceptions.py +8 -0
  118. data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
  119. data_designer/engine/processing/gsonschema/types.py +8 -0
  120. data_designer/engine/processing/gsonschema/validators.py +143 -0
  121. data_designer/engine/processing/processors/base.py +15 -0
  122. data_designer/engine/processing/processors/drop_columns.py +46 -0
  123. data_designer/engine/processing/processors/registry.py +20 -0
  124. data_designer/engine/processing/utils.py +120 -0
  125. data_designer/engine/registry/base.py +97 -0
  126. data_designer/engine/registry/data_designer_registry.py +37 -0
  127. data_designer/engine/registry/errors.py +10 -0
  128. data_designer/engine/resources/managed_dataset_generator.py +35 -0
  129. data_designer/engine/resources/managed_dataset_repository.py +194 -0
  130. data_designer/engine/resources/managed_storage.py +63 -0
  131. data_designer/engine/resources/resource_provider.py +46 -0
  132. data_designer/engine/resources/seed_dataset_data_store.py +66 -0
  133. data_designer/engine/sampling_gen/column.py +89 -0
  134. data_designer/engine/sampling_gen/constraints.py +95 -0
  135. data_designer/engine/sampling_gen/data_sources/base.py +214 -0
  136. data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
  137. data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
  138. data_designer/engine/sampling_gen/entities/__init__.py +2 -0
  139. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  140. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
  141. data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
  142. data_designer/engine/sampling_gen/entities/errors.py +8 -0
  143. data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
  144. data_designer/engine/sampling_gen/entities/person.py +142 -0
  145. data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
  146. data_designer/engine/sampling_gen/errors.py +24 -0
  147. data_designer/engine/sampling_gen/generator.py +121 -0
  148. data_designer/engine/sampling_gen/jinja_utils.py +60 -0
  149. data_designer/engine/sampling_gen/people_gen.py +203 -0
  150. data_designer/engine/sampling_gen/person_constants.py +54 -0
  151. data_designer/engine/sampling_gen/schema.py +143 -0
  152. data_designer/engine/sampling_gen/schema_builder.py +59 -0
  153. data_designer/engine/sampling_gen/utils.py +40 -0
  154. data_designer/engine/secret_resolver.py +80 -0
  155. data_designer/engine/validators/__init__.py +17 -0
  156. data_designer/engine/validators/base.py +36 -0
  157. data_designer/engine/validators/local_callable.py +34 -0
  158. data_designer/engine/validators/python.py +245 -0
  159. data_designer/engine/validators/remote.py +83 -0
  160. data_designer/engine/validators/sql.py +60 -0
  161. data_designer/errors.py +5 -0
  162. data_designer/essentials/__init__.py +137 -0
  163. data_designer/interface/__init__.py +2 -0
  164. data_designer/interface/data_designer.py +351 -0
  165. data_designer/interface/errors.py +16 -0
  166. data_designer/interface/results.py +55 -0
  167. data_designer/logging.py +161 -0
  168. data_designer/plugin_manager.py +83 -0
  169. data_designer/plugins/__init__.py +6 -0
  170. data_designer/plugins/errors.py +10 -0
  171. data_designer/plugins/plugin.py +69 -0
  172. data_designer/plugins/registry.py +86 -0
  173. data_designer-0.1.0.dist-info/METADATA +173 -0
  174. data_designer-0.1.0.dist-info/RECORD +177 -0
  175. data_designer-0.1.0.dist-info/WHEEL +4 -0
  176. data_designer-0.1.0.dist-info/entry_points.txt +2 -0
  177. data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,336 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from enum import Enum
7
+ from string import Formatter
8
+ from typing import Optional
9
+
10
+ from jinja2 import meta
11
+ from jinja2.sandbox import ImmutableSandboxedEnvironment
12
+ from pydantic import BaseModel
13
+ from rich import box
14
+ from rich.console import Console, Group
15
+ from rich.padding import Padding
16
+ from rich.panel import Panel
17
+
18
+ from ..column_types import ColumnConfigT, DataDesignerColumnType, column_type_is_llm_generated
19
+ from ..processors import ProcessorConfig, ProcessorType
20
+ from ..validator_params import ValidatorType
21
+ from .constants import RICH_CONSOLE_THEME
22
+ from .misc import can_run_data_designer_locally
23
+
24
+
25
+ class ViolationType(str, Enum):
26
+ ALL_COLUMNS_DROPPED = "all_columns_dropped"
27
+ CODE_COLUMN_MISSING = "code_column_missing"
28
+ CODE_COLUMN_NOT_CODE = "code_column_not_code"
29
+ CODE_LANG_MISMATCH = "code_lang_mismatch"
30
+ EXPRESSION_REFERENCE_MISSING = "expression_reference_missing"
31
+ F_STRING_SYNTAX = "f_string_syntax"
32
+ LOCAL_ONLY_COLUMN = "local_only_column"
33
+ INVALID_COLUMN = "invalid_column"
34
+ INVALID_MODEL_CONFIG = "invalid_model_config"
35
+ INVALID_REFERENCE = "invalid_reference"
36
+ PROMPT_WITHOUT_REFERENCES = "prompt_without_references"
37
+
38
+
39
+ class ViolationLevel(str, Enum):
40
+ ERROR = "ERROR"
41
+ WARNING = "WARNING"
42
+
43
+
44
+ class Violation(BaseModel):
45
+ column: Optional[str] = None
46
+ type: ViolationType
47
+ message: str
48
+ level: ViolationLevel
49
+
50
+ @property
51
+ def has_column(self) -> bool:
52
+ return self.column is not None
53
+
54
+
55
+ def validate_data_designer_config(
56
+ columns: list[ColumnConfigT],
57
+ processor_configs: list[ProcessorConfig],
58
+ allowed_references: list[str],
59
+ ) -> list[Violation]:
60
+ violations = []
61
+ violations.extend(validate_prompt_templates(columns=columns, allowed_references=allowed_references))
62
+ violations.extend(validate_code_validation(columns=columns))
63
+ violations.extend(validate_expression_references(columns=columns, allowed_references=allowed_references))
64
+ violations.extend(validate_columns_not_all_dropped(columns=columns))
65
+ violations.extend(validate_drop_columns_processor(columns=columns, processor_configs=processor_configs))
66
+ if not can_run_data_designer_locally():
67
+ violations.extend(validate_local_only_columns(columns=columns))
68
+ return violations
69
+
70
+
71
+ def rich_print_violations(violations: list[Violation]) -> None:
72
+ if len(violations) == 0:
73
+ return
74
+
75
+ console = Console(theme=RICH_CONSOLE_THEME)
76
+
77
+ render_list = []
78
+ render_list.append(
79
+ Padding(
80
+ Panel(
81
+ f"🔎 Identified {len(violations)} validation "
82
+ f"issue{'' if len(violations) == 1 else 's'} "
83
+ "in your Data Designer column definitions",
84
+ box=box.SIMPLE,
85
+ highlight=True,
86
+ ),
87
+ (0, 0, 1, 0),
88
+ )
89
+ )
90
+
91
+ for v in violations:
92
+ emoji = "🛑" if v.level == ViolationLevel.ERROR else "⚠️"
93
+
94
+ error_title = f"{emoji} {v.level.upper()} | {v.type.value.upper()}"
95
+
96
+ render_list.append(
97
+ Padding(
98
+ Panel(
99
+ f"{error_title}\n\n{v.message}",
100
+ box=box.HORIZONTALS,
101
+ title=f"Column: {v.column}" if v.has_column else "",
102
+ padding=(1, 0, 1, 1),
103
+ highlight=True,
104
+ ),
105
+ (0, 0, 1, 0),
106
+ )
107
+ )
108
+
109
+ console.print(Group(*render_list), markup=False)
110
+
111
+
112
+ def validate_prompt_templates(
113
+ columns: list[ColumnConfigT],
114
+ allowed_references: list[str],
115
+ ) -> list[Violation]:
116
+ env = ImmutableSandboxedEnvironment()
117
+
118
+ columns_with_prompts = [c for c in columns if column_type_is_llm_generated(c.column_type)]
119
+
120
+ violations = []
121
+ for column in columns_with_prompts:
122
+ for prompt_type in ["prompt", "system_prompt"]:
123
+ if not hasattr(column, prompt_type) or getattr(column, prompt_type) is None:
124
+ continue
125
+
126
+ prompt = getattr(column, prompt_type)
127
+
128
+ # check for invalid references
129
+ prompt_references = set()
130
+ prompt_references.update(meta.find_undeclared_variables(env.parse(prompt)))
131
+ invalid_references = list(set(prompt_references) - set(allowed_references))
132
+ num_invalid = len(invalid_references)
133
+ if num_invalid > 0:
134
+ ref_msg = (
135
+ f"references {num_invalid} columns that do not exist"
136
+ if num_invalid > 1
137
+ else "references a column that does not exist"
138
+ )
139
+ invalid_references = ", ".join([f"'{r}'" for r in invalid_references])
140
+ message = f"The {prompt_type} template for '{column.name}' {ref_msg}: {invalid_references}."
141
+ violations.append(
142
+ Violation(
143
+ column=column.name,
144
+ type=ViolationType.INVALID_REFERENCE,
145
+ message=message,
146
+ level=ViolationLevel.ERROR,
147
+ )
148
+ )
149
+
150
+ # check for prompts without references
151
+
152
+ if (
153
+ prompt_type == "prompt"
154
+ and len(prompt_references) == 0
155
+ and (not hasattr(column, "multi_modal_context") or getattr(column, "multi_modal_context") is None)
156
+ ):
157
+ message = (
158
+ f"The {prompt_type} template for '{column.name}' does not reference any columns. "
159
+ "This means the same prompt will be used for every row in the dataset. To increase "
160
+ "the diversity of the generated data, consider adding references to other columns "
161
+ "in the prompt template."
162
+ )
163
+ violations.append(
164
+ Violation(
165
+ column=column.name,
166
+ type=ViolationType.PROMPT_WITHOUT_REFERENCES,
167
+ message=message,
168
+ level=ViolationLevel.WARNING,
169
+ )
170
+ )
171
+
172
+ # check for f-string syntax
173
+ f_string_references = _get_string_formatter_references(prompt, allowed_references)
174
+ if len(f_string_references) > 0:
175
+ f_string_references = ", ".join([f"'{r}'" for r in f_string_references])
176
+ message = (
177
+ f"The {prompt_type} template for '{column.name}' references the "
178
+ f"following columns using f-string syntax: {f_string_references}. "
179
+ "Please use jinja2 syntax to reference columns: {reference} -> {{ reference }}."
180
+ )
181
+ violations.append(
182
+ Violation(
183
+ column=column.name,
184
+ type=ViolationType.F_STRING_SYNTAX,
185
+ message=message,
186
+ level=ViolationLevel.WARNING,
187
+ )
188
+ )
189
+ return violations
190
+
191
+
192
+ def validate_code_validation(
193
+ columns: list[ColumnConfigT],
194
+ ) -> list[Violation]:
195
+ columns_by_name = {c.name: c for c in columns}
196
+ code_validation_columns = [
197
+ c for c in columns if c.column_type == DataDesignerColumnType.VALIDATION and c.validator_type == "code"
198
+ ]
199
+
200
+ violations = []
201
+ for validation_column in code_validation_columns:
202
+ for target_column_name in validation_column.target_columns:
203
+ # check that the target column exists
204
+ if target_column_name not in columns_by_name:
205
+ message = f"Target code column '{target_column_name}' not found in column list."
206
+ violations.append(
207
+ Violation(
208
+ column=validation_column.name,
209
+ type=ViolationType.CODE_COLUMN_MISSING,
210
+ message=message,
211
+ level=ViolationLevel.ERROR,
212
+ )
213
+ )
214
+ continue
215
+
216
+ # check for consistent code languages
217
+ target_column = columns_by_name[target_column_name]
218
+ if target_column.column_type != DataDesignerColumnType.LLM_CODE:
219
+ message = (
220
+ f"Code validation column '{validation_column.name}' is set to validate "
221
+ f"code, but the target column was generated as {target_column.column_type}."
222
+ )
223
+ violations.append(
224
+ Violation(
225
+ column=validation_column.name,
226
+ type=ViolationType.CODE_COLUMN_NOT_CODE,
227
+ message=message,
228
+ level=ViolationLevel.WARNING,
229
+ )
230
+ )
231
+ elif target_column.code_lang != validation_column.validator_params.code_lang:
232
+ message = (
233
+ f"Code validation column '{validation_column.name}' is set to validate "
234
+ f"{validation_column.validator_params.code_lang}, but the target column was generated as "
235
+ f"{target_column.code_lang}."
236
+ )
237
+ violations.append(
238
+ Violation(
239
+ column=validation_column.name,
240
+ type=ViolationType.CODE_LANG_MISMATCH,
241
+ message=message,
242
+ level=ViolationLevel.ERROR,
243
+ )
244
+ )
245
+
246
+ return violations
247
+
248
+
249
+ def validate_columns_not_all_dropped(
250
+ columns: list[ColumnConfigT],
251
+ ) -> list[Violation]:
252
+ remaining_cols = [c for c in columns if c.column_type != DataDesignerColumnType.SEED_DATASET and not c.drop]
253
+
254
+ if len(remaining_cols) == 0:
255
+ return [
256
+ Violation(
257
+ column=None,
258
+ type=ViolationType.ALL_COLUMNS_DROPPED,
259
+ message=(
260
+ "All generated columns are configured to be dropped. "
261
+ "Please mark at least one column with `drop=False`."
262
+ ),
263
+ level=ViolationLevel.ERROR,
264
+ )
265
+ ]
266
+
267
+ return []
268
+
269
+
270
+ def validate_drop_columns_processor(
271
+ columns: list[ColumnConfigT],
272
+ processor_configs: list[ProcessorConfig],
273
+ ) -> list[Violation]:
274
+ all_column_names = set([c.name for c in columns])
275
+ for processor_config in processor_configs:
276
+ if processor_config.processor_type == ProcessorType.DROP_COLUMNS:
277
+ invalid_columns = set(processor_config.column_names) - all_column_names
278
+ if len(invalid_columns) > 0:
279
+ return [
280
+ Violation(
281
+ column=c,
282
+ type=ViolationType.INVALID_COLUMN,
283
+ message=f"Drop columns processor is configured to drop column '{c!r}', but the column is not defined.",
284
+ level=ViolationLevel.ERROR,
285
+ )
286
+ for c in invalid_columns
287
+ ]
288
+ return []
289
+
290
+
291
+ def validate_expression_references(
292
+ columns: list[ColumnConfigT],
293
+ allowed_references: list[str],
294
+ ) -> list[Violation]:
295
+ expression_columns = [c for c in columns if c.column_type == DataDesignerColumnType.EXPRESSION]
296
+ violations = []
297
+ for expression_column in expression_columns:
298
+ for reference in expression_column.required_columns:
299
+ if reference not in allowed_references:
300
+ violations.append(
301
+ Violation(
302
+ column=expression_column.name,
303
+ type=ViolationType.EXPRESSION_REFERENCE_MISSING,
304
+ message=f"Expression column '{expression_column.name}' references missing column '{reference}'.",
305
+ level=ViolationLevel.ERROR,
306
+ )
307
+ )
308
+ return violations
309
+
310
+
311
+ def validate_local_only_columns(
312
+ columns: list[ColumnConfigT],
313
+ ) -> list[Violation]:
314
+ violations = []
315
+ validation_columns = [c for c in columns if c.column_type == DataDesignerColumnType.VALIDATION]
316
+
317
+ # Local validation columns
318
+ for validation_column in validation_columns:
319
+ if validation_column.validator_type == ValidatorType.LOCAL_CALLABLE:
320
+ violations.append(
321
+ Violation(
322
+ column=validation_column.name,
323
+ type=ViolationType.LOCAL_ONLY_COLUMN,
324
+ message="Validation using functions are only supported when running Data Designer locally",
325
+ level=ViolationLevel.ERROR,
326
+ )
327
+ )
328
+ return violations
329
+
330
+
331
+ def _get_string_formatter_references(template: str, allowed_references: list[str]) -> list[str]:
332
+ return [
333
+ k[1].strip()
334
+ for k in Formatter().parse(template)
335
+ if len(k) > 1 and k[1] is not None and k[1].strip() in allowed_references
336
+ ]