data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0rc1.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/entry_points.txt +0 -0
@@ -1,482 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import json
7
- import os
8
- from collections import OrderedDict
9
- from enum import Enum
10
- from functools import cached_property
11
- from typing import TYPE_CHECKING, Any
12
-
13
- from rich.console import Console, Group
14
- from rich.padding import Padding
15
- from rich.panel import Panel
16
- from rich.pretty import Pretty
17
- from rich.rule import Rule
18
- from rich.syntax import Syntax
19
- from rich.table import Table
20
- from rich.text import Text
21
-
22
- from data_designer.config.base import ConfigBase
23
- from data_designer.config.column_types import DataDesignerColumnType
24
- from data_designer.config.models import ModelConfig, ModelProvider
25
- from data_designer.config.sampler_params import SamplerType
26
- from data_designer.config.utils.code_lang import code_lang_to_syntax_lexer
27
- from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
28
- from data_designer.config.utils.errors import DatasetSampleDisplayError
29
- from data_designer.lazy_heavy_imports import np, pd
30
-
31
- if TYPE_CHECKING:
32
- import numpy as np
33
- import pandas as pd
34
-
35
- from data_designer.config.config_builder import DataDesignerConfigBuilder
36
- from data_designer.config.dataset_metadata import DatasetMetadata
37
-
38
-
39
- console = Console()
40
-
41
-
42
- def get_nvidia_api_key() -> str | None:
43
- return os.getenv(NVIDIA_API_KEY_ENV_VAR_NAME)
44
-
45
-
46
- def get_openai_api_key() -> str | None:
47
- return os.getenv(OPENAI_API_KEY_ENV_VAR_NAME)
48
-
49
-
50
- class ColorPalette(str, Enum):
51
- NVIDIA_GREEN = "#76b900"
52
- PURPLE = "#9525c6"
53
- YELLOW = "#f9c500"
54
- BLUE = "#0074df"
55
- RED = "#e52020"
56
- ORANGE = "#ef9100"
57
- MAGENTA = "#d2308e"
58
- TEAL = "#1dbba4"
59
-
60
-
61
- class WithRecordSamplerMixin:
62
- _display_cycle_index: int = 0
63
- dataset_metadata: DatasetMetadata | None
64
-
65
- @cached_property
66
- def _record_sampler_dataset(self) -> pd.DataFrame:
67
- if hasattr(self, "dataset") and self.dataset is not None and isinstance(self.dataset, pd.DataFrame):
68
- return self.dataset
69
- elif (
70
- hasattr(self, "load_dataset")
71
- and callable(self.load_dataset)
72
- and (dataset := self.load_dataset()) is not None
73
- and isinstance(dataset, pd.DataFrame)
74
- ):
75
- return dataset
76
- else:
77
- raise DatasetSampleDisplayError("No valid dataset found in results object.")
78
-
79
- def _has_processor_artifacts(self) -> bool:
80
- return hasattr(self, "processor_artifacts") and self.processor_artifacts is not None
81
-
82
- def display_sample_record(
83
- self,
84
- index: int | None = None,
85
- *,
86
- syntax_highlighting_theme: str = "dracula",
87
- background_color: str | None = None,
88
- processors_to_display: list[str] | None = None,
89
- hide_seed_columns: bool = False,
90
- ) -> None:
91
- """Display a sample record from the Data Designer dataset preview.
92
-
93
- Args:
94
- index: Index of the record to display. If None, the next record will be displayed.
95
- This is useful for running the cell in a notebook multiple times.
96
- syntax_highlighting_theme: Theme to use for syntax highlighting. See the `Syntax`
97
- documentation from `rich` for information about available themes.
98
- background_color: Background color to use for the record. See the `Syntax`
99
- documentation from `rich` for information about available background colors.
100
- processors_to_display: List of processors to display the artifacts for. If None, all processors will be displayed.
101
- hide_seed_columns: If True, seed columns will not be displayed separately.
102
- """
103
- i = index or self._display_cycle_index
104
-
105
- try:
106
- record = self._record_sampler_dataset.iloc[i]
107
- num_records = len(self._record_sampler_dataset)
108
- except IndexError:
109
- raise DatasetSampleDisplayError(f"Index {i} is out of bounds for dataset of length {num_records}.")
110
-
111
- processor_data_to_display = None
112
- if self._has_processor_artifacts() and len(self.processor_artifacts) > 0:
113
- if processors_to_display is None:
114
- processors_to_display = list(self.processor_artifacts.keys())
115
-
116
- if len(processors_to_display) > 0:
117
- processor_data_to_display = {}
118
- for processor in processors_to_display:
119
- if (
120
- isinstance(self.processor_artifacts[processor], list)
121
- and len(self.processor_artifacts[processor]) == num_records
122
- ):
123
- processor_data_to_display[processor] = self.processor_artifacts[processor][i]
124
- else:
125
- processor_data_to_display[processor] = self.processor_artifacts[processor]
126
-
127
- seed_column_names = (
128
- None if hide_seed_columns or self.dataset_metadata is None else self.dataset_metadata.seed_column_names
129
- )
130
-
131
- display_sample_record(
132
- record=record,
133
- processor_data_to_display=processor_data_to_display,
134
- config_builder=self._config_builder,
135
- background_color=background_color,
136
- syntax_highlighting_theme=syntax_highlighting_theme,
137
- record_index=i,
138
- seed_column_names=seed_column_names,
139
- )
140
- if index is None:
141
- self._display_cycle_index = (self._display_cycle_index + 1) % num_records
142
-
143
-
144
- def create_rich_histogram_table(
145
- data: dict[str, int | float],
146
- column_names: tuple[int, int],
147
- name_style: str = ColorPalette.BLUE.value,
148
- value_style: str = ColorPalette.TEAL.value,
149
- title: str | None = None,
150
- **kwargs,
151
- ) -> Table:
152
- table = Table(title=title, **kwargs)
153
- table.add_column(column_names[0], justify="right", style=name_style)
154
- table.add_column(column_names[1], justify="left", style=value_style)
155
-
156
- max_count = max(data.values())
157
- for name, value in data.items():
158
- bar = "" if max_count <= 0 else "█" * int((value / max_count) * 20)
159
- table.add_row(str(name), f"{bar} {value:.1f}")
160
-
161
- return table
162
-
163
-
164
- def display_sample_record(
165
- record: dict | pd.Series | pd.DataFrame,
166
- config_builder: DataDesignerConfigBuilder,
167
- processor_data_to_display: dict[str, list[str] | str] | None = None,
168
- background_color: str | None = None,
169
- syntax_highlighting_theme: str = "dracula",
170
- record_index: int | None = None,
171
- seed_column_names: list[str] | None = None,
172
- ):
173
- if isinstance(record, (dict, pd.Series)):
174
- record = pd.DataFrame([record]).iloc[0]
175
- elif isinstance(record, pd.DataFrame):
176
- if record.shape[0] > 1:
177
- raise DatasetSampleDisplayError(
178
- f"The record must be a single record. You provided a DataFrame with {record.shape[0]} records."
179
- )
180
- record = record.iloc[0]
181
- else:
182
- raise DatasetSampleDisplayError(
183
- "The record must be a single record in a dictionary, pandas Series, "
184
- f"or pandas DataFrame. You provided: {type(record)}."
185
- )
186
-
187
- render_list = []
188
- table_kws = dict(show_lines=True, expand=True)
189
-
190
- # Display seed columns if seed_column_names is provided and not empty
191
- if seed_column_names:
192
- table = Table(title="Seed Columns", **table_kws)
193
- table.add_column("Name")
194
- table.add_column("Value")
195
- for col_name in seed_column_names:
196
- if col_name in record.index:
197
- table.add_row(col_name, convert_to_row_element(record[col_name]))
198
- render_list.append(pad_console_element(table))
199
-
200
- non_code_columns = (
201
- config_builder.get_columns_of_type(DataDesignerColumnType.SAMPLER)
202
- + config_builder.get_columns_of_type(DataDesignerColumnType.EXPRESSION)
203
- + config_builder.get_columns_of_type(DataDesignerColumnType.LLM_TEXT)
204
- + config_builder.get_columns_of_type(DataDesignerColumnType.LLM_STRUCTURED)
205
- + config_builder.get_columns_of_type(DataDesignerColumnType.EMBEDDING)
206
- )
207
- if len(non_code_columns) > 0:
208
- table = Table(title="Generated Columns", **table_kws)
209
- table.add_column("Name")
210
- table.add_column("Value")
211
- for col in non_code_columns:
212
- if not col.drop:
213
- if col.column_type == DataDesignerColumnType.EMBEDDING:
214
- record[col.name]["embeddings"] = [
215
- get_truncated_list_as_string(embd) for embd in record[col.name].get("embeddings")
216
- ]
217
- table.add_row(col.name, convert_to_row_element(record[col.name]))
218
- render_list.append(pad_console_element(table))
219
-
220
- for col in config_builder.get_columns_of_type(DataDesignerColumnType.LLM_CODE):
221
- panel = Panel(
222
- Syntax(
223
- record[col.name],
224
- lexer=code_lang_to_syntax_lexer(col.code_lang),
225
- theme=syntax_highlighting_theme,
226
- word_wrap=True,
227
- background_color=background_color,
228
- ),
229
- title=col.name,
230
- expand=True,
231
- )
232
- render_list.append(pad_console_element(panel))
233
-
234
- validation_columns = config_builder.get_columns_of_type(DataDesignerColumnType.VALIDATION)
235
- if len(validation_columns) > 0:
236
- table = Table(title="Validation", **table_kws)
237
- table.add_column("Name")
238
- table.add_column("Value", ratio=1)
239
- for col in validation_columns:
240
- if not col.drop:
241
- # Add is_valid before other fields
242
- if "is_valid" in record[col.name]:
243
- value_to_display = {"is_valid": record[col.name].get("is_valid")} | record[col.name]
244
- else: # if columns treated separately
245
- value_to_display = {}
246
- for col_name, validation_output in record[col.name].items():
247
- value_to_display[col_name] = {
248
- "is_valid": validation_output.get("is_valid", None)
249
- } | validation_output
250
-
251
- table.add_row(col.name, convert_to_row_element(value_to_display))
252
- render_list.append(pad_console_element(table, (1, 0, 1, 0)))
253
-
254
- llm_judge_columns = config_builder.get_columns_of_type(DataDesignerColumnType.LLM_JUDGE)
255
- if len(llm_judge_columns) > 0:
256
- for col in llm_judge_columns:
257
- if col.drop:
258
- continue
259
- table = Table(title=f"LLM-as-a-Judge: {col.name}", **table_kws)
260
- row = []
261
- judge = record[col.name]
262
-
263
- for measure, results in judge.items():
264
- table.add_column(measure)
265
- row.append(f"score: {results['score']}\nreasoning: {results['reasoning']}")
266
- table.add_row(*row)
267
- render_list.append(pad_console_element(table, (1, 0, 1, 0)))
268
-
269
- if processor_data_to_display and len(processor_data_to_display) > 0:
270
- for processor_name, processor_data in processor_data_to_display.items():
271
- table = Table(title=f"Processor Outputs: {processor_name}", **table_kws)
272
- table.add_column("Name")
273
- table.add_column("Value")
274
- for col, value in processor_data.items():
275
- table.add_row(col, convert_to_row_element(value))
276
- render_list.append(pad_console_element(table, (1, 0, 1, 0)))
277
-
278
- if record_index is not None:
279
- index_label = Text(f"[index: {record_index}]", justify="center")
280
- render_list.append(index_label)
281
-
282
- console.print(Group(*render_list), markup=False)
283
-
284
-
285
- def get_truncated_list_as_string(long_list: list[Any], max_items: int = 2) -> str:
286
- if max_items <= 0:
287
- raise ValueError("max_items must be greater than 0")
288
- if len(long_list) > max_items:
289
- truncated_part = long_list[:max_items]
290
- return f"[{', '.join(str(x) for x in truncated_part)}, ...]"
291
- else:
292
- return str(long_list)
293
-
294
-
295
- def display_sampler_table(
296
- sampler_params: dict[SamplerType, ConfigBase],
297
- title: str | None = None,
298
- ) -> None:
299
- table = Table(expand=True)
300
- table.add_column("Type")
301
- table.add_column("Parameter")
302
- table.add_column("Data Type")
303
- table.add_column("Required", justify="center")
304
- table.add_column("Constraints")
305
-
306
- for sampler_type, params in sampler_params.items():
307
- num = 0
308
- schema = params.model_json_schema()
309
- for param_name, field_info in schema["properties"].items():
310
- is_required = param_name in schema.get("required", [])
311
- table.add_row(
312
- sampler_type if num == 0 else "",
313
- param_name,
314
- _get_field_type(field_info),
315
- "✓" if is_required else "",
316
- _get_field_constraints(field_info, schema),
317
- )
318
- num += 1
319
- table.add_section()
320
-
321
- title = title or "NeMo Data Designer Samplers"
322
-
323
- group = Group(Rule(title, end="\n\n"), table)
324
- console.print(group)
325
-
326
-
327
- def display_model_configs_table(model_configs: list[ModelConfig]) -> None:
328
- table_model_configs = Table(expand=True)
329
- table_model_configs.add_column("Alias")
330
- table_model_configs.add_column("Model")
331
- table_model_configs.add_column("Provider")
332
- table_model_configs.add_column("Inference Parameters")
333
- for model_config in model_configs:
334
- params_display = model_config.inference_parameters.format_for_display()
335
-
336
- table_model_configs.add_row(
337
- model_config.alias,
338
- model_config.model,
339
- model_config.provider,
340
- params_display,
341
- )
342
- group_args: list = [Rule(title="Model Configs"), table_model_configs]
343
- if len(model_configs) == 0:
344
- subtitle = Text(
345
- "‼️ No model configs found. Please provide at least one model config to the config builder",
346
- style="dim",
347
- justify="center",
348
- )
349
- group_args.insert(1, subtitle)
350
- group = Group(*group_args)
351
- console.print(group)
352
-
353
-
354
- def display_model_providers_table(model_providers: list[ModelProvider]) -> None:
355
- table_model_providers = Table(expand=True)
356
- table_model_providers.add_column("Name")
357
- table_model_providers.add_column("Endpoint")
358
- table_model_providers.add_column("API Key")
359
- for model_provider in model_providers:
360
- api_key = model_provider.api_key
361
- if model_provider.api_key == OPENAI_API_KEY_ENV_VAR_NAME:
362
- if get_openai_api_key() is not None:
363
- api_key = mask_api_key(get_openai_api_key())
364
- else:
365
- api_key = f"* {OPENAI_API_KEY_ENV_VAR_NAME!r} not set in environment variables * "
366
- elif model_provider.api_key == NVIDIA_API_KEY_ENV_VAR_NAME:
367
- if get_nvidia_api_key() is not None:
368
- api_key = mask_api_key(get_nvidia_api_key())
369
- else:
370
- api_key = f"* {NVIDIA_API_KEY_ENV_VAR_NAME!r} not set in environment variables *"
371
- else:
372
- api_key = mask_api_key(model_provider.api_key)
373
- table_model_providers.add_row(model_provider.name, model_provider.endpoint, api_key)
374
- group = Group(Rule(title="Model Providers"), table_model_providers)
375
- console.print(group)
376
-
377
-
378
- def mask_api_key(api_key: str | None) -> str:
379
- """Mask API keys for display.
380
-
381
- Environment variable names (all uppercase) are kept visible.
382
- Actual API keys are masked to show only the last 4 characters.
383
-
384
- Args:
385
- api_key: The API key to mask.
386
-
387
- Returns:
388
- Masked API key string or "(not set)" if None.
389
- """
390
- if not api_key:
391
- return "(not set)"
392
-
393
- # Keep environment variable names visible
394
- if api_key.isupper():
395
- return api_key
396
-
397
- # Mask actual API keys
398
- return "***" + api_key[-4:] if len(api_key) > 4 else "***"
399
-
400
-
401
- def convert_to_row_element(elem):
402
- try:
403
- elem = Pretty(json.loads(elem))
404
- except (TypeError, json.JSONDecodeError):
405
- pass
406
- if isinstance(elem, (np.integer, np.floating, np.ndarray)):
407
- elem = str(elem)
408
- elif isinstance(elem, (list, dict)):
409
- elem = Pretty(elem)
410
- return elem
411
-
412
-
413
- def pad_console_element(elem, padding=(1, 0, 1, 0)):
414
- return Padding(elem, padding)
415
-
416
-
417
- def _get_field_type(field: dict) -> str:
418
- """Extract human-readable type information from a JSON Schema field."""
419
-
420
- # single type
421
- if "type" in field:
422
- if field["type"] == "array":
423
- return " | ".join([f"{f.strip()}[]" for f in _get_field_type(field["items"]).split("|")])
424
- if field["type"] == "object":
425
- return "dict"
426
- return field["type"]
427
-
428
- # union type
429
- elif "anyOf" in field:
430
- types = []
431
- for f in field["anyOf"]:
432
- if "$ref" in f:
433
- types.append("enum")
434
- elif f.get("type") == "array":
435
- if "items" in f and "$ref" in f["items"]:
436
- types.append("enum[]")
437
- else:
438
- types.append(f"{f['items']['type']}[]")
439
- else:
440
- types.append(f.get("type", ""))
441
- return " | ".join(t for t in types if t)
442
-
443
- return ""
444
-
445
-
446
- def _get_field_constraints(field: dict, schema: dict) -> str:
447
- """Extract human-readable constraints from a JSON Schema field."""
448
- constraints = []
449
-
450
- # numeric constraints
451
- if "minimum" in field:
452
- constraints.append(f">= {field['minimum']}")
453
- if "exclusiveMinimum" in field:
454
- constraints.append(f"> {field['exclusiveMinimum']}")
455
- if "maximum" in field:
456
- constraints.append(f"<= {field['maximum']}")
457
- if "exclusiveMaximum" in field:
458
- constraints.append(f"< {field['exclusiveMaximum']}")
459
-
460
- # string constraints
461
- if "minLength" in field:
462
- constraints.append(f"len > {field['minLength']}")
463
- if "maxLength" in field:
464
- constraints.append(f"len < {field['maxLength']}")
465
-
466
- # array constraints
467
- if "minItems" in field:
468
- constraints.append(f"len > {field['minItems']}")
469
- if "maxItems" in field:
470
- constraints.append(f"len < {field['maxItems']}")
471
-
472
- # enum constraints
473
- if "enum" in _get_field_type(field) and "$defs" in schema:
474
- enum_values = []
475
- for defs in schema["$defs"].values():
476
- if "enum" in defs:
477
- enum_values.extend(defs["enum"])
478
- if len(enum_values) > 0:
479
- enum_values = OrderedDict.fromkeys(enum_values)
480
- constraints.append(f"allowed: {', '.join(enum_values.keys())}")
481
-
482
- return ", ".join(constraints)
@@ -1,94 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from enum import Enum
7
- from typing import Any
8
-
9
- from pydantic import Field, field_serializer, model_validator
10
- from typing_extensions import Self, TypeAlias
11
-
12
- from data_designer.config.base import ConfigBase
13
- from data_designer.config.utils.code_lang import SQL_DIALECTS, CodeLang
14
-
15
- SUPPORTED_CODE_LANGUAGES = {CodeLang.PYTHON, *SQL_DIALECTS}
16
-
17
-
18
- class ValidatorType(str, Enum):
19
- CODE = "code"
20
- LOCAL_CALLABLE = "local_callable"
21
- REMOTE = "remote"
22
-
23
-
24
- class CodeValidatorParams(ConfigBase):
25
- """Configuration for code validation. Supports Python and SQL code validation.
26
-
27
- Attributes:
28
- code_lang: The language of the code to validate. Supported values include: `python`,
29
- `sql:sqlite`, `sql:postgres`, `sql:mysql`, `sql:tsql`, `sql:bigquery`, `sql:ansi`.
30
- """
31
-
32
- code_lang: CodeLang = Field(description="The language of the code to validate")
33
-
34
- @model_validator(mode="after")
35
- def validate_code_lang(self) -> Self:
36
- if self.code_lang not in SUPPORTED_CODE_LANGUAGES:
37
- raise ValueError(
38
- f"Unsupported code language, supported languages are: {[lang.value for lang in SUPPORTED_CODE_LANGUAGES]}"
39
- )
40
- return self
41
-
42
-
43
- class LocalCallableValidatorParams(ConfigBase):
44
- """Configuration for local callable validation. Expects a function to be passed that validates the data.
45
-
46
- Attributes:
47
- validation_function: Function (`Callable[[pd.DataFrame], pd.DataFrame]`) to validate the
48
- data. Output must contain a column `is_valid` of type `bool`.
49
- output_schema: The JSON schema for the local callable validator's output. If not provided,
50
- the output will not be validated.
51
- """
52
-
53
- validation_function: Any = Field(
54
- description="Function (Callable[[pd.DataFrame], pd.DataFrame]) to validate the data"
55
- )
56
- output_schema: dict[str, Any] | None = Field(
57
- default=None, description="Expected schema for local callable validator's output"
58
- )
59
-
60
- @field_serializer("validation_function")
61
- def serialize_validation_function(self, v: Any) -> Any:
62
- return v.__name__
63
-
64
- @model_validator(mode="after")
65
- def validate_validation_function(self) -> Self:
66
- if not callable(self.validation_function):
67
- raise ValueError("Validation function must be a callable")
68
- return self
69
-
70
-
71
- class RemoteValidatorParams(ConfigBase):
72
- """Configuration for remote validation. Sends data to a remote endpoint for validation.
73
-
74
- Attributes:
75
- endpoint_url: The URL of the remote endpoint.
76
- output_schema: The JSON schema for the remote validator's output. If not provided,
77
- the output will not be validated.
78
- timeout: The timeout for the HTTP request in seconds. Defaults to 30.0.
79
- max_retries: The maximum number of retry attempts. Defaults to 3.
80
- retry_backoff: The backoff factor for the retry delay in seconds. Defaults to 2.0.
81
- max_parallel_requests: The maximum number of parallel requests to make. Defaults to 4.
82
- """
83
-
84
- endpoint_url: str = Field(description="URL of the remote endpoint")
85
- output_schema: dict[str, Any] | None = Field(
86
- default=None, description="Expected schema for remote validator's output"
87
- )
88
- timeout: float = Field(default=30.0, gt=0, description="The timeout for the HTTP request")
89
- max_retries: int = Field(default=3, ge=0, description="The maximum number of retry attempts")
90
- retry_backoff: float = Field(default=2.0, gt=1, description="The backoff factor for the retry delay")
91
- max_parallel_requests: int = Field(default=4, ge=1, description="The maximum number of parallel requests to make")
92
-
93
-
94
- ValidatorParamsT: TypeAlias = CodeValidatorParams | LocalCallableValidatorParams | RemoteValidatorParams
@@ -1,2 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
@@ -1,49 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import logging
7
- from abc import ABC, abstractmethod
8
- from typing import TYPE_CHECKING
9
-
10
- from pydantic import BaseModel, model_validator
11
- from typing_extensions import Self
12
-
13
- from data_designer.config.base import ConfigBase
14
- from data_designer.config.column_configs import SingleColumnConfig
15
- from data_designer.config.column_types import DataDesignerColumnType
16
- from data_designer.engine.configurable_task import ConfigurableTask, TaskConfigT
17
- from data_designer.lazy_heavy_imports import pd
18
-
19
- if TYPE_CHECKING:
20
- import pandas as pd
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- class ColumnConfigWithDataFrame(ConfigBase):
26
- column_config: SingleColumnConfig
27
- df: pd.DataFrame
28
-
29
- @model_validator(mode="after")
30
- def validate_column_exists(self) -> Self:
31
- if self.column_config.name not in self.df.columns:
32
- raise ValueError(f"Column {self.column_config.name!r} not found in DataFrame")
33
- return self
34
-
35
- def as_tuple(self) -> tuple[SingleColumnConfig, pd.DataFrame]:
36
- return (self.column_config, self.df)
37
-
38
-
39
- class ColumnProfiler(ConfigurableTask[TaskConfigT], ABC):
40
- @staticmethod
41
- @abstractmethod
42
- def get_applicable_column_types() -> list[DataDesignerColumnType]:
43
- """Returns a list of column types that this profiler can be applied to during dataset profiling."""
44
-
45
- @abstractmethod
46
- def profile(self, column_config_with_df: ColumnConfigWithDataFrame) -> BaseModel: ...
47
-
48
- def _initialize(self) -> None:
49
- logger.info(f"💫 Initializing column profiler: '{self.name}'")