data-designer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. data_designer/__init__.py +15 -0
  2. data_designer/_version.py +34 -0
  3. data_designer/cli/README.md +236 -0
  4. data_designer/cli/__init__.py +6 -0
  5. data_designer/cli/commands/__init__.py +2 -0
  6. data_designer/cli/commands/list.py +130 -0
  7. data_designer/cli/commands/models.py +10 -0
  8. data_designer/cli/commands/providers.py +11 -0
  9. data_designer/cli/commands/reset.py +100 -0
  10. data_designer/cli/controllers/__init__.py +7 -0
  11. data_designer/cli/controllers/model_controller.py +246 -0
  12. data_designer/cli/controllers/provider_controller.py +317 -0
  13. data_designer/cli/forms/__init__.py +20 -0
  14. data_designer/cli/forms/builder.py +51 -0
  15. data_designer/cli/forms/field.py +180 -0
  16. data_designer/cli/forms/form.py +59 -0
  17. data_designer/cli/forms/model_builder.py +125 -0
  18. data_designer/cli/forms/provider_builder.py +76 -0
  19. data_designer/cli/main.py +44 -0
  20. data_designer/cli/repositories/__init__.py +8 -0
  21. data_designer/cli/repositories/base.py +39 -0
  22. data_designer/cli/repositories/model_repository.py +42 -0
  23. data_designer/cli/repositories/provider_repository.py +43 -0
  24. data_designer/cli/services/__init__.py +7 -0
  25. data_designer/cli/services/model_service.py +116 -0
  26. data_designer/cli/services/provider_service.py +111 -0
  27. data_designer/cli/ui.py +448 -0
  28. data_designer/cli/utils.py +47 -0
  29. data_designer/config/__init__.py +2 -0
  30. data_designer/config/analysis/column_profilers.py +89 -0
  31. data_designer/config/analysis/column_statistics.py +274 -0
  32. data_designer/config/analysis/dataset_profiler.py +60 -0
  33. data_designer/config/analysis/utils/errors.py +8 -0
  34. data_designer/config/analysis/utils/reporting.py +188 -0
  35. data_designer/config/base.py +68 -0
  36. data_designer/config/column_configs.py +354 -0
  37. data_designer/config/column_types.py +168 -0
  38. data_designer/config/config_builder.py +660 -0
  39. data_designer/config/data_designer_config.py +40 -0
  40. data_designer/config/dataset_builders.py +11 -0
  41. data_designer/config/datastore.py +151 -0
  42. data_designer/config/default_model_settings.py +123 -0
  43. data_designer/config/errors.py +19 -0
  44. data_designer/config/interface.py +54 -0
  45. data_designer/config/models.py +231 -0
  46. data_designer/config/preview_results.py +32 -0
  47. data_designer/config/processors.py +41 -0
  48. data_designer/config/sampler_constraints.py +51 -0
  49. data_designer/config/sampler_params.py +604 -0
  50. data_designer/config/seed.py +145 -0
  51. data_designer/config/utils/code_lang.py +83 -0
  52. data_designer/config/utils/constants.py +313 -0
  53. data_designer/config/utils/errors.py +19 -0
  54. data_designer/config/utils/info.py +88 -0
  55. data_designer/config/utils/io_helpers.py +273 -0
  56. data_designer/config/utils/misc.py +81 -0
  57. data_designer/config/utils/numerical_helpers.py +28 -0
  58. data_designer/config/utils/type_helpers.py +100 -0
  59. data_designer/config/utils/validation.py +336 -0
  60. data_designer/config/utils/visualization.py +427 -0
  61. data_designer/config/validator_params.py +96 -0
  62. data_designer/engine/__init__.py +2 -0
  63. data_designer/engine/analysis/column_profilers/base.py +55 -0
  64. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
  65. data_designer/engine/analysis/column_profilers/registry.py +20 -0
  66. data_designer/engine/analysis/column_statistics.py +142 -0
  67. data_designer/engine/analysis/dataset_profiler.py +125 -0
  68. data_designer/engine/analysis/errors.py +7 -0
  69. data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
  70. data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
  71. data_designer/engine/column_generators/__init__.py +2 -0
  72. data_designer/engine/column_generators/generators/__init__.py +2 -0
  73. data_designer/engine/column_generators/generators/base.py +61 -0
  74. data_designer/engine/column_generators/generators/expression.py +63 -0
  75. data_designer/engine/column_generators/generators/llm_generators.py +172 -0
  76. data_designer/engine/column_generators/generators/samplers.py +75 -0
  77. data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
  78. data_designer/engine/column_generators/generators/validation.py +147 -0
  79. data_designer/engine/column_generators/registry.py +56 -0
  80. data_designer/engine/column_generators/utils/errors.py +13 -0
  81. data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
  82. data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
  83. data_designer/engine/configurable_task.py +82 -0
  84. data_designer/engine/dataset_builders/artifact_storage.py +181 -0
  85. data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
  86. data_designer/engine/dataset_builders/errors.py +13 -0
  87. data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
  88. data_designer/engine/dataset_builders/utils/__init__.py +2 -0
  89. data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
  90. data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
  91. data_designer/engine/dataset_builders/utils/dag.py +56 -0
  92. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
  93. data_designer/engine/dataset_builders/utils/errors.py +13 -0
  94. data_designer/engine/errors.py +49 -0
  95. data_designer/engine/model_provider.py +75 -0
  96. data_designer/engine/models/__init__.py +2 -0
  97. data_designer/engine/models/errors.py +308 -0
  98. data_designer/engine/models/facade.py +225 -0
  99. data_designer/engine/models/litellm_overrides.py +162 -0
  100. data_designer/engine/models/parsers/__init__.py +2 -0
  101. data_designer/engine/models/parsers/errors.py +34 -0
  102. data_designer/engine/models/parsers/parser.py +236 -0
  103. data_designer/engine/models/parsers/postprocessors.py +93 -0
  104. data_designer/engine/models/parsers/tag_parsers.py +60 -0
  105. data_designer/engine/models/parsers/types.py +82 -0
  106. data_designer/engine/models/recipes/base.py +79 -0
  107. data_designer/engine/models/recipes/response_recipes.py +291 -0
  108. data_designer/engine/models/registry.py +118 -0
  109. data_designer/engine/models/usage.py +75 -0
  110. data_designer/engine/models/utils.py +38 -0
  111. data_designer/engine/processing/ginja/__init__.py +2 -0
  112. data_designer/engine/processing/ginja/ast.py +64 -0
  113. data_designer/engine/processing/ginja/environment.py +461 -0
  114. data_designer/engine/processing/ginja/exceptions.py +54 -0
  115. data_designer/engine/processing/ginja/record.py +30 -0
  116. data_designer/engine/processing/gsonschema/__init__.py +2 -0
  117. data_designer/engine/processing/gsonschema/exceptions.py +8 -0
  118. data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
  119. data_designer/engine/processing/gsonschema/types.py +8 -0
  120. data_designer/engine/processing/gsonschema/validators.py +143 -0
  121. data_designer/engine/processing/processors/base.py +15 -0
  122. data_designer/engine/processing/processors/drop_columns.py +46 -0
  123. data_designer/engine/processing/processors/registry.py +20 -0
  124. data_designer/engine/processing/utils.py +120 -0
  125. data_designer/engine/registry/base.py +97 -0
  126. data_designer/engine/registry/data_designer_registry.py +37 -0
  127. data_designer/engine/registry/errors.py +10 -0
  128. data_designer/engine/resources/managed_dataset_generator.py +35 -0
  129. data_designer/engine/resources/managed_dataset_repository.py +194 -0
  130. data_designer/engine/resources/managed_storage.py +63 -0
  131. data_designer/engine/resources/resource_provider.py +46 -0
  132. data_designer/engine/resources/seed_dataset_data_store.py +66 -0
  133. data_designer/engine/sampling_gen/column.py +89 -0
  134. data_designer/engine/sampling_gen/constraints.py +95 -0
  135. data_designer/engine/sampling_gen/data_sources/base.py +214 -0
  136. data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
  137. data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
  138. data_designer/engine/sampling_gen/entities/__init__.py +2 -0
  139. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  140. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
  141. data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
  142. data_designer/engine/sampling_gen/entities/errors.py +8 -0
  143. data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
  144. data_designer/engine/sampling_gen/entities/person.py +142 -0
  145. data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
  146. data_designer/engine/sampling_gen/errors.py +24 -0
  147. data_designer/engine/sampling_gen/generator.py +121 -0
  148. data_designer/engine/sampling_gen/jinja_utils.py +60 -0
  149. data_designer/engine/sampling_gen/people_gen.py +203 -0
  150. data_designer/engine/sampling_gen/person_constants.py +54 -0
  151. data_designer/engine/sampling_gen/schema.py +143 -0
  152. data_designer/engine/sampling_gen/schema_builder.py +59 -0
  153. data_designer/engine/sampling_gen/utils.py +40 -0
  154. data_designer/engine/secret_resolver.py +80 -0
  155. data_designer/engine/validators/__init__.py +17 -0
  156. data_designer/engine/validators/base.py +36 -0
  157. data_designer/engine/validators/local_callable.py +34 -0
  158. data_designer/engine/validators/python.py +245 -0
  159. data_designer/engine/validators/remote.py +83 -0
  160. data_designer/engine/validators/sql.py +60 -0
  161. data_designer/errors.py +5 -0
  162. data_designer/essentials/__init__.py +137 -0
  163. data_designer/interface/__init__.py +2 -0
  164. data_designer/interface/data_designer.py +351 -0
  165. data_designer/interface/errors.py +16 -0
  166. data_designer/interface/results.py +55 -0
  167. data_designer/logging.py +161 -0
  168. data_designer/plugin_manager.py +83 -0
  169. data_designer/plugins/__init__.py +6 -0
  170. data_designer/plugins/errors.py +10 -0
  171. data_designer/plugins/plugin.py +69 -0
  172. data_designer/plugins/registry.py +86 -0
  173. data_designer-0.1.0.dist-info/METADATA +173 -0
  174. data_designer-0.1.0.dist-info/RECORD +177 -0
  175. data_designer-0.1.0.dist-info/WHEEL +4 -0
  176. data_designer-0.1.0.dist-info/entry_points.txt +2 -0
  177. data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,47 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+
5
+ def validate_url(url: str) -> bool:
6
+ """Validate that a string is a valid URL.
7
+
8
+ Args:
9
+ url: URL string to validate
10
+
11
+ Returns:
12
+ True if valid URL, False otherwise
13
+ """
14
+ if not url:
15
+ return False
16
+
17
+ # Basic validation - must start with http:// or https://
18
+ if not url.startswith(("http://", "https://")):
19
+ return False
20
+
21
+ # Must have at least a domain after the protocol
22
+ parts = url.split("://", 1)
23
+ if len(parts) != 2 or not parts[1]:
24
+ return False
25
+
26
+ return True
27
+
28
+
29
+ def validate_numeric_range(value: str, min_value: float, max_value: float) -> tuple[bool, float | None]:
30
+ """Validate that a string is a valid number within a range.
31
+
32
+ Args:
33
+ value: String to validate and convert
34
+ min_value: Minimum allowed value (inclusive)
35
+ max_value: Maximum allowed value (inclusive)
36
+
37
+ Returns:
38
+ Tuple of (is_valid, parsed_value)
39
+ If invalid, parsed_value is None
40
+ """
41
+ try:
42
+ num = float(value)
43
+ if min_value <= num <= max_value:
44
+ return True, num
45
+ return False, None
46
+ except ValueError:
47
+ return False, None
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,89 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from abc import ABC
5
+ from enum import Enum
6
+ from typing import Optional, Union
7
+
8
+ from pydantic import BaseModel, Field
9
+ from rich.panel import Panel
10
+ from rich.table import Column, Table
11
+ from typing_extensions import TypeAlias
12
+
13
+ from ..base import ConfigBase
14
+ from ..utils.visualization import ColorPalette
15
+ from .column_statistics import (
16
+ CategoricalDistribution,
17
+ CategoricalHistogramData,
18
+ ColumnDistributionType,
19
+ MissingValue,
20
+ NumericalDistribution,
21
+ )
22
+ from .utils.reporting import TITLE_STYLE, create_judge_score_summary_table
23
+
24
+
25
+ class ColumnProfilerType(str, Enum):
26
+ JUDGE_SCORE = "judge-score"
27
+
28
+
29
+ class ColumnProfilerResults(BaseModel, ABC):
30
+ def create_report_section(self) -> Panel:
31
+ return Panel(
32
+ f"Report section generation not implemented for '{self.__class__.__name__}'.",
33
+ title="Not Implemented",
34
+ border_style=f"bold {ColorPalette.YELLOW.value}",
35
+ padding=(1, 2),
36
+ )
37
+
38
+
39
+ class JudgeScoreProfilerConfig(ConfigBase):
40
+ model_alias: str
41
+ summary_score_sample_size: Optional[int] = Field(default=20, ge=1)
42
+
43
+
44
+ class JudgeScoreSample(BaseModel):
45
+ score: Union[int, str]
46
+ reasoning: str
47
+
48
+
49
+ class JudgeScoreDistributions(BaseModel):
50
+ scores: dict[str, list[Union[int, str]]]
51
+ reasoning: dict[str, list[str]]
52
+ distribution_types: dict[str, ColumnDistributionType]
53
+ distributions: dict[str, Union[CategoricalDistribution, NumericalDistribution, MissingValue]]
54
+ histograms: dict[str, Union[CategoricalHistogramData, MissingValue]]
55
+
56
+
57
+ class JudgeScoreSummary(BaseModel):
58
+ score_name: str
59
+ summary: str
60
+ score_samples: list[JudgeScoreSample]
61
+
62
+
63
+ class JudgeScoreProfilerResults(ColumnProfilerResults):
64
+ column_name: str
65
+ summaries: dict[str, JudgeScoreSummary]
66
+ score_distributions: Union[JudgeScoreDistributions, MissingValue]
67
+
68
+ def create_report_section(self) -> Panel:
69
+ layout = Table.grid(Column(), expand=True, padding=(2, 0))
70
+
71
+ for score_name in self.summaries.keys():
72
+ layout.add_row(
73
+ create_judge_score_summary_table(
74
+ score_name=score_name,
75
+ histogram=self.score_distributions.histograms[score_name],
76
+ summary=self.summaries[score_name].summary,
77
+ )
78
+ )
79
+
80
+ return Panel(
81
+ layout,
82
+ title=f"[{TITLE_STYLE}]LLM-as-a-Judge Score Profile: '{self.column_name}'[/{TITLE_STYLE}]",
83
+ padding=(1, 2),
84
+ )
85
+
86
+
87
+ ColumnProfilerConfigT: TypeAlias = JudgeScoreProfilerConfig
88
+
89
+ ColumnProfilerResultsT: TypeAlias = JudgeScoreProfilerResults
@@ -0,0 +1,274 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from abc import ABC, abstractmethod
7
+ from enum import Enum
8
+ from typing import Any, Literal, Optional, Union
9
+
10
+ from pandas import Series
11
+ from pydantic import BaseModel, ConfigDict, create_model, field_validator, model_validator
12
+ from typing_extensions import Self, TypeAlias
13
+
14
+ from ...plugin_manager import PluginManager
15
+ from ..column_types import DataDesignerColumnType
16
+ from ..sampler_params import SamplerType
17
+ from ..utils.constants import EPSILON
18
+ from ..utils.numerical_helpers import is_float, is_int, prepare_number_for_reporting
19
+
20
+
21
+ class MissingValue(str, Enum):
22
+ CALCULATION_FAILED = "--"
23
+ OUTPUT_FORMAT_ERROR = "output_format_error"
24
+
25
+
26
+ class ColumnDistributionType(str, Enum):
27
+ CATEGORICAL = "categorical"
28
+ NUMERICAL = "numerical"
29
+ TEXT = "text"
30
+ OTHER = "other"
31
+ UNKNOWN = "unknown"
32
+
33
+
34
+ class BaseColumnStatistics(BaseModel, ABC):
35
+ model_config = ConfigDict(use_enum_values=True)
36
+
37
+ @abstractmethod
38
+ def create_report_row_data(self) -> dict[str, str]: ...
39
+
40
+
41
+ class GeneralColumnStatistics(BaseColumnStatistics):
42
+ column_name: str
43
+ num_records: Union[int, MissingValue]
44
+ num_null: Union[int, MissingValue]
45
+ num_unique: Union[int, MissingValue]
46
+ pyarrow_dtype: str
47
+ simple_dtype: str
48
+ column_type: Literal["general"] = "general"
49
+
50
+ @field_validator("num_null", "num_unique", "num_records", mode="before")
51
+ def general_statistics_ensure_python_integers(cls, v: Union[int, MissingValue]) -> Union[int, MissingValue]:
52
+ return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, int)
53
+
54
+ @property
55
+ def percent_null(self) -> Union[float, MissingValue]:
56
+ return (
57
+ self.num_null
58
+ if self._is_missing_value(self.num_null)
59
+ else prepare_number_for_reporting(100 * self.num_null / (self.num_records + EPSILON), float)
60
+ )
61
+
62
+ @property
63
+ def percent_unique(self) -> Union[float, MissingValue]:
64
+ return (
65
+ self.num_unique
66
+ if self._is_missing_value(self.num_unique)
67
+ else prepare_number_for_reporting(100 * self.num_unique / (self.num_records + EPSILON), float)
68
+ )
69
+
70
+ @property
71
+ def _general_display_row(self) -> dict[str, str]:
72
+ pct_unique_str = f" ({self.percent_unique:.1f}%)" if not self._is_missing_value(self.percent_unique) else ""
73
+ return {
74
+ "column name": self.column_name,
75
+ "data type": self.simple_dtype,
76
+ "number unique values": f"{self.num_unique}{pct_unique_str}",
77
+ }
78
+
79
+ def create_report_row_data(self) -> dict[str, str]:
80
+ return self._general_display_row
81
+
82
+ def _is_missing_value(self, v: Union[float, int, MissingValue]) -> bool:
83
+ return v in set(MissingValue)
84
+
85
+
86
+ class LLMTextColumnStatistics(GeneralColumnStatistics):
87
+ completion_tokens_mean: Union[float, MissingValue]
88
+ completion_tokens_median: Union[float, MissingValue]
89
+ completion_tokens_stddev: Union[float, MissingValue]
90
+ prompt_tokens_mean: Union[float, MissingValue]
91
+ prompt_tokens_median: Union[float, MissingValue]
92
+ prompt_tokens_stddev: Union[float, MissingValue]
93
+ column_type: Literal[DataDesignerColumnType.LLM_TEXT.value] = DataDesignerColumnType.LLM_TEXT.value
94
+
95
+ @field_validator(
96
+ "completion_tokens_mean",
97
+ "completion_tokens_median",
98
+ "completion_tokens_stddev",
99
+ "prompt_tokens_mean",
100
+ "prompt_tokens_median",
101
+ "prompt_tokens_stddev",
102
+ mode="before",
103
+ )
104
+ def llm_column_ensure_python_floats(cls, v: Union[float, int, MissingValue]) -> Union[float, int, MissingValue]:
105
+ return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, float)
106
+
107
+ def create_report_row_data(self) -> dict[str, Any]:
108
+ prompt_tokens_str = (
109
+ f"{self.prompt_tokens_median:.1f} +/- {self.prompt_tokens_stddev:.1f}"
110
+ if not self._is_missing_value(self.prompt_tokens_median)
111
+ else "--"
112
+ )
113
+ completion_tokens_str = (
114
+ f"{self.completion_tokens_median:.1f} +/- {self.completion_tokens_stddev:.1f}"
115
+ if not self._is_missing_value(self.completion_tokens_median)
116
+ else "--"
117
+ )
118
+ return {
119
+ **self._general_display_row,
120
+ "prompt tokens\nper record": prompt_tokens_str,
121
+ "completion tokens\nper record": completion_tokens_str,
122
+ }
123
+
124
+
125
+ class LLMCodeColumnStatistics(LLMTextColumnStatistics):
126
+ column_type: Literal[DataDesignerColumnType.LLM_CODE.value] = DataDesignerColumnType.LLM_CODE.value
127
+
128
+
129
+ class LLMStructuredColumnStatistics(LLMTextColumnStatistics):
130
+ column_type: Literal[DataDesignerColumnType.LLM_STRUCTURED.value] = DataDesignerColumnType.LLM_STRUCTURED.value
131
+
132
+
133
+ class LLMJudgedColumnStatistics(LLMTextColumnStatistics):
134
+ column_type: Literal[DataDesignerColumnType.LLM_JUDGE.value] = DataDesignerColumnType.LLM_JUDGE.value
135
+
136
+
137
+ class SamplerColumnStatistics(GeneralColumnStatistics):
138
+ sampler_type: SamplerType
139
+ distribution_type: ColumnDistributionType
140
+ distribution: Optional[Union[CategoricalDistribution, NumericalDistribution, MissingValue]]
141
+ column_type: Literal[DataDesignerColumnType.SAMPLER.value] = DataDesignerColumnType.SAMPLER.value
142
+
143
+ def create_report_row_data(self) -> dict[str, str]:
144
+ return {
145
+ **self._general_display_row,
146
+ "sampler type": self.sampler_type,
147
+ }
148
+
149
+
150
+ class SeedDatasetColumnStatistics(GeneralColumnStatistics):
151
+ column_type: Literal[DataDesignerColumnType.SEED_DATASET.value] = DataDesignerColumnType.SEED_DATASET.value
152
+
153
+
154
+ class ExpressionColumnStatistics(GeneralColumnStatistics):
155
+ column_type: Literal[DataDesignerColumnType.EXPRESSION.value] = DataDesignerColumnType.EXPRESSION.value
156
+
157
+
158
+ class ValidationColumnStatistics(GeneralColumnStatistics):
159
+ num_valid_records: Union[int, MissingValue]
160
+ column_type: Literal[DataDesignerColumnType.VALIDATION.value] = DataDesignerColumnType.VALIDATION.value
161
+
162
+ @field_validator("num_valid_records", mode="before")
163
+ def code_validation_column_ensure_python_integers(cls, v: Union[int, MissingValue]) -> Union[int, MissingValue]:
164
+ return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, int)
165
+
166
+ @property
167
+ def percent_valid(self) -> Union[float, MissingValue]:
168
+ return (
169
+ self.num_valid_records
170
+ if self._is_missing_value(self.num_valid_records)
171
+ else prepare_number_for_reporting(100 * self.num_valid_records / (self.num_records + EPSILON), float)
172
+ )
173
+
174
+ def create_report_row_data(self) -> dict[str, str]:
175
+ percent_valid_str = f"{self.percent_valid:.1f}%" if not self._is_missing_value(self.percent_valid) else "--"
176
+ return {**self._general_display_row, "percent valid": percent_valid_str}
177
+
178
+
179
+ class CategoricalHistogramData(BaseModel):
180
+ categories: list[Union[float, int, str]]
181
+ counts: list[int]
182
+
183
+ @model_validator(mode="after")
184
+ def ensure_python_types(self) -> Self:
185
+ """Ensure numerical values are Python objects rather than Numpy types."""
186
+ self.categories = [(float(x) if is_float(x) else (int(x) if is_int(x) else str(x))) for x in self.categories]
187
+ self.counts = [int(i) for i in self.counts]
188
+ return self
189
+
190
+ @classmethod
191
+ def from_series(cls, series: Series) -> Self:
192
+ counts = series.value_counts()
193
+ return cls(categories=counts.index.tolist(), counts=counts.tolist())
194
+
195
+
196
+ class CategoricalDistribution(BaseModel):
197
+ most_common_value: Union[str, int]
198
+ least_common_value: Union[str, int]
199
+ histogram: CategoricalHistogramData
200
+
201
+ @field_validator("most_common_value", "least_common_value", mode="before")
202
+ def ensure_python_types(cls, v: Union[str, int]) -> Union[str, int]:
203
+ return str(v) if not is_int(v) else prepare_number_for_reporting(v, int)
204
+
205
+ @classmethod
206
+ def from_series(cls, series: Series) -> Self:
207
+ counts = series.value_counts()
208
+ return cls(
209
+ most_common_value=counts.index[0],
210
+ least_common_value=counts.index[-1],
211
+ histogram=CategoricalHistogramData.from_series(series),
212
+ )
213
+
214
+
215
+ class NumericalDistribution(BaseModel):
216
+ min: Union[float, int]
217
+ max: Union[float, int]
218
+ mean: float
219
+ stddev: float
220
+ median: float
221
+
222
+ @field_validator("min", "max", "mean", "stddev", "median", mode="before")
223
+ def ensure_python_types(cls, v: Union[float, int]) -> Union[float, int]:
224
+ return prepare_number_for_reporting(v, int if is_int(v) else float)
225
+
226
+ @classmethod
227
+ def from_series(cls, series: Series) -> Self:
228
+ return cls(
229
+ min=series.min(skipna=True),
230
+ max=series.max(skipna=True),
231
+ mean=series.mean(skipna=True),
232
+ stddev=series.std(skipna=True),
233
+ median=series.median(skipna=True),
234
+ )
235
+
236
+
237
+ ColumnStatisticsT: TypeAlias = Union[
238
+ GeneralColumnStatistics,
239
+ LLMTextColumnStatistics,
240
+ LLMCodeColumnStatistics,
241
+ LLMStructuredColumnStatistics,
242
+ LLMJudgedColumnStatistics,
243
+ SamplerColumnStatistics,
244
+ SeedDatasetColumnStatistics,
245
+ ValidationColumnStatistics,
246
+ ExpressionColumnStatistics,
247
+ ]
248
+
249
+
250
+ DEFAULT_COLUMN_STATISTICS_MAP = {
251
+ DataDesignerColumnType.EXPRESSION: ExpressionColumnStatistics,
252
+ DataDesignerColumnType.LLM_CODE: LLMCodeColumnStatistics,
253
+ DataDesignerColumnType.LLM_JUDGE: LLMJudgedColumnStatistics,
254
+ DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnStatistics,
255
+ DataDesignerColumnType.LLM_TEXT: LLMTextColumnStatistics,
256
+ DataDesignerColumnType.SAMPLER: SamplerColumnStatistics,
257
+ DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnStatistics,
258
+ DataDesignerColumnType.VALIDATION: ValidationColumnStatistics,
259
+ }
260
+
261
+ for plugin in PluginManager().get_column_generator_plugins():
262
+ # Dynamically create a statistics class for this plugin using Pydantic's create_model
263
+ plugin_stats_cls_name = f"{plugin.config_type_as_class_name}ColumnStatistics"
264
+
265
+ # Create the class with proper Pydantic field
266
+ plugin_stats_cls = create_model(
267
+ plugin_stats_cls_name,
268
+ __base__=GeneralColumnStatistics,
269
+ column_type=(Literal[plugin.name], plugin.name),
270
+ )
271
+
272
+ # Add the plugin statistics class to the union
273
+ ColumnStatisticsT |= plugin_stats_cls
274
+ DEFAULT_COLUMN_STATISTICS_MAP[DataDesignerColumnType(plugin.name)] = plugin_stats_cls
@@ -0,0 +1,60 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from functools import cached_property
5
+ from pathlib import Path
6
+ from typing import Annotated, Optional, Union
7
+
8
+ from pydantic import BaseModel, Field, field_validator
9
+
10
+ from ..column_types import DataDesignerColumnType, get_column_display_order
11
+ from ..utils.constants import EPSILON
12
+ from ..utils.numerical_helpers import prepare_number_for_reporting
13
+ from .column_profilers import ColumnProfilerResultsT
14
+ from .column_statistics import ColumnStatisticsT
15
+ from .utils.reporting import ReportSection, generate_analysis_report
16
+
17
+
18
+ class DatasetProfilerResults(BaseModel):
19
+ num_records: int
20
+ target_num_records: int
21
+ column_statistics: list[Annotated[ColumnStatisticsT, Field(discriminator="column_type")]] = Field(..., min_length=1)
22
+ side_effect_column_names: Optional[list[str]] = None
23
+ column_profiles: Optional[list[ColumnProfilerResultsT]] = None
24
+
25
+ @field_validator("num_records", "target_num_records", mode="before")
26
+ def ensure_python_integers(cls, v: int) -> int:
27
+ return prepare_number_for_reporting(v, int)
28
+
29
+ @property
30
+ def percent_complete(self) -> float:
31
+ return 100 * self.num_records / (self.target_num_records + EPSILON)
32
+
33
+ @cached_property
34
+ def column_types(self) -> list[str]:
35
+ display_order = get_column_display_order()
36
+ return sorted(
37
+ list(set([c.column_type for c in self.column_statistics])),
38
+ key=lambda x: display_order.index(x) if x in display_order else len(display_order),
39
+ )
40
+
41
+ def get_column_statistics_by_type(self, column_type: DataDesignerColumnType) -> list[ColumnStatisticsT]:
42
+ return [c for c in self.column_statistics if c.column_type == column_type]
43
+
44
+ def to_report(
45
+ self,
46
+ save_path: Optional[Union[str, Path]] = None,
47
+ include_sections: Optional[list[Union[ReportSection, DataDesignerColumnType]]] = None,
48
+ ) -> None:
49
+ """Generate and print an analysis report based on the dataset profiling results.
50
+
51
+ Args:
52
+ save_path: Optional path to save the report. If provided, the report will be saved
53
+ as either HTML (.html) or SVG (.svg) format. If None, the report will
54
+ only be displayed in the console.
55
+ include_sections: Optional list of sections to include in the report. Choices are
56
+ any DataDesignerColumnType, "overview" (the dataset overview section),
57
+ and "column_profilers" (all column profilers in one section). If None,
58
+ all sections will be included.
59
+ """
60
+ generate_analysis_report(self, save_path, include_sections=include_sections)
@@ -0,0 +1,8 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from ...errors import DataDesignerError
5
+
6
+
7
+ class AnalysisReportError(DataDesignerError):
8
+ """Base exception for analysis report errors."""
@@ -0,0 +1,188 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from enum import Enum
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Optional, Union
9
+
10
+ from rich.align import Align
11
+ from rich.console import Console, Group
12
+ from rich.panel import Panel
13
+ from rich.rule import Rule
14
+ from rich.table import Column, Table
15
+ from rich.text import Text
16
+
17
+ from ...analysis.column_statistics import CategoricalHistogramData
18
+ from ...column_types import COLUMN_TYPE_EMOJI_MAP, DataDesignerColumnType, get_column_display_order
19
+ from ...utils.visualization import (
20
+ ColorPalette,
21
+ convert_to_row_element,
22
+ create_rich_histogram_table,
23
+ pad_console_element,
24
+ )
25
+ from .errors import AnalysisReportError
26
+
27
+ if TYPE_CHECKING:
28
+ from ...analysis.dataset_profiler import DatasetProfilerResults
29
+
30
+ HEADER_STYLE = "dim"
31
+ RULE_STYLE = f"bold {ColorPalette.NVIDIA_GREEN.value}"
32
+ ACCENT_STYLE = f"bold {ColorPalette.BLUE.value}"
33
+ TITLE_STYLE = f"bold {ColorPalette.NVIDIA_GREEN.value}"
34
+ HIST_NAME_STYLE = f"bold {ColorPalette.BLUE.value}"
35
+ HIST_VALUE_STYLE = f"dim {ColorPalette.BLUE.value}"
36
+
37
+
38
+ class ReportSection(str, Enum):
39
+ OVERVIEW = "overview"
40
+ COLUMN_PROFILERS = "column_profilers"
41
+
42
+
43
+ DEFAULT_INCLUDE_SECTIONS = [
44
+ ReportSection.OVERVIEW,
45
+ ReportSection.COLUMN_PROFILERS,
46
+ ] + get_column_display_order()
47
+
48
+
49
+ def generate_analysis_report(
50
+ analysis: DatasetProfilerResults,
51
+ save_path: Optional[Union[str, Path]] = None,
52
+ include_sections: Optional[list[Union[ReportSection, DataDesignerColumnType]]] = None,
53
+ ) -> None:
54
+ """Generate an analysis report for dataset profiling results.
55
+
56
+ This function creates a rich-formatted report that displays dataset overview statistics
57
+ and detailed column statistics organized by column type. The report includes visual
58
+ elements like tables, rules, and panels to present the analysis results in an
59
+ easy-to-read format.
60
+
61
+ Args:
62
+ analysis: The DatasetProfilerResults object containing the analysis data to report on.
63
+ save_path: Optional path to save the report. If provided, the report will be saved
64
+ as either HTML (.html) or SVG (.svg) format. If None, the report will
65
+ only be displayed in the console.
66
+ include_sections: Optional list of sections to include in the report. Choices are
67
+ any Data Designer column type, "overview" (the dataset overview section),
68
+ and "column_profilers" (all column profilers in one section). If None,
69
+ all sections will be included.
70
+
71
+ Raises:
72
+ AnalysisReportError: If save_path is provided but doesn't have a .html or .svg extension.
73
+ """
74
+ render_list = []
75
+ table_kws = dict(show_lines=True, expand=True, title_style=TITLE_STYLE)
76
+ include_sections = include_sections or DEFAULT_INCLUDE_SECTIONS
77
+
78
+ title = Rule(title="🎨 Data Designer Dataset Profile", style=RULE_STYLE, end="\n\n")
79
+
80
+ render_list.append(title)
81
+
82
+ if ReportSection.OVERVIEW in include_sections:
83
+ table = Table(title="Dataset Overview", **table_kws)
84
+ table.add_column("number of records", header_style=HEADER_STYLE)
85
+ table.add_column("number of columns", header_style=HEADER_STYLE)
86
+ table.add_column("percent complete records", header_style=HEADER_STYLE)
87
+
88
+ table.add_row(
89
+ f"{analysis.num_records:,}",
90
+ f"{len(analysis.column_statistics):,}",
91
+ f"{analysis.percent_complete:.1f}%",
92
+ )
93
+
94
+ render_list.append(pad_console_element(table, (1, 0, 1, 0)))
95
+
96
+ displayed_column_types = set()
97
+ for column_type in analysis.column_types:
98
+ if column_type not in include_sections:
99
+ continue
100
+
101
+ displayed_column_types.add(column_type)
102
+ column_label = column_type.replace("_", " ").title().replace("Llm", "LLM")
103
+ table = Table(
104
+ title=f"{COLUMN_TYPE_EMOJI_MAP[column_type]} {column_label} Columns",
105
+ **table_kws,
106
+ )
107
+
108
+ column_stats_list = analysis.get_column_statistics_by_type(column_type)
109
+ for col in list(column_stats_list[0].create_report_row_data()):
110
+ if col == "column name":
111
+ table.add_column(col, header_style=HEADER_STYLE)
112
+ else:
113
+ table.add_column(col, justify="right", header_style=HEADER_STYLE)
114
+
115
+ for stats in column_stats_list:
116
+ table.add_row(*[convert_to_row_element(elem) for elem in stats.create_report_row_data().values()])
117
+
118
+ render_list.append(pad_console_element(table, (1, 0, 1, 0)))
119
+
120
+ if ReportSection.COLUMN_PROFILERS in include_sections:
121
+ for profile in analysis.column_profiles or []:
122
+ render_list.append(pad_console_element(profile.create_report_section()))
123
+
124
+ if any("llm" in col_type for col_type in displayed_column_types):
125
+ footnotes_text = (
126
+ "1. All token statistics are based on a sample of max(1000, len(dataset)) records.\n"
127
+ "2. Tokens are calculated using tiktoken's cl100k_base tokenizer."
128
+ )
129
+
130
+ render_list.append(
131
+ pad_console_element(
132
+ Panel(
133
+ Text.from_markup(footnotes_text.strip()),
134
+ title="Table Notes",
135
+ border_style="dim",
136
+ padding=(1, 2),
137
+ )
138
+ )
139
+ )
140
+
141
+ render_list.append(Rule(style=RULE_STYLE))
142
+
143
+ console = Console(record=save_path is not None)
144
+ console.print(Group(*render_list), markup=False)
145
+
146
+ if save_path is not None:
147
+ save_path = str(save_path)
148
+ if save_path.endswith(".html"):
149
+ console.save_html(save_path)
150
+ elif save_path.endswith(".svg"):
151
+ console.save_svg(save_path, title="")
152
+ else:
153
+ raise AnalysisReportError(
154
+ f"🛑 The extension of the save path must be either .html or .svg. You provided {save_path}."
155
+ )
156
+
157
+
158
+ def create_judge_score_summary_table(
159
+ score_name: str,
160
+ histogram: CategoricalHistogramData,
161
+ summary: str,
162
+ accent_style: str = ACCENT_STYLE,
163
+ summary_border_style: str = "dim",
164
+ ) -> Table:
165
+ layout = Table.grid(Column(), Column(), expand=True, padding=(0, 2))
166
+
167
+ histogram_table = create_rich_histogram_table(
168
+ {str(s): c for s, c in zip(histogram.categories, histogram.counts)},
169
+ ("score", "count"),
170
+ name_style=HIST_NAME_STYLE,
171
+ value_style=HIST_VALUE_STYLE,
172
+ )
173
+
174
+ summary_panel = Panel(
175
+ Text(summary, justify="left"),
176
+ title=(
177
+ f"Score Summary: [not {summary_border_style}][{accent_style}]"
178
+ f"{score_name.upper()}[/{accent_style}][/not {summary_border_style}]"
179
+ ),
180
+ border_style=summary_border_style,
181
+ )
182
+
183
+ layout.add_row(
184
+ Align(summary_panel, vertical="top"),
185
+ Align(histogram_table, vertical="top"),
186
+ )
187
+
188
+ return layout