data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0rc1.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/entry_points.txt +0 -0
@@ -1,421 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from abc import ABC, abstractmethod
7
- from enum import Enum
8
- from typing import TYPE_CHECKING, Any, Literal
9
-
10
- from pydantic import BaseModel, ConfigDict, create_model, field_validator, model_validator
11
- from typing_extensions import Self, TypeAlias
12
-
13
- from data_designer.config.column_types import DataDesignerColumnType
14
- from data_designer.config.sampler_params import SamplerType
15
- from data_designer.config.utils.constants import EPSILON
16
- from data_designer.config.utils.numerical_helpers import is_float, is_int, prepare_number_for_reporting
17
- from data_designer.lazy_heavy_imports import pd
18
- from data_designer.plugin_manager import PluginManager
19
-
20
- if TYPE_CHECKING:
21
- import pandas as pd
22
-
23
-
24
- class MissingValue(str, Enum):
25
- CALCULATION_FAILED = "--"
26
- OUTPUT_FORMAT_ERROR = "output_format_error"
27
-
28
-
29
- class ColumnDistributionType(str, Enum):
30
- CATEGORICAL = "categorical"
31
- NUMERICAL = "numerical"
32
- TEXT = "text"
33
- OTHER = "other"
34
- UNKNOWN = "unknown"
35
-
36
-
37
- class BaseColumnStatistics(BaseModel, ABC):
38
- """Abstract base class for all column statistics types.
39
-
40
- Serves as a container for computed statistics across different column types in
41
- Data-Designer-generated datasets. Subclasses hold column-specific statistical results
42
- and provide methods for formatting these results for display in reports.
43
- """
44
-
45
- model_config = ConfigDict(use_enum_values=True)
46
-
47
- @abstractmethod
48
- def create_report_row_data(self) -> dict[str, str]:
49
- """Creates a formatted dictionary of statistics for display in reports.
50
-
51
- Returns:
52
- Dictionary mapping display labels to formatted statistic values.
53
- """
54
- ...
55
-
56
-
57
- class GeneralColumnStatistics(BaseColumnStatistics):
58
- """Container for general statistics applicable to all column types.
59
-
60
- Holds core statistical measures that apply universally across all column types,
61
- including null counts, unique values, and data type information. Serves as the base
62
- for more specialized column statistics classes that store additional column-specific metrics.
63
-
64
- Attributes:
65
- column_name: Name of the column being analyzed.
66
- num_records: Total number of records in the column.
67
- num_null: Number of null/missing values in the column.
68
- num_unique: Number of distinct values in the column. If a value is not hashable, it is converted to a string.
69
- pyarrow_dtype: PyArrow data type of the column as a string.
70
- simple_dtype: Simplified human-readable data type label.
71
- column_type: Discriminator field, always "general" for this statistics type.
72
- """
73
-
74
- column_name: str
75
- num_records: int | MissingValue
76
- num_null: int | MissingValue
77
- num_unique: int | MissingValue
78
- pyarrow_dtype: str
79
- simple_dtype: str
80
- column_type: Literal["general"] = "general"
81
-
82
- @field_validator("num_null", "num_unique", "num_records", mode="before")
83
- def general_statistics_ensure_python_integers(cls, v: int | MissingValue) -> int | MissingValue:
84
- return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, int)
85
-
86
- @property
87
- def percent_null(self) -> float | MissingValue:
88
- return (
89
- self.num_null
90
- if self._is_missing_value(self.num_null)
91
- else prepare_number_for_reporting(100 * self.num_null / (self.num_records + EPSILON), float)
92
- )
93
-
94
- @property
95
- def percent_unique(self) -> float | MissingValue:
96
- return (
97
- self.num_unique
98
- if self._is_missing_value(self.num_unique)
99
- else prepare_number_for_reporting(100 * self.num_unique / (self.num_records + EPSILON), float)
100
- )
101
-
102
- @property
103
- def _general_display_row(self) -> dict[str, str]:
104
- pct_unique_str = f" ({self.percent_unique:.1f}%)" if not self._is_missing_value(self.percent_unique) else ""
105
- return {
106
- "column name": self.column_name,
107
- "data type": self.simple_dtype,
108
- "number unique values": f"{self.num_unique}{pct_unique_str}",
109
- }
110
-
111
- def create_report_row_data(self) -> dict[str, str]:
112
- return self._general_display_row
113
-
114
- def _is_missing_value(self, v: float | int | MissingValue) -> bool:
115
- return v in set(MissingValue)
116
-
117
-
118
- class LLMTextColumnStatistics(GeneralColumnStatistics):
119
- """Container for statistics on LLM-generated text columns.
120
-
121
- Inherits general statistics plus token usage metrics specific to LLM text generation.
122
- Stores both prompt and completion token consumption data.
123
-
124
- Attributes:
125
- output_tokens_mean: Mean number of output tokens generated per record.
126
- output_tokens_median: Median number of output tokens generated per record.
127
- output_tokens_stddev: Standard deviation of output tokens per record.
128
- input_tokens_mean: Mean number of input tokens used per record.
129
- input_tokens_median: Median number of input tokens used per record.
130
- input_tokens_stddev: Standard deviation of input tokens per record.
131
- column_type: Discriminator field, always "llm-text" for this statistics type.
132
- """
133
-
134
- output_tokens_mean: float | MissingValue
135
- output_tokens_median: float | MissingValue
136
- output_tokens_stddev: float | MissingValue
137
- input_tokens_mean: float | MissingValue
138
- input_tokens_median: float | MissingValue
139
- input_tokens_stddev: float | MissingValue
140
- column_type: Literal[DataDesignerColumnType.LLM_TEXT.value] = DataDesignerColumnType.LLM_TEXT.value
141
-
142
- @field_validator(
143
- "output_tokens_mean",
144
- "output_tokens_median",
145
- "output_tokens_stddev",
146
- "input_tokens_mean",
147
- "input_tokens_median",
148
- "input_tokens_stddev",
149
- mode="before",
150
- )
151
- def llm_column_ensure_python_floats(cls, v: float | int | MissingValue) -> float | int | MissingValue:
152
- return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, float)
153
-
154
- def create_report_row_data(self) -> dict[str, Any]:
155
- prompt_tokens_str = (
156
- f"{self.input_tokens_median:.1f} +/- {self.input_tokens_stddev:.1f}"
157
- if not self._is_missing_value(self.input_tokens_median)
158
- else "--"
159
- )
160
- completion_tokens_str = (
161
- f"{self.output_tokens_median:.1f} +/- {self.output_tokens_stddev:.1f}"
162
- if not self._is_missing_value(self.output_tokens_median)
163
- else "--"
164
- )
165
- return {
166
- **self._general_display_row,
167
- "prompt tokens\nper record": prompt_tokens_str,
168
- "completion tokens\nper record": completion_tokens_str,
169
- }
170
-
171
-
172
- class LLMCodeColumnStatistics(LLMTextColumnStatistics):
173
- """Container for statistics on LLM-generated code columns.
174
-
175
- Inherits all token usage metrics from LLMTextColumnStatistics. Stores
176
- statistics from columns that generate code snippets in specific programming languages.
177
-
178
- Attributes:
179
- column_type: Discriminator field, always "llm-code" for this statistics type.
180
- """
181
-
182
- column_type: Literal[DataDesignerColumnType.LLM_CODE.value] = DataDesignerColumnType.LLM_CODE.value
183
-
184
-
185
- class LLMStructuredColumnStatistics(LLMTextColumnStatistics):
186
- """Container for statistics on LLM-generated structured JSON columns.
187
-
188
- Inherits all token usage metrics from LLMTextColumnStatistics. Stores statistics from
189
- columns that generate structured data conforming to JSON schemas or Pydantic models.
190
-
191
- Attributes:
192
- column_type: Discriminator field, always "llm-structured" for this statistics type.
193
- """
194
-
195
- column_type: Literal[DataDesignerColumnType.LLM_STRUCTURED.value] = DataDesignerColumnType.LLM_STRUCTURED.value
196
-
197
-
198
- class LLMJudgedColumnStatistics(LLMTextColumnStatistics):
199
- """Container for statistics on LLM-as-a-judge quality assessment columns.
200
-
201
- Inherits all token usage metrics from LLMTextColumnStatistics. Stores statistics from
202
- columns that evaluate and score other generated content based on defined criteria.
203
-
204
- Attributes:
205
- column_type: Discriminator field, always "llm-judge" for this statistics type.
206
- """
207
-
208
- column_type: Literal[DataDesignerColumnType.LLM_JUDGE.value] = DataDesignerColumnType.LLM_JUDGE.value
209
-
210
-
211
- class SamplerColumnStatistics(GeneralColumnStatistics):
212
- """Container for statistics on sampler-generated columns.
213
-
214
- Inherits general statistics plus sampler-specific information including the sampler type
215
- used and the empirical distribution of generated values. Stores both categorical and
216
- numerical distribution results.
217
-
218
- Attributes:
219
- sampler_type: Type of sampler used to generate this column (e.g., "uniform", "category",
220
- "gaussian", "person").
221
- distribution_type: Classification of the column's distribution (categorical, numerical,
222
- text, other, or unknown).
223
- distribution: Empirical distribution statistics for the generated values. Can be
224
- CategoricalDistribution (for discrete values), NumericalDistribution (for continuous
225
- values), or MissingValue if distribution could not be computed.
226
- column_type: Discriminator field, always "sampler" for this statistics type.
227
- """
228
-
229
- sampler_type: SamplerType
230
- distribution_type: ColumnDistributionType
231
- distribution: CategoricalDistribution | NumericalDistribution | MissingValue | None
232
- column_type: Literal[DataDesignerColumnType.SAMPLER.value] = DataDesignerColumnType.SAMPLER.value
233
-
234
- def create_report_row_data(self) -> dict[str, str]:
235
- return {
236
- **self._general_display_row,
237
- "sampler type": self.sampler_type,
238
- }
239
-
240
-
241
- class SeedDatasetColumnStatistics(GeneralColumnStatistics):
242
- """Container for statistics on columns sourced from seed datasets.
243
-
244
- Inherits general statistics and stores statistics computed from columns that originate
245
- from existing data provided via the seed dataset functionality.
246
-
247
- Attributes:
248
- column_type: Discriminator field, always "seed-dataset" for this statistics type.
249
- """
250
-
251
- column_type: Literal[DataDesignerColumnType.SEED_DATASET.value] = DataDesignerColumnType.SEED_DATASET.value
252
-
253
-
254
- class ExpressionColumnStatistics(GeneralColumnStatistics):
255
- """Container for statistics on expression-based derived columns.
256
-
257
- Inherits general statistics and stores statistics computed from columns that are derived
258
- from columns that are derived from Jinja2 expressions referencing other column values.
259
-
260
- Attributes:
261
- column_type: Discriminator field, always "expression" for this statistics type.
262
- """
263
-
264
- column_type: Literal[DataDesignerColumnType.EXPRESSION.value] = DataDesignerColumnType.EXPRESSION.value
265
-
266
-
267
- class ValidationColumnStatistics(GeneralColumnStatistics):
268
- """Container for statistics on validation result columns.
269
-
270
- Inherits general statistics plus validation-specific metrics including the count and
271
- percentage of records that passed validation. Stores results from validation logic
272
- (Python, SQL, or remote) executed against target columns.
273
-
274
- Attributes:
275
- num_valid_records: Number of records that passed validation.
276
- column_type: Discriminator field, always "validation" for this statistics type.
277
- """
278
-
279
- num_valid_records: int | MissingValue
280
- column_type: Literal[DataDesignerColumnType.VALIDATION.value] = DataDesignerColumnType.VALIDATION.value
281
-
282
- @field_validator("num_valid_records", mode="before")
283
- def code_validation_column_ensure_python_integers(cls, v: int | MissingValue) -> int | MissingValue:
284
- return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, int)
285
-
286
- @property
287
- def percent_valid(self) -> float | MissingValue:
288
- return (
289
- self.num_valid_records
290
- if self._is_missing_value(self.num_valid_records)
291
- else prepare_number_for_reporting(100 * self.num_valid_records / (self.num_records + EPSILON), float)
292
- )
293
-
294
- def create_report_row_data(self) -> dict[str, str]:
295
- percent_valid_str = f"{self.percent_valid:.1f}%" if not self._is_missing_value(self.percent_valid) else "--"
296
- return {**self._general_display_row, "percent valid": percent_valid_str}
297
-
298
-
299
- class CategoricalHistogramData(BaseModel):
300
- """Container for categorical distribution histogram data.
301
-
302
- Stores the computed frequency distribution of categorical values.
303
-
304
- Attributes:
305
- categories: List of unique category values that appear in the data.
306
- counts: List of occurrence counts for each category.
307
- """
308
-
309
- categories: list[float | int | str]
310
- counts: list[int]
311
-
312
- @model_validator(mode="after")
313
- def ensure_python_types(self) -> Self:
314
- """Ensure numerical values are Python objects rather than Numpy types."""
315
- self.categories = [(float(x) if is_float(x) else (int(x) if is_int(x) else str(x))) for x in self.categories]
316
- self.counts = [int(i) for i in self.counts]
317
- return self
318
-
319
- @classmethod
320
- def from_series(cls, series: pd.Series) -> Self:
321
- counts = series.value_counts()
322
- return cls(categories=counts.index.tolist(), counts=counts.tolist())
323
-
324
-
325
- class CategoricalDistribution(BaseModel):
326
- """Container for computed categorical distribution statistics.
327
-
328
- Attributes:
329
- most_common_value: The category value that appears most frequently in the data.
330
- least_common_value: The category value that appears least frequently in the data.
331
- histogram: Complete frequency distribution showing all categories and their counts.
332
- """
333
-
334
- most_common_value: str | int
335
- least_common_value: str | int
336
- histogram: CategoricalHistogramData
337
-
338
- @field_validator("most_common_value", "least_common_value", mode="before")
339
- def ensure_python_types(cls, v: str | int) -> str | int:
340
- return str(v) if not is_int(v) else prepare_number_for_reporting(v, int)
341
-
342
- @classmethod
343
- def from_series(cls, series: pd.Series) -> Self:
344
- counts = series.value_counts()
345
- return cls(
346
- most_common_value=counts.index[0],
347
- least_common_value=counts.index[-1],
348
- histogram=CategoricalHistogramData.from_series(series),
349
- )
350
-
351
-
352
- class NumericalDistribution(BaseModel):
353
- """Container for computed numerical distribution statistics.
354
-
355
- Attributes:
356
- min: Minimum value in the distribution.
357
- max: Maximum value in the distribution.
358
- mean: Arithmetic mean (average) of all values.
359
- stddev: Standard deviation measuring the spread of values around the mean.
360
- median: Median value of the distribution.
361
- """
362
-
363
- min: float | int
364
- max: float | int
365
- mean: float
366
- stddev: float
367
- median: float
368
-
369
- @field_validator("min", "max", "mean", "stddev", "median", mode="before")
370
- def ensure_python_types(cls, v: float | int) -> float | int:
371
- return prepare_number_for_reporting(v, int if is_int(v) else float)
372
-
373
- @classmethod
374
- def from_series(cls, series: pd.Series) -> Self:
375
- return cls(
376
- min=series.min(skipna=True),
377
- max=series.max(skipna=True),
378
- mean=series.mean(skipna=True),
379
- stddev=series.std(skipna=True),
380
- median=series.median(skipna=True),
381
- )
382
-
383
-
384
- ColumnStatisticsT: TypeAlias = (
385
- GeneralColumnStatistics
386
- | LLMTextColumnStatistics
387
- | LLMCodeColumnStatistics
388
- | LLMStructuredColumnStatistics
389
- | LLMJudgedColumnStatistics
390
- | SamplerColumnStatistics
391
- | SeedDatasetColumnStatistics
392
- | ValidationColumnStatistics
393
- | ExpressionColumnStatistics
394
- )
395
-
396
-
397
- DEFAULT_COLUMN_STATISTICS_MAP = {
398
- DataDesignerColumnType.EXPRESSION: ExpressionColumnStatistics,
399
- DataDesignerColumnType.LLM_CODE: LLMCodeColumnStatistics,
400
- DataDesignerColumnType.LLM_JUDGE: LLMJudgedColumnStatistics,
401
- DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnStatistics,
402
- DataDesignerColumnType.LLM_TEXT: LLMTextColumnStatistics,
403
- DataDesignerColumnType.SAMPLER: SamplerColumnStatistics,
404
- DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnStatistics,
405
- DataDesignerColumnType.VALIDATION: ValidationColumnStatistics,
406
- }
407
-
408
- for plugin in PluginManager().get_column_generator_plugins():
409
- # Dynamically create a statistics class for this plugin using Pydantic's create_model
410
- plugin_stats_cls_name = f"{plugin.config_type_as_class_name}ColumnStatistics"
411
-
412
- # Create the class with proper Pydantic field
413
- plugin_stats_cls = create_model(
414
- plugin_stats_cls_name,
415
- __base__=GeneralColumnStatistics,
416
- column_type=(Literal[plugin.name], plugin.name),
417
- )
418
-
419
- # Add the plugin statistics class to the union
420
- ColumnStatisticsT |= plugin_stats_cls
421
- DEFAULT_COLUMN_STATISTICS_MAP[DataDesignerColumnType(plugin.name)] = plugin_stats_cls
@@ -1,84 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from functools import cached_property
7
- from pathlib import Path
8
- from typing import TYPE_CHECKING, Annotated
9
-
10
- from pydantic import BaseModel, Field, field_validator
11
-
12
- from data_designer.config.analysis.column_profilers import ColumnProfilerResultsT
13
- from data_designer.config.analysis.column_statistics import ColumnStatisticsT
14
- from data_designer.config.analysis.utils.reporting import generate_analysis_report
15
- from data_designer.config.column_types import get_column_display_order
16
- from data_designer.config.utils.constants import EPSILON
17
- from data_designer.config.utils.numerical_helpers import prepare_number_for_reporting
18
-
19
- if TYPE_CHECKING:
20
- from data_designer.config.analysis.utils.reporting import ReportSection
21
- from data_designer.config.column_types import DataDesignerColumnType
22
-
23
-
24
- class DatasetProfilerResults(BaseModel):
25
- """Container for complete dataset profiling and analysis results.
26
-
27
- Stores profiling results for a generated dataset, including statistics for all columns,
28
- dataset-level metadata, and optional advanced profiler results. Provides methods for
29
- computing derived metrics and generating formatted reports.
30
-
31
- Attributes:
32
- num_records: Actual number of records successfully generated in the dataset.
33
- target_num_records: Target number of records that were requested to be generated.
34
- column_statistics: List of statistics objects for all columns in the dataset. Each
35
- column has statistics appropriate to its type. Must contain at least one column.
36
- side_effect_column_names: Column names that were generated as side effects of other columns.
37
- column_profiles: Column profiler results for specific columns when configured.
38
- """
39
-
40
- num_records: int
41
- target_num_records: int
42
- column_statistics: list[Annotated[ColumnStatisticsT, Field(discriminator="column_type")]] = Field(..., min_length=1)
43
- side_effect_column_names: list[str] | None = None
44
- column_profiles: list[ColumnProfilerResultsT] | None = None
45
-
46
- @field_validator("num_records", "target_num_records", mode="before")
47
- def ensure_python_integers(cls, v: int) -> int:
48
- return prepare_number_for_reporting(v, int)
49
-
50
- @property
51
- def percent_complete(self) -> float:
52
- """Returns the completion percentage of the dataset."""
53
- return 100 * self.num_records / (self.target_num_records + EPSILON)
54
-
55
- @cached_property
56
- def column_types(self) -> list[str]:
57
- """Returns a sorted list of unique column types present in the dataset."""
58
- display_order = get_column_display_order()
59
- return sorted(
60
- list(set([c.column_type for c in self.column_statistics])),
61
- key=lambda x: display_order.index(x) if x in display_order else len(display_order),
62
- )
63
-
64
- def get_column_statistics_by_type(self, column_type: DataDesignerColumnType) -> list[ColumnStatisticsT]:
65
- """Filters column statistics to return only those of the specified type."""
66
- return [c for c in self.column_statistics if c.column_type == column_type]
67
-
68
- def to_report(
69
- self,
70
- save_path: str | Path | None = None,
71
- include_sections: list[ReportSection | DataDesignerColumnType] | None = None,
72
- ) -> None:
73
- """Generate and print an analysis report based on the dataset profiling results.
74
-
75
- Args:
76
- save_path: Optional path to save the report. If provided, the report will be saved
77
- as either HTML (.html) or SVG (.svg) format. If None, the report will
78
- only be displayed in the console.
79
- include_sections: Optional list of sections to include in the report. Choices are
80
- any DataDesignerColumnType, "overview" (the dataset overview section),
81
- and "column_profilers" (all column profilers in one section). If None,
82
- all sections will be included.
83
- """
84
- generate_analysis_report(self, save_path, include_sections=include_sections)
@@ -1,10 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.errors import DataDesignerError
7
-
8
-
9
- class AnalysisReportError(DataDesignerError):
10
- """Base exception for analysis report errors."""