data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/cli/commands/__init__.py +1 -1
- data_designer/interface/__init__.py +21 -1
- data_designer/{_version.py → interface/_version.py} +2 -2
- data_designer/interface/data_designer.py +1 -7
- {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/METADATA +10 -42
- data_designer-0.4.0rc1.dist-info/RECORD +39 -0
- data_designer/__init__.py +0 -17
- data_designer/config/__init__.py +0 -2
- data_designer/config/analysis/__init__.py +0 -2
- data_designer/config/analysis/column_profilers.py +0 -159
- data_designer/config/analysis/column_statistics.py +0 -421
- data_designer/config/analysis/dataset_profiler.py +0 -84
- data_designer/config/analysis/utils/errors.py +0 -10
- data_designer/config/analysis/utils/reporting.py +0 -192
- data_designer/config/base.py +0 -69
- data_designer/config/column_configs.py +0 -470
- data_designer/config/column_types.py +0 -141
- data_designer/config/config_builder.py +0 -595
- data_designer/config/data_designer_config.py +0 -40
- data_designer/config/dataset_builders.py +0 -13
- data_designer/config/dataset_metadata.py +0 -18
- data_designer/config/default_model_settings.py +0 -129
- data_designer/config/errors.py +0 -24
- data_designer/config/exports.py +0 -145
- data_designer/config/interface.py +0 -55
- data_designer/config/models.py +0 -455
- data_designer/config/preview_results.py +0 -41
- data_designer/config/processors.py +0 -148
- data_designer/config/run_config.py +0 -51
- data_designer/config/sampler_constraints.py +0 -52
- data_designer/config/sampler_params.py +0 -639
- data_designer/config/seed.py +0 -116
- data_designer/config/seed_source.py +0 -84
- data_designer/config/seed_source_types.py +0 -19
- data_designer/config/utils/code_lang.py +0 -82
- data_designer/config/utils/constants.py +0 -363
- data_designer/config/utils/errors.py +0 -21
- data_designer/config/utils/info.py +0 -94
- data_designer/config/utils/io_helpers.py +0 -258
- data_designer/config/utils/misc.py +0 -78
- data_designer/config/utils/numerical_helpers.py +0 -30
- data_designer/config/utils/type_helpers.py +0 -106
- data_designer/config/utils/visualization.py +0 -482
- data_designer/config/validator_params.py +0 -94
- data_designer/engine/__init__.py +0 -2
- data_designer/engine/analysis/column_profilers/base.py +0 -49
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
- data_designer/engine/analysis/column_profilers/registry.py +0 -22
- data_designer/engine/analysis/column_statistics.py +0 -145
- data_designer/engine/analysis/dataset_profiler.py +0 -149
- data_designer/engine/analysis/errors.py +0 -9
- data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
- data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
- data_designer/engine/column_generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/base.py +0 -122
- data_designer/engine/column_generators/generators/embedding.py +0 -35
- data_designer/engine/column_generators/generators/expression.py +0 -55
- data_designer/engine/column_generators/generators/llm_completion.py +0 -113
- data_designer/engine/column_generators/generators/samplers.py +0 -69
- data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
- data_designer/engine/column_generators/generators/validation.py +0 -140
- data_designer/engine/column_generators/registry.py +0 -60
- data_designer/engine/column_generators/utils/errors.py +0 -15
- data_designer/engine/column_generators/utils/generator_classification.py +0 -43
- data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
- data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
- data_designer/engine/compiler.py +0 -97
- data_designer/engine/configurable_task.py +0 -71
- data_designer/engine/dataset_builders/artifact_storage.py +0 -283
- data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
- data_designer/engine/dataset_builders/errors.py +0 -15
- data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
- data_designer/engine/dataset_builders/utils/__init__.py +0 -2
- data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
- data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
- data_designer/engine/dataset_builders/utils/dag.py +0 -62
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
- data_designer/engine/dataset_builders/utils/errors.py +0 -15
- data_designer/engine/errors.py +0 -51
- data_designer/engine/model_provider.py +0 -77
- data_designer/engine/models/__init__.py +0 -2
- data_designer/engine/models/errors.py +0 -300
- data_designer/engine/models/facade.py +0 -287
- data_designer/engine/models/factory.py +0 -42
- data_designer/engine/models/litellm_overrides.py +0 -179
- data_designer/engine/models/parsers/__init__.py +0 -2
- data_designer/engine/models/parsers/errors.py +0 -34
- data_designer/engine/models/parsers/parser.py +0 -235
- data_designer/engine/models/parsers/postprocessors.py +0 -93
- data_designer/engine/models/parsers/tag_parsers.py +0 -62
- data_designer/engine/models/parsers/types.py +0 -84
- data_designer/engine/models/recipes/base.py +0 -81
- data_designer/engine/models/recipes/response_recipes.py +0 -293
- data_designer/engine/models/registry.py +0 -146
- data_designer/engine/models/telemetry.py +0 -359
- data_designer/engine/models/usage.py +0 -73
- data_designer/engine/models/utils.py +0 -38
- data_designer/engine/processing/ginja/__init__.py +0 -2
- data_designer/engine/processing/ginja/ast.py +0 -65
- data_designer/engine/processing/ginja/environment.py +0 -463
- data_designer/engine/processing/ginja/exceptions.py +0 -56
- data_designer/engine/processing/ginja/record.py +0 -32
- data_designer/engine/processing/gsonschema/__init__.py +0 -2
- data_designer/engine/processing/gsonschema/exceptions.py +0 -15
- data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
- data_designer/engine/processing/gsonschema/types.py +0 -10
- data_designer/engine/processing/gsonschema/validators.py +0 -202
- data_designer/engine/processing/processors/base.py +0 -13
- data_designer/engine/processing/processors/drop_columns.py +0 -42
- data_designer/engine/processing/processors/registry.py +0 -25
- data_designer/engine/processing/processors/schema_transform.py +0 -49
- data_designer/engine/processing/utils.py +0 -169
- data_designer/engine/registry/base.py +0 -99
- data_designer/engine/registry/data_designer_registry.py +0 -39
- data_designer/engine/registry/errors.py +0 -12
- data_designer/engine/resources/managed_dataset_generator.py +0 -39
- data_designer/engine/resources/managed_dataset_repository.py +0 -197
- data_designer/engine/resources/managed_storage.py +0 -65
- data_designer/engine/resources/resource_provider.py +0 -77
- data_designer/engine/resources/seed_reader.py +0 -154
- data_designer/engine/sampling_gen/column.py +0 -91
- data_designer/engine/sampling_gen/constraints.py +0 -100
- data_designer/engine/sampling_gen/data_sources/base.py +0 -217
- data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
- data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
- data_designer/engine/sampling_gen/entities/__init__.py +0 -2
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
- data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
- data_designer/engine/sampling_gen/entities/errors.py +0 -10
- data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
- data_designer/engine/sampling_gen/entities/person.py +0 -144
- data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
- data_designer/engine/sampling_gen/errors.py +0 -26
- data_designer/engine/sampling_gen/generator.py +0 -122
- data_designer/engine/sampling_gen/jinja_utils.py +0 -64
- data_designer/engine/sampling_gen/people_gen.py +0 -199
- data_designer/engine/sampling_gen/person_constants.py +0 -56
- data_designer/engine/sampling_gen/schema.py +0 -147
- data_designer/engine/sampling_gen/schema_builder.py +0 -61
- data_designer/engine/sampling_gen/utils.py +0 -46
- data_designer/engine/secret_resolver.py +0 -82
- data_designer/engine/validation.py +0 -367
- data_designer/engine/validators/__init__.py +0 -19
- data_designer/engine/validators/base.py +0 -38
- data_designer/engine/validators/local_callable.py +0 -39
- data_designer/engine/validators/python.py +0 -254
- data_designer/engine/validators/remote.py +0 -89
- data_designer/engine/validators/sql.py +0 -65
- data_designer/errors.py +0 -7
- data_designer/essentials/__init__.py +0 -33
- data_designer/lazy_heavy_imports.py +0 -54
- data_designer/logging.py +0 -163
- data_designer/plugin_manager.py +0 -78
- data_designer/plugins/__init__.py +0 -8
- data_designer/plugins/errors.py +0 -15
- data_designer/plugins/plugin.py +0 -141
- data_designer/plugins/registry.py +0 -88
- data_designer/plugins/testing/__init__.py +0 -10
- data_designer/plugins/testing/stubs.py +0 -116
- data_designer/plugins/testing/utils.py +0 -20
- data_designer-0.3.8rc2.dist-info/RECORD +0 -196
- data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
- {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/WHEEL +0 -0
- {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/entry_points.txt +0 -0
|
@@ -1,421 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from abc import ABC, abstractmethod
|
|
7
|
-
from enum import Enum
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Literal
|
|
9
|
-
|
|
10
|
-
from pydantic import BaseModel, ConfigDict, create_model, field_validator, model_validator
|
|
11
|
-
from typing_extensions import Self, TypeAlias
|
|
12
|
-
|
|
13
|
-
from data_designer.config.column_types import DataDesignerColumnType
|
|
14
|
-
from data_designer.config.sampler_params import SamplerType
|
|
15
|
-
from data_designer.config.utils.constants import EPSILON
|
|
16
|
-
from data_designer.config.utils.numerical_helpers import is_float, is_int, prepare_number_for_reporting
|
|
17
|
-
from data_designer.lazy_heavy_imports import pd
|
|
18
|
-
from data_designer.plugin_manager import PluginManager
|
|
19
|
-
|
|
20
|
-
if TYPE_CHECKING:
|
|
21
|
-
import pandas as pd
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class MissingValue(str, Enum):
|
|
25
|
-
CALCULATION_FAILED = "--"
|
|
26
|
-
OUTPUT_FORMAT_ERROR = "output_format_error"
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class ColumnDistributionType(str, Enum):
|
|
30
|
-
CATEGORICAL = "categorical"
|
|
31
|
-
NUMERICAL = "numerical"
|
|
32
|
-
TEXT = "text"
|
|
33
|
-
OTHER = "other"
|
|
34
|
-
UNKNOWN = "unknown"
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class BaseColumnStatistics(BaseModel, ABC):
|
|
38
|
-
"""Abstract base class for all column statistics types.
|
|
39
|
-
|
|
40
|
-
Serves as a container for computed statistics across different column types in
|
|
41
|
-
Data-Designer-generated datasets. Subclasses hold column-specific statistical results
|
|
42
|
-
and provide methods for formatting these results for display in reports.
|
|
43
|
-
"""
|
|
44
|
-
|
|
45
|
-
model_config = ConfigDict(use_enum_values=True)
|
|
46
|
-
|
|
47
|
-
@abstractmethod
|
|
48
|
-
def create_report_row_data(self) -> dict[str, str]:
|
|
49
|
-
"""Creates a formatted dictionary of statistics for display in reports.
|
|
50
|
-
|
|
51
|
-
Returns:
|
|
52
|
-
Dictionary mapping display labels to formatted statistic values.
|
|
53
|
-
"""
|
|
54
|
-
...
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
class GeneralColumnStatistics(BaseColumnStatistics):
|
|
58
|
-
"""Container for general statistics applicable to all column types.
|
|
59
|
-
|
|
60
|
-
Holds core statistical measures that apply universally across all column types,
|
|
61
|
-
including null counts, unique values, and data type information. Serves as the base
|
|
62
|
-
for more specialized column statistics classes that store additional column-specific metrics.
|
|
63
|
-
|
|
64
|
-
Attributes:
|
|
65
|
-
column_name: Name of the column being analyzed.
|
|
66
|
-
num_records: Total number of records in the column.
|
|
67
|
-
num_null: Number of null/missing values in the column.
|
|
68
|
-
num_unique: Number of distinct values in the column. If a value is not hashable, it is converted to a string.
|
|
69
|
-
pyarrow_dtype: PyArrow data type of the column as a string.
|
|
70
|
-
simple_dtype: Simplified human-readable data type label.
|
|
71
|
-
column_type: Discriminator field, always "general" for this statistics type.
|
|
72
|
-
"""
|
|
73
|
-
|
|
74
|
-
column_name: str
|
|
75
|
-
num_records: int | MissingValue
|
|
76
|
-
num_null: int | MissingValue
|
|
77
|
-
num_unique: int | MissingValue
|
|
78
|
-
pyarrow_dtype: str
|
|
79
|
-
simple_dtype: str
|
|
80
|
-
column_type: Literal["general"] = "general"
|
|
81
|
-
|
|
82
|
-
@field_validator("num_null", "num_unique", "num_records", mode="before")
|
|
83
|
-
def general_statistics_ensure_python_integers(cls, v: int | MissingValue) -> int | MissingValue:
|
|
84
|
-
return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, int)
|
|
85
|
-
|
|
86
|
-
@property
|
|
87
|
-
def percent_null(self) -> float | MissingValue:
|
|
88
|
-
return (
|
|
89
|
-
self.num_null
|
|
90
|
-
if self._is_missing_value(self.num_null)
|
|
91
|
-
else prepare_number_for_reporting(100 * self.num_null / (self.num_records + EPSILON), float)
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
@property
|
|
95
|
-
def percent_unique(self) -> float | MissingValue:
|
|
96
|
-
return (
|
|
97
|
-
self.num_unique
|
|
98
|
-
if self._is_missing_value(self.num_unique)
|
|
99
|
-
else prepare_number_for_reporting(100 * self.num_unique / (self.num_records + EPSILON), float)
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
@property
|
|
103
|
-
def _general_display_row(self) -> dict[str, str]:
|
|
104
|
-
pct_unique_str = f" ({self.percent_unique:.1f}%)" if not self._is_missing_value(self.percent_unique) else ""
|
|
105
|
-
return {
|
|
106
|
-
"column name": self.column_name,
|
|
107
|
-
"data type": self.simple_dtype,
|
|
108
|
-
"number unique values": f"{self.num_unique}{pct_unique_str}",
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
def create_report_row_data(self) -> dict[str, str]:
|
|
112
|
-
return self._general_display_row
|
|
113
|
-
|
|
114
|
-
def _is_missing_value(self, v: float | int | MissingValue) -> bool:
|
|
115
|
-
return v in set(MissingValue)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
class LLMTextColumnStatistics(GeneralColumnStatistics):
|
|
119
|
-
"""Container for statistics on LLM-generated text columns.
|
|
120
|
-
|
|
121
|
-
Inherits general statistics plus token usage metrics specific to LLM text generation.
|
|
122
|
-
Stores both prompt and completion token consumption data.
|
|
123
|
-
|
|
124
|
-
Attributes:
|
|
125
|
-
output_tokens_mean: Mean number of output tokens generated per record.
|
|
126
|
-
output_tokens_median: Median number of output tokens generated per record.
|
|
127
|
-
output_tokens_stddev: Standard deviation of output tokens per record.
|
|
128
|
-
input_tokens_mean: Mean number of input tokens used per record.
|
|
129
|
-
input_tokens_median: Median number of input tokens used per record.
|
|
130
|
-
input_tokens_stddev: Standard deviation of input tokens per record.
|
|
131
|
-
column_type: Discriminator field, always "llm-text" for this statistics type.
|
|
132
|
-
"""
|
|
133
|
-
|
|
134
|
-
output_tokens_mean: float | MissingValue
|
|
135
|
-
output_tokens_median: float | MissingValue
|
|
136
|
-
output_tokens_stddev: float | MissingValue
|
|
137
|
-
input_tokens_mean: float | MissingValue
|
|
138
|
-
input_tokens_median: float | MissingValue
|
|
139
|
-
input_tokens_stddev: float | MissingValue
|
|
140
|
-
column_type: Literal[DataDesignerColumnType.LLM_TEXT.value] = DataDesignerColumnType.LLM_TEXT.value
|
|
141
|
-
|
|
142
|
-
@field_validator(
|
|
143
|
-
"output_tokens_mean",
|
|
144
|
-
"output_tokens_median",
|
|
145
|
-
"output_tokens_stddev",
|
|
146
|
-
"input_tokens_mean",
|
|
147
|
-
"input_tokens_median",
|
|
148
|
-
"input_tokens_stddev",
|
|
149
|
-
mode="before",
|
|
150
|
-
)
|
|
151
|
-
def llm_column_ensure_python_floats(cls, v: float | int | MissingValue) -> float | int | MissingValue:
|
|
152
|
-
return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, float)
|
|
153
|
-
|
|
154
|
-
def create_report_row_data(self) -> dict[str, Any]:
|
|
155
|
-
prompt_tokens_str = (
|
|
156
|
-
f"{self.input_tokens_median:.1f} +/- {self.input_tokens_stddev:.1f}"
|
|
157
|
-
if not self._is_missing_value(self.input_tokens_median)
|
|
158
|
-
else "--"
|
|
159
|
-
)
|
|
160
|
-
completion_tokens_str = (
|
|
161
|
-
f"{self.output_tokens_median:.1f} +/- {self.output_tokens_stddev:.1f}"
|
|
162
|
-
if not self._is_missing_value(self.output_tokens_median)
|
|
163
|
-
else "--"
|
|
164
|
-
)
|
|
165
|
-
return {
|
|
166
|
-
**self._general_display_row,
|
|
167
|
-
"prompt tokens\nper record": prompt_tokens_str,
|
|
168
|
-
"completion tokens\nper record": completion_tokens_str,
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
class LLMCodeColumnStatistics(LLMTextColumnStatistics):
|
|
173
|
-
"""Container for statistics on LLM-generated code columns.
|
|
174
|
-
|
|
175
|
-
Inherits all token usage metrics from LLMTextColumnStatistics. Stores
|
|
176
|
-
statistics from columns that generate code snippets in specific programming languages.
|
|
177
|
-
|
|
178
|
-
Attributes:
|
|
179
|
-
column_type: Discriminator field, always "llm-code" for this statistics type.
|
|
180
|
-
"""
|
|
181
|
-
|
|
182
|
-
column_type: Literal[DataDesignerColumnType.LLM_CODE.value] = DataDesignerColumnType.LLM_CODE.value
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
class LLMStructuredColumnStatistics(LLMTextColumnStatistics):
|
|
186
|
-
"""Container for statistics on LLM-generated structured JSON columns.
|
|
187
|
-
|
|
188
|
-
Inherits all token usage metrics from LLMTextColumnStatistics. Stores statistics from
|
|
189
|
-
columns that generate structured data conforming to JSON schemas or Pydantic models.
|
|
190
|
-
|
|
191
|
-
Attributes:
|
|
192
|
-
column_type: Discriminator field, always "llm-structured" for this statistics type.
|
|
193
|
-
"""
|
|
194
|
-
|
|
195
|
-
column_type: Literal[DataDesignerColumnType.LLM_STRUCTURED.value] = DataDesignerColumnType.LLM_STRUCTURED.value
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
class LLMJudgedColumnStatistics(LLMTextColumnStatistics):
|
|
199
|
-
"""Container for statistics on LLM-as-a-judge quality assessment columns.
|
|
200
|
-
|
|
201
|
-
Inherits all token usage metrics from LLMTextColumnStatistics. Stores statistics from
|
|
202
|
-
columns that evaluate and score other generated content based on defined criteria.
|
|
203
|
-
|
|
204
|
-
Attributes:
|
|
205
|
-
column_type: Discriminator field, always "llm-judge" for this statistics type.
|
|
206
|
-
"""
|
|
207
|
-
|
|
208
|
-
column_type: Literal[DataDesignerColumnType.LLM_JUDGE.value] = DataDesignerColumnType.LLM_JUDGE.value
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
class SamplerColumnStatistics(GeneralColumnStatistics):
|
|
212
|
-
"""Container for statistics on sampler-generated columns.
|
|
213
|
-
|
|
214
|
-
Inherits general statistics plus sampler-specific information including the sampler type
|
|
215
|
-
used and the empirical distribution of generated values. Stores both categorical and
|
|
216
|
-
numerical distribution results.
|
|
217
|
-
|
|
218
|
-
Attributes:
|
|
219
|
-
sampler_type: Type of sampler used to generate this column (e.g., "uniform", "category",
|
|
220
|
-
"gaussian", "person").
|
|
221
|
-
distribution_type: Classification of the column's distribution (categorical, numerical,
|
|
222
|
-
text, other, or unknown).
|
|
223
|
-
distribution: Empirical distribution statistics for the generated values. Can be
|
|
224
|
-
CategoricalDistribution (for discrete values), NumericalDistribution (for continuous
|
|
225
|
-
values), or MissingValue if distribution could not be computed.
|
|
226
|
-
column_type: Discriminator field, always "sampler" for this statistics type.
|
|
227
|
-
"""
|
|
228
|
-
|
|
229
|
-
sampler_type: SamplerType
|
|
230
|
-
distribution_type: ColumnDistributionType
|
|
231
|
-
distribution: CategoricalDistribution | NumericalDistribution | MissingValue | None
|
|
232
|
-
column_type: Literal[DataDesignerColumnType.SAMPLER.value] = DataDesignerColumnType.SAMPLER.value
|
|
233
|
-
|
|
234
|
-
def create_report_row_data(self) -> dict[str, str]:
|
|
235
|
-
return {
|
|
236
|
-
**self._general_display_row,
|
|
237
|
-
"sampler type": self.sampler_type,
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
class SeedDatasetColumnStatistics(GeneralColumnStatistics):
|
|
242
|
-
"""Container for statistics on columns sourced from seed datasets.
|
|
243
|
-
|
|
244
|
-
Inherits general statistics and stores statistics computed from columns that originate
|
|
245
|
-
from existing data provided via the seed dataset functionality.
|
|
246
|
-
|
|
247
|
-
Attributes:
|
|
248
|
-
column_type: Discriminator field, always "seed-dataset" for this statistics type.
|
|
249
|
-
"""
|
|
250
|
-
|
|
251
|
-
column_type: Literal[DataDesignerColumnType.SEED_DATASET.value] = DataDesignerColumnType.SEED_DATASET.value
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
class ExpressionColumnStatistics(GeneralColumnStatistics):
|
|
255
|
-
"""Container for statistics on expression-based derived columns.
|
|
256
|
-
|
|
257
|
-
Inherits general statistics and stores statistics computed from columns that are derived
|
|
258
|
-
from columns that are derived from Jinja2 expressions referencing other column values.
|
|
259
|
-
|
|
260
|
-
Attributes:
|
|
261
|
-
column_type: Discriminator field, always "expression" for this statistics type.
|
|
262
|
-
"""
|
|
263
|
-
|
|
264
|
-
column_type: Literal[DataDesignerColumnType.EXPRESSION.value] = DataDesignerColumnType.EXPRESSION.value
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
class ValidationColumnStatistics(GeneralColumnStatistics):
|
|
268
|
-
"""Container for statistics on validation result columns.
|
|
269
|
-
|
|
270
|
-
Inherits general statistics plus validation-specific metrics including the count and
|
|
271
|
-
percentage of records that passed validation. Stores results from validation logic
|
|
272
|
-
(Python, SQL, or remote) executed against target columns.
|
|
273
|
-
|
|
274
|
-
Attributes:
|
|
275
|
-
num_valid_records: Number of records that passed validation.
|
|
276
|
-
column_type: Discriminator field, always "validation" for this statistics type.
|
|
277
|
-
"""
|
|
278
|
-
|
|
279
|
-
num_valid_records: int | MissingValue
|
|
280
|
-
column_type: Literal[DataDesignerColumnType.VALIDATION.value] = DataDesignerColumnType.VALIDATION.value
|
|
281
|
-
|
|
282
|
-
@field_validator("num_valid_records", mode="before")
|
|
283
|
-
def code_validation_column_ensure_python_integers(cls, v: int | MissingValue) -> int | MissingValue:
|
|
284
|
-
return v if isinstance(v, MissingValue) else prepare_number_for_reporting(v, int)
|
|
285
|
-
|
|
286
|
-
@property
|
|
287
|
-
def percent_valid(self) -> float | MissingValue:
|
|
288
|
-
return (
|
|
289
|
-
self.num_valid_records
|
|
290
|
-
if self._is_missing_value(self.num_valid_records)
|
|
291
|
-
else prepare_number_for_reporting(100 * self.num_valid_records / (self.num_records + EPSILON), float)
|
|
292
|
-
)
|
|
293
|
-
|
|
294
|
-
def create_report_row_data(self) -> dict[str, str]:
|
|
295
|
-
percent_valid_str = f"{self.percent_valid:.1f}%" if not self._is_missing_value(self.percent_valid) else "--"
|
|
296
|
-
return {**self._general_display_row, "percent valid": percent_valid_str}
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
class CategoricalHistogramData(BaseModel):
|
|
300
|
-
"""Container for categorical distribution histogram data.
|
|
301
|
-
|
|
302
|
-
Stores the computed frequency distribution of categorical values.
|
|
303
|
-
|
|
304
|
-
Attributes:
|
|
305
|
-
categories: List of unique category values that appear in the data.
|
|
306
|
-
counts: List of occurrence counts for each category.
|
|
307
|
-
"""
|
|
308
|
-
|
|
309
|
-
categories: list[float | int | str]
|
|
310
|
-
counts: list[int]
|
|
311
|
-
|
|
312
|
-
@model_validator(mode="after")
|
|
313
|
-
def ensure_python_types(self) -> Self:
|
|
314
|
-
"""Ensure numerical values are Python objects rather than Numpy types."""
|
|
315
|
-
self.categories = [(float(x) if is_float(x) else (int(x) if is_int(x) else str(x))) for x in self.categories]
|
|
316
|
-
self.counts = [int(i) for i in self.counts]
|
|
317
|
-
return self
|
|
318
|
-
|
|
319
|
-
@classmethod
|
|
320
|
-
def from_series(cls, series: pd.Series) -> Self:
|
|
321
|
-
counts = series.value_counts()
|
|
322
|
-
return cls(categories=counts.index.tolist(), counts=counts.tolist())
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
class CategoricalDistribution(BaseModel):
|
|
326
|
-
"""Container for computed categorical distribution statistics.
|
|
327
|
-
|
|
328
|
-
Attributes:
|
|
329
|
-
most_common_value: The category value that appears most frequently in the data.
|
|
330
|
-
least_common_value: The category value that appears least frequently in the data.
|
|
331
|
-
histogram: Complete frequency distribution showing all categories and their counts.
|
|
332
|
-
"""
|
|
333
|
-
|
|
334
|
-
most_common_value: str | int
|
|
335
|
-
least_common_value: str | int
|
|
336
|
-
histogram: CategoricalHistogramData
|
|
337
|
-
|
|
338
|
-
@field_validator("most_common_value", "least_common_value", mode="before")
|
|
339
|
-
def ensure_python_types(cls, v: str | int) -> str | int:
|
|
340
|
-
return str(v) if not is_int(v) else prepare_number_for_reporting(v, int)
|
|
341
|
-
|
|
342
|
-
@classmethod
|
|
343
|
-
def from_series(cls, series: pd.Series) -> Self:
|
|
344
|
-
counts = series.value_counts()
|
|
345
|
-
return cls(
|
|
346
|
-
most_common_value=counts.index[0],
|
|
347
|
-
least_common_value=counts.index[-1],
|
|
348
|
-
histogram=CategoricalHistogramData.from_series(series),
|
|
349
|
-
)
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
class NumericalDistribution(BaseModel):
|
|
353
|
-
"""Container for computed numerical distribution statistics.
|
|
354
|
-
|
|
355
|
-
Attributes:
|
|
356
|
-
min: Minimum value in the distribution.
|
|
357
|
-
max: Maximum value in the distribution.
|
|
358
|
-
mean: Arithmetic mean (average) of all values.
|
|
359
|
-
stddev: Standard deviation measuring the spread of values around the mean.
|
|
360
|
-
median: Median value of the distribution.
|
|
361
|
-
"""
|
|
362
|
-
|
|
363
|
-
min: float | int
|
|
364
|
-
max: float | int
|
|
365
|
-
mean: float
|
|
366
|
-
stddev: float
|
|
367
|
-
median: float
|
|
368
|
-
|
|
369
|
-
@field_validator("min", "max", "mean", "stddev", "median", mode="before")
|
|
370
|
-
def ensure_python_types(cls, v: float | int) -> float | int:
|
|
371
|
-
return prepare_number_for_reporting(v, int if is_int(v) else float)
|
|
372
|
-
|
|
373
|
-
@classmethod
|
|
374
|
-
def from_series(cls, series: pd.Series) -> Self:
|
|
375
|
-
return cls(
|
|
376
|
-
min=series.min(skipna=True),
|
|
377
|
-
max=series.max(skipna=True),
|
|
378
|
-
mean=series.mean(skipna=True),
|
|
379
|
-
stddev=series.std(skipna=True),
|
|
380
|
-
median=series.median(skipna=True),
|
|
381
|
-
)
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
ColumnStatisticsT: TypeAlias = (
|
|
385
|
-
GeneralColumnStatistics
|
|
386
|
-
| LLMTextColumnStatistics
|
|
387
|
-
| LLMCodeColumnStatistics
|
|
388
|
-
| LLMStructuredColumnStatistics
|
|
389
|
-
| LLMJudgedColumnStatistics
|
|
390
|
-
| SamplerColumnStatistics
|
|
391
|
-
| SeedDatasetColumnStatistics
|
|
392
|
-
| ValidationColumnStatistics
|
|
393
|
-
| ExpressionColumnStatistics
|
|
394
|
-
)
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
DEFAULT_COLUMN_STATISTICS_MAP = {
|
|
398
|
-
DataDesignerColumnType.EXPRESSION: ExpressionColumnStatistics,
|
|
399
|
-
DataDesignerColumnType.LLM_CODE: LLMCodeColumnStatistics,
|
|
400
|
-
DataDesignerColumnType.LLM_JUDGE: LLMJudgedColumnStatistics,
|
|
401
|
-
DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnStatistics,
|
|
402
|
-
DataDesignerColumnType.LLM_TEXT: LLMTextColumnStatistics,
|
|
403
|
-
DataDesignerColumnType.SAMPLER: SamplerColumnStatistics,
|
|
404
|
-
DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnStatistics,
|
|
405
|
-
DataDesignerColumnType.VALIDATION: ValidationColumnStatistics,
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
for plugin in PluginManager().get_column_generator_plugins():
|
|
409
|
-
# Dynamically create a statistics class for this plugin using Pydantic's create_model
|
|
410
|
-
plugin_stats_cls_name = f"{plugin.config_type_as_class_name}ColumnStatistics"
|
|
411
|
-
|
|
412
|
-
# Create the class with proper Pydantic field
|
|
413
|
-
plugin_stats_cls = create_model(
|
|
414
|
-
plugin_stats_cls_name,
|
|
415
|
-
__base__=GeneralColumnStatistics,
|
|
416
|
-
column_type=(Literal[plugin.name], plugin.name),
|
|
417
|
-
)
|
|
418
|
-
|
|
419
|
-
# Add the plugin statistics class to the union
|
|
420
|
-
ColumnStatisticsT |= plugin_stats_cls
|
|
421
|
-
DEFAULT_COLUMN_STATISTICS_MAP[DataDesignerColumnType(plugin.name)] = plugin_stats_cls
|
|
@@ -1,84 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from functools import cached_property
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Annotated
|
|
9
|
-
|
|
10
|
-
from pydantic import BaseModel, Field, field_validator
|
|
11
|
-
|
|
12
|
-
from data_designer.config.analysis.column_profilers import ColumnProfilerResultsT
|
|
13
|
-
from data_designer.config.analysis.column_statistics import ColumnStatisticsT
|
|
14
|
-
from data_designer.config.analysis.utils.reporting import generate_analysis_report
|
|
15
|
-
from data_designer.config.column_types import get_column_display_order
|
|
16
|
-
from data_designer.config.utils.constants import EPSILON
|
|
17
|
-
from data_designer.config.utils.numerical_helpers import prepare_number_for_reporting
|
|
18
|
-
|
|
19
|
-
if TYPE_CHECKING:
|
|
20
|
-
from data_designer.config.analysis.utils.reporting import ReportSection
|
|
21
|
-
from data_designer.config.column_types import DataDesignerColumnType
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class DatasetProfilerResults(BaseModel):
|
|
25
|
-
"""Container for complete dataset profiling and analysis results.
|
|
26
|
-
|
|
27
|
-
Stores profiling results for a generated dataset, including statistics for all columns,
|
|
28
|
-
dataset-level metadata, and optional advanced profiler results. Provides methods for
|
|
29
|
-
computing derived metrics and generating formatted reports.
|
|
30
|
-
|
|
31
|
-
Attributes:
|
|
32
|
-
num_records: Actual number of records successfully generated in the dataset.
|
|
33
|
-
target_num_records: Target number of records that were requested to be generated.
|
|
34
|
-
column_statistics: List of statistics objects for all columns in the dataset. Each
|
|
35
|
-
column has statistics appropriate to its type. Must contain at least one column.
|
|
36
|
-
side_effect_column_names: Column names that were generated as side effects of other columns.
|
|
37
|
-
column_profiles: Column profiler results for specific columns when configured.
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
num_records: int
|
|
41
|
-
target_num_records: int
|
|
42
|
-
column_statistics: list[Annotated[ColumnStatisticsT, Field(discriminator="column_type")]] = Field(..., min_length=1)
|
|
43
|
-
side_effect_column_names: list[str] | None = None
|
|
44
|
-
column_profiles: list[ColumnProfilerResultsT] | None = None
|
|
45
|
-
|
|
46
|
-
@field_validator("num_records", "target_num_records", mode="before")
|
|
47
|
-
def ensure_python_integers(cls, v: int) -> int:
|
|
48
|
-
return prepare_number_for_reporting(v, int)
|
|
49
|
-
|
|
50
|
-
@property
|
|
51
|
-
def percent_complete(self) -> float:
|
|
52
|
-
"""Returns the completion percentage of the dataset."""
|
|
53
|
-
return 100 * self.num_records / (self.target_num_records + EPSILON)
|
|
54
|
-
|
|
55
|
-
@cached_property
|
|
56
|
-
def column_types(self) -> list[str]:
|
|
57
|
-
"""Returns a sorted list of unique column types present in the dataset."""
|
|
58
|
-
display_order = get_column_display_order()
|
|
59
|
-
return sorted(
|
|
60
|
-
list(set([c.column_type for c in self.column_statistics])),
|
|
61
|
-
key=lambda x: display_order.index(x) if x in display_order else len(display_order),
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
def get_column_statistics_by_type(self, column_type: DataDesignerColumnType) -> list[ColumnStatisticsT]:
|
|
65
|
-
"""Filters column statistics to return only those of the specified type."""
|
|
66
|
-
return [c for c in self.column_statistics if c.column_type == column_type]
|
|
67
|
-
|
|
68
|
-
def to_report(
|
|
69
|
-
self,
|
|
70
|
-
save_path: str | Path | None = None,
|
|
71
|
-
include_sections: list[ReportSection | DataDesignerColumnType] | None = None,
|
|
72
|
-
) -> None:
|
|
73
|
-
"""Generate and print an analysis report based on the dataset profiling results.
|
|
74
|
-
|
|
75
|
-
Args:
|
|
76
|
-
save_path: Optional path to save the report. If provided, the report will be saved
|
|
77
|
-
as either HTML (.html) or SVG (.svg) format. If None, the report will
|
|
78
|
-
only be displayed in the console.
|
|
79
|
-
include_sections: Optional list of sections to include in the report. Choices are
|
|
80
|
-
any DataDesignerColumnType, "overview" (the dataset overview section),
|
|
81
|
-
and "column_profilers" (all column profilers in one section). If None,
|
|
82
|
-
all sections will be included.
|
|
83
|
-
"""
|
|
84
|
-
generate_analysis_report(self, save_path, include_sections=include_sections)
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from data_designer.errors import DataDesignerError
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class AnalysisReportError(DataDesignerError):
|
|
10
|
-
"""Base exception for analysis report errors."""
|