data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0rc1.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/entry_points.txt +0 -0
@@ -1,153 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import logging
7
- import random
8
- from typing import TYPE_CHECKING
9
-
10
- from data_designer.config.analysis.column_profilers import (
11
- JudgeScoreProfilerConfig,
12
- JudgeScoreProfilerResults,
13
- JudgeScoreSummary,
14
- )
15
- from data_designer.config.analysis.column_statistics import (
16
- ColumnDistributionType,
17
- MissingValue,
18
- )
19
- from data_designer.config.column_types import DataDesignerColumnType
20
- from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
21
- from data_designer.engine.analysis.utils.judge_score_processing import (
22
- extract_judge_score_distributions,
23
- sample_scores_and_reasoning,
24
- )
25
- from data_designer.engine.models.recipes.response_recipes import TextResponseRecipe
26
-
27
- if TYPE_CHECKING:
28
- from data_designer.config.analysis.column_profilers import JudgeScoreSample
29
- from data_designer.config.analysis.column_statistics import (
30
- CategoricalDistribution,
31
- CategoricalHistogramData,
32
- NumericalDistribution,
33
- )
34
- from data_designer.engine.models.facade import ModelFacade
35
-
36
- logger = logging.getLogger(__name__)
37
-
38
-
39
- class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
40
- @staticmethod
41
- def get_applicable_column_types() -> list[DataDesignerColumnType]:
42
- return [DataDesignerColumnType.LLM_JUDGE]
43
-
44
- def get_model(self, model_alias: str) -> ModelFacade:
45
- return self.resource_provider.model_registry.get_model(model_alias=model_alias)
46
-
47
- def profile(self, column_config_with_df: ColumnConfigWithDataFrame) -> JudgeScoreProfilerResults:
48
- column_config, df = column_config_with_df.as_tuple()
49
-
50
- logger.info(
51
- f"{column_config.get_column_emoji()} Analyzing LLM-as-judge scores for column: '{column_config.name}'"
52
- )
53
-
54
- score_summaries = {}
55
- score_distributions = extract_judge_score_distributions(column_config, df)
56
-
57
- if self.config.summary_score_sample_size is None or isinstance(score_distributions, MissingValue):
58
- return JudgeScoreProfilerResults(
59
- summaries={},
60
- column_name=column_config.name,
61
- score_distributions=score_distributions,
62
- )
63
-
64
- for score in column_config.scores:
65
- score_name = score.name
66
- logger.info(f"{random.choice(['👩‍⚖️', '👨‍⚖️'])} Summarizing LLM-as-judge score: '{score_name}'")
67
- score_sample = sample_scores_and_reasoning(
68
- scores=score_distributions.scores[score_name],
69
- reasoning=score_distributions.reasoning[score_name],
70
- num_samples=self.config.summary_score_sample_size,
71
- )
72
-
73
- score_summaries[score_name] = self._summarize_score_sample(
74
- name=score_name,
75
- sample=score_sample,
76
- histogram=score_distributions.histograms[score_name],
77
- distribution=score_distributions.distributions[score_name],
78
- distribution_type=score_distributions.distribution_types[score_name],
79
- )
80
-
81
- return JudgeScoreProfilerResults(
82
- column_name=column_config.name,
83
- summaries=score_summaries,
84
- score_distributions=score_distributions,
85
- )
86
-
87
- def _summarize_score_sample(
88
- self,
89
- name: str,
90
- sample: list[JudgeScoreSample],
91
- histogram: CategoricalHistogramData,
92
- distribution: CategoricalDistribution | NumericalDistribution | MissingValue,
93
- distribution_type: ColumnDistributionType,
94
- ) -> JudgeScoreSummary:
95
- if isinstance(distribution, MissingValue) or not sample:
96
- return JudgeScoreSummary(
97
- score_name=name,
98
- summary="No judge score information available to summarize.",
99
- score_samples=sample,
100
- )
101
-
102
- category_info = []
103
- total_count = sum(histogram.counts)
104
- for cat, count in zip(histogram.categories, histogram.counts):
105
- percentage = (count / total_count) * 100
106
- category_info.append(f"{cat}: {count} records ({percentage:.1f}%)")
107
-
108
- distribution_context = f"Score distribution - {', '.join(category_info)}, "
109
- if distribution_type == ColumnDistributionType.CATEGORICAL:
110
- distribution_context += f"Most common value: {distribution.most_common_value}. "
111
- if distribution_type == ColumnDistributionType.NUMERICAL:
112
- distribution_context += f"Mean score: {distribution.mean:.2f}. "
113
-
114
- logger.info(f" |-- number of score samples: {len(sample)}")
115
- logger.info(f" |-- {distribution_context.lower()}")
116
-
117
- combined_reasoning = "\n".join([r.reasoning for r in sample])
118
- prompt = (
119
- f"Based on the following evaluator reasoning for the '{name}' criterion, "
120
- "provide a concise summary that captures both the strengths and areas for improvement mentioned. "
121
- "Be specific about what worked well and what needs improvement.\n\n"
122
- f"Overall distribution of scores: {distribution_context}"
123
- f"\nA sample of reasoning:\n{combined_reasoning}\n\n"
124
- "Do not include any titles like `Summary` or `Summary:`. "
125
- "Do not wrap the summary in quotation marks. "
126
- "YOU WILL PRODUCE LESS THAN 75 WORDS in a readable sentence format. "
127
- "No need to use bullets or headers. Write naturally."
128
- )
129
-
130
- system_prompt = (
131
- "You are an expert at distilling complex feedback into concise summaries. "
132
- "Focus on specificity and balance, incorporating both the distribution context and individual reasoning examples."
133
- )
134
-
135
- try:
136
- model = self.get_model(self.config.model_alias)
137
- recipe = TextResponseRecipe()
138
- summary, _ = model.generate(
139
- prompt=recipe.apply_recipe_to_user_prompt(prompt),
140
- system_prompt=recipe.apply_recipe_to_system_prompt(system_prompt),
141
- parser=recipe.parse,
142
- )
143
- return JudgeScoreSummary(
144
- score_name=name,
145
- summary=summary.strip(),
146
- score_samples=sample,
147
- )
148
- except Exception as e:
149
- return JudgeScoreSummary(
150
- score_name=name,
151
- summary=f"Score summarization failed: {e}",
152
- score_samples=sample,
153
- )
@@ -1,22 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.config.analysis.column_profilers import ColumnProfilerType
7
- from data_designer.config.base import ConfigBase
8
- from data_designer.engine.analysis.column_profilers.base import ColumnProfiler
9
- from data_designer.engine.analysis.column_profilers.judge_score_profiler import (
10
- JudgeScoreProfiler,
11
- JudgeScoreProfilerConfig,
12
- )
13
- from data_designer.engine.registry.base import TaskRegistry
14
-
15
-
16
- class ColumnProfilerRegistry(TaskRegistry[ColumnProfilerType, ColumnProfiler, ConfigBase]): ...
17
-
18
-
19
- def create_default_column_profiler_registry() -> ColumnProfilerRegistry:
20
- registry = ColumnProfilerRegistry()
21
- registry.register(ColumnProfilerType.JUDGE_SCORE, JudgeScoreProfiler, JudgeScoreProfilerConfig, False)
22
- return registry
@@ -1,145 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import logging
7
- from typing import TYPE_CHECKING, Any, TypeAlias
8
-
9
- from pydantic import BaseModel
10
- from typing_extensions import Self
11
-
12
- from data_designer.config.analysis.column_statistics import (
13
- DEFAULT_COLUMN_STATISTICS_MAP,
14
- ColumnStatisticsT,
15
- GeneralColumnStatistics,
16
- )
17
- from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType
18
- from data_designer.config.sampler_params import SamplerType, is_numerical_sampler_type
19
- from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame
20
- from data_designer.engine.analysis.utils.column_statistics_calculations import (
21
- ColumnDistributionType,
22
- calculate_column_distribution,
23
- calculate_general_column_info,
24
- calculate_token_stats,
25
- calculate_validation_column_info,
26
- )
27
- from data_designer.lazy_heavy_imports import pd
28
-
29
- if TYPE_CHECKING:
30
- import pandas as pd
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
-
35
- class GeneralColumnStatisticsCalculator(BaseModel):
36
- column_config_with_df: ColumnConfigWithDataFrame
37
-
38
- @property
39
- def column_config(self) -> ColumnConfigT:
40
- return self.column_config_with_df.column_config
41
-
42
- @property
43
- def df(self) -> pd.DataFrame:
44
- return self.column_config_with_df.df
45
-
46
- @property
47
- def column_statistics_type(self) -> type[ColumnStatisticsT]:
48
- return DEFAULT_COLUMN_STATISTICS_MAP.get(self.column_config.column_type, GeneralColumnStatistics)
49
-
50
- def calculate(self) -> Self:
51
- """Calculate all the column statistics fields for the given column configuration and dataset profiler.
52
-
53
- This method dynamically collects all class methods prefixed with 'calculate_' and invokes them to
54
- compute various column statistics, aggregating their results into a single statistics object.
55
- """
56
- calculate_methods = [
57
- name for name in dir(self) if name.startswith("calculate_") and callable(getattr(self, name))
58
- ]
59
- return self.column_statistics_type(
60
- column_name=self.column_config.name,
61
- **{k: v for name in calculate_methods for k, v in getattr(self, name)().items()},
62
- )
63
-
64
- def calculate_general_column_info(self) -> dict[str, Any]:
65
- return calculate_general_column_info(self.column_config.name, self.df)
66
-
67
- def __repr__(self) -> str:
68
- params = []
69
- for field, value in self.model_dump(mode="json").items():
70
- params.append(f" {field}: {value}")
71
- params_str = "\n".join(params)
72
- return f"{self.__class__.__name__}(\n{params_str}\n)"
73
-
74
-
75
- class LLMTextColumnStatisticsCalculator(GeneralColumnStatisticsCalculator):
76
- def calculate_token_stats(self) -> dict[str, Any]:
77
- return calculate_token_stats(self.column_config, self.df)
78
-
79
-
80
- class LLMCodeColumnStatisticsCalculator(LLMTextColumnStatisticsCalculator): ...
81
-
82
-
83
- class LLMStructuredColumnStatisticsCalculator(LLMTextColumnStatisticsCalculator): ...
84
-
85
-
86
- class LLMJudgedColumnStatisticsCalculator(LLMTextColumnStatisticsCalculator): ...
87
-
88
-
89
- class SamplerColumnStatisticsCalculator(GeneralColumnStatisticsCalculator):
90
- def calculate_sampler_distribution(self) -> dict[str, Any]:
91
- make_dist, dist_type = False, ColumnDistributionType.OTHER
92
- if self.column_config.sampler_type in [SamplerType.CATEGORY, SamplerType.SUBCATEGORY]:
93
- make_dist, dist_type = True, ColumnDistributionType.CATEGORICAL
94
- elif is_numerical_sampler_type(self.column_config.sampler_type):
95
- make_dist, dist_type = True, ColumnDistributionType.NUMERICAL
96
- return (
97
- {
98
- "sampler_type": SamplerType(self.column_config.sampler_type),
99
- **calculate_column_distribution(self.column_config.name, self.df, dist_type),
100
- }
101
- if make_dist
102
- else {
103
- "sampler_type": SamplerType(self.column_config.sampler_type),
104
- "distribution_type": dist_type,
105
- "distribution": None,
106
- }
107
- )
108
-
109
-
110
- class SeedDatasetColumnStatisticsCalculator(GeneralColumnStatisticsCalculator): ...
111
-
112
-
113
- class ValidationColumnStatisticsCalculator(GeneralColumnStatisticsCalculator):
114
- def calculate_validation_column_info(self) -> dict[str, Any]:
115
- return calculate_validation_column_info(self.column_config.name, self.df)
116
-
117
-
118
- class ExpressionColumnStatisticsCalculator(GeneralColumnStatisticsCalculator): ...
119
-
120
-
121
- ColumnStatisticsCalculatorT: TypeAlias = (
122
- ExpressionColumnStatisticsCalculator
123
- | ValidationColumnStatisticsCalculator
124
- | GeneralColumnStatisticsCalculator
125
- | LLMCodeColumnStatisticsCalculator
126
- | LLMJudgedColumnStatisticsCalculator
127
- | LLMStructuredColumnStatisticsCalculator
128
- | LLMTextColumnStatisticsCalculator
129
- | SamplerColumnStatisticsCalculator
130
- | SeedDatasetColumnStatisticsCalculator
131
- )
132
- DEFAULT_COLUMN_STATISTICS_CALCULATOR_MAP = {
133
- DataDesignerColumnType.EXPRESSION: ExpressionColumnStatisticsCalculator,
134
- DataDesignerColumnType.VALIDATION: ValidationColumnStatisticsCalculator,
135
- DataDesignerColumnType.LLM_CODE: LLMCodeColumnStatisticsCalculator,
136
- DataDesignerColumnType.LLM_JUDGE: LLMJudgedColumnStatisticsCalculator,
137
- DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnStatisticsCalculator,
138
- DataDesignerColumnType.LLM_TEXT: LLMTextColumnStatisticsCalculator,
139
- DataDesignerColumnType.SAMPLER: SamplerColumnStatisticsCalculator,
140
- DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnStatisticsCalculator,
141
- }
142
-
143
-
144
- def get_column_statistics_calculator(column_type: DataDesignerColumnType) -> ColumnStatisticsCalculatorT:
145
- return DEFAULT_COLUMN_STATISTICS_CALCULATOR_MAP.get(column_type, GeneralColumnStatisticsCalculator)
@@ -1,149 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import logging
7
- from collections.abc import Sequence
8
- from functools import cached_property
9
- from typing import TYPE_CHECKING
10
-
11
- from pydantic import Field, field_validator
12
-
13
- from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
14
- from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
15
- from data_designer.config.base import ConfigBase
16
- from data_designer.config.column_configs import SingleColumnConfig
17
- from data_designer.config.column_types import ColumnConfigT
18
- from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
19
- from data_designer.engine.analysis.column_statistics import get_column_statistics_calculator
20
- from data_designer.engine.analysis.errors import DatasetProfilerConfigurationError
21
- from data_designer.engine.analysis.utils.column_statistics_calculations import has_pyarrow_backend
22
- from data_designer.engine.dataset_builders.multi_column_configs import DatasetBuilderColumnConfigT, MultiColumnConfig
23
- from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry
24
- from data_designer.engine.resources.resource_provider import ResourceProvider
25
- from data_designer.lazy_heavy_imports import pa, pd
26
-
27
- if TYPE_CHECKING:
28
- import pandas as pd
29
- import pyarrow as pa
30
-
31
- logger = logging.getLogger(__name__)
32
-
33
-
34
- class DatasetProfilerConfig(ConfigBase):
35
- column_configs: Sequence[DatasetBuilderColumnConfigT] = Field(..., min_length=1)
36
- column_profiler_configs: Sequence[ColumnProfilerConfigT] | None = None
37
-
38
- @field_validator("column_configs")
39
- def flatten_and_validate_column_configs(cls, v: list[DatasetBuilderColumnConfigT]) -> list[ColumnConfigT]:
40
- column_configs = []
41
- for config in v:
42
- if isinstance(config, SingleColumnConfig) and not config.drop:
43
- column_configs.append(config)
44
- elif isinstance(config, MultiColumnConfig):
45
- column_configs.extend([c for c in config.columns if not c.drop])
46
- if len(column_configs) == 0:
47
- raise DatasetProfilerConfigurationError("All columns were dropped!")
48
- return column_configs
49
-
50
-
51
- class DataDesignerDatasetProfiler:
52
- def __init__(self, config: DatasetProfilerConfig, resource_provider: ResourceProvider):
53
- self.config = config
54
- self.resource_provider = resource_provider
55
- self._validate_column_profiler_configs()
56
-
57
- @cached_property
58
- def column_names_from_configs(self) -> list[str]:
59
- return [c.name for c in self.config.column_configs]
60
-
61
- @cached_property
62
- def registry(self) -> DataDesignerRegistry:
63
- return DataDesignerRegistry()
64
-
65
- def profile_dataset(
66
- self,
67
- target_num_records: int,
68
- dataset: pd.DataFrame,
69
- ) -> DatasetProfilerResults:
70
- logger.info("📐 Measuring dataset column statistics:")
71
-
72
- self._validate_schema_consistency(list(dataset.columns))
73
- dataset = self._convert_to_pyarrow_backend_if_needed(dataset)
74
-
75
- column_statistics = []
76
- for c in self.config.column_configs:
77
- logger.info(f" |-- {c.get_column_emoji()} column: '{c.name}'")
78
- column_statistics.append(
79
- get_column_statistics_calculator(c.column_type)(
80
- column_config_with_df=ColumnConfigWithDataFrame(column_config=c, df=dataset)
81
- ).calculate()
82
- )
83
-
84
- column_profiles = []
85
- for profiler_config in self.config.column_profiler_configs or []:
86
- profiler = self._create_column_profiler(profiler_config)
87
- applicable_column_types = profiler.get_applicable_column_types()
88
- for c in self.config.column_configs:
89
- if c.column_type in applicable_column_types:
90
- params = ColumnConfigWithDataFrame(column_config=c, df=dataset)
91
- column_profiles.append(profiler.profile(params))
92
- if len(column_profiles) == 0:
93
- logger.warning(
94
- f"⚠️ No applicable column types found for the '{profiler.name}' profiler. "
95
- f"This profiler is applicable to the following column types: {applicable_column_types}"
96
- )
97
-
98
- return DatasetProfilerResults(
99
- num_records=len(dataset),
100
- target_num_records=target_num_records,
101
- side_effect_column_names=list(set(dataset.columns) - set(self.column_names_from_configs)),
102
- column_statistics=column_statistics,
103
- column_profiles=column_profiles if column_profiles else None,
104
- )
105
-
106
- def _convert_to_pyarrow_backend_if_needed(self, dataset: pd.DataFrame) -> pd.DataFrame:
107
- if not has_pyarrow_backend(dataset):
108
- try:
109
- dataset = pa.Table.from_pandas(dataset).to_pandas(types_mapper=pd.ArrowDtype)
110
- except Exception as e:
111
- # For ArrowTypeError, the second arg contains the more informative message
112
- if isinstance(e, pa.lib.ArrowTypeError) and len(e.args) > 1:
113
- error_msg = str(e.args[1])
114
- else:
115
- error_msg = str(e)
116
- for col in dataset.columns:
117
- # Make sure column names are clear in the error message
118
- error_msg = error_msg.replace(col, f"'{col}'")
119
- logger.warning("⚠️ Unable to convert the dataset to a PyArrow backend")
120
- logger.warning(f" |-- Conversion Error Message: {error_msg}")
121
- logger.warning(" |-- This is often due to at least one column having mixed data types")
122
- logger.warning(
123
- " |-- Note: Reported data types will be inferred from the first non-null value of each column"
124
- )
125
- return dataset
126
-
127
- def _create_column_profiler(self, profiler_config: ColumnProfilerConfigT) -> ColumnProfiler:
128
- return self.registry.column_profilers.get_for_config_type(type(profiler_config))(
129
- config=profiler_config, resource_provider=self.resource_provider
130
- )
131
-
132
- def _validate_column_profiler_configs(self) -> None:
133
- if self.config.column_profiler_configs:
134
- if self.resource_provider.model_registry is None:
135
- raise DatasetProfilerConfigurationError("Model registry is required for column profiler configs")
136
- self._validate_model_configs()
137
-
138
- def _validate_model_configs(self) -> None:
139
- aliases = [alias for alias in self.resource_provider.model_registry.model_configs.keys()]
140
- for column_config in self.config.column_configs:
141
- if hasattr(column_config, "model_alias") and column_config.model_alias not in aliases:
142
- raise DatasetProfilerConfigurationError(
143
- f"Model config '{column_config.model_alias}' not found in model configs"
144
- )
145
-
146
- def _validate_schema_consistency(self, dataset_column_names: list[str]) -> None:
147
- for column_name in self.column_names_from_configs:
148
- if column_name not in dataset_column_names:
149
- raise DatasetProfilerConfigurationError(f"Column '{column_name}' not found in dataset")
@@ -1,9 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.errors import DataDesignerError
7
-
8
-
9
- class DatasetProfilerConfigurationError(DataDesignerError): ...