data-designer-config 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. data_designer/config/__init__.py +149 -0
  2. data_designer/config/_version.py +34 -0
  3. data_designer/config/analysis/__init__.py +2 -0
  4. data_designer/config/analysis/column_profilers.py +159 -0
  5. data_designer/config/analysis/column_statistics.py +421 -0
  6. data_designer/config/analysis/dataset_profiler.py +84 -0
  7. data_designer/config/analysis/utils/errors.py +10 -0
  8. data_designer/config/analysis/utils/reporting.py +192 -0
  9. data_designer/config/base.py +69 -0
  10. data_designer/config/column_configs.py +476 -0
  11. data_designer/config/column_types.py +141 -0
  12. data_designer/config/config_builder.py +595 -0
  13. data_designer/config/data_designer_config.py +40 -0
  14. data_designer/config/dataset_builders.py +13 -0
  15. data_designer/config/dataset_metadata.py +18 -0
  16. data_designer/config/default_model_settings.py +129 -0
  17. data_designer/config/errors.py +24 -0
  18. data_designer/config/interface.py +55 -0
  19. data_designer/config/models.py +486 -0
  20. data_designer/config/preview_results.py +41 -0
  21. data_designer/config/processors.py +148 -0
  22. data_designer/config/run_config.py +56 -0
  23. data_designer/config/sampler_constraints.py +52 -0
  24. data_designer/config/sampler_params.py +639 -0
  25. data_designer/config/seed.py +116 -0
  26. data_designer/config/seed_source.py +84 -0
  27. data_designer/config/seed_source_types.py +19 -0
  28. data_designer/config/testing/__init__.py +6 -0
  29. data_designer/config/testing/fixtures.py +308 -0
  30. data_designer/config/utils/code_lang.py +93 -0
  31. data_designer/config/utils/constants.py +365 -0
  32. data_designer/config/utils/errors.py +21 -0
  33. data_designer/config/utils/info.py +94 -0
  34. data_designer/config/utils/io_helpers.py +258 -0
  35. data_designer/config/utils/misc.py +78 -0
  36. data_designer/config/utils/numerical_helpers.py +30 -0
  37. data_designer/config/utils/type_helpers.py +106 -0
  38. data_designer/config/utils/visualization.py +482 -0
  39. data_designer/config/validator_params.py +94 -0
  40. data_designer/errors.py +7 -0
  41. data_designer/lazy_heavy_imports.py +56 -0
  42. data_designer/logging.py +180 -0
  43. data_designer/plugin_manager.py +78 -0
  44. data_designer/plugins/__init__.py +8 -0
  45. data_designer/plugins/errors.py +15 -0
  46. data_designer/plugins/plugin.py +141 -0
  47. data_designer/plugins/registry.py +88 -0
  48. data_designer_config-0.4.0.dist-info/METADATA +75 -0
  49. data_designer_config-0.4.0.dist-info/RECORD +50 -0
  50. data_designer_config-0.4.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,149 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from data_designer.config.analysis.column_profilers import JudgeScoreProfilerConfig
7
+ from data_designer.config.column_configs import (
8
+ EmbeddingColumnConfig,
9
+ ExpressionColumnConfig,
10
+ LLMCodeColumnConfig,
11
+ LLMJudgeColumnConfig,
12
+ LLMStructuredColumnConfig,
13
+ LLMTextColumnConfig,
14
+ SamplerColumnConfig,
15
+ Score,
16
+ SeedDatasetColumnConfig,
17
+ ValidationColumnConfig,
18
+ )
19
+ from data_designer.config.column_types import DataDesignerColumnType
20
+ from data_designer.config.config_builder import DataDesignerConfigBuilder
21
+ from data_designer.config.data_designer_config import DataDesignerConfig
22
+ from data_designer.config.dataset_builders import BuildStage
23
+ from data_designer.config.models import (
24
+ ChatCompletionInferenceParams,
25
+ EmbeddingInferenceParams,
26
+ GenerationType,
27
+ ImageContext,
28
+ ImageFormat,
29
+ ManualDistribution,
30
+ ManualDistributionParams,
31
+ Modality,
32
+ ModalityContext,
33
+ ModalityDataType,
34
+ ModelConfig,
35
+ ModelProvider,
36
+ UniformDistribution,
37
+ UniformDistributionParams,
38
+ )
39
+ from data_designer.config.processors import (
40
+ DropColumnsProcessorConfig,
41
+ ProcessorType,
42
+ SchemaTransformProcessorConfig,
43
+ )
44
+ from data_designer.config.run_config import RunConfig
45
+ from data_designer.config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
46
+ from data_designer.config.sampler_params import (
47
+ BernoulliMixtureSamplerParams,
48
+ BernoulliSamplerParams,
49
+ BinomialSamplerParams,
50
+ CategorySamplerParams,
51
+ DatetimeSamplerParams,
52
+ GaussianSamplerParams,
53
+ PersonFromFakerSamplerParams,
54
+ PersonSamplerParams,
55
+ PoissonSamplerParams,
56
+ SamplerType,
57
+ ScipySamplerParams,
58
+ SubcategorySamplerParams,
59
+ TimeDeltaSamplerParams,
60
+ UniformSamplerParams,
61
+ UUIDSamplerParams,
62
+ )
63
+ from data_designer.config.seed import (
64
+ IndexRange,
65
+ PartitionBlock,
66
+ SamplingStrategy,
67
+ SeedConfig,
68
+ )
69
+ from data_designer.config.seed_source import (
70
+ DataFrameSeedSource,
71
+ HuggingFaceSeedSource,
72
+ LocalFileSeedSource,
73
+ )
74
+ from data_designer.config.utils.code_lang import CodeLang
75
+ from data_designer.config.utils.info import InfoType
76
+ from data_designer.config.validator_params import (
77
+ CodeValidatorParams,
78
+ LocalCallableValidatorParams,
79
+ RemoteValidatorParams,
80
+ ValidatorType,
81
+ )
82
+
83
+
84
+ def get_config_exports() -> list[str]:
85
+ return [
86
+ SchemaTransformProcessorConfig.__name__,
87
+ BernoulliMixtureSamplerParams.__name__,
88
+ BernoulliSamplerParams.__name__,
89
+ BinomialSamplerParams.__name__,
90
+ CategorySamplerParams.__name__,
91
+ CodeLang.__name__,
92
+ CodeValidatorParams.__name__,
93
+ ColumnInequalityConstraint.__name__,
94
+ ChatCompletionInferenceParams.__name__,
95
+ DataDesignerColumnType.__name__,
96
+ DataDesignerConfig.__name__,
97
+ DataDesignerConfigBuilder.__name__,
98
+ DataFrameSeedSource.__name__,
99
+ BuildStage.__name__,
100
+ DatetimeSamplerParams.__name__,
101
+ DropColumnsProcessorConfig.__name__,
102
+ EmbeddingColumnConfig.__name__,
103
+ EmbeddingInferenceParams.__name__,
104
+ ExpressionColumnConfig.__name__,
105
+ GaussianSamplerParams.__name__,
106
+ GenerationType.__name__,
107
+ HuggingFaceSeedSource.__name__,
108
+ IndexRange.__name__,
109
+ InfoType.__name__,
110
+ ImageContext.__name__,
111
+ ImageFormat.__name__,
112
+ JudgeScoreProfilerConfig.__name__,
113
+ LLMCodeColumnConfig.__name__,
114
+ LLMJudgeColumnConfig.__name__,
115
+ LLMStructuredColumnConfig.__name__,
116
+ LLMTextColumnConfig.__name__,
117
+ LocalCallableValidatorParams.__name__,
118
+ LocalFileSeedSource.__name__,
119
+ ManualDistribution.__name__,
120
+ ManualDistributionParams.__name__,
121
+ Modality.__name__,
122
+ ModalityContext.__name__,
123
+ ModalityDataType.__name__,
124
+ ModelConfig.__name__,
125
+ ModelProvider.__name__,
126
+ PartitionBlock.__name__,
127
+ PersonSamplerParams.__name__,
128
+ PersonFromFakerSamplerParams.__name__,
129
+ PoissonSamplerParams.__name__,
130
+ ProcessorType.__name__,
131
+ RemoteValidatorParams.__name__,
132
+ RunConfig.__name__,
133
+ SamplerColumnConfig.__name__,
134
+ SamplerType.__name__,
135
+ SamplingStrategy.__name__,
136
+ ScalarInequalityConstraint.__name__,
137
+ ScipySamplerParams.__name__,
138
+ Score.__name__,
139
+ SeedConfig.__name__,
140
+ SeedDatasetColumnConfig.__name__,
141
+ SubcategorySamplerParams.__name__,
142
+ TimeDeltaSamplerParams.__name__,
143
+ UniformDistribution.__name__,
144
+ UniformDistributionParams.__name__,
145
+ UniformSamplerParams.__name__,
146
+ UUIDSamplerParams.__name__,
147
+ ValidationColumnConfig.__name__,
148
+ ValidatorType.__name__,
149
+ ]
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.4.0'
32
+ __version_tuple__ = version_tuple = (0, 4, 0)
33
+
34
+ __commit_id__ = commit_id = None
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,159 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from abc import ABC
7
+ from enum import Enum
8
+
9
+ from pydantic import BaseModel, Field
10
+ from rich.panel import Panel
11
+ from rich.table import Column, Table
12
+ from typing_extensions import TypeAlias
13
+
14
+ from data_designer.config.analysis.column_statistics import (
15
+ CategoricalDistribution,
16
+ CategoricalHistogramData,
17
+ ColumnDistributionType,
18
+ MissingValue,
19
+ NumericalDistribution,
20
+ )
21
+ from data_designer.config.analysis.utils.reporting import TITLE_STYLE, create_judge_score_summary_table
22
+ from data_designer.config.base import ConfigBase
23
+ from data_designer.config.utils.visualization import ColorPalette
24
+
25
+
26
+ class ColumnProfilerType(str, Enum):
27
+ JUDGE_SCORE = "judge-score"
28
+
29
+
30
+ class ColumnProfilerResults(BaseModel, ABC):
31
+ """Abstract base class for column profiler results.
32
+
33
+ Stores results from column profiling operations. Subclasses hold profiler-specific
34
+ analysis results and provide methods for generating formatted report sections for display.
35
+ """
36
+
37
+ def create_report_section(self) -> Panel:
38
+ """Creates a Rich Panel containing the formatted profiler results for display.
39
+
40
+ Returns:
41
+ A Rich Panel containing the formatted profiler results. Default implementation
42
+ returns a "Not Implemented" message; subclasses should override to provide
43
+ specific formatting.
44
+ """
45
+ return Panel(
46
+ f"Report section generation not implemented for '{self.__class__.__name__}'.",
47
+ title="Not Implemented",
48
+ border_style=f"bold {ColorPalette.YELLOW.value}",
49
+ padding=(1, 2),
50
+ )
51
+
52
+
53
+ class JudgeScoreProfilerConfig(ConfigBase):
54
+ """Configuration for the LLM-as-a-judge score profiler.
55
+
56
+ Attributes:
57
+ model_alias: Alias of the LLM model to use for generating score distribution summaries.
58
+ Must match a model alias defined in the Data Designer configuration.
59
+ summary_score_sample_size: Number of score samples to include when prompting the LLM
60
+ to generate summaries. Larger sample sizes provide more context but increase
61
+ token usage. Must be at least 1. Defaults to 20.
62
+ """
63
+
64
+ model_alias: str
65
+ summary_score_sample_size: int | None = Field(default=20, ge=1)
66
+
67
+
68
+ class JudgeScoreSample(BaseModel):
69
+ """Container for a single judge score and its associated reasoning.
70
+
71
+ Stores a paired score-reasoning sample extracted from an LLM-as-a-judge column.
72
+ Used when generating summaries to provide the LLM with examples of scoring patterns.
73
+
74
+ Attributes:
75
+ score: The score value assigned by the judge. Can be numeric (int) or categorical (str).
76
+ reasoning: The reasoning or explanation provided by the judge for this score.
77
+ """
78
+
79
+ score: int | str
80
+ reasoning: str
81
+
82
+
83
+ class JudgeScoreDistributions(BaseModel):
84
+ """Container for computed distributions across all judge score dimensions.
85
+
86
+ Stores the complete distribution analysis for all score dimensions in an LLM-as-a-judge
87
+ column. Each score dimension (e.g., "relevance", "fluency") has its own distribution
88
+ computed from the generated data.
89
+
90
+ Attributes:
91
+ scores: Mapping of each score dimension name to its list of score values.
92
+ reasoning: Mapping of each score dimension name to its list of reasoning texts.
93
+ distribution_types: Mapping of each score dimension name to its classification.
94
+ distributions: Mapping of each score dimension name to its computed distribution statistics.
95
+ histograms: Mapping of each score dimension name to its histogram data.
96
+ """
97
+
98
+ scores: dict[str, list[int | str]]
99
+ reasoning: dict[str, list[str]]
100
+ distribution_types: dict[str, ColumnDistributionType]
101
+ distributions: dict[str, CategoricalDistribution | NumericalDistribution | MissingValue]
102
+ histograms: dict[str, CategoricalHistogramData | MissingValue]
103
+
104
+
105
+ class JudgeScoreSummary(BaseModel):
106
+ """Container for an LLM-generated summary of a judge score dimension.
107
+
108
+ Stores the natural language summary and sample data for a single score dimension
109
+ generated by the judge score profiler. The summary is created by an LLM analyzing
110
+ the distribution and patterns in the score-reasoning pairs.
111
+
112
+ Attributes:
113
+ score_name: Name of the score dimension being summarized (e.g., "relevance", "fluency").
114
+ summary: LLM-generated natural language summary describing the scoring patterns,
115
+ distribution characteristics, and notable trends for this score dimension.
116
+ score_samples: List of score-reasoning pairs that were used to generate the summary.
117
+ These are the examples of the scoring behavior that were used to generate the summary.
118
+ """
119
+
120
+ score_name: str
121
+ summary: str
122
+ score_samples: list[JudgeScoreSample]
123
+
124
+
125
+ class JudgeScoreProfilerResults(ColumnProfilerResults):
126
+ """Container for complete judge score profiler analysis results.
127
+
128
+ Attributes:
129
+ column_name: Name of the judge column that was profiled.
130
+ summaries: Mapping of each score dimension name to its LLM-generated summary.
131
+ score_distributions: Complete distribution analysis across all score dimensions.
132
+ """
133
+
134
+ column_name: str
135
+ summaries: dict[str, JudgeScoreSummary]
136
+ score_distributions: JudgeScoreDistributions | MissingValue
137
+
138
+ def create_report_section(self) -> Panel:
139
+ layout = Table.grid(Column(), expand=True, padding=(2, 0))
140
+
141
+ for score_name in self.summaries.keys():
142
+ layout.add_row(
143
+ create_judge_score_summary_table(
144
+ score_name=score_name,
145
+ histogram=self.score_distributions.histograms[score_name],
146
+ summary=self.summaries[score_name].summary,
147
+ )
148
+ )
149
+
150
+ return Panel(
151
+ layout,
152
+ title=f"[{TITLE_STYLE}]LLM-as-a-Judge Score Profile: '{self.column_name}'[/{TITLE_STYLE}]",
153
+ padding=(1, 2),
154
+ )
155
+
156
+
157
+ ColumnProfilerConfigT: TypeAlias = JudgeScoreProfilerConfig
158
+
159
+ ColumnProfilerResultsT: TypeAlias = JudgeScoreProfilerResults