data-designer-config 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. data_designer/config/__init__.py +149 -0
  2. data_designer/config/_version.py +34 -0
  3. data_designer/config/analysis/__init__.py +2 -0
  4. data_designer/config/analysis/column_profilers.py +159 -0
  5. data_designer/config/analysis/column_statistics.py +421 -0
  6. data_designer/config/analysis/dataset_profiler.py +84 -0
  7. data_designer/config/analysis/utils/errors.py +10 -0
  8. data_designer/config/analysis/utils/reporting.py +192 -0
  9. data_designer/config/base.py +69 -0
  10. data_designer/config/column_configs.py +476 -0
  11. data_designer/config/column_types.py +141 -0
  12. data_designer/config/config_builder.py +595 -0
  13. data_designer/config/data_designer_config.py +40 -0
  14. data_designer/config/dataset_builders.py +13 -0
  15. data_designer/config/dataset_metadata.py +18 -0
  16. data_designer/config/default_model_settings.py +129 -0
  17. data_designer/config/errors.py +24 -0
  18. data_designer/config/interface.py +55 -0
  19. data_designer/config/models.py +486 -0
  20. data_designer/config/preview_results.py +41 -0
  21. data_designer/config/processors.py +148 -0
  22. data_designer/config/run_config.py +56 -0
  23. data_designer/config/sampler_constraints.py +52 -0
  24. data_designer/config/sampler_params.py +639 -0
  25. data_designer/config/seed.py +116 -0
  26. data_designer/config/seed_source.py +84 -0
  27. data_designer/config/seed_source_types.py +19 -0
  28. data_designer/config/testing/__init__.py +6 -0
  29. data_designer/config/testing/fixtures.py +308 -0
  30. data_designer/config/utils/code_lang.py +93 -0
  31. data_designer/config/utils/constants.py +365 -0
  32. data_designer/config/utils/errors.py +21 -0
  33. data_designer/config/utils/info.py +94 -0
  34. data_designer/config/utils/io_helpers.py +258 -0
  35. data_designer/config/utils/misc.py +78 -0
  36. data_designer/config/utils/numerical_helpers.py +30 -0
  37. data_designer/config/utils/type_helpers.py +106 -0
  38. data_designer/config/utils/visualization.py +482 -0
  39. data_designer/config/validator_params.py +94 -0
  40. data_designer/errors.py +7 -0
  41. data_designer/lazy_heavy_imports.py +56 -0
  42. data_designer/logging.py +180 -0
  43. data_designer/plugin_manager.py +78 -0
  44. data_designer/plugins/__init__.py +8 -0
  45. data_designer/plugins/errors.py +15 -0
  46. data_designer/plugins/plugin.py +141 -0
  47. data_designer/plugins/registry.py +88 -0
  48. data_designer_config-0.4.0.dist-info/METADATA +75 -0
  49. data_designer_config-0.4.0.dist-info/RECORD +50 -0
  50. data_designer_config-0.4.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,192 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from enum import Enum
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING
9
+
10
+ from rich.align import Align
11
+ from rich.console import Console, Group
12
+ from rich.panel import Panel
13
+ from rich.rule import Rule
14
+ from rich.table import Column, Table
15
+ from rich.text import Text
16
+
17
+ from data_designer.config.analysis.utils.errors import AnalysisReportError
18
+ from data_designer.config.column_types import (
19
+ DataDesignerColumnType,
20
+ get_column_display_order,
21
+ get_column_emoji_from_type,
22
+ )
23
+ from data_designer.config.utils.visualization import (
24
+ ColorPalette,
25
+ convert_to_row_element,
26
+ create_rich_histogram_table,
27
+ pad_console_element,
28
+ )
29
+
30
+ if TYPE_CHECKING:
31
+ from data_designer.config.analysis.column_statistics import CategoricalHistogramData
32
+ from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
33
+
34
+ HEADER_STYLE = "dim"
35
+ RULE_STYLE = f"bold {ColorPalette.NVIDIA_GREEN.value}"
36
+ ACCENT_STYLE = f"bold {ColorPalette.BLUE.value}"
37
+ TITLE_STYLE = f"bold {ColorPalette.NVIDIA_GREEN.value}"
38
+ HIST_NAME_STYLE = f"bold {ColorPalette.BLUE.value}"
39
+ HIST_VALUE_STYLE = f"dim {ColorPalette.BLUE.value}"
40
+
41
+
42
+ class ReportSection(str, Enum):
43
+ OVERVIEW = "overview"
44
+ COLUMN_PROFILERS = "column_profilers"
45
+
46
+
47
+ DEFAULT_INCLUDE_SECTIONS = [
48
+ ReportSection.OVERVIEW,
49
+ ReportSection.COLUMN_PROFILERS,
50
+ ] + get_column_display_order()
51
+
52
+
53
+ def generate_analysis_report(
54
+ analysis: DatasetProfilerResults,
55
+ save_path: str | Path | None = None,
56
+ include_sections: list[ReportSection | DataDesignerColumnType] | None = None,
57
+ ) -> None:
58
+ """Generate an analysis report for dataset profiling results.
59
+
60
+ This function creates a rich-formatted report that displays dataset overview statistics
61
+ and detailed column statistics organized by column type. The report includes visual
62
+ elements like tables, rules, and panels to present the analysis results in an
63
+ easy-to-read format.
64
+
65
+ Args:
66
+ analysis: The DatasetProfilerResults object containing the analysis data to report on.
67
+ save_path: Optional path to save the report. If provided, the report will be saved
68
+ as either HTML (.html) or SVG (.svg) format. If None, the report will
69
+ only be displayed in the console.
70
+ include_sections: Optional list of sections to include in the report. Choices are
71
+ any Data Designer column type, "overview" (the dataset overview section),
72
+ and "column_profilers" (all column profilers in one section). If None,
73
+ all sections will be included.
74
+
75
+ Raises:
76
+ AnalysisReportError: If save_path is provided but doesn't have a .html or .svg extension.
77
+ """
78
+ render_list = []
79
+ table_kws = dict(show_lines=True, expand=True, title_style=TITLE_STYLE)
80
+ include_sections = include_sections or DEFAULT_INCLUDE_SECTIONS
81
+
82
+ title = Rule(title="🎨 Data Designer Dataset Profile", style=RULE_STYLE, end="\n\n")
83
+
84
+ render_list.append(title)
85
+
86
+ if ReportSection.OVERVIEW in include_sections:
87
+ table = Table(title="Dataset Overview", **table_kws)
88
+ table.add_column("number of records", header_style=HEADER_STYLE)
89
+ table.add_column("number of columns", header_style=HEADER_STYLE)
90
+ table.add_column("percent complete records", header_style=HEADER_STYLE)
91
+
92
+ table.add_row(
93
+ f"{analysis.num_records:,}",
94
+ f"{len(analysis.column_statistics):,}",
95
+ f"{analysis.percent_complete:.1f}%",
96
+ )
97
+
98
+ render_list.append(pad_console_element(table, (1, 0, 1, 0)))
99
+
100
+ displayed_column_types = set()
101
+ for column_type in analysis.column_types:
102
+ if column_type not in include_sections:
103
+ continue
104
+
105
+ displayed_column_types.add(column_type)
106
+ column_label = column_type.replace("_", " ").title().replace("Llm", "LLM")
107
+ table = Table(
108
+ title=f"{get_column_emoji_from_type(column_type)} {column_label} Columns",
109
+ **table_kws,
110
+ )
111
+
112
+ column_stats_list = analysis.get_column_statistics_by_type(column_type)
113
+ for col in list(column_stats_list[0].create_report_row_data()):
114
+ if col == "column name":
115
+ table.add_column(col, header_style=HEADER_STYLE)
116
+ else:
117
+ table.add_column(col, justify="right", header_style=HEADER_STYLE)
118
+
119
+ for stats in column_stats_list:
120
+ table.add_row(*[convert_to_row_element(elem) for elem in stats.create_report_row_data().values()])
121
+
122
+ render_list.append(pad_console_element(table, (1, 0, 1, 0)))
123
+
124
+ if ReportSection.COLUMN_PROFILERS in include_sections:
125
+ for profile in analysis.column_profiles or []:
126
+ render_list.append(pad_console_element(profile.create_report_section()))
127
+
128
+ if any("llm" in col_type for col_type in displayed_column_types):
129
+ footnotes_text = (
130
+ "1. All token statistics are based on a sample of max(1000, len(dataset)) records.\n"
131
+ "2. Tokens are calculated using tiktoken's cl100k_base tokenizer."
132
+ )
133
+
134
+ render_list.append(
135
+ pad_console_element(
136
+ Panel(
137
+ Text.from_markup(footnotes_text.strip()),
138
+ title="Table Notes",
139
+ border_style="dim",
140
+ padding=(1, 2),
141
+ )
142
+ )
143
+ )
144
+
145
+ render_list.append(Rule(style=RULE_STYLE))
146
+
147
+ console = Console(record=save_path is not None)
148
+ console.print(Group(*render_list), markup=False)
149
+
150
+ if save_path is not None:
151
+ save_path = str(save_path)
152
+ if save_path.endswith(".html"):
153
+ console.save_html(save_path)
154
+ elif save_path.endswith(".svg"):
155
+ console.save_svg(save_path, title="")
156
+ else:
157
+ raise AnalysisReportError(
158
+ f"🛑 The extension of the save path must be either .html or .svg. You provided {save_path}."
159
+ )
160
+
161
+
162
+ def create_judge_score_summary_table(
163
+ score_name: str,
164
+ histogram: CategoricalHistogramData,
165
+ summary: str,
166
+ accent_style: str = ACCENT_STYLE,
167
+ summary_border_style: str = "dim",
168
+ ) -> Table:
169
+ layout = Table.grid(Column(), Column(), expand=True, padding=(0, 2))
170
+
171
+ histogram_table = create_rich_histogram_table(
172
+ {str(s): c for s, c in zip(histogram.categories, histogram.counts)},
173
+ ("score", "count"),
174
+ name_style=HIST_NAME_STYLE,
175
+ value_style=HIST_VALUE_STYLE,
176
+ )
177
+
178
+ summary_panel = Panel(
179
+ Text(summary, justify="left"),
180
+ title=(
181
+ f"Score Summary: [not {summary_border_style}][{accent_style}]"
182
+ f"{score_name.upper()}[/{accent_style}][/not {summary_border_style}]"
183
+ ),
184
+ border_style=summary_border_style,
185
+ )
186
+
187
+ layout.add_row(
188
+ Align(summary_panel, vertical="top"),
189
+ Align(histogram_table, vertical="top"),
190
+ )
191
+
192
+ return layout
@@ -0,0 +1,69 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import yaml
10
+ from pydantic import BaseModel, ConfigDict
11
+
12
+ from data_designer.config.utils.io_helpers import serialize_data
13
+
14
+
15
+ class ConfigBase(BaseModel):
16
+ model_config = ConfigDict(
17
+ protected_namespaces=(),
18
+ use_enum_values=True,
19
+ arbitrary_types_allowed=True,
20
+ extra="forbid",
21
+ json_schema_mode_override="validation",
22
+ )
23
+
24
+
25
+ class ExportableConfigBase(ConfigBase):
26
+ def to_dict(self) -> dict[str, Any]:
27
+ """Convert the configuration to a dictionary.
28
+
29
+ Returns:
30
+ A dictionary representation of the configuration using JSON-compatible
31
+ serialization.
32
+ """
33
+ return self.model_dump(mode="json")
34
+
35
+ def to_yaml(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
36
+ """Convert the configuration to a YAML string or file.
37
+
38
+ Args:
39
+ path: Optional file path to write the YAML to. If None, returns the
40
+ YAML string instead of writing to file.
41
+ indent: Number of spaces for YAML indentation. Defaults to 2.
42
+ **kwargs: Additional keyword arguments passed to yaml.dump().
43
+
44
+ Returns:
45
+ The YAML string if path is None, otherwise None (file is written).
46
+ """
47
+ yaml_str = yaml.dump(self.to_dict(), indent=indent, **kwargs)
48
+ if path is None:
49
+ return yaml_str
50
+ with open(path, "w") as f:
51
+ f.write(yaml_str)
52
+
53
+ def to_json(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
54
+ """Convert the configuration to a JSON string or file.
55
+
56
+ Args:
57
+ path: Optional file path to write the JSON to. If None, returns the
58
+ JSON string instead of writing to file.
59
+ indent: Number of spaces for JSON indentation. Defaults to 2.
60
+ **kwargs: Additional keyword arguments passed to json.dumps().
61
+
62
+ Returns:
63
+ The JSON string if path is None, otherwise None (file is written).
64
+ """
65
+ json_str = serialize_data(self.to_dict(), indent=indent, **kwargs)
66
+ if path is None:
67
+ return json_str
68
+ with open(path, "w") as f:
69
+ f.write(json_str)