data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0rc1.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0rc1.dist-info}/entry_points.txt +0 -0
@@ -1,192 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from enum import Enum
7
- from pathlib import Path
8
- from typing import TYPE_CHECKING
9
-
10
- from rich.align import Align
11
- from rich.console import Console, Group
12
- from rich.panel import Panel
13
- from rich.rule import Rule
14
- from rich.table import Column, Table
15
- from rich.text import Text
16
-
17
- from data_designer.config.analysis.utils.errors import AnalysisReportError
18
- from data_designer.config.column_types import (
19
- DataDesignerColumnType,
20
- get_column_display_order,
21
- get_column_emoji_from_type,
22
- )
23
- from data_designer.config.utils.visualization import (
24
- ColorPalette,
25
- convert_to_row_element,
26
- create_rich_histogram_table,
27
- pad_console_element,
28
- )
29
-
30
- if TYPE_CHECKING:
31
- from data_designer.config.analysis.column_statistics import CategoricalHistogramData
32
- from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
33
-
34
- HEADER_STYLE = "dim"
35
- RULE_STYLE = f"bold {ColorPalette.NVIDIA_GREEN.value}"
36
- ACCENT_STYLE = f"bold {ColorPalette.BLUE.value}"
37
- TITLE_STYLE = f"bold {ColorPalette.NVIDIA_GREEN.value}"
38
- HIST_NAME_STYLE = f"bold {ColorPalette.BLUE.value}"
39
- HIST_VALUE_STYLE = f"dim {ColorPalette.BLUE.value}"
40
-
41
-
42
- class ReportSection(str, Enum):
43
- OVERVIEW = "overview"
44
- COLUMN_PROFILERS = "column_profilers"
45
-
46
-
47
- DEFAULT_INCLUDE_SECTIONS = [
48
- ReportSection.OVERVIEW,
49
- ReportSection.COLUMN_PROFILERS,
50
- ] + get_column_display_order()
51
-
52
-
53
- def generate_analysis_report(
54
- analysis: DatasetProfilerResults,
55
- save_path: str | Path | None = None,
56
- include_sections: list[ReportSection | DataDesignerColumnType] | None = None,
57
- ) -> None:
58
- """Generate an analysis report for dataset profiling results.
59
-
60
- This function creates a rich-formatted report that displays dataset overview statistics
61
- and detailed column statistics organized by column type. The report includes visual
62
- elements like tables, rules, and panels to present the analysis results in an
63
- easy-to-read format.
64
-
65
- Args:
66
- analysis: The DatasetProfilerResults object containing the analysis data to report on.
67
- save_path: Optional path to save the report. If provided, the report will be saved
68
- as either HTML (.html) or SVG (.svg) format. If None, the report will
69
- only be displayed in the console.
70
- include_sections: Optional list of sections to include in the report. Choices are
71
- any Data Designer column type, "overview" (the dataset overview section),
72
- and "column_profilers" (all column profilers in one section). If None,
73
- all sections will be included.
74
-
75
- Raises:
76
- AnalysisReportError: If save_path is provided but doesn't have a .html or .svg extension.
77
- """
78
- render_list = []
79
- table_kws = dict(show_lines=True, expand=True, title_style=TITLE_STYLE)
80
- include_sections = include_sections or DEFAULT_INCLUDE_SECTIONS
81
-
82
- title = Rule(title="🎨 Data Designer Dataset Profile", style=RULE_STYLE, end="\n\n")
83
-
84
- render_list.append(title)
85
-
86
- if ReportSection.OVERVIEW in include_sections:
87
- table = Table(title="Dataset Overview", **table_kws)
88
- table.add_column("number of records", header_style=HEADER_STYLE)
89
- table.add_column("number of columns", header_style=HEADER_STYLE)
90
- table.add_column("percent complete records", header_style=HEADER_STYLE)
91
-
92
- table.add_row(
93
- f"{analysis.num_records:,}",
94
- f"{len(analysis.column_statistics):,}",
95
- f"{analysis.percent_complete:.1f}%",
96
- )
97
-
98
- render_list.append(pad_console_element(table, (1, 0, 1, 0)))
99
-
100
- displayed_column_types = set()
101
- for column_type in analysis.column_types:
102
- if column_type not in include_sections:
103
- continue
104
-
105
- displayed_column_types.add(column_type)
106
- column_label = column_type.replace("_", " ").title().replace("Llm", "LLM")
107
- table = Table(
108
- title=f"{get_column_emoji_from_type(column_type)} {column_label} Columns",
109
- **table_kws,
110
- )
111
-
112
- column_stats_list = analysis.get_column_statistics_by_type(column_type)
113
- for col in list(column_stats_list[0].create_report_row_data()):
114
- if col == "column name":
115
- table.add_column(col, header_style=HEADER_STYLE)
116
- else:
117
- table.add_column(col, justify="right", header_style=HEADER_STYLE)
118
-
119
- for stats in column_stats_list:
120
- table.add_row(*[convert_to_row_element(elem) for elem in stats.create_report_row_data().values()])
121
-
122
- render_list.append(pad_console_element(table, (1, 0, 1, 0)))
123
-
124
- if ReportSection.COLUMN_PROFILERS in include_sections:
125
- for profile in analysis.column_profiles or []:
126
- render_list.append(pad_console_element(profile.create_report_section()))
127
-
128
- if any("llm" in col_type for col_type in displayed_column_types):
129
- footnotes_text = (
130
- "1. All token statistics are based on a sample of max(1000, len(dataset)) records.\n"
131
- "2. Tokens are calculated using tiktoken's cl100k_base tokenizer."
132
- )
133
-
134
- render_list.append(
135
- pad_console_element(
136
- Panel(
137
- Text.from_markup(footnotes_text.strip()),
138
- title="Table Notes",
139
- border_style="dim",
140
- padding=(1, 2),
141
- )
142
- )
143
- )
144
-
145
- render_list.append(Rule(style=RULE_STYLE))
146
-
147
- console = Console(record=save_path is not None)
148
- console.print(Group(*render_list), markup=False)
149
-
150
- if save_path is not None:
151
- save_path = str(save_path)
152
- if save_path.endswith(".html"):
153
- console.save_html(save_path)
154
- elif save_path.endswith(".svg"):
155
- console.save_svg(save_path, title="")
156
- else:
157
- raise AnalysisReportError(
158
- f"🛑 The extension of the save path must be either .html or .svg. You provided {save_path}."
159
- )
160
-
161
-
162
- def create_judge_score_summary_table(
163
- score_name: str,
164
- histogram: CategoricalHistogramData,
165
- summary: str,
166
- accent_style: str = ACCENT_STYLE,
167
- summary_border_style: str = "dim",
168
- ) -> Table:
169
- layout = Table.grid(Column(), Column(), expand=True, padding=(0, 2))
170
-
171
- histogram_table = create_rich_histogram_table(
172
- {str(s): c for s, c in zip(histogram.categories, histogram.counts)},
173
- ("score", "count"),
174
- name_style=HIST_NAME_STYLE,
175
- value_style=HIST_VALUE_STYLE,
176
- )
177
-
178
- summary_panel = Panel(
179
- Text(summary, justify="left"),
180
- title=(
181
- f"Score Summary: [not {summary_border_style}][{accent_style}]"
182
- f"{score_name.upper()}[/{accent_style}][/not {summary_border_style}]"
183
- ),
184
- border_style=summary_border_style,
185
- )
186
-
187
- layout.add_row(
188
- Align(summary_panel, vertical="top"),
189
- Align(histogram_table, vertical="top"),
190
- )
191
-
192
- return layout
@@ -1,69 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from pathlib import Path
7
- from typing import Any
8
-
9
- import yaml
10
- from pydantic import BaseModel, ConfigDict
11
-
12
- from data_designer.config.utils.io_helpers import serialize_data
13
-
14
-
15
- class ConfigBase(BaseModel):
16
- model_config = ConfigDict(
17
- protected_namespaces=(),
18
- use_enum_values=True,
19
- arbitrary_types_allowed=True,
20
- extra="forbid",
21
- json_schema_mode_override="validation",
22
- )
23
-
24
-
25
- class ExportableConfigBase(ConfigBase):
26
- def to_dict(self) -> dict[str, Any]:
27
- """Convert the configuration to a dictionary.
28
-
29
- Returns:
30
- A dictionary representation of the configuration using JSON-compatible
31
- serialization.
32
- """
33
- return self.model_dump(mode="json")
34
-
35
- def to_yaml(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
36
- """Convert the configuration to a YAML string or file.
37
-
38
- Args:
39
- path: Optional file path to write the YAML to. If None, returns the
40
- YAML string instead of writing to file.
41
- indent: Number of spaces for YAML indentation. Defaults to 2.
42
- **kwargs: Additional keyword arguments passed to yaml.dump().
43
-
44
- Returns:
45
- The YAML string if path is None, otherwise None (file is written).
46
- """
47
- yaml_str = yaml.dump(self.to_dict(), indent=indent, **kwargs)
48
- if path is None:
49
- return yaml_str
50
- with open(path, "w") as f:
51
- f.write(yaml_str)
52
-
53
- def to_json(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
54
- """Convert the configuration to a JSON string or file.
55
-
56
- Args:
57
- path: Optional file path to write the JSON to. If None, returns the
58
- JSON string instead of writing to file.
59
- indent: Number of spaces for JSON indentation. Defaults to 2.
60
- **kwargs: Additional keyword arguments passed to json.dumps().
61
-
62
- Returns:
63
- The JSON string if path is None, otherwise None (file is written).
64
- """
65
- json_str = serialize_data(self.to_dict(), indent=indent, **kwargs)
66
- if path is None:
67
- return json_str
68
- with open(path, "w") as f:
69
- f.write(json_str)