PyPI - data-designer-config - Versions diffs - 0.4.0__py3-none-any.whl - Mend

data-designer-config 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

data_designer/config/__init__.py +149 -0
data_designer/config/_version.py +34 -0
data_designer/config/analysis/__init__.py +2 -0
data_designer/config/analysis/column_profilers.py +159 -0
data_designer/config/analysis/column_statistics.py +421 -0
data_designer/config/analysis/dataset_profiler.py +84 -0
data_designer/config/analysis/utils/errors.py +10 -0
data_designer/config/analysis/utils/reporting.py +192 -0
data_designer/config/base.py +69 -0
data_designer/config/column_configs.py +476 -0
data_designer/config/column_types.py +141 -0
data_designer/config/config_builder.py +595 -0
data_designer/config/data_designer_config.py +40 -0
data_designer/config/dataset_builders.py +13 -0
data_designer/config/dataset_metadata.py +18 -0
data_designer/config/default_model_settings.py +129 -0
data_designer/config/errors.py +24 -0
data_designer/config/interface.py +55 -0
data_designer/config/models.py +486 -0
data_designer/config/preview_results.py +41 -0
data_designer/config/processors.py +148 -0
data_designer/config/run_config.py +56 -0
data_designer/config/sampler_constraints.py +52 -0
data_designer/config/sampler_params.py +639 -0
data_designer/config/seed.py +116 -0
data_designer/config/seed_source.py +84 -0
data_designer/config/seed_source_types.py +19 -0
data_designer/config/testing/__init__.py +6 -0
data_designer/config/testing/fixtures.py +308 -0
data_designer/config/utils/code_lang.py +93 -0
data_designer/config/utils/constants.py +365 -0
data_designer/config/utils/errors.py +21 -0
data_designer/config/utils/info.py +94 -0
data_designer/config/utils/io_helpers.py +258 -0
data_designer/config/utils/misc.py +78 -0
data_designer/config/utils/numerical_helpers.py +30 -0
data_designer/config/utils/type_helpers.py +106 -0
data_designer/config/utils/visualization.py +482 -0
data_designer/config/validator_params.py +94 -0
data_designer/errors.py +7 -0
data_designer/lazy_heavy_imports.py +56 -0
data_designer/logging.py +180 -0
data_designer/plugin_manager.py +78 -0
data_designer/plugins/__init__.py +8 -0
data_designer/plugins/errors.py +15 -0
data_designer/plugins/plugin.py +141 -0
data_designer/plugins/registry.py +88 -0
data_designer_config-0.4.0.dist-info/METADATA +75 -0
data_designer_config-0.4.0.dist-info/RECORD +50 -0
data_designer_config-0.4.0.dist-info/WHEEL +4 -0

data_designer/config/analysis/utils/reporting.py ADDED Viewed

@@ -0,0 +1,192 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from enum import Enum
+from pathlib import Path
+from typing import TYPE_CHECKING
+from rich.align import Align
+from rich.console import Console, Group
+from rich.panel import Panel
+from rich.rule import Rule
+from rich.table import Column, Table
+from rich.text import Text
+from data_designer.config.analysis.utils.errors import AnalysisReportError
+from data_designer.config.column_types import (
+    DataDesignerColumnType,
+    get_column_display_order,
+    get_column_emoji_from_type,
+)
+from data_designer.config.utils.visualization import (
+    ColorPalette,
+    convert_to_row_element,
+    create_rich_histogram_table,
+    pad_console_element,
+)
+if TYPE_CHECKING:
+    from data_designer.config.analysis.column_statistics import CategoricalHistogramData
+    from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
+HEADER_STYLE = "dim"
+RULE_STYLE = f"bold {ColorPalette.NVIDIA_GREEN.value}"
+ACCENT_STYLE = f"bold {ColorPalette.BLUE.value}"
+TITLE_STYLE = f"bold {ColorPalette.NVIDIA_GREEN.value}"
+HIST_NAME_STYLE = f"bold {ColorPalette.BLUE.value}"
+HIST_VALUE_STYLE = f"dim {ColorPalette.BLUE.value}"
+class ReportSection(str, Enum):
+    OVERVIEW = "overview"
+    COLUMN_PROFILERS = "column_profilers"
+DEFAULT_INCLUDE_SECTIONS = [
+    ReportSection.OVERVIEW,
+    ReportSection.COLUMN_PROFILERS,
+] + get_column_display_order()
+def generate_analysis_report(
+    analysis: DatasetProfilerResults,
+    save_path: str | Path | None = None,
+    include_sections: list[ReportSection | DataDesignerColumnType] | None = None,
+) -> None:
+    """Generate an analysis report for dataset profiling results.
+    This function creates a rich-formatted report that displays dataset overview statistics
+    and detailed column statistics organized by column type. The report includes visual
+    elements like tables, rules, and panels to present the analysis results in an
+    easy-to-read format.
+    Args:
+        analysis: The DatasetProfilerResults object containing the analysis data to report on.
+        save_path: Optional path to save the report. If provided, the report will be saved
+                  as either HTML (.html) or SVG (.svg) format. If None, the report will
+                  only be displayed in the console.
+        include_sections: Optional list of sections to include in the report. Choices are
+                  any Data Designer column type, "overview" (the dataset overview section),
+                  and "column_profilers" (all column profilers in one section). If None,
+                  all sections will be included.
+    Raises:
+        AnalysisReportError: If save_path is provided but doesn't have a .html or .svg extension.
+    """
+    render_list = []
+    table_kws = dict(show_lines=True, expand=True, title_style=TITLE_STYLE)
+    include_sections = include_sections or DEFAULT_INCLUDE_SECTIONS
+    title = Rule(title="🎨 Data Designer Dataset Profile", style=RULE_STYLE, end="\n\n")
+    render_list.append(title)
+    if ReportSection.OVERVIEW in include_sections:
+        table = Table(title="Dataset Overview", **table_kws)
+        table.add_column("number of records", header_style=HEADER_STYLE)
+        table.add_column("number of columns", header_style=HEADER_STYLE)
+        table.add_column("percent complete records", header_style=HEADER_STYLE)
+        table.add_row(
+            f"{analysis.num_records:,}",
+            f"{len(analysis.column_statistics):,}",
+            f"{analysis.percent_complete:.1f}%",
+        )
+        render_list.append(pad_console_element(table, (1, 0, 1, 0)))
+    displayed_column_types = set()
+    for column_type in analysis.column_types:
+        if column_type not in include_sections:
+            continue
+        displayed_column_types.add(column_type)
+        column_label = column_type.replace("_", " ").title().replace("Llm", "LLM")
+        table = Table(
+            title=f"{get_column_emoji_from_type(column_type)} {column_label} Columns",
+            **table_kws,
+        )
+        column_stats_list = analysis.get_column_statistics_by_type(column_type)
+        for col in list(column_stats_list[0].create_report_row_data()):
+            if col == "column name":
+                table.add_column(col, header_style=HEADER_STYLE)
+            else:
+                table.add_column(col, justify="right", header_style=HEADER_STYLE)
+        for stats in column_stats_list:
+            table.add_row(*[convert_to_row_element(elem) for elem in stats.create_report_row_data().values()])
+        render_list.append(pad_console_element(table, (1, 0, 1, 0)))
+    if ReportSection.COLUMN_PROFILERS in include_sections:
+        for profile in analysis.column_profiles or []:
+            render_list.append(pad_console_element(profile.create_report_section()))
+    if any("llm" in col_type for col_type in displayed_column_types):
+        footnotes_text = (
+            "1. All token statistics are based on a sample of max(1000, len(dataset)) records.\n"
+            "2. Tokens are calculated using tiktoken's cl100k_base tokenizer."
+        )
+        render_list.append(
+            pad_console_element(
+                Panel(
+                    Text.from_markup(footnotes_text.strip()),
+                    title="Table Notes",
+                    border_style="dim",
+                    padding=(1, 2),
+                )
+            )
+        )
+    render_list.append(Rule(style=RULE_STYLE))
+    console = Console(record=save_path is not None)
+    console.print(Group(*render_list), markup=False)
+    if save_path is not None:
+        save_path = str(save_path)
+        if save_path.endswith(".html"):
+            console.save_html(save_path)
+        elif save_path.endswith(".svg"):
+            console.save_svg(save_path, title="")
+        else:
+            raise AnalysisReportError(
+                f"🛑 The extension of the save path must be either .html or .svg. You provided {save_path}."
+            )
+def create_judge_score_summary_table(
+    score_name: str,
+    histogram: CategoricalHistogramData,
+    summary: str,
+    accent_style: str = ACCENT_STYLE,
+    summary_border_style: str = "dim",
+) -> Table:
+    layout = Table.grid(Column(), Column(), expand=True, padding=(0, 2))
+    histogram_table = create_rich_histogram_table(
+        {str(s): c for s, c in zip(histogram.categories, histogram.counts)},
+        ("score", "count"),
+        name_style=HIST_NAME_STYLE,
+        value_style=HIST_VALUE_STYLE,
+    )
+    summary_panel = Panel(
+        Text(summary, justify="left"),
+        title=(
+            f"Score Summary: [not {summary_border_style}][{accent_style}]"
+            f"{score_name.upper()}[/{accent_style}][/not {summary_border_style}]"
+        ),
+        border_style=summary_border_style,
+    )
+    layout.add_row(
+        Align(summary_panel, vertical="top"),
+        Align(histogram_table, vertical="top"),
+    )
+    return layout

data_designer/config/base.py ADDED Viewed

@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import yaml
+from pydantic import BaseModel, ConfigDict
+from data_designer.config.utils.io_helpers import serialize_data
+class ConfigBase(BaseModel):
+    model_config = ConfigDict(
+        protected_namespaces=(),
+        use_enum_values=True,
+        arbitrary_types_allowed=True,
+        extra="forbid",
+        json_schema_mode_override="validation",
+    )
+class ExportableConfigBase(ConfigBase):
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the configuration to a dictionary.
+        Returns:
+            A dictionary representation of the configuration using JSON-compatible
+            serialization.
+        """
+        return self.model_dump(mode="json")
+    def to_yaml(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
+        """Convert the configuration to a YAML string or file.
+        Args:
+            path: Optional file path to write the YAML to. If None, returns the
+                YAML string instead of writing to file.
+            indent: Number of spaces for YAML indentation. Defaults to 2.
+            **kwargs: Additional keyword arguments passed to yaml.dump().
+        Returns:
+            The YAML string if path is None, otherwise None (file is written).
+        """
+        yaml_str = yaml.dump(self.to_dict(), indent=indent, **kwargs)
+        if path is None:
+            return yaml_str
+        with open(path, "w") as f:
+            f.write(yaml_str)
+    def to_json(self, path: str | Path | None = None, *, indent: int | None = 2, **kwargs) -> str | None:
+        """Convert the configuration to a JSON string or file.
+        Args:
+            path: Optional file path to write the JSON to. If None, returns the
+                JSON string instead of writing to file.
+            indent: Number of spaces for JSON indentation. Defaults to 2.
+            **kwargs: Additional keyword arguments passed to json.dumps().
+        Returns:
+            The JSON string if path is None, otherwise None (file is written).
+        """
+        json_str = serialize_data(self.to_dict(), indent=indent, **kwargs)
+        if path is None:
+            return json_str
+        with open(path, "w") as f:
+            f.write(json_str)