data-designer 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/__init__.py +2 -0
- data_designer/_version.py +2 -2
- data_designer/cli/__init__.py +2 -0
- data_designer/cli/commands/download.py +2 -0
- data_designer/cli/commands/list.py +2 -0
- data_designer/cli/commands/models.py +2 -0
- data_designer/cli/commands/providers.py +2 -0
- data_designer/cli/commands/reset.py +2 -0
- data_designer/cli/controllers/__init__.py +2 -0
- data_designer/cli/controllers/download_controller.py +2 -0
- data_designer/cli/controllers/model_controller.py +6 -1
- data_designer/cli/controllers/provider_controller.py +6 -1
- data_designer/cli/forms/__init__.py +2 -0
- data_designer/cli/forms/builder.py +2 -0
- data_designer/cli/forms/field.py +2 -0
- data_designer/cli/forms/form.py +2 -0
- data_designer/cli/forms/model_builder.py +2 -0
- data_designer/cli/forms/provider_builder.py +2 -0
- data_designer/cli/main.py +2 -0
- data_designer/cli/repositories/__init__.py +2 -0
- data_designer/cli/repositories/base.py +2 -0
- data_designer/cli/repositories/model_repository.py +2 -0
- data_designer/cli/repositories/persona_repository.py +2 -0
- data_designer/cli/repositories/provider_repository.py +2 -0
- data_designer/cli/services/__init__.py +2 -0
- data_designer/cli/services/download_service.py +2 -0
- data_designer/cli/services/model_service.py +2 -0
- data_designer/cli/services/provider_service.py +2 -0
- data_designer/cli/ui.py +2 -0
- data_designer/cli/utils.py +2 -0
- data_designer/config/analysis/column_profilers.py +2 -0
- data_designer/config/analysis/column_statistics.py +8 -5
- data_designer/config/analysis/dataset_profiler.py +9 -3
- data_designer/config/analysis/utils/errors.py +2 -0
- data_designer/config/analysis/utils/reporting.py +7 -3
- data_designer/config/base.py +1 -0
- data_designer/config/column_configs.py +77 -7
- data_designer/config/column_types.py +33 -36
- data_designer/config/dataset_builders.py +2 -0
- data_designer/config/dataset_metadata.py +18 -0
- data_designer/config/default_model_settings.py +1 -0
- data_designer/config/errors.py +2 -0
- data_designer/config/exports.py +2 -0
- data_designer/config/interface.py +3 -2
- data_designer/config/models.py +7 -2
- data_designer/config/preview_results.py +9 -1
- data_designer/config/processors.py +2 -0
- data_designer/config/run_config.py +19 -5
- data_designer/config/sampler_constraints.py +2 -0
- data_designer/config/sampler_params.py +7 -2
- data_designer/config/seed.py +2 -0
- data_designer/config/seed_source.py +9 -3
- data_designer/config/seed_source_types.py +2 -0
- data_designer/config/utils/constants.py +2 -0
- data_designer/config/utils/errors.py +2 -0
- data_designer/config/utils/info.py +2 -0
- data_designer/config/utils/io_helpers.py +8 -3
- data_designer/config/utils/misc.py +2 -2
- data_designer/config/utils/numerical_helpers.py +2 -0
- data_designer/config/utils/type_helpers.py +2 -0
- data_designer/config/utils/visualization.py +19 -11
- data_designer/config/validator_params.py +2 -0
- data_designer/engine/analysis/column_profilers/base.py +9 -8
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
- data_designer/engine/analysis/column_profilers/registry.py +2 -0
- data_designer/engine/analysis/column_statistics.py +5 -2
- data_designer/engine/analysis/dataset_profiler.py +12 -9
- data_designer/engine/analysis/errors.py +2 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
- data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
- data_designer/engine/column_generators/generators/base.py +26 -14
- data_designer/engine/column_generators/generators/embedding.py +4 -11
- data_designer/engine/column_generators/generators/expression.py +7 -16
- data_designer/engine/column_generators/generators/llm_completion.py +13 -47
- data_designer/engine/column_generators/generators/samplers.py +8 -14
- data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
- data_designer/engine/column_generators/generators/validation.py +9 -20
- data_designer/engine/column_generators/registry.py +2 -0
- data_designer/engine/column_generators/utils/errors.py +2 -0
- data_designer/engine/column_generators/utils/generator_classification.py +2 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
- data_designer/engine/compiler.py +3 -6
- data_designer/engine/configurable_task.py +12 -13
- data_designer/engine/dataset_builders/artifact_storage.py +87 -8
- data_designer/engine/dataset_builders/column_wise_builder.py +34 -35
- data_designer/engine/dataset_builders/errors.py +2 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
- data_designer/engine/dataset_builders/utils/concurrency.py +13 -4
- data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
- data_designer/engine/dataset_builders/utils/dag.py +7 -2
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +35 -25
- data_designer/engine/dataset_builders/utils/errors.py +2 -0
- data_designer/engine/errors.py +2 -0
- data_designer/engine/model_provider.py +2 -0
- data_designer/engine/models/errors.py +23 -31
- data_designer/engine/models/facade.py +12 -9
- data_designer/engine/models/factory.py +42 -0
- data_designer/engine/models/litellm_overrides.py +16 -11
- data_designer/engine/models/parsers/errors.py +2 -0
- data_designer/engine/models/parsers/parser.py +2 -2
- data_designer/engine/models/parsers/postprocessors.py +1 -0
- data_designer/engine/models/parsers/tag_parsers.py +2 -0
- data_designer/engine/models/parsers/types.py +2 -0
- data_designer/engine/models/recipes/base.py +2 -0
- data_designer/engine/models/recipes/response_recipes.py +2 -0
- data_designer/engine/models/registry.py +11 -18
- data_designer/engine/models/telemetry.py +6 -2
- data_designer/engine/processing/ginja/ast.py +2 -0
- data_designer/engine/processing/ginja/environment.py +2 -0
- data_designer/engine/processing/ginja/exceptions.py +2 -0
- data_designer/engine/processing/ginja/record.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +9 -2
- data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
- data_designer/engine/processing/gsonschema/types.py +2 -0
- data_designer/engine/processing/gsonschema/validators.py +10 -6
- data_designer/engine/processing/processors/base.py +1 -5
- data_designer/engine/processing/processors/drop_columns.py +7 -10
- data_designer/engine/processing/processors/registry.py +2 -0
- data_designer/engine/processing/processors/schema_transform.py +7 -10
- data_designer/engine/processing/utils.py +7 -3
- data_designer/engine/registry/base.py +2 -0
- data_designer/engine/registry/data_designer_registry.py +2 -0
- data_designer/engine/registry/errors.py +2 -0
- data_designer/engine/resources/managed_dataset_generator.py +6 -2
- data_designer/engine/resources/managed_dataset_repository.py +8 -5
- data_designer/engine/resources/managed_storage.py +2 -0
- data_designer/engine/resources/resource_provider.py +20 -1
- data_designer/engine/resources/seed_reader.py +7 -2
- data_designer/engine/sampling_gen/column.py +2 -0
- data_designer/engine/sampling_gen/constraints.py +8 -2
- data_designer/engine/sampling_gen/data_sources/base.py +10 -7
- data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
- data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/errors.py +2 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/person.py +2 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
- data_designer/engine/sampling_gen/errors.py +2 -0
- data_designer/engine/sampling_gen/generator.py +5 -4
- data_designer/engine/sampling_gen/jinja_utils.py +7 -3
- data_designer/engine/sampling_gen/people_gen.py +7 -7
- data_designer/engine/sampling_gen/person_constants.py +2 -0
- data_designer/engine/sampling_gen/schema.py +5 -1
- data_designer/engine/sampling_gen/schema_builder.py +2 -0
- data_designer/engine/sampling_gen/utils.py +7 -1
- data_designer/engine/secret_resolver.py +2 -0
- data_designer/engine/validation.py +2 -2
- data_designer/engine/validators/__init__.py +2 -0
- data_designer/engine/validators/base.py +2 -0
- data_designer/engine/validators/local_callable.py +7 -2
- data_designer/engine/validators/python.py +7 -1
- data_designer/engine/validators/remote.py +7 -1
- data_designer/engine/validators/sql.py +8 -3
- data_designer/errors.py +2 -0
- data_designer/essentials/__init__.py +2 -0
- data_designer/interface/data_designer.py +36 -39
- data_designer/interface/errors.py +2 -0
- data_designer/interface/results.py +9 -2
- data_designer/lazy_heavy_imports.py +54 -0
- data_designer/logging.py +2 -0
- data_designer/plugins/__init__.py +2 -0
- data_designer/plugins/errors.py +2 -0
- data_designer/plugins/plugin.py +0 -1
- data_designer/plugins/registry.py +2 -0
- data_designer/plugins/testing/__init__.py +2 -0
- data_designer/plugins/testing/stubs.py +21 -43
- data_designer/plugins/testing/utils.py +2 -0
- {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/METADATA +19 -4
- data_designer-0.3.5.dist-info/RECORD +196 -0
- data_designer-0.3.3.dist-info/RECORD +0 -193
- {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/WHEEL +0 -0
- {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/entry_points.txt +0 -0
- {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
4
5
|
|
|
5
6
|
from typing_extensions import TypeAlias
|
|
6
7
|
|
|
@@ -15,7 +16,7 @@ from data_designer.config.column_configs import (
|
|
|
15
16
|
SeedDatasetColumnConfig,
|
|
16
17
|
ValidationColumnConfig,
|
|
17
18
|
)
|
|
18
|
-
from data_designer.config.errors import
|
|
19
|
+
from data_designer.config.errors import InvalidConfigError
|
|
19
20
|
from data_designer.config.sampler_params import SamplerType
|
|
20
21
|
from data_designer.config.utils.type_helpers import (
|
|
21
22
|
SAMPLER_PARAMS,
|
|
@@ -45,22 +46,6 @@ DataDesignerColumnType = create_str_enum_from_discriminated_type_union(
|
|
|
45
46
|
discriminator_field_name="column_type",
|
|
46
47
|
)
|
|
47
48
|
|
|
48
|
-
COLUMN_TYPE_EMOJI_MAP = {
|
|
49
|
-
"general": "⚛️", # possible analysis column type
|
|
50
|
-
DataDesignerColumnType.EXPRESSION: "🧩",
|
|
51
|
-
DataDesignerColumnType.LLM_CODE: "💻",
|
|
52
|
-
DataDesignerColumnType.LLM_JUDGE: "⚖️",
|
|
53
|
-
DataDesignerColumnType.LLM_STRUCTURED: "🗂️",
|
|
54
|
-
DataDesignerColumnType.LLM_TEXT: "📝",
|
|
55
|
-
DataDesignerColumnType.SEED_DATASET: "🌱",
|
|
56
|
-
DataDesignerColumnType.SAMPLER: "🎲",
|
|
57
|
-
DataDesignerColumnType.VALIDATION: "🔍",
|
|
58
|
-
DataDesignerColumnType.EMBEDDING: "🧬",
|
|
59
|
-
}
|
|
60
|
-
COLUMN_TYPE_EMOJI_MAP.update(
|
|
61
|
-
{DataDesignerColumnType(p.name): p.emoji for p in plugin_manager.get_column_generator_plugins()}
|
|
62
|
-
)
|
|
63
|
-
|
|
64
49
|
|
|
65
50
|
def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType, **kwargs) -> ColumnConfigT:
|
|
66
51
|
"""Create a Data Designer column config object from kwargs.
|
|
@@ -74,27 +59,20 @@ def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType
|
|
|
74
59
|
Data Designer column object of the appropriate type.
|
|
75
60
|
"""
|
|
76
61
|
column_type = resolve_string_enum(column_type, DataDesignerColumnType)
|
|
77
|
-
|
|
78
|
-
return LLMTextColumnConfig(name=name, **kwargs)
|
|
79
|
-
if column_type == DataDesignerColumnType.LLM_CODE:
|
|
80
|
-
return LLMCodeColumnConfig(name=name, **kwargs)
|
|
81
|
-
if column_type == DataDesignerColumnType.LLM_STRUCTURED:
|
|
82
|
-
return LLMStructuredColumnConfig(name=name, **kwargs)
|
|
83
|
-
if column_type == DataDesignerColumnType.LLM_JUDGE:
|
|
84
|
-
return LLMJudgeColumnConfig(name=name, **kwargs)
|
|
85
|
-
if column_type == DataDesignerColumnType.VALIDATION:
|
|
86
|
-
return ValidationColumnConfig(name=name, **kwargs)
|
|
87
|
-
if column_type == DataDesignerColumnType.EXPRESSION:
|
|
88
|
-
return ExpressionColumnConfig(name=name, **kwargs)
|
|
62
|
+
config_cls = get_column_config_cls_from_type(column_type)
|
|
89
63
|
if column_type == DataDesignerColumnType.SAMPLER:
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
64
|
+
kwargs = _resolve_sampler_kwargs(name, kwargs)
|
|
65
|
+
return config_cls(name=name, **kwargs)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_column_config_cls_from_type(column_type: DataDesignerColumnType) -> type[ColumnConfigT]:
|
|
69
|
+
"""Get the column config class for a column type."""
|
|
70
|
+
column_type = resolve_string_enum(column_type, DataDesignerColumnType)
|
|
71
|
+
if column_type in _COLUMN_TYPE_CONFIG_CLS_MAP:
|
|
72
|
+
return _COLUMN_TYPE_CONFIG_CLS_MAP[column_type]
|
|
95
73
|
if plugin := plugin_manager.get_column_generator_plugin_if_exists(column_type.value):
|
|
96
|
-
return plugin.config_cls
|
|
97
|
-
raise
|
|
74
|
+
return plugin.config_cls
|
|
75
|
+
raise InvalidConfigError(f"🛑 {column_type} is not a valid column type.")
|
|
98
76
|
|
|
99
77
|
|
|
100
78
|
def get_column_display_order() -> list[DataDesignerColumnType]:
|
|
@@ -114,6 +92,12 @@ def get_column_display_order() -> list[DataDesignerColumnType]:
|
|
|
114
92
|
return display_order
|
|
115
93
|
|
|
116
94
|
|
|
95
|
+
def get_column_emoji_from_type(column_type: DataDesignerColumnType) -> str:
|
|
96
|
+
"""Get the emoji for a column type."""
|
|
97
|
+
config_cls = get_column_config_cls_from_type(resolve_string_enum(column_type, DataDesignerColumnType))
|
|
98
|
+
return config_cls.get_column_emoji()
|
|
99
|
+
|
|
100
|
+
|
|
117
101
|
def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
|
|
118
102
|
if "sampler_type" not in kwargs:
|
|
119
103
|
raise InvalidConfigError(f"🛑 `sampler_type` is required for sampler column '{name}'.")
|
|
@@ -142,3 +126,16 @@ def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
|
|
|
142
126
|
"params": params,
|
|
143
127
|
**{k: v for k, v in kwargs.items() if k not in ["sampler_type", "params"]},
|
|
144
128
|
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
_COLUMN_TYPE_CONFIG_CLS_MAP = {
|
|
132
|
+
DataDesignerColumnType.LLM_TEXT: LLMTextColumnConfig,
|
|
133
|
+
DataDesignerColumnType.LLM_CODE: LLMCodeColumnConfig,
|
|
134
|
+
DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnConfig,
|
|
135
|
+
DataDesignerColumnType.LLM_JUDGE: LLMJudgeColumnConfig,
|
|
136
|
+
DataDesignerColumnType.VALIDATION: ValidationColumnConfig,
|
|
137
|
+
DataDesignerColumnType.EXPRESSION: ExpressionColumnConfig,
|
|
138
|
+
DataDesignerColumnType.SAMPLER: SamplerColumnConfig,
|
|
139
|
+
DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnConfig,
|
|
140
|
+
DataDesignerColumnType.EMBEDDING: EmbeddingColumnConfig,
|
|
141
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DatasetMetadata(BaseModel):
|
|
8
|
+
"""Metadata about a generated dataset.
|
|
9
|
+
|
|
10
|
+
This object is created by the engine and passed to results objects for use
|
|
11
|
+
in visualization and other client-side utilities. It is designed to be
|
|
12
|
+
serializable so it can be sent over the wire in a client-server architecture.
|
|
13
|
+
|
|
14
|
+
Attributes:
|
|
15
|
+
seed_column_names: Names of columns from the seed dataset. Empty list if no seed dataset.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
seed_column_names: list[str] = []
|
data_designer/config/errors.py
CHANGED
data_designer/config/exports.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.config.analysis.column_profilers import JudgeScoreProfilerConfig
|
|
5
7
|
from data_designer.config.column_configs import (
|
|
6
8
|
EmbeddingColumnConfig,
|
|
@@ -6,13 +6,14 @@ from __future__ import annotations
|
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
|
|
8
8
|
|
|
9
|
-
import pandas as pd
|
|
10
|
-
|
|
11
9
|
from data_designer.config.models import ModelConfig, ModelProvider
|
|
12
10
|
from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS
|
|
13
11
|
from data_designer.config.utils.info import InterfaceInfo
|
|
12
|
+
from data_designer.lazy_heavy_imports import pd
|
|
14
13
|
|
|
15
14
|
if TYPE_CHECKING:
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
16
17
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
17
18
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
18
19
|
from data_designer.config.preview_results import PreviewResults
|
data_designer/config/models.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
from abc import ABC, abstractmethod
|
|
6
8
|
from enum import Enum
|
|
7
9
|
from pathlib import Path
|
|
8
|
-
from typing import Annotated, Any, Generic, Literal, TypeVar
|
|
10
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generic, Literal, TypeVar
|
|
9
11
|
|
|
10
|
-
import numpy as np
|
|
11
12
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
12
13
|
from typing_extensions import Self, TypeAlias
|
|
13
14
|
|
|
@@ -20,6 +21,10 @@ from data_designer.config.utils.constants import (
|
|
|
20
21
|
MIN_TOP_P,
|
|
21
22
|
)
|
|
22
23
|
from data_designer.config.utils.io_helpers import smart_load_yaml
|
|
24
|
+
from data_designer.lazy_heavy_imports import np
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
import numpy as np
|
|
23
28
|
|
|
24
29
|
logger = logging.getLogger(__name__)
|
|
25
30
|
|
|
@@ -3,11 +3,16 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
7
|
|
|
8
8
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
9
9
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
10
|
+
from data_designer.config.dataset_metadata import DatasetMetadata
|
|
10
11
|
from data_designer.config.utils.visualization import WithRecordSamplerMixin
|
|
12
|
+
from data_designer.lazy_heavy_imports import pd
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import pandas as pd
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
class PreviewResults(WithRecordSamplerMixin):
|
|
@@ -15,6 +20,7 @@ class PreviewResults(WithRecordSamplerMixin):
|
|
|
15
20
|
self,
|
|
16
21
|
*,
|
|
17
22
|
config_builder: DataDesignerConfigBuilder,
|
|
23
|
+
dataset_metadata: DatasetMetadata | None = None,
|
|
18
24
|
dataset: pd.DataFrame | None = None,
|
|
19
25
|
analysis: DatasetProfilerResults | None = None,
|
|
20
26
|
processor_artifacts: dict[str, list[str] | str] | None = None,
|
|
@@ -23,6 +29,7 @@ class PreviewResults(WithRecordSamplerMixin):
|
|
|
23
29
|
|
|
24
30
|
Args:
|
|
25
31
|
config_builder: Data Designer configuration builder.
|
|
32
|
+
dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
|
|
26
33
|
dataset: Dataset of the preview run.
|
|
27
34
|
analysis: Analysis of the preview run.
|
|
28
35
|
processor_artifacts: Artifacts generated by the processors.
|
|
@@ -30,4 +37,5 @@ class PreviewResults(WithRecordSamplerMixin):
|
|
|
30
37
|
self.dataset: pd.DataFrame | None = dataset
|
|
31
38
|
self.analysis: DatasetProfilerResults | None = analysis
|
|
32
39
|
self.processor_artifacts: dict[str, list[str] | str] | None = processor_artifacts
|
|
40
|
+
self.dataset_metadata: DatasetMetadata | None = dataset_metadata
|
|
33
41
|
self._config_builder = config_builder
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from pydantic import Field, model_validator
|
|
5
7
|
from typing_extensions import Self
|
|
6
8
|
|
|
@@ -14,21 +16,33 @@ class RunConfig(ConfigBase):
|
|
|
14
16
|
part of the dataset configuration itself.
|
|
15
17
|
|
|
16
18
|
Attributes:
|
|
17
|
-
disable_early_shutdown: If True, disables early
|
|
18
|
-
will continue regardless of error rate
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
disable_early_shutdown: If True, disables the executor's early-shutdown behavior entirely.
|
|
20
|
+
Generation will continue regardless of error rate, and the early-shutdown exception
|
|
21
|
+
will never be raised. Error counts and summaries are still collected. Default is False.
|
|
22
|
+
shutdown_error_rate: Error rate threshold (0.0-1.0) that triggers early shutdown when
|
|
23
|
+
early shutdown is enabled. Default is 0.5.
|
|
21
24
|
shutdown_error_window: Minimum number of completed tasks before error rate
|
|
22
25
|
monitoring begins. Must be >= 0. Default is 10.
|
|
26
|
+
buffer_size: Number of records to process in each batch during dataset generation.
|
|
27
|
+
A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
|
|
28
|
+
to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
|
|
29
|
+
max_conversation_restarts: Maximum number of full conversation restarts permitted when
|
|
30
|
+
generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
|
|
31
|
+
max_conversation_correction_steps: Maximum number of correction rounds permitted within a
|
|
32
|
+
single conversation when generation tasks call `ModelFacade.generate(...)`. Must be >= 0.
|
|
33
|
+
Default is 0.
|
|
23
34
|
"""
|
|
24
35
|
|
|
25
36
|
disable_early_shutdown: bool = False
|
|
26
37
|
shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
|
|
27
38
|
shutdown_error_window: int = Field(default=10, ge=0)
|
|
39
|
+
buffer_size: int = Field(default=1000, gt=0)
|
|
40
|
+
max_conversation_restarts: int = Field(default=5, ge=0)
|
|
41
|
+
max_conversation_correction_steps: int = Field(default=0, ge=0)
|
|
28
42
|
|
|
29
43
|
@model_validator(mode="after")
|
|
30
44
|
def normalize_shutdown_settings(self) -> Self:
|
|
31
|
-
"""
|
|
45
|
+
"""Normalize shutdown settings for compatibility."""
|
|
32
46
|
if self.disable_early_shutdown:
|
|
33
47
|
self.shutdown_error_rate = 1.0
|
|
34
48
|
return self
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from enum import Enum
|
|
5
|
-
from typing import Literal
|
|
7
|
+
from typing import TYPE_CHECKING, Literal
|
|
6
8
|
|
|
7
|
-
import pandas as pd
|
|
8
9
|
from pydantic import Field, field_validator, model_validator
|
|
9
10
|
from typing_extensions import Self, TypeAlias
|
|
10
11
|
|
|
@@ -16,6 +17,10 @@ from data_designer.config.utils.constants import (
|
|
|
16
17
|
MAX_AGE,
|
|
17
18
|
MIN_AGE,
|
|
18
19
|
)
|
|
20
|
+
from data_designer.lazy_heavy_imports import pd
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
import pandas as pd
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
class SamplerType(str, Enum):
|
data_designer/config/seed.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from abc import ABC
|
|
5
|
-
from typing import Literal
|
|
7
|
+
from typing import TYPE_CHECKING, Literal
|
|
6
8
|
|
|
7
|
-
import pandas as pd
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
10
|
+
from pydantic.json_schema import SkipJsonSchema
|
|
9
11
|
from typing_extensions import Self
|
|
10
12
|
|
|
11
13
|
from data_designer.config.utils.io_helpers import (
|
|
@@ -13,6 +15,10 @@ from data_designer.config.utils.io_helpers import (
|
|
|
13
15
|
validate_dataset_file_path,
|
|
14
16
|
validate_path_contains_files_of_type,
|
|
15
17
|
)
|
|
18
|
+
from data_designer.lazy_heavy_imports import pd
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import pandas as pd
|
|
16
22
|
|
|
17
23
|
|
|
18
24
|
class SeedSource(BaseModel, ABC):
|
|
@@ -68,7 +74,7 @@ class DataFrameSeedSource(SeedSource):
|
|
|
68
74
|
|
|
69
75
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
70
76
|
|
|
71
|
-
df: pd.DataFrame = Field(
|
|
77
|
+
df: SkipJsonSchema[pd.DataFrame] = Field(
|
|
72
78
|
...,
|
|
73
79
|
exclude=True,
|
|
74
80
|
description=(
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from enum import Enum
|
|
6
8
|
from typing import Literal, TypeVar
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import json
|
|
5
7
|
import logging
|
|
6
8
|
import os
|
|
@@ -8,13 +10,16 @@ from datetime import date, datetime, timedelta
|
|
|
8
10
|
from decimal import Decimal
|
|
9
11
|
from numbers import Number
|
|
10
12
|
from pathlib import Path
|
|
11
|
-
from typing import Any
|
|
13
|
+
from typing import TYPE_CHECKING, Any
|
|
12
14
|
|
|
13
|
-
import numpy as np
|
|
14
|
-
import pandas as pd
|
|
15
15
|
import yaml
|
|
16
16
|
|
|
17
17
|
from data_designer.config.errors import InvalidFileFormatError, InvalidFilePathError
|
|
18
|
+
from data_designer.lazy_heavy_imports import np, pd
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
18
23
|
|
|
19
24
|
logger = logging.getLogger(__name__)
|
|
20
25
|
|
|
@@ -48,8 +48,8 @@ def can_run_data_designer_locally() -> bool:
|
|
|
48
48
|
return True
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def
|
|
52
|
-
"""Extract all keywords from a valid
|
|
51
|
+
def extract_keywords_from_jinja2_template(template: str) -> set[str]:
|
|
52
|
+
"""Extract all keywords from a valid Jinja2 template."""
|
|
53
53
|
with template_error_handler():
|
|
54
54
|
ast = ImmutableSandboxedEnvironment().parse(template)
|
|
55
55
|
keywords = set(meta.find_undeclared_variables(ast))
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import inspect
|
|
5
7
|
from enum import Enum
|
|
6
8
|
from typing import Any, Literal, get_args, get_origin
|
|
@@ -10,8 +10,6 @@ from enum import Enum
|
|
|
10
10
|
from functools import cached_property
|
|
11
11
|
from typing import TYPE_CHECKING, Any
|
|
12
12
|
|
|
13
|
-
import numpy as np
|
|
14
|
-
import pandas as pd
|
|
15
13
|
from rich.console import Console, Group
|
|
16
14
|
from rich.padding import Padding
|
|
17
15
|
from rich.panel import Panel
|
|
@@ -28,9 +26,14 @@ from data_designer.config.sampler_params import SamplerType
|
|
|
28
26
|
from data_designer.config.utils.code_lang import code_lang_to_syntax_lexer
|
|
29
27
|
from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
|
|
30
28
|
from data_designer.config.utils.errors import DatasetSampleDisplayError
|
|
29
|
+
from data_designer.lazy_heavy_imports import np, pd
|
|
31
30
|
|
|
32
31
|
if TYPE_CHECKING:
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
33
35
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
36
|
+
from data_designer.config.dataset_metadata import DatasetMetadata
|
|
34
37
|
|
|
35
38
|
|
|
36
39
|
console = Console()
|
|
@@ -57,6 +60,7 @@ class ColorPalette(str, Enum):
|
|
|
57
60
|
|
|
58
61
|
class WithRecordSamplerMixin:
|
|
59
62
|
_display_cycle_index: int = 0
|
|
63
|
+
dataset_metadata: DatasetMetadata | None
|
|
60
64
|
|
|
61
65
|
@cached_property
|
|
62
66
|
def _record_sampler_dataset(self) -> pd.DataFrame:
|
|
@@ -79,22 +83,22 @@ class WithRecordSamplerMixin:
|
|
|
79
83
|
self,
|
|
80
84
|
index: int | None = None,
|
|
81
85
|
*,
|
|
82
|
-
hide_seed_columns: bool = False,
|
|
83
86
|
syntax_highlighting_theme: str = "dracula",
|
|
84
87
|
background_color: str | None = None,
|
|
85
88
|
processors_to_display: list[str] | None = None,
|
|
89
|
+
hide_seed_columns: bool = False,
|
|
86
90
|
) -> None:
|
|
87
91
|
"""Display a sample record from the Data Designer dataset preview.
|
|
88
92
|
|
|
89
93
|
Args:
|
|
90
94
|
index: Index of the record to display. If None, the next record will be displayed.
|
|
91
95
|
This is useful for running the cell in a notebook multiple times.
|
|
92
|
-
hide_seed_columns: If True, the columns from the seed dataset (if any) will not be displayed.
|
|
93
96
|
syntax_highlighting_theme: Theme to use for syntax highlighting. See the `Syntax`
|
|
94
97
|
documentation from `rich` for information about available themes.
|
|
95
98
|
background_color: Background color to use for the record. See the `Syntax`
|
|
96
99
|
documentation from `rich` for information about available background colors.
|
|
97
100
|
processors_to_display: List of processors to display the artifacts for. If None, all processors will be displayed.
|
|
101
|
+
hide_seed_columns: If True, seed columns will not be displayed separately.
|
|
98
102
|
"""
|
|
99
103
|
i = index or self._display_cycle_index
|
|
100
104
|
|
|
@@ -120,14 +124,18 @@ class WithRecordSamplerMixin:
|
|
|
120
124
|
else:
|
|
121
125
|
processor_data_to_display[processor] = self.processor_artifacts[processor]
|
|
122
126
|
|
|
127
|
+
seed_column_names = (
|
|
128
|
+
None if hide_seed_columns or self.dataset_metadata is None else self.dataset_metadata.seed_column_names
|
|
129
|
+
)
|
|
130
|
+
|
|
123
131
|
display_sample_record(
|
|
124
132
|
record=record,
|
|
125
133
|
processor_data_to_display=processor_data_to_display,
|
|
126
134
|
config_builder=self._config_builder,
|
|
127
135
|
background_color=background_color,
|
|
128
136
|
syntax_highlighting_theme=syntax_highlighting_theme,
|
|
129
|
-
hide_seed_columns=hide_seed_columns,
|
|
130
137
|
record_index=i,
|
|
138
|
+
seed_column_names=seed_column_names,
|
|
131
139
|
)
|
|
132
140
|
if index is None:
|
|
133
141
|
self._display_cycle_index = (self._display_cycle_index + 1) % num_records
|
|
@@ -160,7 +168,7 @@ def display_sample_record(
|
|
|
160
168
|
background_color: str | None = None,
|
|
161
169
|
syntax_highlighting_theme: str = "dracula",
|
|
162
170
|
record_index: int | None = None,
|
|
163
|
-
|
|
171
|
+
seed_column_names: list[str] | None = None,
|
|
164
172
|
):
|
|
165
173
|
if isinstance(record, (dict, pd.Series)):
|
|
166
174
|
record = pd.DataFrame([record]).iloc[0]
|
|
@@ -179,14 +187,14 @@ def display_sample_record(
|
|
|
179
187
|
render_list = []
|
|
180
188
|
table_kws = dict(show_lines=True, expand=True)
|
|
181
189
|
|
|
182
|
-
|
|
183
|
-
if
|
|
190
|
+
# Display seed columns if seed_column_names is provided and not empty
|
|
191
|
+
if seed_column_names:
|
|
184
192
|
table = Table(title="Seed Columns", **table_kws)
|
|
185
193
|
table.add_column("Name")
|
|
186
194
|
table.add_column("Value")
|
|
187
|
-
for
|
|
188
|
-
if
|
|
189
|
-
table.add_row(
|
|
195
|
+
for col_name in seed_column_names:
|
|
196
|
+
if col_name in record.index:
|
|
197
|
+
table.add_row(col_name, convert_to_row_element(record[col_name]))
|
|
190
198
|
render_list.append(pad_console_element(table))
|
|
191
199
|
|
|
192
200
|
non_code_columns = (
|
|
@@ -5,15 +5,19 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
8
9
|
|
|
9
|
-
import pandas as pd
|
|
10
10
|
from pydantic import BaseModel, model_validator
|
|
11
11
|
from typing_extensions import Self
|
|
12
12
|
|
|
13
13
|
from data_designer.config.base import ConfigBase
|
|
14
14
|
from data_designer.config.column_configs import SingleColumnConfig
|
|
15
15
|
from data_designer.config.column_types import DataDesignerColumnType
|
|
16
|
-
from data_designer.engine.configurable_task import ConfigurableTask,
|
|
16
|
+
from data_designer.engine.configurable_task import ConfigurableTask, TaskConfigT
|
|
17
|
+
from data_designer.lazy_heavy_imports import pd
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
import pandas as pd
|
|
17
21
|
|
|
18
22
|
logger = logging.getLogger(__name__)
|
|
19
23
|
|
|
@@ -32,17 +36,14 @@ class ColumnConfigWithDataFrame(ConfigBase):
|
|
|
32
36
|
return (self.column_config, self.df)
|
|
33
37
|
|
|
34
38
|
|
|
35
|
-
class ColumnProfilerMetadata(ConfigurableTaskMetadata):
|
|
36
|
-
applicable_column_types: list[DataDesignerColumnType]
|
|
37
|
-
|
|
38
|
-
|
|
39
39
|
class ColumnProfiler(ConfigurableTask[TaskConfigT], ABC):
|
|
40
40
|
@staticmethod
|
|
41
41
|
@abstractmethod
|
|
42
|
-
def
|
|
42
|
+
def get_applicable_column_types() -> list[DataDesignerColumnType]:
|
|
43
|
+
"""Returns a list of column types that this profiler can be applied to during dataset profiling."""
|
|
43
44
|
|
|
44
45
|
@abstractmethod
|
|
45
46
|
def profile(self, column_config_with_df: ColumnConfigWithDataFrame) -> BaseModel: ...
|
|
46
47
|
|
|
47
48
|
def _initialize(self) -> None:
|
|
48
|
-
logger.info(f"💫 Initializing column profiler: '{self.
|
|
49
|
+
logger.info(f"💫 Initializing column profiler: '{self.name}'")
|