data-designer 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/__init__.py +2 -0
- data_designer/_version.py +2 -2
- data_designer/cli/__init__.py +2 -0
- data_designer/cli/commands/download.py +2 -0
- data_designer/cli/commands/list.py +2 -0
- data_designer/cli/commands/models.py +2 -0
- data_designer/cli/commands/providers.py +2 -0
- data_designer/cli/commands/reset.py +2 -0
- data_designer/cli/controllers/__init__.py +2 -0
- data_designer/cli/controllers/download_controller.py +2 -0
- data_designer/cli/controllers/model_controller.py +6 -1
- data_designer/cli/controllers/provider_controller.py +6 -1
- data_designer/cli/forms/__init__.py +2 -0
- data_designer/cli/forms/builder.py +2 -0
- data_designer/cli/forms/field.py +2 -0
- data_designer/cli/forms/form.py +2 -0
- data_designer/cli/forms/model_builder.py +2 -0
- data_designer/cli/forms/provider_builder.py +2 -0
- data_designer/cli/main.py +2 -0
- data_designer/cli/repositories/__init__.py +2 -0
- data_designer/cli/repositories/base.py +2 -0
- data_designer/cli/repositories/model_repository.py +2 -0
- data_designer/cli/repositories/persona_repository.py +2 -0
- data_designer/cli/repositories/provider_repository.py +2 -0
- data_designer/cli/services/__init__.py +2 -0
- data_designer/cli/services/download_service.py +2 -0
- data_designer/cli/services/model_service.py +2 -0
- data_designer/cli/services/provider_service.py +2 -0
- data_designer/cli/ui.py +2 -0
- data_designer/cli/utils.py +2 -0
- data_designer/config/analysis/column_profilers.py +2 -0
- data_designer/config/analysis/column_statistics.py +8 -5
- data_designer/config/analysis/dataset_profiler.py +9 -3
- data_designer/config/analysis/utils/errors.py +2 -0
- data_designer/config/analysis/utils/reporting.py +7 -3
- data_designer/config/column_configs.py +77 -7
- data_designer/config/column_types.py +33 -36
- data_designer/config/dataset_builders.py +2 -0
- data_designer/config/default_model_settings.py +1 -0
- data_designer/config/errors.py +2 -0
- data_designer/config/exports.py +2 -0
- data_designer/config/interface.py +3 -2
- data_designer/config/models.py +7 -2
- data_designer/config/preview_results.py +7 -3
- data_designer/config/processors.py +2 -0
- data_designer/config/run_config.py +2 -0
- data_designer/config/sampler_constraints.py +2 -0
- data_designer/config/sampler_params.py +7 -2
- data_designer/config/seed.py +2 -0
- data_designer/config/seed_source.py +7 -2
- data_designer/config/seed_source_types.py +2 -0
- data_designer/config/utils/constants.py +2 -0
- data_designer/config/utils/errors.py +2 -0
- data_designer/config/utils/info.py +2 -0
- data_designer/config/utils/io_helpers.py +8 -3
- data_designer/config/utils/misc.py +2 -2
- data_designer/config/utils/numerical_helpers.py +2 -0
- data_designer/config/utils/type_helpers.py +2 -0
- data_designer/config/utils/visualization.py +8 -4
- data_designer/config/validator_params.py +2 -0
- data_designer/engine/analysis/column_profilers/base.py +9 -8
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
- data_designer/engine/analysis/column_profilers/registry.py +2 -0
- data_designer/engine/analysis/column_statistics.py +5 -2
- data_designer/engine/analysis/dataset_profiler.py +12 -9
- data_designer/engine/analysis/errors.py +2 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
- data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
- data_designer/engine/column_generators/generators/base.py +26 -14
- data_designer/engine/column_generators/generators/embedding.py +4 -11
- data_designer/engine/column_generators/generators/expression.py +7 -16
- data_designer/engine/column_generators/generators/llm_completion.py +11 -37
- data_designer/engine/column_generators/generators/samplers.py +8 -14
- data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
- data_designer/engine/column_generators/generators/validation.py +8 -20
- data_designer/engine/column_generators/registry.py +2 -0
- data_designer/engine/column_generators/utils/errors.py +2 -0
- data_designer/engine/column_generators/utils/generator_classification.py +2 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
- data_designer/engine/compiler.py +3 -6
- data_designer/engine/configurable_task.py +12 -13
- data_designer/engine/dataset_builders/artifact_storage.py +87 -8
- data_designer/engine/dataset_builders/column_wise_builder.py +32 -34
- data_designer/engine/dataset_builders/errors.py +2 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
- data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
- data_designer/engine/dataset_builders/utils/dag.py +7 -2
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +9 -6
- data_designer/engine/dataset_builders/utils/errors.py +2 -0
- data_designer/engine/errors.py +2 -0
- data_designer/engine/model_provider.py +2 -0
- data_designer/engine/models/errors.py +23 -31
- data_designer/engine/models/facade.py +12 -9
- data_designer/engine/models/factory.py +42 -0
- data_designer/engine/models/litellm_overrides.py +22 -11
- data_designer/engine/models/parsers/errors.py +2 -0
- data_designer/engine/models/parsers/parser.py +2 -2
- data_designer/engine/models/parsers/postprocessors.py +1 -0
- data_designer/engine/models/parsers/tag_parsers.py +2 -0
- data_designer/engine/models/parsers/types.py +2 -0
- data_designer/engine/models/recipes/base.py +2 -0
- data_designer/engine/models/recipes/response_recipes.py +2 -0
- data_designer/engine/models/registry.py +11 -18
- data_designer/engine/models/telemetry.py +6 -2
- data_designer/engine/processing/ginja/ast.py +2 -0
- data_designer/engine/processing/ginja/environment.py +2 -0
- data_designer/engine/processing/ginja/exceptions.py +2 -0
- data_designer/engine/processing/ginja/record.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +9 -2
- data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
- data_designer/engine/processing/gsonschema/types.py +2 -0
- data_designer/engine/processing/gsonschema/validators.py +10 -6
- data_designer/engine/processing/processors/base.py +1 -5
- data_designer/engine/processing/processors/drop_columns.py +7 -10
- data_designer/engine/processing/processors/registry.py +2 -0
- data_designer/engine/processing/processors/schema_transform.py +7 -10
- data_designer/engine/processing/utils.py +7 -3
- data_designer/engine/registry/base.py +2 -0
- data_designer/engine/registry/data_designer_registry.py +2 -0
- data_designer/engine/registry/errors.py +2 -0
- data_designer/engine/resources/managed_dataset_generator.py +6 -2
- data_designer/engine/resources/managed_dataset_repository.py +8 -5
- data_designer/engine/resources/managed_storage.py +2 -0
- data_designer/engine/resources/resource_provider.py +8 -1
- data_designer/engine/resources/seed_reader.py +7 -2
- data_designer/engine/sampling_gen/column.py +2 -0
- data_designer/engine/sampling_gen/constraints.py +8 -2
- data_designer/engine/sampling_gen/data_sources/base.py +10 -7
- data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
- data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/errors.py +2 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/person.py +2 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
- data_designer/engine/sampling_gen/errors.py +2 -0
- data_designer/engine/sampling_gen/generator.py +5 -4
- data_designer/engine/sampling_gen/jinja_utils.py +7 -3
- data_designer/engine/sampling_gen/people_gen.py +7 -7
- data_designer/engine/sampling_gen/person_constants.py +2 -0
- data_designer/engine/sampling_gen/schema.py +5 -1
- data_designer/engine/sampling_gen/schema_builder.py +2 -0
- data_designer/engine/sampling_gen/utils.py +7 -1
- data_designer/engine/secret_resolver.py +2 -0
- data_designer/engine/validation.py +2 -2
- data_designer/engine/validators/__init__.py +2 -0
- data_designer/engine/validators/base.py +2 -0
- data_designer/engine/validators/local_callable.py +7 -2
- data_designer/engine/validators/python.py +7 -1
- data_designer/engine/validators/remote.py +7 -1
- data_designer/engine/validators/sql.py +8 -3
- data_designer/errors.py +2 -0
- data_designer/essentials/__init__.py +2 -0
- data_designer/interface/data_designer.py +23 -17
- data_designer/interface/errors.py +2 -0
- data_designer/interface/results.py +5 -2
- data_designer/lazy_heavy_imports.py +54 -0
- data_designer/logging.py +2 -0
- data_designer/plugins/__init__.py +2 -0
- data_designer/plugins/errors.py +2 -0
- data_designer/plugins/plugin.py +0 -1
- data_designer/plugins/registry.py +2 -0
- data_designer/plugins/testing/__init__.py +2 -0
- data_designer/plugins/testing/stubs.py +21 -43
- data_designer/plugins/testing/utils.py +2 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/METADATA +12 -5
- data_designer-0.3.6.dist-info/RECORD +196 -0
- data_designer-0.3.4.dist-info/RECORD +0 -194
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/WHEEL +0 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/entry_points.txt +0 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
4
5
|
|
|
5
6
|
from typing_extensions import TypeAlias
|
|
6
7
|
|
|
@@ -15,7 +16,7 @@ from data_designer.config.column_configs import (
|
|
|
15
16
|
SeedDatasetColumnConfig,
|
|
16
17
|
ValidationColumnConfig,
|
|
17
18
|
)
|
|
18
|
-
from data_designer.config.errors import
|
|
19
|
+
from data_designer.config.errors import InvalidConfigError
|
|
19
20
|
from data_designer.config.sampler_params import SamplerType
|
|
20
21
|
from data_designer.config.utils.type_helpers import (
|
|
21
22
|
SAMPLER_PARAMS,
|
|
@@ -45,22 +46,6 @@ DataDesignerColumnType = create_str_enum_from_discriminated_type_union(
|
|
|
45
46
|
discriminator_field_name="column_type",
|
|
46
47
|
)
|
|
47
48
|
|
|
48
|
-
COLUMN_TYPE_EMOJI_MAP = {
|
|
49
|
-
"general": "⚛️", # possible analysis column type
|
|
50
|
-
DataDesignerColumnType.EXPRESSION: "🧩",
|
|
51
|
-
DataDesignerColumnType.LLM_CODE: "💻",
|
|
52
|
-
DataDesignerColumnType.LLM_JUDGE: "⚖️",
|
|
53
|
-
DataDesignerColumnType.LLM_STRUCTURED: "🗂️",
|
|
54
|
-
DataDesignerColumnType.LLM_TEXT: "📝",
|
|
55
|
-
DataDesignerColumnType.SEED_DATASET: "🌱",
|
|
56
|
-
DataDesignerColumnType.SAMPLER: "🎲",
|
|
57
|
-
DataDesignerColumnType.VALIDATION: "🔍",
|
|
58
|
-
DataDesignerColumnType.EMBEDDING: "🧬",
|
|
59
|
-
}
|
|
60
|
-
COLUMN_TYPE_EMOJI_MAP.update(
|
|
61
|
-
{DataDesignerColumnType(p.name): p.emoji for p in plugin_manager.get_column_generator_plugins()}
|
|
62
|
-
)
|
|
63
|
-
|
|
64
49
|
|
|
65
50
|
def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType, **kwargs) -> ColumnConfigT:
|
|
66
51
|
"""Create a Data Designer column config object from kwargs.
|
|
@@ -74,27 +59,20 @@ def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType
|
|
|
74
59
|
Data Designer column object of the appropriate type.
|
|
75
60
|
"""
|
|
76
61
|
column_type = resolve_string_enum(column_type, DataDesignerColumnType)
|
|
77
|
-
|
|
78
|
-
return LLMTextColumnConfig(name=name, **kwargs)
|
|
79
|
-
if column_type == DataDesignerColumnType.LLM_CODE:
|
|
80
|
-
return LLMCodeColumnConfig(name=name, **kwargs)
|
|
81
|
-
if column_type == DataDesignerColumnType.LLM_STRUCTURED:
|
|
82
|
-
return LLMStructuredColumnConfig(name=name, **kwargs)
|
|
83
|
-
if column_type == DataDesignerColumnType.LLM_JUDGE:
|
|
84
|
-
return LLMJudgeColumnConfig(name=name, **kwargs)
|
|
85
|
-
if column_type == DataDesignerColumnType.VALIDATION:
|
|
86
|
-
return ValidationColumnConfig(name=name, **kwargs)
|
|
87
|
-
if column_type == DataDesignerColumnType.EXPRESSION:
|
|
88
|
-
return ExpressionColumnConfig(name=name, **kwargs)
|
|
62
|
+
config_cls = get_column_config_cls_from_type(column_type)
|
|
89
63
|
if column_type == DataDesignerColumnType.SAMPLER:
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
64
|
+
kwargs = _resolve_sampler_kwargs(name, kwargs)
|
|
65
|
+
return config_cls(name=name, **kwargs)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_column_config_cls_from_type(column_type: DataDesignerColumnType) -> type[ColumnConfigT]:
|
|
69
|
+
"""Get the column config class for a column type."""
|
|
70
|
+
column_type = resolve_string_enum(column_type, DataDesignerColumnType)
|
|
71
|
+
if column_type in _COLUMN_TYPE_CONFIG_CLS_MAP:
|
|
72
|
+
return _COLUMN_TYPE_CONFIG_CLS_MAP[column_type]
|
|
95
73
|
if plugin := plugin_manager.get_column_generator_plugin_if_exists(column_type.value):
|
|
96
|
-
return plugin.config_cls
|
|
97
|
-
raise
|
|
74
|
+
return plugin.config_cls
|
|
75
|
+
raise InvalidConfigError(f"🛑 {column_type} is not a valid column type.")
|
|
98
76
|
|
|
99
77
|
|
|
100
78
|
def get_column_display_order() -> list[DataDesignerColumnType]:
|
|
@@ -114,6 +92,12 @@ def get_column_display_order() -> list[DataDesignerColumnType]:
|
|
|
114
92
|
return display_order
|
|
115
93
|
|
|
116
94
|
|
|
95
|
+
def get_column_emoji_from_type(column_type: DataDesignerColumnType) -> str:
|
|
96
|
+
"""Get the emoji for a column type."""
|
|
97
|
+
config_cls = get_column_config_cls_from_type(resolve_string_enum(column_type, DataDesignerColumnType))
|
|
98
|
+
return config_cls.get_column_emoji()
|
|
99
|
+
|
|
100
|
+
|
|
117
101
|
def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
|
|
118
102
|
if "sampler_type" not in kwargs:
|
|
119
103
|
raise InvalidConfigError(f"🛑 `sampler_type` is required for sampler column '{name}'.")
|
|
@@ -142,3 +126,16 @@ def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
|
|
|
142
126
|
"params": params,
|
|
143
127
|
**{k: v for k, v in kwargs.items() if k not in ["sampler_type", "params"]},
|
|
144
128
|
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
_COLUMN_TYPE_CONFIG_CLS_MAP = {
|
|
132
|
+
DataDesignerColumnType.LLM_TEXT: LLMTextColumnConfig,
|
|
133
|
+
DataDesignerColumnType.LLM_CODE: LLMCodeColumnConfig,
|
|
134
|
+
DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnConfig,
|
|
135
|
+
DataDesignerColumnType.LLM_JUDGE: LLMJudgeColumnConfig,
|
|
136
|
+
DataDesignerColumnType.VALIDATION: ValidationColumnConfig,
|
|
137
|
+
DataDesignerColumnType.EXPRESSION: ExpressionColumnConfig,
|
|
138
|
+
DataDesignerColumnType.SAMPLER: SamplerColumnConfig,
|
|
139
|
+
DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnConfig,
|
|
140
|
+
DataDesignerColumnType.EMBEDDING: EmbeddingColumnConfig,
|
|
141
|
+
}
|
data_designer/config/errors.py
CHANGED
data_designer/config/exports.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.config.analysis.column_profilers import JudgeScoreProfilerConfig
|
|
5
7
|
from data_designer.config.column_configs import (
|
|
6
8
|
EmbeddingColumnConfig,
|
|
@@ -6,13 +6,14 @@ from __future__ import annotations
|
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
|
|
8
8
|
|
|
9
|
-
import pandas as pd
|
|
10
|
-
|
|
11
9
|
from data_designer.config.models import ModelConfig, ModelProvider
|
|
12
10
|
from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS
|
|
13
11
|
from data_designer.config.utils.info import InterfaceInfo
|
|
12
|
+
from data_designer.lazy_heavy_imports import pd
|
|
14
13
|
|
|
15
14
|
if TYPE_CHECKING:
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
16
17
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
17
18
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
18
19
|
from data_designer.config.preview_results import PreviewResults
|
data_designer/config/models.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
from abc import ABC, abstractmethod
|
|
6
8
|
from enum import Enum
|
|
7
9
|
from pathlib import Path
|
|
8
|
-
from typing import Annotated, Any, Generic, Literal, TypeVar
|
|
10
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generic, Literal, TypeVar
|
|
9
11
|
|
|
10
|
-
import numpy as np
|
|
11
12
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
12
13
|
from typing_extensions import Self, TypeAlias
|
|
13
14
|
|
|
@@ -20,6 +21,10 @@ from data_designer.config.utils.constants import (
|
|
|
20
21
|
MIN_TOP_P,
|
|
21
22
|
)
|
|
22
23
|
from data_designer.config.utils.io_helpers import smart_load_yaml
|
|
24
|
+
from data_designer.lazy_heavy_imports import np
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
import numpy as np
|
|
23
28
|
|
|
24
29
|
logger = logging.getLogger(__name__)
|
|
25
30
|
|
|
@@ -3,12 +3,16 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
7
|
|
|
8
8
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
9
9
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
10
10
|
from data_designer.config.dataset_metadata import DatasetMetadata
|
|
11
11
|
from data_designer.config.utils.visualization import WithRecordSamplerMixin
|
|
12
|
+
from data_designer.lazy_heavy_imports import pd
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import pandas as pd
|
|
12
16
|
|
|
13
17
|
|
|
14
18
|
class PreviewResults(WithRecordSamplerMixin):
|
|
@@ -16,7 +20,7 @@ class PreviewResults(WithRecordSamplerMixin):
|
|
|
16
20
|
self,
|
|
17
21
|
*,
|
|
18
22
|
config_builder: DataDesignerConfigBuilder,
|
|
19
|
-
dataset_metadata: DatasetMetadata,
|
|
23
|
+
dataset_metadata: DatasetMetadata | None = None,
|
|
20
24
|
dataset: pd.DataFrame | None = None,
|
|
21
25
|
analysis: DatasetProfilerResults | None = None,
|
|
22
26
|
processor_artifacts: dict[str, list[str] | str] | None = None,
|
|
@@ -33,5 +37,5 @@ class PreviewResults(WithRecordSamplerMixin):
|
|
|
33
37
|
self.dataset: pd.DataFrame | None = dataset
|
|
34
38
|
self.analysis: DatasetProfilerResults | None = analysis
|
|
35
39
|
self.processor_artifacts: dict[str, list[str] | str] | None = processor_artifacts
|
|
36
|
-
self.dataset_metadata = dataset_metadata
|
|
40
|
+
self.dataset_metadata: DatasetMetadata | None = dataset_metadata
|
|
37
41
|
self._config_builder = config_builder
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from enum import Enum
|
|
5
|
-
from typing import Literal
|
|
7
|
+
from typing import TYPE_CHECKING, Literal
|
|
6
8
|
|
|
7
|
-
import pandas as pd
|
|
8
9
|
from pydantic import Field, field_validator, model_validator
|
|
9
10
|
from typing_extensions import Self, TypeAlias
|
|
10
11
|
|
|
@@ -16,6 +17,10 @@ from data_designer.config.utils.constants import (
|
|
|
16
17
|
MAX_AGE,
|
|
17
18
|
MIN_AGE,
|
|
18
19
|
)
|
|
20
|
+
from data_designer.lazy_heavy_imports import pd
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
import pandas as pd
|
|
19
24
|
|
|
20
25
|
|
|
21
26
|
class SamplerType(str, Enum):
|
data_designer/config/seed.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from abc import ABC
|
|
5
|
-
from typing import Literal
|
|
7
|
+
from typing import TYPE_CHECKING, Literal
|
|
6
8
|
|
|
7
|
-
import pandas as pd
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
9
10
|
from pydantic.json_schema import SkipJsonSchema
|
|
10
11
|
from typing_extensions import Self
|
|
@@ -14,6 +15,10 @@ from data_designer.config.utils.io_helpers import (
|
|
|
14
15
|
validate_dataset_file_path,
|
|
15
16
|
validate_path_contains_files_of_type,
|
|
16
17
|
)
|
|
18
|
+
from data_designer.lazy_heavy_imports import pd
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import pandas as pd
|
|
17
22
|
|
|
18
23
|
|
|
19
24
|
class SeedSource(BaseModel, ABC):
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from enum import Enum
|
|
6
8
|
from typing import Literal, TypeVar
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import json
|
|
5
7
|
import logging
|
|
6
8
|
import os
|
|
@@ -8,13 +10,16 @@ from datetime import date, datetime, timedelta
|
|
|
8
10
|
from decimal import Decimal
|
|
9
11
|
from numbers import Number
|
|
10
12
|
from pathlib import Path
|
|
11
|
-
from typing import Any
|
|
13
|
+
from typing import TYPE_CHECKING, Any
|
|
12
14
|
|
|
13
|
-
import numpy as np
|
|
14
|
-
import pandas as pd
|
|
15
15
|
import yaml
|
|
16
16
|
|
|
17
17
|
from data_designer.config.errors import InvalidFileFormatError, InvalidFilePathError
|
|
18
|
+
from data_designer.lazy_heavy_imports import np, pd
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
18
23
|
|
|
19
24
|
logger = logging.getLogger(__name__)
|
|
20
25
|
|
|
@@ -48,8 +48,8 @@ def can_run_data_designer_locally() -> bool:
|
|
|
48
48
|
return True
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def
|
|
52
|
-
"""Extract all keywords from a valid
|
|
51
|
+
def extract_keywords_from_jinja2_template(template: str) -> set[str]:
|
|
52
|
+
"""Extract all keywords from a valid Jinja2 template."""
|
|
53
53
|
with template_error_handler():
|
|
54
54
|
ast = ImmutableSandboxedEnvironment().parse(template)
|
|
55
55
|
keywords = set(meta.find_undeclared_variables(ast))
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import inspect
|
|
5
7
|
from enum import Enum
|
|
6
8
|
from typing import Any, Literal, get_args, get_origin
|
|
@@ -10,8 +10,6 @@ from enum import Enum
|
|
|
10
10
|
from functools import cached_property
|
|
11
11
|
from typing import TYPE_CHECKING, Any
|
|
12
12
|
|
|
13
|
-
import numpy as np
|
|
14
|
-
import pandas as pd
|
|
15
13
|
from rich.console import Console, Group
|
|
16
14
|
from rich.padding import Padding
|
|
17
15
|
from rich.panel import Panel
|
|
@@ -28,8 +26,12 @@ from data_designer.config.sampler_params import SamplerType
|
|
|
28
26
|
from data_designer.config.utils.code_lang import code_lang_to_syntax_lexer
|
|
29
27
|
from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
|
|
30
28
|
from data_designer.config.utils.errors import DatasetSampleDisplayError
|
|
29
|
+
from data_designer.lazy_heavy_imports import np, pd
|
|
31
30
|
|
|
32
31
|
if TYPE_CHECKING:
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
33
35
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
34
36
|
from data_designer.config.dataset_metadata import DatasetMetadata
|
|
35
37
|
|
|
@@ -58,7 +60,7 @@ class ColorPalette(str, Enum):
|
|
|
58
60
|
|
|
59
61
|
class WithRecordSamplerMixin:
|
|
60
62
|
_display_cycle_index: int = 0
|
|
61
|
-
dataset_metadata: DatasetMetadata
|
|
63
|
+
dataset_metadata: DatasetMetadata | None
|
|
62
64
|
|
|
63
65
|
@cached_property
|
|
64
66
|
def _record_sampler_dataset(self) -> pd.DataFrame:
|
|
@@ -122,7 +124,9 @@ class WithRecordSamplerMixin:
|
|
|
122
124
|
else:
|
|
123
125
|
processor_data_to_display[processor] = self.processor_artifacts[processor]
|
|
124
126
|
|
|
125
|
-
seed_column_names =
|
|
127
|
+
seed_column_names = (
|
|
128
|
+
None if hide_seed_columns or self.dataset_metadata is None else self.dataset_metadata.seed_column_names
|
|
129
|
+
)
|
|
126
130
|
|
|
127
131
|
display_sample_record(
|
|
128
132
|
record=record,
|
|
@@ -5,15 +5,19 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
8
9
|
|
|
9
|
-
import pandas as pd
|
|
10
10
|
from pydantic import BaseModel, model_validator
|
|
11
11
|
from typing_extensions import Self
|
|
12
12
|
|
|
13
13
|
from data_designer.config.base import ConfigBase
|
|
14
14
|
from data_designer.config.column_configs import SingleColumnConfig
|
|
15
15
|
from data_designer.config.column_types import DataDesignerColumnType
|
|
16
|
-
from data_designer.engine.configurable_task import ConfigurableTask,
|
|
16
|
+
from data_designer.engine.configurable_task import ConfigurableTask, TaskConfigT
|
|
17
|
+
from data_designer.lazy_heavy_imports import pd
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
import pandas as pd
|
|
17
21
|
|
|
18
22
|
logger = logging.getLogger(__name__)
|
|
19
23
|
|
|
@@ -32,17 +36,14 @@ class ColumnConfigWithDataFrame(ConfigBase):
|
|
|
32
36
|
return (self.column_config, self.df)
|
|
33
37
|
|
|
34
38
|
|
|
35
|
-
class ColumnProfilerMetadata(ConfigurableTaskMetadata):
|
|
36
|
-
applicable_column_types: list[DataDesignerColumnType]
|
|
37
|
-
|
|
38
|
-
|
|
39
39
|
class ColumnProfiler(ConfigurableTask[TaskConfigT], ABC):
|
|
40
40
|
@staticmethod
|
|
41
41
|
@abstractmethod
|
|
42
|
-
def
|
|
42
|
+
def get_applicable_column_types() -> list[DataDesignerColumnType]:
|
|
43
|
+
"""Returns a list of column types that this profiler can be applied to during dataset profiling."""
|
|
43
44
|
|
|
44
45
|
@abstractmethod
|
|
45
46
|
def profile(self, column_config_with_df: ColumnConfigWithDataFrame) -> BaseModel: ...
|
|
46
47
|
|
|
47
48
|
def _initialize(self) -> None:
|
|
48
|
-
logger.info(f"💫 Initializing column profiler: '{self.
|
|
49
|
+
logger.info(f"💫 Initializing column profiler: '{self.name}'")
|
|
@@ -5,44 +5,41 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
import random
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
8
9
|
|
|
9
10
|
from data_designer.config.analysis.column_profilers import (
|
|
10
11
|
JudgeScoreProfilerConfig,
|
|
11
12
|
JudgeScoreProfilerResults,
|
|
12
|
-
JudgeScoreSample,
|
|
13
13
|
JudgeScoreSummary,
|
|
14
14
|
)
|
|
15
15
|
from data_designer.config.analysis.column_statistics import (
|
|
16
|
-
CategoricalDistribution,
|
|
17
|
-
CategoricalHistogramData,
|
|
18
16
|
ColumnDistributionType,
|
|
19
17
|
MissingValue,
|
|
20
|
-
NumericalDistribution,
|
|
21
|
-
)
|
|
22
|
-
from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP, DataDesignerColumnType
|
|
23
|
-
from data_designer.engine.analysis.column_profilers.base import (
|
|
24
|
-
ColumnConfigWithDataFrame,
|
|
25
|
-
ColumnProfiler,
|
|
26
|
-
ColumnProfilerMetadata,
|
|
27
18
|
)
|
|
19
|
+
from data_designer.config.column_types import DataDesignerColumnType
|
|
20
|
+
from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
|
|
28
21
|
from data_designer.engine.analysis.utils.judge_score_processing import (
|
|
29
22
|
extract_judge_score_distributions,
|
|
30
23
|
sample_scores_and_reasoning,
|
|
31
24
|
)
|
|
32
|
-
from data_designer.engine.models.facade import ModelFacade
|
|
33
25
|
from data_designer.engine.models.recipes.response_recipes import TextResponseRecipe
|
|
34
26
|
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from data_designer.config.analysis.column_profilers import JudgeScoreSample
|
|
29
|
+
from data_designer.config.analysis.column_statistics import (
|
|
30
|
+
CategoricalDistribution,
|
|
31
|
+
CategoricalHistogramData,
|
|
32
|
+
NumericalDistribution,
|
|
33
|
+
)
|
|
34
|
+
from data_designer.engine.models.facade import ModelFacade
|
|
35
|
+
|
|
35
36
|
logger = logging.getLogger(__name__)
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
|
|
39
40
|
@staticmethod
|
|
40
|
-
def
|
|
41
|
-
return
|
|
42
|
-
name="judge_score_profiler",
|
|
43
|
-
description="Analyzes LLM-as-judge score distributions in a Data Designer dataset.",
|
|
44
|
-
applicable_column_types=[DataDesignerColumnType.LLM_JUDGE],
|
|
45
|
-
)
|
|
41
|
+
def get_applicable_column_types() -> list[DataDesignerColumnType]:
|
|
42
|
+
return [DataDesignerColumnType.LLM_JUDGE]
|
|
46
43
|
|
|
47
44
|
def get_model(self, model_alias: str) -> ModelFacade:
|
|
48
45
|
return self.resource_provider.model_registry.get_model(model_alias=model_alias)
|
|
@@ -51,8 +48,7 @@ class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
|
|
|
51
48
|
column_config, df = column_config_with_df.as_tuple()
|
|
52
49
|
|
|
53
50
|
logger.info(
|
|
54
|
-
f"{
|
|
55
|
-
f"scores for column: '{column_config.name}'"
|
|
51
|
+
f"{column_config.get_column_emoji()} Analyzing LLM-as-judge scores for column: '{column_config.name}'"
|
|
56
52
|
)
|
|
57
53
|
|
|
58
54
|
score_summaries = {}
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.config.analysis.column_profilers import ColumnProfilerType
|
|
5
7
|
from data_designer.config.base import ConfigBase
|
|
6
8
|
from data_designer.engine.analysis.column_profilers.base import ColumnProfiler
|
|
@@ -4,9 +4,8 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
-
from typing import Any, TypeAlias
|
|
7
|
+
from typing import TYPE_CHECKING, Any, TypeAlias
|
|
8
8
|
|
|
9
|
-
import pandas as pd
|
|
10
9
|
from pydantic import BaseModel
|
|
11
10
|
from typing_extensions import Self
|
|
12
11
|
|
|
@@ -25,6 +24,10 @@ from data_designer.engine.analysis.utils.column_statistics_calculations import (
|
|
|
25
24
|
calculate_token_stats,
|
|
26
25
|
calculate_validation_column_info,
|
|
27
26
|
)
|
|
27
|
+
from data_designer.lazy_heavy_imports import pd
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
import pandas as pd
|
|
28
31
|
|
|
29
32
|
logger = logging.getLogger(__name__)
|
|
30
33
|
|