data-designer 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/__init__.py +2 -0
- data_designer/_version.py +2 -2
- data_designer/cli/__init__.py +2 -0
- data_designer/cli/commands/download.py +2 -0
- data_designer/cli/commands/list.py +2 -0
- data_designer/cli/commands/models.py +2 -0
- data_designer/cli/commands/providers.py +2 -0
- data_designer/cli/commands/reset.py +2 -0
- data_designer/cli/controllers/__init__.py +2 -0
- data_designer/cli/controllers/download_controller.py +2 -0
- data_designer/cli/controllers/model_controller.py +6 -1
- data_designer/cli/controllers/provider_controller.py +6 -1
- data_designer/cli/forms/__init__.py +2 -0
- data_designer/cli/forms/builder.py +2 -0
- data_designer/cli/forms/field.py +2 -0
- data_designer/cli/forms/form.py +2 -0
- data_designer/cli/forms/model_builder.py +2 -0
- data_designer/cli/forms/provider_builder.py +2 -0
- data_designer/cli/main.py +2 -0
- data_designer/cli/repositories/__init__.py +2 -0
- data_designer/cli/repositories/base.py +2 -0
- data_designer/cli/repositories/model_repository.py +2 -0
- data_designer/cli/repositories/persona_repository.py +2 -0
- data_designer/cli/repositories/provider_repository.py +2 -0
- data_designer/cli/services/__init__.py +2 -0
- data_designer/cli/services/download_service.py +2 -0
- data_designer/cli/services/model_service.py +2 -0
- data_designer/cli/services/provider_service.py +2 -0
- data_designer/cli/ui.py +2 -0
- data_designer/cli/utils.py +2 -0
- data_designer/config/analysis/column_profilers.py +2 -0
- data_designer/config/analysis/column_statistics.py +8 -5
- data_designer/config/analysis/dataset_profiler.py +9 -3
- data_designer/config/analysis/utils/errors.py +2 -0
- data_designer/config/analysis/utils/reporting.py +7 -3
- data_designer/config/column_configs.py +77 -7
- data_designer/config/column_types.py +33 -36
- data_designer/config/dataset_builders.py +2 -0
- data_designer/config/default_model_settings.py +1 -0
- data_designer/config/errors.py +2 -0
- data_designer/config/exports.py +2 -0
- data_designer/config/interface.py +3 -2
- data_designer/config/models.py +7 -2
- data_designer/config/preview_results.py +7 -3
- data_designer/config/processors.py +2 -0
- data_designer/config/run_config.py +2 -0
- data_designer/config/sampler_constraints.py +2 -0
- data_designer/config/sampler_params.py +7 -2
- data_designer/config/seed.py +2 -0
- data_designer/config/seed_source.py +7 -2
- data_designer/config/seed_source_types.py +2 -0
- data_designer/config/utils/constants.py +2 -0
- data_designer/config/utils/errors.py +2 -0
- data_designer/config/utils/info.py +2 -0
- data_designer/config/utils/io_helpers.py +8 -3
- data_designer/config/utils/misc.py +2 -2
- data_designer/config/utils/numerical_helpers.py +2 -0
- data_designer/config/utils/type_helpers.py +2 -0
- data_designer/config/utils/visualization.py +8 -4
- data_designer/config/validator_params.py +2 -0
- data_designer/engine/analysis/column_profilers/base.py +9 -8
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
- data_designer/engine/analysis/column_profilers/registry.py +2 -0
- data_designer/engine/analysis/column_statistics.py +5 -2
- data_designer/engine/analysis/dataset_profiler.py +12 -9
- data_designer/engine/analysis/errors.py +2 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
- data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
- data_designer/engine/column_generators/generators/base.py +26 -14
- data_designer/engine/column_generators/generators/embedding.py +4 -11
- data_designer/engine/column_generators/generators/expression.py +7 -16
- data_designer/engine/column_generators/generators/llm_completion.py +11 -37
- data_designer/engine/column_generators/generators/samplers.py +8 -14
- data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
- data_designer/engine/column_generators/generators/validation.py +8 -20
- data_designer/engine/column_generators/registry.py +2 -0
- data_designer/engine/column_generators/utils/errors.py +2 -0
- data_designer/engine/column_generators/utils/generator_classification.py +2 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
- data_designer/engine/compiler.py +3 -6
- data_designer/engine/configurable_task.py +12 -13
- data_designer/engine/dataset_builders/artifact_storage.py +87 -8
- data_designer/engine/dataset_builders/column_wise_builder.py +32 -34
- data_designer/engine/dataset_builders/errors.py +2 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
- data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
- data_designer/engine/dataset_builders/utils/dag.py +7 -2
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +9 -6
- data_designer/engine/dataset_builders/utils/errors.py +2 -0
- data_designer/engine/errors.py +2 -0
- data_designer/engine/model_provider.py +2 -0
- data_designer/engine/models/errors.py +23 -31
- data_designer/engine/models/facade.py +12 -9
- data_designer/engine/models/factory.py +42 -0
- data_designer/engine/models/litellm_overrides.py +22 -11
- data_designer/engine/models/parsers/errors.py +2 -0
- data_designer/engine/models/parsers/parser.py +2 -2
- data_designer/engine/models/parsers/postprocessors.py +1 -0
- data_designer/engine/models/parsers/tag_parsers.py +2 -0
- data_designer/engine/models/parsers/types.py +2 -0
- data_designer/engine/models/recipes/base.py +2 -0
- data_designer/engine/models/recipes/response_recipes.py +2 -0
- data_designer/engine/models/registry.py +11 -18
- data_designer/engine/models/telemetry.py +6 -2
- data_designer/engine/processing/ginja/ast.py +2 -0
- data_designer/engine/processing/ginja/environment.py +2 -0
- data_designer/engine/processing/ginja/exceptions.py +2 -0
- data_designer/engine/processing/ginja/record.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +9 -2
- data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
- data_designer/engine/processing/gsonschema/types.py +2 -0
- data_designer/engine/processing/gsonschema/validators.py +10 -6
- data_designer/engine/processing/processors/base.py +1 -5
- data_designer/engine/processing/processors/drop_columns.py +7 -10
- data_designer/engine/processing/processors/registry.py +2 -0
- data_designer/engine/processing/processors/schema_transform.py +7 -10
- data_designer/engine/processing/utils.py +7 -3
- data_designer/engine/registry/base.py +2 -0
- data_designer/engine/registry/data_designer_registry.py +2 -0
- data_designer/engine/registry/errors.py +2 -0
- data_designer/engine/resources/managed_dataset_generator.py +6 -2
- data_designer/engine/resources/managed_dataset_repository.py +8 -5
- data_designer/engine/resources/managed_storage.py +2 -0
- data_designer/engine/resources/resource_provider.py +8 -1
- data_designer/engine/resources/seed_reader.py +7 -2
- data_designer/engine/sampling_gen/column.py +2 -0
- data_designer/engine/sampling_gen/constraints.py +8 -2
- data_designer/engine/sampling_gen/data_sources/base.py +10 -7
- data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
- data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/errors.py +2 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/person.py +2 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
- data_designer/engine/sampling_gen/errors.py +2 -0
- data_designer/engine/sampling_gen/generator.py +5 -4
- data_designer/engine/sampling_gen/jinja_utils.py +7 -3
- data_designer/engine/sampling_gen/people_gen.py +7 -7
- data_designer/engine/sampling_gen/person_constants.py +2 -0
- data_designer/engine/sampling_gen/schema.py +5 -1
- data_designer/engine/sampling_gen/schema_builder.py +2 -0
- data_designer/engine/sampling_gen/utils.py +7 -1
- data_designer/engine/secret_resolver.py +2 -0
- data_designer/engine/validation.py +2 -2
- data_designer/engine/validators/__init__.py +2 -0
- data_designer/engine/validators/base.py +2 -0
- data_designer/engine/validators/local_callable.py +7 -2
- data_designer/engine/validators/python.py +7 -1
- data_designer/engine/validators/remote.py +7 -1
- data_designer/engine/validators/sql.py +8 -3
- data_designer/errors.py +2 -0
- data_designer/essentials/__init__.py +2 -0
- data_designer/interface/data_designer.py +23 -17
- data_designer/interface/errors.py +2 -0
- data_designer/interface/results.py +5 -2
- data_designer/lazy_heavy_imports.py +54 -0
- data_designer/logging.py +2 -0
- data_designer/plugins/__init__.py +2 -0
- data_designer/plugins/errors.py +2 -0
- data_designer/plugins/plugin.py +0 -1
- data_designer/plugins/registry.py +2 -0
- data_designer/plugins/testing/__init__.py +2 -0
- data_designer/plugins/testing/stubs.py +21 -43
- data_designer/plugins/testing/utils.py +2 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/METADATA +12 -5
- data_designer-0.3.6.dist-info/RECORD +196 -0
- data_designer-0.3.4.dist-info/RECORD +0 -194
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/WHEEL +0 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/entry_points.txt +0 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/licenses/LICENSE +0 -0
data_designer/__init__.py
CHANGED
data_designer/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.3.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 3,
|
|
31
|
+
__version__ = version = '0.3.6'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 3, 6)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
data_designer/cli/__init__.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import typer
|
|
5
7
|
|
|
6
8
|
from data_designer.cli.controllers.download_controller import DownloadController
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from rich.table import Table
|
|
5
7
|
|
|
6
8
|
from data_designer.cli.repositories.model_repository import ModelRepository
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.cli.controllers.model_controller import ModelController
|
|
5
7
|
from data_designer.config.utils.constants import DATA_DESIGNER_HOME
|
|
6
8
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.cli.controllers.provider_controller import ProviderController
|
|
5
7
|
from data_designer.config.utils.constants import DATA_DESIGNER_HOME
|
|
6
8
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import typer
|
|
5
7
|
|
|
6
8
|
from data_designer.cli.repositories.model_repository import ModelRepository
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.cli.controllers.download_controller import DownloadController
|
|
5
7
|
from data_designer.cli.controllers.model_controller import ModelController
|
|
6
8
|
from data_designer.cli.controllers.provider_controller import ProviderController
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
5
8
|
|
|
6
9
|
from data_designer.cli.forms.model_builder import ModelFormBuilder
|
|
7
10
|
from data_designer.cli.repositories.model_repository import ModelRepository
|
|
@@ -20,7 +23,9 @@ from data_designer.cli.ui import (
|
|
|
20
23
|
print_warning,
|
|
21
24
|
select_with_arrows,
|
|
22
25
|
)
|
|
23
|
-
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from data_designer.config.models import ModelConfig
|
|
24
29
|
|
|
25
30
|
|
|
26
31
|
class ModelController:
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import copy
|
|
5
7
|
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
6
9
|
|
|
7
10
|
from data_designer.cli.forms.provider_builder import ProviderFormBuilder
|
|
8
11
|
from data_designer.cli.repositories.model_repository import ModelRepository
|
|
@@ -20,7 +23,9 @@ from data_designer.cli.ui import (
|
|
|
20
23
|
print_warning,
|
|
21
24
|
select_with_arrows,
|
|
22
25
|
)
|
|
23
|
-
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from data_designer.engine.model_provider import ModelProvider
|
|
24
29
|
|
|
25
30
|
|
|
26
31
|
class ProviderController:
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.cli.forms.builder import FormBuilder
|
|
5
7
|
from data_designer.cli.forms.field import Field, NumericField, SelectField, TextField, ValidationError
|
|
6
8
|
from data_designer.cli.forms.form import Form
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from typing import Any, Generic, TypeVar
|
|
6
8
|
|
data_designer/cli/forms/field.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from collections.abc import Callable
|
|
6
8
|
from typing import Any, Generic, TypeVar
|
data_designer/cli/forms/form.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from typing import Any
|
|
5
7
|
|
|
6
8
|
from data_designer.cli.forms.builder import FormBuilder
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from typing import Any
|
|
5
7
|
|
|
6
8
|
from data_designer.cli.forms.builder import FormBuilder
|
data_designer/cli/main.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import typer
|
|
5
7
|
|
|
6
8
|
from data_designer.cli.commands import download, models, providers, reset
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.cli.repositories.base import ConfigRepository
|
|
5
7
|
from data_designer.cli.repositories.model_repository import ModelRepository
|
|
6
8
|
from data_designer.cli.repositories.provider_repository import ProviderRepository
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from pathlib import Path
|
|
6
8
|
from typing import Generic, TypeVar
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from pydantic import BaseModel
|
|
5
7
|
|
|
6
8
|
from data_designer.config.utils.constants import (
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.cli.services.download_service import DownloadService
|
|
5
7
|
from data_designer.cli.services.model_service import ModelService
|
|
6
8
|
from data_designer.cli.services.provider_service import ProviderService
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.cli.repositories.model_repository import ModelConfigRegistry, ModelRepository
|
|
5
7
|
from data_designer.config.models import ModelConfig
|
|
6
8
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.cli.repositories.provider_repository import ModelProviderRegistry, ProviderRepository
|
|
5
7
|
from data_designer.config.models import ModelProvider
|
|
6
8
|
|
data_designer/cli/ui.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from collections.abc import Callable
|
|
5
7
|
|
|
6
8
|
from prompt_toolkit import Application, prompt
|
data_designer/cli/utils.py
CHANGED
|
@@ -5,9 +5,8 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
from enum import Enum
|
|
8
|
-
from typing import Any, Literal
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
9
9
|
|
|
10
|
-
from pandas import Series
|
|
11
10
|
from pydantic import BaseModel, ConfigDict, create_model, field_validator, model_validator
|
|
12
11
|
from typing_extensions import Self, TypeAlias
|
|
13
12
|
|
|
@@ -15,8 +14,12 @@ from data_designer.config.column_types import DataDesignerColumnType
|
|
|
15
14
|
from data_designer.config.sampler_params import SamplerType
|
|
16
15
|
from data_designer.config.utils.constants import EPSILON
|
|
17
16
|
from data_designer.config.utils.numerical_helpers import is_float, is_int, prepare_number_for_reporting
|
|
17
|
+
from data_designer.lazy_heavy_imports import pd
|
|
18
18
|
from data_designer.plugin_manager import PluginManager
|
|
19
19
|
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
20
23
|
|
|
21
24
|
class MissingValue(str, Enum):
|
|
22
25
|
CALCULATION_FAILED = "--"
|
|
@@ -314,7 +317,7 @@ class CategoricalHistogramData(BaseModel):
|
|
|
314
317
|
return self
|
|
315
318
|
|
|
316
319
|
@classmethod
|
|
317
|
-
def from_series(cls, series: Series) -> Self:
|
|
320
|
+
def from_series(cls, series: pd.Series) -> Self:
|
|
318
321
|
counts = series.value_counts()
|
|
319
322
|
return cls(categories=counts.index.tolist(), counts=counts.tolist())
|
|
320
323
|
|
|
@@ -337,7 +340,7 @@ class CategoricalDistribution(BaseModel):
|
|
|
337
340
|
return str(v) if not is_int(v) else prepare_number_for_reporting(v, int)
|
|
338
341
|
|
|
339
342
|
@classmethod
|
|
340
|
-
def from_series(cls, series: Series) -> Self:
|
|
343
|
+
def from_series(cls, series: pd.Series) -> Self:
|
|
341
344
|
counts = series.value_counts()
|
|
342
345
|
return cls(
|
|
343
346
|
most_common_value=counts.index[0],
|
|
@@ -368,7 +371,7 @@ class NumericalDistribution(BaseModel):
|
|
|
368
371
|
return prepare_number_for_reporting(v, int if is_int(v) else float)
|
|
369
372
|
|
|
370
373
|
@classmethod
|
|
371
|
-
def from_series(cls, series: Series) -> Self:
|
|
374
|
+
def from_series(cls, series: pd.Series) -> Self:
|
|
372
375
|
return cls(
|
|
373
376
|
min=series.min(skipna=True),
|
|
374
377
|
max=series.max(skipna=True),
|
|
@@ -1,19 +1,25 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from functools import cached_property
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
from typing import Annotated
|
|
8
|
+
from typing import TYPE_CHECKING, Annotated
|
|
7
9
|
|
|
8
10
|
from pydantic import BaseModel, Field, field_validator
|
|
9
11
|
|
|
10
12
|
from data_designer.config.analysis.column_profilers import ColumnProfilerResultsT
|
|
11
13
|
from data_designer.config.analysis.column_statistics import ColumnStatisticsT
|
|
12
|
-
from data_designer.config.analysis.utils.reporting import
|
|
13
|
-
from data_designer.config.column_types import
|
|
14
|
+
from data_designer.config.analysis.utils.reporting import generate_analysis_report
|
|
15
|
+
from data_designer.config.column_types import get_column_display_order
|
|
14
16
|
from data_designer.config.utils.constants import EPSILON
|
|
15
17
|
from data_designer.config.utils.numerical_helpers import prepare_number_for_reporting
|
|
16
18
|
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from data_designer.config.analysis.utils.reporting import ReportSection
|
|
21
|
+
from data_designer.config.column_types import DataDesignerColumnType
|
|
22
|
+
|
|
17
23
|
|
|
18
24
|
class DatasetProfilerResults(BaseModel):
|
|
19
25
|
"""Container for complete dataset profiling and analysis results.
|
|
@@ -14,9 +14,12 @@ from rich.rule import Rule
|
|
|
14
14
|
from rich.table import Column, Table
|
|
15
15
|
from rich.text import Text
|
|
16
16
|
|
|
17
|
-
from data_designer.config.analysis.column_statistics import CategoricalHistogramData
|
|
18
17
|
from data_designer.config.analysis.utils.errors import AnalysisReportError
|
|
19
|
-
from data_designer.config.column_types import
|
|
18
|
+
from data_designer.config.column_types import (
|
|
19
|
+
DataDesignerColumnType,
|
|
20
|
+
get_column_display_order,
|
|
21
|
+
get_column_emoji_from_type,
|
|
22
|
+
)
|
|
20
23
|
from data_designer.config.utils.visualization import (
|
|
21
24
|
ColorPalette,
|
|
22
25
|
convert_to_row_element,
|
|
@@ -25,6 +28,7 @@ from data_designer.config.utils.visualization import (
|
|
|
25
28
|
)
|
|
26
29
|
|
|
27
30
|
if TYPE_CHECKING:
|
|
31
|
+
from data_designer.config.analysis.column_statistics import CategoricalHistogramData
|
|
28
32
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
29
33
|
|
|
30
34
|
HEADER_STYLE = "dim"
|
|
@@ -101,7 +105,7 @@ def generate_analysis_report(
|
|
|
101
105
|
displayed_column_types.add(column_type)
|
|
102
106
|
column_label = column_type.replace("_", " ").title().replace("Llm", "LLM")
|
|
103
107
|
table = Table(
|
|
104
|
-
title=f"{
|
|
108
|
+
title=f"{get_column_emoji_from_type(column_type)} {column_label} Columns",
|
|
105
109
|
**table_kws,
|
|
106
110
|
)
|
|
107
111
|
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
5
7
|
from typing import Annotated, Literal
|
|
6
8
|
|
|
7
9
|
from pydantic import BaseModel, Discriminator, Field, model_validator
|
|
@@ -13,7 +15,7 @@ from data_designer.config.models import ImageContext
|
|
|
13
15
|
from data_designer.config.sampler_params import SamplerParamsT, SamplerType
|
|
14
16
|
from data_designer.config.utils.code_lang import CodeLang
|
|
15
17
|
from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX
|
|
16
|
-
from data_designer.config.utils.misc import assert_valid_jinja2_template,
|
|
18
|
+
from data_designer.config.utils.misc import assert_valid_jinja2_template, extract_keywords_from_jinja2_template
|
|
17
19
|
from data_designer.config.validator_params import ValidatorParamsT, ValidatorType
|
|
18
20
|
|
|
19
21
|
|
|
@@ -35,7 +37,12 @@ class SingleColumnConfig(ConfigBase, ABC):
|
|
|
35
37
|
drop: bool = False
|
|
36
38
|
column_type: str
|
|
37
39
|
|
|
40
|
+
@staticmethod
|
|
41
|
+
def get_column_emoji() -> str:
|
|
42
|
+
return "🎨"
|
|
43
|
+
|
|
38
44
|
@property
|
|
45
|
+
@abstractmethod
|
|
39
46
|
def required_columns(self) -> list[str]:
|
|
40
47
|
"""Returns a list of column names that must exist before this column can be generated.
|
|
41
48
|
|
|
@@ -43,9 +50,9 @@ class SingleColumnConfig(ConfigBase, ABC):
|
|
|
43
50
|
List of column names that this column depends on. Empty list indicates
|
|
44
51
|
no dependencies. Override in subclasses to specify dependencies.
|
|
45
52
|
"""
|
|
46
|
-
return []
|
|
47
53
|
|
|
48
54
|
@property
|
|
55
|
+
@abstractmethod
|
|
49
56
|
def side_effect_columns(self) -> list[str]:
|
|
50
57
|
"""Returns a list of additional columns that this column will create as a side effect.
|
|
51
58
|
|
|
@@ -56,7 +63,6 @@ class SingleColumnConfig(ConfigBase, ABC):
|
|
|
56
63
|
List of column names that this column will create as a side effect. Empty list
|
|
57
64
|
indicates no side effect columns. Override in subclasses to specify side effects.
|
|
58
65
|
"""
|
|
59
|
-
return []
|
|
60
66
|
|
|
61
67
|
|
|
62
68
|
class SamplerColumnConfig(SingleColumnConfig):
|
|
@@ -94,6 +100,18 @@ class SamplerColumnConfig(SingleColumnConfig):
|
|
|
94
100
|
convert_to: str | None = None
|
|
95
101
|
column_type: Literal["sampler"] = "sampler"
|
|
96
102
|
|
|
103
|
+
@staticmethod
|
|
104
|
+
def get_column_emoji() -> str:
|
|
105
|
+
return "🎲"
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def required_columns(self) -> list[str]:
|
|
109
|
+
return []
|
|
110
|
+
|
|
111
|
+
@property
|
|
112
|
+
def side_effect_columns(self) -> list[str]:
|
|
113
|
+
return []
|
|
114
|
+
|
|
97
115
|
@model_validator(mode="before")
|
|
98
116
|
@classmethod
|
|
99
117
|
def inject_sampler_type_into_params(cls, data: dict) -> dict:
|
|
@@ -150,6 +168,10 @@ class LLMTextColumnConfig(SingleColumnConfig):
|
|
|
150
168
|
multi_modal_context: list[ImageContext] | None = None
|
|
151
169
|
column_type: Literal["llm-text"] = "llm-text"
|
|
152
170
|
|
|
171
|
+
@staticmethod
|
|
172
|
+
def get_column_emoji() -> str:
|
|
173
|
+
return "📝"
|
|
174
|
+
|
|
153
175
|
@property
|
|
154
176
|
def required_columns(self) -> list[str]:
|
|
155
177
|
"""Get columns referenced in the prompt and system_prompt templates.
|
|
@@ -157,9 +179,9 @@ class LLMTextColumnConfig(SingleColumnConfig):
|
|
|
157
179
|
Returns:
|
|
158
180
|
List of unique column names referenced in Jinja2 templates.
|
|
159
181
|
"""
|
|
160
|
-
required_cols = list(
|
|
182
|
+
required_cols = list(extract_keywords_from_jinja2_template(self.prompt))
|
|
161
183
|
if self.system_prompt:
|
|
162
|
-
required_cols.extend(list(
|
|
184
|
+
required_cols.extend(list(extract_keywords_from_jinja2_template(self.system_prompt)))
|
|
163
185
|
return list(set(required_cols))
|
|
164
186
|
|
|
165
187
|
@property
|
|
@@ -207,6 +229,10 @@ class LLMCodeColumnConfig(LLMTextColumnConfig):
|
|
|
207
229
|
code_lang: CodeLang
|
|
208
230
|
column_type: Literal["llm-code"] = "llm-code"
|
|
209
231
|
|
|
232
|
+
@staticmethod
|
|
233
|
+
def get_column_emoji() -> str:
|
|
234
|
+
return "💻"
|
|
235
|
+
|
|
210
236
|
|
|
211
237
|
class LLMStructuredColumnConfig(LLMTextColumnConfig):
|
|
212
238
|
"""Configuration for structured JSON generation columns using Large Language Models.
|
|
@@ -225,6 +251,10 @@ class LLMStructuredColumnConfig(LLMTextColumnConfig):
|
|
|
225
251
|
output_format: dict | type[BaseModel]
|
|
226
252
|
column_type: Literal["llm-structured"] = "llm-structured"
|
|
227
253
|
|
|
254
|
+
@staticmethod
|
|
255
|
+
def get_column_emoji() -> str:
|
|
256
|
+
return "🗂️"
|
|
257
|
+
|
|
228
258
|
@model_validator(mode="after")
|
|
229
259
|
def validate_output_format(self) -> Self:
|
|
230
260
|
"""Convert Pydantic model to JSON schema if needed.
|
|
@@ -275,6 +305,10 @@ class LLMJudgeColumnConfig(LLMTextColumnConfig):
|
|
|
275
305
|
scores: list[Score] = Field(..., min_length=1)
|
|
276
306
|
column_type: Literal["llm-judge"] = "llm-judge"
|
|
277
307
|
|
|
308
|
+
@staticmethod
|
|
309
|
+
def get_column_emoji() -> str:
|
|
310
|
+
return "⚖️"
|
|
311
|
+
|
|
278
312
|
|
|
279
313
|
class ExpressionColumnConfig(SingleColumnConfig):
|
|
280
314
|
"""Configuration for derived columns using Jinja2 expressions.
|
|
@@ -297,10 +331,18 @@ class ExpressionColumnConfig(SingleColumnConfig):
|
|
|
297
331
|
dtype: Literal["int", "float", "str", "bool"] = "str"
|
|
298
332
|
column_type: Literal["expression"] = "expression"
|
|
299
333
|
|
|
334
|
+
@staticmethod
|
|
335
|
+
def get_column_emoji() -> str:
|
|
336
|
+
return "🧩"
|
|
337
|
+
|
|
300
338
|
@property
|
|
301
339
|
def required_columns(self) -> list[str]:
|
|
302
340
|
"""Returns the columns referenced in the expression template."""
|
|
303
|
-
return list(
|
|
341
|
+
return list(extract_keywords_from_jinja2_template(self.expr))
|
|
342
|
+
|
|
343
|
+
@property
|
|
344
|
+
def side_effect_columns(self) -> list[str]:
|
|
345
|
+
return []
|
|
304
346
|
|
|
305
347
|
@model_validator(mode="after")
|
|
306
348
|
def assert_expression_valid_jinja(self) -> Self:
|
|
@@ -359,11 +401,19 @@ class ValidationColumnConfig(SingleColumnConfig):
|
|
|
359
401
|
batch_size: int = Field(default=10, ge=1, description="Number of records to process in each batch")
|
|
360
402
|
column_type: Literal["validation"] = "validation"
|
|
361
403
|
|
|
404
|
+
@staticmethod
|
|
405
|
+
def get_column_emoji() -> str:
|
|
406
|
+
return "🔍"
|
|
407
|
+
|
|
362
408
|
@property
|
|
363
409
|
def required_columns(self) -> list[str]:
|
|
364
410
|
"""Returns the columns that need to be validated."""
|
|
365
411
|
return self.target_columns
|
|
366
412
|
|
|
413
|
+
@property
|
|
414
|
+
def side_effect_columns(self) -> list[str]:
|
|
415
|
+
return []
|
|
416
|
+
|
|
367
417
|
|
|
368
418
|
class SeedDatasetColumnConfig(SingleColumnConfig):
|
|
369
419
|
"""Configuration for columns sourced from seed datasets.
|
|
@@ -378,6 +428,18 @@ class SeedDatasetColumnConfig(SingleColumnConfig):
|
|
|
378
428
|
|
|
379
429
|
column_type: Literal["seed-dataset"] = "seed-dataset"
|
|
380
430
|
|
|
431
|
+
@staticmethod
|
|
432
|
+
def get_column_emoji() -> str:
|
|
433
|
+
return "🌱"
|
|
434
|
+
|
|
435
|
+
@property
|
|
436
|
+
def required_columns(self) -> list[str]:
|
|
437
|
+
return []
|
|
438
|
+
|
|
439
|
+
@property
|
|
440
|
+
def side_effect_columns(self) -> list[str]:
|
|
441
|
+
return []
|
|
442
|
+
|
|
381
443
|
|
|
382
444
|
class EmbeddingColumnConfig(SingleColumnConfig):
|
|
383
445
|
"""Configuration for embedding generation columns.
|
|
@@ -395,6 +457,14 @@ class EmbeddingColumnConfig(SingleColumnConfig):
|
|
|
395
457
|
model_alias: str
|
|
396
458
|
column_type: Literal["embedding"] = "embedding"
|
|
397
459
|
|
|
460
|
+
@staticmethod
|
|
461
|
+
def get_column_emoji() -> str:
|
|
462
|
+
return "🧬"
|
|
463
|
+
|
|
398
464
|
@property
|
|
399
465
|
def required_columns(self) -> list[str]:
|
|
400
466
|
return [self.target_column]
|
|
467
|
+
|
|
468
|
+
@property
|
|
469
|
+
def side_effect_columns(self) -> list[str]:
|
|
470
|
+
return []
|