data-designer 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/__init__.py +2 -0
- data_designer/_version.py +2 -2
- data_designer/cli/__init__.py +2 -0
- data_designer/cli/commands/download.py +2 -0
- data_designer/cli/commands/list.py +2 -0
- data_designer/cli/commands/models.py +2 -0
- data_designer/cli/commands/providers.py +2 -0
- data_designer/cli/commands/reset.py +2 -0
- data_designer/cli/controllers/__init__.py +2 -0
- data_designer/cli/controllers/download_controller.py +2 -0
- data_designer/cli/controllers/model_controller.py +6 -1
- data_designer/cli/controllers/provider_controller.py +6 -1
- data_designer/cli/forms/__init__.py +2 -0
- data_designer/cli/forms/builder.py +2 -0
- data_designer/cli/forms/field.py +2 -0
- data_designer/cli/forms/form.py +2 -0
- data_designer/cli/forms/model_builder.py +2 -0
- data_designer/cli/forms/provider_builder.py +2 -0
- data_designer/cli/main.py +2 -0
- data_designer/cli/repositories/__init__.py +2 -0
- data_designer/cli/repositories/base.py +2 -0
- data_designer/cli/repositories/model_repository.py +2 -0
- data_designer/cli/repositories/persona_repository.py +2 -0
- data_designer/cli/repositories/provider_repository.py +2 -0
- data_designer/cli/services/__init__.py +2 -0
- data_designer/cli/services/download_service.py +2 -0
- data_designer/cli/services/model_service.py +2 -0
- data_designer/cli/services/provider_service.py +2 -0
- data_designer/cli/ui.py +2 -0
- data_designer/cli/utils.py +2 -0
- data_designer/config/analysis/column_profilers.py +2 -0
- data_designer/config/analysis/column_statistics.py +8 -5
- data_designer/config/analysis/dataset_profiler.py +9 -3
- data_designer/config/analysis/utils/errors.py +2 -0
- data_designer/config/analysis/utils/reporting.py +7 -3
- data_designer/config/base.py +1 -0
- data_designer/config/column_configs.py +77 -7
- data_designer/config/column_types.py +33 -36
- data_designer/config/dataset_builders.py +2 -0
- data_designer/config/dataset_metadata.py +18 -0
- data_designer/config/default_model_settings.py +1 -0
- data_designer/config/errors.py +2 -0
- data_designer/config/exports.py +2 -0
- data_designer/config/interface.py +3 -2
- data_designer/config/models.py +7 -2
- data_designer/config/preview_results.py +9 -1
- data_designer/config/processors.py +2 -0
- data_designer/config/run_config.py +19 -5
- data_designer/config/sampler_constraints.py +2 -0
- data_designer/config/sampler_params.py +7 -2
- data_designer/config/seed.py +2 -0
- data_designer/config/seed_source.py +9 -3
- data_designer/config/seed_source_types.py +2 -0
- data_designer/config/utils/constants.py +2 -0
- data_designer/config/utils/errors.py +2 -0
- data_designer/config/utils/info.py +2 -0
- data_designer/config/utils/io_helpers.py +8 -3
- data_designer/config/utils/misc.py +2 -2
- data_designer/config/utils/numerical_helpers.py +2 -0
- data_designer/config/utils/type_helpers.py +2 -0
- data_designer/config/utils/visualization.py +19 -11
- data_designer/config/validator_params.py +2 -0
- data_designer/engine/analysis/column_profilers/base.py +9 -8
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
- data_designer/engine/analysis/column_profilers/registry.py +2 -0
- data_designer/engine/analysis/column_statistics.py +5 -2
- data_designer/engine/analysis/dataset_profiler.py +12 -9
- data_designer/engine/analysis/errors.py +2 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
- data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
- data_designer/engine/column_generators/generators/base.py +26 -14
- data_designer/engine/column_generators/generators/embedding.py +4 -11
- data_designer/engine/column_generators/generators/expression.py +7 -16
- data_designer/engine/column_generators/generators/llm_completion.py +13 -47
- data_designer/engine/column_generators/generators/samplers.py +8 -14
- data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
- data_designer/engine/column_generators/generators/validation.py +9 -20
- data_designer/engine/column_generators/registry.py +2 -0
- data_designer/engine/column_generators/utils/errors.py +2 -0
- data_designer/engine/column_generators/utils/generator_classification.py +2 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
- data_designer/engine/compiler.py +3 -6
- data_designer/engine/configurable_task.py +12 -13
- data_designer/engine/dataset_builders/artifact_storage.py +87 -8
- data_designer/engine/dataset_builders/column_wise_builder.py +34 -35
- data_designer/engine/dataset_builders/errors.py +2 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
- data_designer/engine/dataset_builders/utils/concurrency.py +13 -4
- data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
- data_designer/engine/dataset_builders/utils/dag.py +7 -2
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +35 -25
- data_designer/engine/dataset_builders/utils/errors.py +2 -0
- data_designer/engine/errors.py +2 -0
- data_designer/engine/model_provider.py +2 -0
- data_designer/engine/models/errors.py +23 -31
- data_designer/engine/models/facade.py +12 -9
- data_designer/engine/models/factory.py +42 -0
- data_designer/engine/models/litellm_overrides.py +16 -11
- data_designer/engine/models/parsers/errors.py +2 -0
- data_designer/engine/models/parsers/parser.py +2 -2
- data_designer/engine/models/parsers/postprocessors.py +1 -0
- data_designer/engine/models/parsers/tag_parsers.py +2 -0
- data_designer/engine/models/parsers/types.py +2 -0
- data_designer/engine/models/recipes/base.py +2 -0
- data_designer/engine/models/recipes/response_recipes.py +2 -0
- data_designer/engine/models/registry.py +11 -18
- data_designer/engine/models/telemetry.py +6 -2
- data_designer/engine/processing/ginja/ast.py +2 -0
- data_designer/engine/processing/ginja/environment.py +2 -0
- data_designer/engine/processing/ginja/exceptions.py +2 -0
- data_designer/engine/processing/ginja/record.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +9 -2
- data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
- data_designer/engine/processing/gsonschema/types.py +2 -0
- data_designer/engine/processing/gsonschema/validators.py +10 -6
- data_designer/engine/processing/processors/base.py +1 -5
- data_designer/engine/processing/processors/drop_columns.py +7 -10
- data_designer/engine/processing/processors/registry.py +2 -0
- data_designer/engine/processing/processors/schema_transform.py +7 -10
- data_designer/engine/processing/utils.py +7 -3
- data_designer/engine/registry/base.py +2 -0
- data_designer/engine/registry/data_designer_registry.py +2 -0
- data_designer/engine/registry/errors.py +2 -0
- data_designer/engine/resources/managed_dataset_generator.py +6 -2
- data_designer/engine/resources/managed_dataset_repository.py +8 -5
- data_designer/engine/resources/managed_storage.py +2 -0
- data_designer/engine/resources/resource_provider.py +20 -1
- data_designer/engine/resources/seed_reader.py +7 -2
- data_designer/engine/sampling_gen/column.py +2 -0
- data_designer/engine/sampling_gen/constraints.py +8 -2
- data_designer/engine/sampling_gen/data_sources/base.py +10 -7
- data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
- data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/errors.py +2 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/person.py +2 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
- data_designer/engine/sampling_gen/errors.py +2 -0
- data_designer/engine/sampling_gen/generator.py +5 -4
- data_designer/engine/sampling_gen/jinja_utils.py +7 -3
- data_designer/engine/sampling_gen/people_gen.py +7 -7
- data_designer/engine/sampling_gen/person_constants.py +2 -0
- data_designer/engine/sampling_gen/schema.py +5 -1
- data_designer/engine/sampling_gen/schema_builder.py +2 -0
- data_designer/engine/sampling_gen/utils.py +7 -1
- data_designer/engine/secret_resolver.py +2 -0
- data_designer/engine/validation.py +2 -2
- data_designer/engine/validators/__init__.py +2 -0
- data_designer/engine/validators/base.py +2 -0
- data_designer/engine/validators/local_callable.py +7 -2
- data_designer/engine/validators/python.py +7 -1
- data_designer/engine/validators/remote.py +7 -1
- data_designer/engine/validators/sql.py +8 -3
- data_designer/errors.py +2 -0
- data_designer/essentials/__init__.py +2 -0
- data_designer/interface/data_designer.py +36 -39
- data_designer/interface/errors.py +2 -0
- data_designer/interface/results.py +9 -2
- data_designer/lazy_heavy_imports.py +54 -0
- data_designer/logging.py +2 -0
- data_designer/plugins/__init__.py +2 -0
- data_designer/plugins/errors.py +2 -0
- data_designer/plugins/plugin.py +0 -1
- data_designer/plugins/registry.py +2 -0
- data_designer/plugins/testing/__init__.py +2 -0
- data_designer/plugins/testing/stubs.py +21 -43
- data_designer/plugins/testing/utils.py +2 -0
- {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/METADATA +19 -4
- data_designer-0.3.5.dist-info/RECORD +196 -0
- data_designer-0.3.3.dist-info/RECORD +0 -193
- {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/WHEEL +0 -0
- {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/entry_points.txt +0 -0
- {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from copy import deepcopy
|
|
5
7
|
|
|
6
8
|
from data_designer.config.column_configs import SamplerColumnConfig
|
|
@@ -1,9 +1,15 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import numbers
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from data_designer.lazy_heavy_imports import np
|
|
5
10
|
|
|
6
|
-
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import numpy as np
|
|
7
13
|
|
|
8
14
|
|
|
9
15
|
def check_random_state(seed):
|
|
@@ -19,7 +19,7 @@ from data_designer.config.processors import ProcessorConfigT, ProcessorType
|
|
|
19
19
|
from data_designer.config.utils.constants import RICH_CONSOLE_THEME
|
|
20
20
|
from data_designer.config.utils.misc import (
|
|
21
21
|
can_run_data_designer_locally,
|
|
22
|
-
|
|
22
|
+
extract_keywords_from_jinja2_template,
|
|
23
23
|
)
|
|
24
24
|
from data_designer.config.validator_params import ValidatorType
|
|
25
25
|
from data_designer.engine.column_generators.utils.generator_classification import column_type_is_model_generated
|
|
@@ -302,7 +302,7 @@ def validate_schema_transform_processor(
|
|
|
302
302
|
for processor_config in processor_configs:
|
|
303
303
|
if processor_config.processor_type == ProcessorType.SCHEMA_TRANSFORM:
|
|
304
304
|
for col, template in processor_config.template.items():
|
|
305
|
-
template_keywords =
|
|
305
|
+
template_keywords = extract_keywords_from_jinja2_template(template)
|
|
306
306
|
invalid_keywords = set(template_keywords) - all_column_names
|
|
307
307
|
if len(invalid_keywords) > 0:
|
|
308
308
|
invalid_keywords = ", ".join([f"'{k}'" for k in invalid_keywords])
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.engine.validators.base import BaseValidator, ValidationResult
|
|
5
7
|
from data_designer.engine.validators.local_callable import LocalCallableValidator
|
|
6
8
|
from data_designer.engine.validators.python import PythonValidator
|
|
@@ -1,14 +1,19 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
import logging
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
8
9
|
from data_designer.config.validator_params import LocalCallableValidatorParams
|
|
9
10
|
from data_designer.engine.errors import LocalCallableValidationError
|
|
10
11
|
from data_designer.engine.processing.gsonschema.validators import validate
|
|
11
12
|
from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
|
|
13
|
+
from data_designer.lazy_heavy_imports import pd
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
import pandas as pd
|
|
12
17
|
|
|
13
18
|
logger = logging.getLogger(__name__)
|
|
14
19
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import ast
|
|
5
7
|
import json
|
|
6
8
|
import logging
|
|
@@ -8,14 +10,18 @@ import subprocess
|
|
|
8
10
|
import tempfile
|
|
9
11
|
from collections import defaultdict
|
|
10
12
|
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
11
14
|
from uuid import uuid4
|
|
12
15
|
|
|
13
|
-
import pandas as pd
|
|
14
16
|
from pydantic import BaseModel
|
|
15
17
|
from ruff.__main__ import find_ruff_bin
|
|
16
18
|
|
|
17
19
|
from data_designer.config.validator_params import CodeValidatorParams
|
|
18
20
|
from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
|
|
21
|
+
from data_designer.lazy_heavy_imports import pd
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
import pandas as pd
|
|
19
25
|
|
|
20
26
|
logger = logging.getLogger(__name__)
|
|
21
27
|
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
5
8
|
|
|
6
|
-
import httpx
|
|
7
9
|
from httpx_retries import Retry, RetryTransport
|
|
8
10
|
|
|
9
11
|
from data_designer.config.validator_params import RemoteValidatorParams
|
|
@@ -11,6 +13,10 @@ from data_designer.engine.errors import RemoteValidationSchemaError
|
|
|
11
13
|
from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError
|
|
12
14
|
from data_designer.engine.processing.gsonschema.validators import validate
|
|
13
15
|
from data_designer.engine.validators.base import BaseValidator, ValidationResult
|
|
16
|
+
from data_designer.lazy_heavy_imports import httpx
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
import httpx
|
|
14
20
|
|
|
15
21
|
logger = logging.getLogger(__name__)
|
|
16
22
|
|
|
@@ -1,15 +1,20 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
import re
|
|
6
|
-
|
|
7
|
-
import pandas as pd
|
|
8
|
-
import sqlfluff
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
9
|
|
|
10
10
|
from data_designer.config.utils.code_lang import CodeLang
|
|
11
11
|
from data_designer.config.validator_params import CodeValidatorParams
|
|
12
12
|
from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
|
|
13
|
+
from data_designer.lazy_heavy_imports import pd, sqlfluff
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import sqlfluff
|
|
13
18
|
|
|
14
19
|
sqlfluff_logger = logging.getLogger("sqlfluff")
|
|
15
20
|
sqlfluff_logger.setLevel(logging.WARNING)
|
data_designer/errors.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.config.default_model_settings import resolve_seed_default_model_settings
|
|
5
7
|
from data_designer.config.exports import * # noqa: F403
|
|
6
8
|
from data_designer.config.run_config import RunConfig
|
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
import pandas as pd
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
8
9
|
|
|
9
10
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
10
11
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
12
|
+
from data_designer.config.data_designer_config import DataDesignerConfig
|
|
11
13
|
from data_designer.config.default_model_settings import (
|
|
12
14
|
get_default_model_configs,
|
|
13
15
|
get_default_model_providers_missing_api_keys,
|
|
@@ -29,14 +31,10 @@ from data_designer.config.utils.constants import (
|
|
|
29
31
|
PREDEFINED_PROVIDERS,
|
|
30
32
|
)
|
|
31
33
|
from data_designer.config.utils.info import InfoType, InterfaceInfo
|
|
32
|
-
from data_designer.engine.analysis.dataset_profiler import
|
|
33
|
-
DataDesignerDatasetProfiler,
|
|
34
|
-
DatasetProfilerConfig,
|
|
35
|
-
)
|
|
34
|
+
from data_designer.engine.analysis.dataset_profiler import DataDesignerDatasetProfiler, DatasetProfilerConfig
|
|
36
35
|
from data_designer.engine.compiler import compile_data_designer_config
|
|
37
36
|
from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
|
|
38
37
|
from data_designer.engine.dataset_builders.column_wise_builder import ColumnWiseDatasetBuilder
|
|
39
|
-
from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
|
|
40
38
|
from data_designer.engine.model_provider import resolve_model_provider_registry
|
|
41
39
|
from data_designer.engine.resources.managed_storage import init_managed_blob_storage
|
|
42
40
|
from data_designer.engine.resources.resource_provider import ResourceProvider, create_resource_provider
|
|
@@ -56,14 +54,18 @@ from data_designer.engine.secret_resolver import (
|
|
|
56
54
|
from data_designer.interface.errors import (
|
|
57
55
|
DataDesignerGenerationError,
|
|
58
56
|
DataDesignerProfilingError,
|
|
59
|
-
InvalidBufferValueError,
|
|
60
57
|
)
|
|
61
58
|
from data_designer.interface.results import DatasetCreationResults
|
|
59
|
+
from data_designer.lazy_heavy_imports import pd
|
|
62
60
|
from data_designer.logging import RandomEmoji
|
|
63
61
|
from data_designer.plugins.plugin import PluginType
|
|
64
62
|
from data_designer.plugins.registry import PluginRegistry
|
|
65
63
|
|
|
66
|
-
|
|
64
|
+
if TYPE_CHECKING:
|
|
65
|
+
import pandas as pd
|
|
66
|
+
|
|
67
|
+
logger = logging.getLogger(__name__)
|
|
68
|
+
|
|
67
69
|
|
|
68
70
|
DEFAULT_SECRET_RESOLVER = CompositeResolver([EnvironmentResolver(), PlaintextResolver()])
|
|
69
71
|
|
|
@@ -75,8 +77,6 @@ DEFAULT_SEED_READERS = [
|
|
|
75
77
|
for plugin in PluginRegistry().get_plugins(PluginType.SEED_READER):
|
|
76
78
|
DEFAULT_SEED_READERS.append(plugin.impl_cls())
|
|
77
79
|
|
|
78
|
-
logger = logging.getLogger(__name__)
|
|
79
|
-
|
|
80
80
|
|
|
81
81
|
class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
82
82
|
"""Main interface for creating datasets with Data Designer.
|
|
@@ -112,7 +112,6 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
112
112
|
):
|
|
113
113
|
self._secret_resolver = secret_resolver or DEFAULT_SECRET_RESOLVER
|
|
114
114
|
self._artifact_path = Path(artifact_path) if artifact_path is not None else Path.cwd() / "artifacts"
|
|
115
|
-
self._buffer_size = DEFAULT_BUFFER_SIZE
|
|
116
115
|
self._run_config = RunConfig()
|
|
117
116
|
self._managed_assets_path = Path(managed_assets_path or MANAGED_ASSETS_PATH)
|
|
118
117
|
self._model_providers = self._resolve_model_providers(model_providers)
|
|
@@ -166,10 +165,10 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
166
165
|
|
|
167
166
|
resource_provider = self._create_resource_provider(dataset_name, config_builder)
|
|
168
167
|
|
|
169
|
-
builder = self._create_dataset_builder(config_builder, resource_provider)
|
|
168
|
+
builder = self._create_dataset_builder(config_builder.build(), resource_provider)
|
|
170
169
|
|
|
171
170
|
try:
|
|
172
|
-
builder.build(num_records=num_records
|
|
171
|
+
builder.build(num_records=num_records)
|
|
173
172
|
except Exception as e:
|
|
174
173
|
raise DataDesignerGenerationError(f"🛑 Error generating dataset: {e}")
|
|
175
174
|
|
|
@@ -182,10 +181,19 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
182
181
|
except Exception as e:
|
|
183
182
|
raise DataDesignerProfilingError(f"🛑 Error profiling dataset: {e}")
|
|
184
183
|
|
|
184
|
+
dataset_metadata = resource_provider.get_dataset_metadata()
|
|
185
|
+
|
|
186
|
+
# Update metadata with column statistics from analysis
|
|
187
|
+
if analysis:
|
|
188
|
+
builder.artifact_storage.update_metadata(
|
|
189
|
+
{"column_statistics": [stat.model_dump(mode="json") for stat in analysis.column_statistics]}
|
|
190
|
+
)
|
|
191
|
+
|
|
185
192
|
return DatasetCreationResults(
|
|
186
193
|
artifact_storage=builder.artifact_storage,
|
|
187
194
|
analysis=analysis,
|
|
188
195
|
config_builder=config_builder,
|
|
196
|
+
dataset_metadata=dataset_metadata,
|
|
189
197
|
)
|
|
190
198
|
|
|
191
199
|
def preview(
|
|
@@ -211,7 +219,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
211
219
|
logger.info(f"{RandomEmoji.previewing()} Preview generation in progress")
|
|
212
220
|
|
|
213
221
|
resource_provider = self._create_resource_provider("preview-dataset", config_builder)
|
|
214
|
-
builder = self._create_dataset_builder(config_builder, resource_provider)
|
|
222
|
+
builder = self._create_dataset_builder(config_builder.build(), resource_provider)
|
|
215
223
|
|
|
216
224
|
try:
|
|
217
225
|
raw_dataset = builder.build_preview(num_records=num_records)
|
|
@@ -249,11 +257,15 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
249
257
|
):
|
|
250
258
|
logger.info(f"{RandomEmoji.success()} Preview complete!")
|
|
251
259
|
|
|
260
|
+
# Create dataset metadata from the resource provider
|
|
261
|
+
dataset_metadata = resource_provider.get_dataset_metadata()
|
|
262
|
+
|
|
252
263
|
return PreviewResults(
|
|
253
264
|
dataset=processed_dataset,
|
|
254
265
|
analysis=analysis,
|
|
255
266
|
processor_artifacts=processor_artifacts,
|
|
256
267
|
config_builder=config_builder,
|
|
268
|
+
dataset_metadata=dataset_metadata,
|
|
257
269
|
)
|
|
258
270
|
|
|
259
271
|
def validate(self, config_builder: DataDesignerConfigBuilder) -> None:
|
|
@@ -271,7 +283,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
271
283
|
InvalidConfigError: If the configuration is invalid.
|
|
272
284
|
"""
|
|
273
285
|
resource_provider = self._create_resource_provider("validate-configuration", config_builder)
|
|
274
|
-
compile_data_designer_config(config_builder, resource_provider)
|
|
286
|
+
compile_data_designer_config(config_builder.build(), resource_provider)
|
|
275
287
|
|
|
276
288
|
def get_default_model_configs(self) -> list[ModelConfig]:
|
|
277
289
|
"""Get the default model configurations.
|
|
@@ -300,34 +312,22 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
300
312
|
"""
|
|
301
313
|
return self._secret_resolver
|
|
302
314
|
|
|
303
|
-
def set_buffer_size(self, buffer_size: int) -> None:
|
|
304
|
-
"""Set the buffer size for dataset generation.
|
|
305
|
-
|
|
306
|
-
The buffer size controls how many records are processed in memory at once
|
|
307
|
-
during dataset generation using the `create` method. The default value is
|
|
308
|
-
set to the constant `DEFAULT_BUFFER_SIZE` defined in the data_designer module.
|
|
309
|
-
|
|
310
|
-
Args:
|
|
311
|
-
buffer_size: Number of records to process in each buffer.
|
|
312
|
-
|
|
313
|
-
Raises:
|
|
314
|
-
InvalidBufferValueError: If buffer size is less than or equal to 0.
|
|
315
|
-
"""
|
|
316
|
-
if buffer_size <= 0:
|
|
317
|
-
raise InvalidBufferValueError("Buffer size must be greater than 0.")
|
|
318
|
-
self._buffer_size = buffer_size
|
|
319
|
-
|
|
320
315
|
def set_run_config(self, run_config: RunConfig) -> None:
|
|
321
316
|
"""Set the runtime configuration for dataset generation.
|
|
322
317
|
|
|
323
318
|
Args:
|
|
324
319
|
run_config: A RunConfig instance containing runtime settings such as
|
|
325
|
-
early shutdown behavior
|
|
320
|
+
early shutdown behavior and batch sizing via `buffer_size`. Import RunConfig from
|
|
321
|
+
data_designer.essentials.
|
|
326
322
|
|
|
327
323
|
Example:
|
|
328
324
|
>>> from data_designer.essentials import DataDesigner, RunConfig
|
|
329
325
|
>>> dd = DataDesigner()
|
|
330
326
|
>>> dd.set_run_config(RunConfig(disable_early_shutdown=True))
|
|
327
|
+
|
|
328
|
+
Notes:
|
|
329
|
+
When `disable_early_shutdown=True`, DataDesigner will never terminate generation early
|
|
330
|
+
due to error-rate thresholds. Errors are still tracked for reporting.
|
|
331
331
|
"""
|
|
332
332
|
self._run_config = run_config
|
|
333
333
|
|
|
@@ -348,14 +348,11 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
348
348
|
|
|
349
349
|
def _create_dataset_builder(
|
|
350
350
|
self,
|
|
351
|
-
|
|
351
|
+
data_designer_config: DataDesignerConfig,
|
|
352
352
|
resource_provider: ResourceProvider,
|
|
353
353
|
) -> ColumnWiseDatasetBuilder:
|
|
354
|
-
config = compile_data_designer_config(config_builder, resource_provider)
|
|
355
|
-
|
|
356
354
|
return ColumnWiseDatasetBuilder(
|
|
357
|
-
|
|
358
|
-
processor_configs=config.processors or [],
|
|
355
|
+
data_designer_config=data_designer_config,
|
|
359
356
|
resource_provider=resource_provider,
|
|
360
357
|
)
|
|
361
358
|
|
|
@@ -4,14 +4,18 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
9
8
|
|
|
10
9
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
11
10
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
11
|
+
from data_designer.config.dataset_metadata import DatasetMetadata
|
|
12
12
|
from data_designer.config.utils.visualization import WithRecordSamplerMixin
|
|
13
13
|
from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
|
|
14
14
|
from data_designer.engine.dataset_builders.errors import ArtifactStorageError
|
|
15
|
+
from data_designer.lazy_heavy_imports import pd
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import pandas as pd
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
class DatasetCreationResults(WithRecordSamplerMixin):
|
|
@@ -28,6 +32,7 @@ class DatasetCreationResults(WithRecordSamplerMixin):
|
|
|
28
32
|
artifact_storage: ArtifactStorage,
|
|
29
33
|
analysis: DatasetProfilerResults,
|
|
30
34
|
config_builder: DataDesignerConfigBuilder,
|
|
35
|
+
dataset_metadata: DatasetMetadata,
|
|
31
36
|
):
|
|
32
37
|
"""Creates a new instance with results based on a dataset creation run.
|
|
33
38
|
|
|
@@ -35,10 +40,12 @@ class DatasetCreationResults(WithRecordSamplerMixin):
|
|
|
35
40
|
artifact_storage: Storage manager for accessing generated artifacts.
|
|
36
41
|
analysis: Profiling results for the generated dataset.
|
|
37
42
|
config_builder: Configuration builder used to create the dataset.
|
|
43
|
+
dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
|
|
38
44
|
"""
|
|
39
45
|
self.artifact_storage = artifact_storage
|
|
40
46
|
self._analysis = analysis
|
|
41
47
|
self._config_builder = config_builder
|
|
48
|
+
self.dataset_metadata = dataset_metadata
|
|
42
49
|
|
|
43
50
|
def load_analysis(self) -> DatasetProfilerResults:
|
|
44
51
|
"""Load the profiling analysis results for the generated dataset.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Lazy imports facade for heavy third-party dependencies.
|
|
6
|
+
|
|
7
|
+
This module provides a centralized facade that lazily imports heavy dependencies
|
|
8
|
+
only when accessed, significantly improving import performance.
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
from data_designer.lazy_heavy_imports import pd, np, faker, litellm
|
|
12
|
+
|
|
13
|
+
df = pd.DataFrame(...)
|
|
14
|
+
arr = np.array([1, 2, 3])
|
|
15
|
+
fake = faker.Faker()
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import importlib
|
|
21
|
+
|
|
22
|
+
# Mapping of lazy import names to their actual module paths
|
|
23
|
+
_LAZY_IMPORTS = {
|
|
24
|
+
"pd": "pandas",
|
|
25
|
+
"np": "numpy",
|
|
26
|
+
"pq": "pyarrow.parquet",
|
|
27
|
+
"pa": "pyarrow",
|
|
28
|
+
"faker": "faker",
|
|
29
|
+
"litellm": "litellm",
|
|
30
|
+
"sqlfluff": "sqlfluff",
|
|
31
|
+
"httpx": "httpx",
|
|
32
|
+
"duckdb": "duckdb",
|
|
33
|
+
"nx": "networkx",
|
|
34
|
+
"scipy": "scipy",
|
|
35
|
+
"jsonschema": "jsonschema",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def __getattr__(name: str) -> object:
|
|
40
|
+
"""Lazily import heavy third-party dependencies when accessed.
|
|
41
|
+
|
|
42
|
+
This allows fast imports of data_designer while deferring loading of heavy
|
|
43
|
+
libraries until they're actually needed.
|
|
44
|
+
"""
|
|
45
|
+
if name in _LAZY_IMPORTS:
|
|
46
|
+
module_name = _LAZY_IMPORTS[name]
|
|
47
|
+
return importlib.import_module(module_name)
|
|
48
|
+
|
|
49
|
+
raise AttributeError(f"module 'data_designer.lazy_heavy_imports' has no attribute {name!r}")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def __dir__() -> list[str]:
|
|
53
|
+
"""Return list of available lazy imports."""
|
|
54
|
+
return list(_LAZY_IMPORTS.keys())
|
data_designer/logging.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.plugins.plugin import Plugin, PluginType
|
|
5
7
|
|
|
6
8
|
__all__ = ["Plugin", "PluginType"]
|
data_designer/plugins/errors.py
CHANGED
data_designer/plugins/plugin.py
CHANGED
|
@@ -70,7 +70,6 @@ class Plugin(BaseModel):
|
|
|
70
70
|
..., description="The fully-qualified name o the config class object, e.g. 'my_plugin.config.MyConfig'"
|
|
71
71
|
)
|
|
72
72
|
plugin_type: PluginType = Field(..., description="The type of plugin")
|
|
73
|
-
emoji: str = Field(default="🔌", description="The emoji to use in logs related to the plugin")
|
|
74
73
|
|
|
75
74
|
@property
|
|
76
75
|
def config_type_as_class_name(self) -> str:
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.plugins.testing.utils import assert_valid_plugin
|
|
5
7
|
|
|
6
8
|
__all__ = [
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from typing import Literal
|
|
5
7
|
|
|
6
8
|
from data_designer.config.base import ConfigBase
|
|
7
9
|
from data_designer.config.column_configs import SingleColumnConfig
|
|
8
|
-
from data_designer.engine.
|
|
10
|
+
from data_designer.engine.column_generators.generators.base import ColumnGeneratorCellByCell
|
|
9
11
|
from data_designer.plugins.plugin import Plugin, PluginType
|
|
10
12
|
|
|
11
13
|
MODULE_NAME = __name__
|
|
@@ -18,15 +20,11 @@ class ValidTestConfig(SingleColumnConfig):
|
|
|
18
20
|
name: str
|
|
19
21
|
|
|
20
22
|
|
|
21
|
-
class ValidTestTask(
|
|
23
|
+
class ValidTestTask(ColumnGeneratorCellByCell[ValidTestConfig]):
|
|
22
24
|
"""Valid task for testing plugin creation."""
|
|
23
25
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return ConfigurableTaskMetadata(
|
|
27
|
-
name="test_generator",
|
|
28
|
-
description="Test generator",
|
|
29
|
-
)
|
|
26
|
+
def generate(self, data: dict) -> dict:
|
|
27
|
+
return data
|
|
30
28
|
|
|
31
29
|
|
|
32
30
|
class ConfigWithoutDiscriminator(ConfigBase):
|
|
@@ -53,22 +51,14 @@ class StubPluginConfigB(SingleColumnConfig):
|
|
|
53
51
|
column_type: Literal["test-plugin-b"] = "test-plugin-b"
|
|
54
52
|
|
|
55
53
|
|
|
56
|
-
class StubPluginTaskA(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
return ConfigurableTaskMetadata(
|
|
60
|
-
name="test_plugin_a",
|
|
61
|
-
description="Test plugin A",
|
|
62
|
-
)
|
|
54
|
+
class StubPluginTaskA(ColumnGeneratorCellByCell[StubPluginConfigA]):
|
|
55
|
+
def generate(self, data: dict) -> dict:
|
|
56
|
+
return data
|
|
63
57
|
|
|
64
58
|
|
|
65
|
-
class StubPluginTaskB(
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
return ConfigurableTaskMetadata(
|
|
69
|
-
name="test_plugin_b",
|
|
70
|
-
description="Test plugin B",
|
|
71
|
-
)
|
|
59
|
+
class StubPluginTaskB(ColumnGeneratorCellByCell[StubPluginConfigB]):
|
|
60
|
+
def generate(self, data: dict) -> dict:
|
|
61
|
+
return data
|
|
72
62
|
|
|
73
63
|
|
|
74
64
|
# Stub plugins requiring different combinations of resources
|
|
@@ -86,31 +76,19 @@ class StubPluginConfigBlobsAndSeeds(SingleColumnConfig):
|
|
|
86
76
|
column_type: Literal["test-plugin-blobs-and-seeds"] = "test-plugin-blobs-and-seeds"
|
|
87
77
|
|
|
88
78
|
|
|
89
|
-
class StubPluginTaskModels(
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
return ConfigurableTaskMetadata(
|
|
93
|
-
name="test_plugin_models",
|
|
94
|
-
description="Test plugin requiring models",
|
|
95
|
-
)
|
|
79
|
+
class StubPluginTaskModels(ColumnGeneratorCellByCell[StubPluginConfigModels]):
|
|
80
|
+
def generate(self, data: dict) -> dict:
|
|
81
|
+
return data
|
|
96
82
|
|
|
97
83
|
|
|
98
|
-
class StubPluginTaskModelsAndBlobs(
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
return ConfigurableTaskMetadata(
|
|
102
|
-
name="test_plugin_models_and_blobs",
|
|
103
|
-
description="Test plugin requiring models and blobs",
|
|
104
|
-
)
|
|
84
|
+
class StubPluginTaskModelsAndBlobs(ColumnGeneratorCellByCell[StubPluginConfigModelsAndBlobs]):
|
|
85
|
+
def generate(self, data: dict) -> dict:
|
|
86
|
+
return data
|
|
105
87
|
|
|
106
88
|
|
|
107
|
-
class StubPluginTaskBlobsAndSeeds(
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
return ConfigurableTaskMetadata(
|
|
111
|
-
name="test_plugin_blobs_and_seeds",
|
|
112
|
-
description="Test plugin requiring blobs and seeds",
|
|
113
|
-
)
|
|
89
|
+
class StubPluginTaskBlobsAndSeeds(ColumnGeneratorCellByCell[StubPluginConfigBlobsAndSeeds]):
|
|
90
|
+
def generate(self, data: dict) -> dict:
|
|
91
|
+
return data
|
|
114
92
|
|
|
115
93
|
|
|
116
94
|
plugin_none = Plugin(
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.config.base import ConfigBase
|
|
5
7
|
from data_designer.engine.configurable_task import ConfigurableTask
|
|
6
8
|
from data_designer.engine.resources.seed_reader import SeedReader
|