data-designer 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/__init__.py +2 -0
- data_designer/_version.py +2 -2
- data_designer/cli/__init__.py +2 -0
- data_designer/cli/commands/download.py +2 -0
- data_designer/cli/commands/list.py +2 -0
- data_designer/cli/commands/models.py +2 -0
- data_designer/cli/commands/providers.py +2 -0
- data_designer/cli/commands/reset.py +2 -0
- data_designer/cli/controllers/__init__.py +2 -0
- data_designer/cli/controllers/download_controller.py +2 -0
- data_designer/cli/controllers/model_controller.py +6 -1
- data_designer/cli/controllers/provider_controller.py +6 -1
- data_designer/cli/forms/__init__.py +2 -0
- data_designer/cli/forms/builder.py +2 -0
- data_designer/cli/forms/field.py +2 -0
- data_designer/cli/forms/form.py +2 -0
- data_designer/cli/forms/model_builder.py +2 -0
- data_designer/cli/forms/provider_builder.py +2 -0
- data_designer/cli/main.py +2 -0
- data_designer/cli/repositories/__init__.py +2 -0
- data_designer/cli/repositories/base.py +2 -0
- data_designer/cli/repositories/model_repository.py +2 -0
- data_designer/cli/repositories/persona_repository.py +2 -0
- data_designer/cli/repositories/provider_repository.py +2 -0
- data_designer/cli/services/__init__.py +2 -0
- data_designer/cli/services/download_service.py +2 -0
- data_designer/cli/services/model_service.py +2 -0
- data_designer/cli/services/provider_service.py +2 -0
- data_designer/cli/ui.py +2 -0
- data_designer/cli/utils.py +2 -0
- data_designer/config/analysis/column_profilers.py +2 -0
- data_designer/config/analysis/column_statistics.py +8 -5
- data_designer/config/analysis/dataset_profiler.py +9 -3
- data_designer/config/analysis/utils/errors.py +2 -0
- data_designer/config/analysis/utils/reporting.py +7 -3
- data_designer/config/column_configs.py +77 -7
- data_designer/config/column_types.py +33 -36
- data_designer/config/dataset_builders.py +2 -0
- data_designer/config/default_model_settings.py +1 -0
- data_designer/config/errors.py +2 -0
- data_designer/config/exports.py +2 -0
- data_designer/config/interface.py +3 -2
- data_designer/config/models.py +7 -2
- data_designer/config/preview_results.py +7 -3
- data_designer/config/processors.py +2 -0
- data_designer/config/run_config.py +2 -0
- data_designer/config/sampler_constraints.py +2 -0
- data_designer/config/sampler_params.py +7 -2
- data_designer/config/seed.py +2 -0
- data_designer/config/seed_source.py +7 -2
- data_designer/config/seed_source_types.py +2 -0
- data_designer/config/utils/constants.py +2 -0
- data_designer/config/utils/errors.py +2 -0
- data_designer/config/utils/info.py +2 -0
- data_designer/config/utils/io_helpers.py +8 -3
- data_designer/config/utils/misc.py +2 -2
- data_designer/config/utils/numerical_helpers.py +2 -0
- data_designer/config/utils/type_helpers.py +2 -0
- data_designer/config/utils/visualization.py +8 -4
- data_designer/config/validator_params.py +2 -0
- data_designer/engine/analysis/column_profilers/base.py +9 -8
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
- data_designer/engine/analysis/column_profilers/registry.py +2 -0
- data_designer/engine/analysis/column_statistics.py +5 -2
- data_designer/engine/analysis/dataset_profiler.py +12 -9
- data_designer/engine/analysis/errors.py +2 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
- data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
- data_designer/engine/column_generators/generators/base.py +26 -14
- data_designer/engine/column_generators/generators/embedding.py +4 -11
- data_designer/engine/column_generators/generators/expression.py +7 -16
- data_designer/engine/column_generators/generators/llm_completion.py +11 -37
- data_designer/engine/column_generators/generators/samplers.py +8 -14
- data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
- data_designer/engine/column_generators/generators/validation.py +8 -20
- data_designer/engine/column_generators/registry.py +2 -0
- data_designer/engine/column_generators/utils/errors.py +2 -0
- data_designer/engine/column_generators/utils/generator_classification.py +2 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
- data_designer/engine/compiler.py +3 -6
- data_designer/engine/configurable_task.py +12 -13
- data_designer/engine/dataset_builders/artifact_storage.py +87 -8
- data_designer/engine/dataset_builders/column_wise_builder.py +32 -34
- data_designer/engine/dataset_builders/errors.py +2 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
- data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
- data_designer/engine/dataset_builders/utils/dag.py +7 -2
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +9 -6
- data_designer/engine/dataset_builders/utils/errors.py +2 -0
- data_designer/engine/errors.py +2 -0
- data_designer/engine/model_provider.py +2 -0
- data_designer/engine/models/errors.py +23 -31
- data_designer/engine/models/facade.py +12 -9
- data_designer/engine/models/factory.py +42 -0
- data_designer/engine/models/litellm_overrides.py +22 -11
- data_designer/engine/models/parsers/errors.py +2 -0
- data_designer/engine/models/parsers/parser.py +2 -2
- data_designer/engine/models/parsers/postprocessors.py +1 -0
- data_designer/engine/models/parsers/tag_parsers.py +2 -0
- data_designer/engine/models/parsers/types.py +2 -0
- data_designer/engine/models/recipes/base.py +2 -0
- data_designer/engine/models/recipes/response_recipes.py +2 -0
- data_designer/engine/models/registry.py +11 -18
- data_designer/engine/models/telemetry.py +6 -2
- data_designer/engine/processing/ginja/ast.py +2 -0
- data_designer/engine/processing/ginja/environment.py +2 -0
- data_designer/engine/processing/ginja/exceptions.py +2 -0
- data_designer/engine/processing/ginja/record.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +9 -2
- data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
- data_designer/engine/processing/gsonschema/types.py +2 -0
- data_designer/engine/processing/gsonschema/validators.py +10 -6
- data_designer/engine/processing/processors/base.py +1 -5
- data_designer/engine/processing/processors/drop_columns.py +7 -10
- data_designer/engine/processing/processors/registry.py +2 -0
- data_designer/engine/processing/processors/schema_transform.py +7 -10
- data_designer/engine/processing/utils.py +7 -3
- data_designer/engine/registry/base.py +2 -0
- data_designer/engine/registry/data_designer_registry.py +2 -0
- data_designer/engine/registry/errors.py +2 -0
- data_designer/engine/resources/managed_dataset_generator.py +6 -2
- data_designer/engine/resources/managed_dataset_repository.py +8 -5
- data_designer/engine/resources/managed_storage.py +2 -0
- data_designer/engine/resources/resource_provider.py +8 -1
- data_designer/engine/resources/seed_reader.py +7 -2
- data_designer/engine/sampling_gen/column.py +2 -0
- data_designer/engine/sampling_gen/constraints.py +8 -2
- data_designer/engine/sampling_gen/data_sources/base.py +10 -7
- data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
- data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/errors.py +2 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/person.py +2 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
- data_designer/engine/sampling_gen/errors.py +2 -0
- data_designer/engine/sampling_gen/generator.py +5 -4
- data_designer/engine/sampling_gen/jinja_utils.py +7 -3
- data_designer/engine/sampling_gen/people_gen.py +7 -7
- data_designer/engine/sampling_gen/person_constants.py +2 -0
- data_designer/engine/sampling_gen/schema.py +5 -1
- data_designer/engine/sampling_gen/schema_builder.py +2 -0
- data_designer/engine/sampling_gen/utils.py +7 -1
- data_designer/engine/secret_resolver.py +2 -0
- data_designer/engine/validation.py +2 -2
- data_designer/engine/validators/__init__.py +2 -0
- data_designer/engine/validators/base.py +2 -0
- data_designer/engine/validators/local_callable.py +7 -2
- data_designer/engine/validators/python.py +7 -1
- data_designer/engine/validators/remote.py +7 -1
- data_designer/engine/validators/sql.py +8 -3
- data_designer/errors.py +2 -0
- data_designer/essentials/__init__.py +2 -0
- data_designer/interface/data_designer.py +23 -17
- data_designer/interface/errors.py +2 -0
- data_designer/interface/results.py +5 -2
- data_designer/lazy_heavy_imports.py +54 -0
- data_designer/logging.py +2 -0
- data_designer/plugins/__init__.py +2 -0
- data_designer/plugins/errors.py +2 -0
- data_designer/plugins/plugin.py +0 -1
- data_designer/plugins/registry.py +2 -0
- data_designer/plugins/testing/__init__.py +2 -0
- data_designer/plugins/testing/stubs.py +21 -43
- data_designer/plugins/testing/utils.py +2 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/METADATA +12 -5
- data_designer-0.3.6.dist-info/RECORD +196 -0
- data_designer-0.3.4.dist-info/RECORD +0 -194
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/WHEEL +0 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/entry_points.txt +0 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,22 +1,20 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
from collections.abc import Sequence
|
|
6
8
|
from functools import cached_property
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
7
10
|
|
|
8
|
-
import pandas as pd
|
|
9
|
-
import pyarrow as pa
|
|
10
11
|
from pydantic import Field, field_validator
|
|
11
12
|
|
|
12
13
|
from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
|
|
13
14
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
14
15
|
from data_designer.config.base import ConfigBase
|
|
15
16
|
from data_designer.config.column_configs import SingleColumnConfig
|
|
16
|
-
from data_designer.config.column_types import
|
|
17
|
-
COLUMN_TYPE_EMOJI_MAP,
|
|
18
|
-
ColumnConfigT,
|
|
19
|
-
)
|
|
17
|
+
from data_designer.config.column_types import ColumnConfigT
|
|
20
18
|
from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
|
|
21
19
|
from data_designer.engine.analysis.column_statistics import get_column_statistics_calculator
|
|
22
20
|
from data_designer.engine.analysis.errors import DatasetProfilerConfigurationError
|
|
@@ -24,6 +22,11 @@ from data_designer.engine.analysis.utils.column_statistics_calculations import h
|
|
|
24
22
|
from data_designer.engine.dataset_builders.multi_column_configs import DatasetBuilderColumnConfigT, MultiColumnConfig
|
|
25
23
|
from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry
|
|
26
24
|
from data_designer.engine.resources.resource_provider import ResourceProvider
|
|
25
|
+
from data_designer.lazy_heavy_imports import pa, pd
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
import pandas as pd
|
|
29
|
+
import pyarrow as pa
|
|
27
30
|
|
|
28
31
|
logger = logging.getLogger(__name__)
|
|
29
32
|
|
|
@@ -71,7 +74,7 @@ class DataDesignerDatasetProfiler:
|
|
|
71
74
|
|
|
72
75
|
column_statistics = []
|
|
73
76
|
for c in self.config.column_configs:
|
|
74
|
-
logger.info(f" |-- {
|
|
77
|
+
logger.info(f" |-- {c.get_column_emoji()} column: '{c.name}'")
|
|
75
78
|
column_statistics.append(
|
|
76
79
|
get_column_statistics_calculator(c.column_type)(
|
|
77
80
|
column_config_with_df=ColumnConfigWithDataFrame(column_config=c, df=dataset)
|
|
@@ -81,14 +84,14 @@ class DataDesignerDatasetProfiler:
|
|
|
81
84
|
column_profiles = []
|
|
82
85
|
for profiler_config in self.config.column_profiler_configs or []:
|
|
83
86
|
profiler = self._create_column_profiler(profiler_config)
|
|
84
|
-
applicable_column_types = profiler.
|
|
87
|
+
applicable_column_types = profiler.get_applicable_column_types()
|
|
85
88
|
for c in self.config.column_configs:
|
|
86
89
|
if c.column_type in applicable_column_types:
|
|
87
90
|
params = ColumnConfigWithDataFrame(column_config=c, df=dataset)
|
|
88
91
|
column_profiles.append(profiler.profile(params))
|
|
89
92
|
if len(column_profiles) == 0:
|
|
90
93
|
logger.warning(
|
|
91
|
-
f"⚠️ No applicable column types found for the '{profiler.
|
|
94
|
+
f"⚠️ No applicable column types found for the '{profiler.name}' profiler. "
|
|
92
95
|
f"This profiler is applicable to the following column types: {applicable_column_types}"
|
|
93
96
|
)
|
|
94
97
|
|
|
@@ -5,11 +5,8 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
from numbers import Number
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
9
|
|
|
10
|
-
import numpy as np
|
|
11
|
-
import pandas as pd
|
|
12
|
-
import pyarrow as pa
|
|
13
10
|
import tiktoken
|
|
14
11
|
|
|
15
12
|
from data_designer.config.analysis.column_statistics import (
|
|
@@ -26,6 +23,12 @@ from data_designer.engine.column_generators.utils.prompt_renderer import (
|
|
|
26
23
|
RecordBasedPromptRenderer,
|
|
27
24
|
create_response_recipe,
|
|
28
25
|
)
|
|
26
|
+
from data_designer.lazy_heavy_imports import np, pa, pd
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
import numpy as np
|
|
30
|
+
import pandas as pd
|
|
31
|
+
import pyarrow as pa
|
|
29
32
|
|
|
30
33
|
RANDOM_SEED = 42
|
|
31
34
|
MAX_PROMPT_SAMPLE_SIZE = 1000
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
from collections import defaultdict
|
|
6
|
-
from typing import Any
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
9
|
|
|
10
10
|
from data_designer.config.analysis.column_profilers import JudgeScoreDistributions, JudgeScoreSample
|
|
11
11
|
from data_designer.config.analysis.column_statistics import (
|
|
@@ -15,6 +15,10 @@ from data_designer.config.analysis.column_statistics import (
|
|
|
15
15
|
NumericalDistribution,
|
|
16
16
|
)
|
|
17
17
|
from data_designer.config.column_configs import LLMJudgeColumnConfig
|
|
18
|
+
from data_designer.lazy_heavy_imports import pd
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import pandas as pd
|
|
18
22
|
|
|
19
23
|
logger = logging.getLogger(__name__)
|
|
20
24
|
|
|
@@ -9,16 +9,16 @@ from abc import ABC, abstractmethod
|
|
|
9
9
|
from enum import Enum
|
|
10
10
|
from typing import TYPE_CHECKING, overload
|
|
11
11
|
|
|
12
|
-
import
|
|
13
|
-
|
|
14
|
-
from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, DataT, TaskConfigT
|
|
12
|
+
from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT
|
|
13
|
+
from data_designer.lazy_heavy_imports import pd
|
|
15
14
|
|
|
16
15
|
if TYPE_CHECKING:
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
17
18
|
from data_designer.config.models import BaseInferenceParams, ModelConfig
|
|
18
19
|
from data_designer.engine.models.facade import ModelFacade
|
|
19
20
|
from data_designer.engine.models.registry import ModelRegistry
|
|
20
21
|
|
|
21
|
-
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
24
|
|
|
@@ -27,22 +27,14 @@ class GenerationStrategy(str, Enum):
|
|
|
27
27
|
FULL_COLUMN = "full_column"
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
class GeneratorMetadata(ConfigurableTaskMetadata):
|
|
31
|
-
generation_strategy: GenerationStrategy
|
|
32
|
-
|
|
33
|
-
|
|
34
30
|
class ColumnGenerator(ConfigurableTask[TaskConfigT], ABC):
|
|
35
31
|
@property
|
|
36
32
|
def can_generate_from_scratch(self) -> bool:
|
|
37
33
|
return False
|
|
38
34
|
|
|
39
|
-
@property
|
|
40
|
-
def generation_strategy(self) -> GenerationStrategy:
|
|
41
|
-
return self.metadata().generation_strategy
|
|
42
|
-
|
|
43
35
|
@staticmethod
|
|
44
36
|
@abstractmethod
|
|
45
|
-
def
|
|
37
|
+
def get_generation_strategy() -> GenerationStrategy: ...
|
|
46
38
|
|
|
47
39
|
@overload
|
|
48
40
|
@abstractmethod
|
|
@@ -103,8 +95,28 @@ class ColumnGeneratorWithModel(ColumnGeneratorWithModelRegistry[TaskConfigT], AB
|
|
|
103
95
|
return self.model_config.inference_parameters
|
|
104
96
|
|
|
105
97
|
def log_pre_generation(self) -> None:
|
|
106
|
-
logger.info(
|
|
98
|
+
logger.info(
|
|
99
|
+
f"{self.config.get_column_emoji()} {self.config.column_type} model config for column '{self.config.name}'"
|
|
100
|
+
)
|
|
107
101
|
logger.info(f" |-- model: {self.model_config.model!r}")
|
|
108
102
|
logger.info(f" |-- model alias: {self.config.model_alias!r}")
|
|
109
103
|
logger.info(f" |-- model provider: {self.get_model_provider_name(model_alias=self.config.model_alias)!r}")
|
|
110
104
|
logger.info(f" |-- inference parameters: {self.inference_parameters.format_for_display()}")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class ColumnGeneratorCellByCell(ColumnGenerator[TaskConfigT], ABC):
|
|
108
|
+
@staticmethod
|
|
109
|
+
def get_generation_strategy() -> GenerationStrategy:
|
|
110
|
+
return GenerationStrategy.CELL_BY_CELL
|
|
111
|
+
|
|
112
|
+
@abstractmethod
|
|
113
|
+
def generate(self, data: dict) -> dict: ...
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class ColumnGeneratorFullColumn(ColumnGenerator[TaskConfigT], ABC):
|
|
117
|
+
@staticmethod
|
|
118
|
+
def get_generation_strategy() -> GenerationStrategy:
|
|
119
|
+
return GenerationStrategy.FULL_COLUMN
|
|
120
|
+
|
|
121
|
+
@abstractmethod
|
|
122
|
+
def generate(self, data: pd.DataFrame) -> pd.DataFrame: ...
|
|
@@ -1,15 +1,12 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
4
5
|
|
|
5
6
|
from pydantic import BaseModel, computed_field
|
|
6
7
|
|
|
7
8
|
from data_designer.config.column_configs import EmbeddingColumnConfig
|
|
8
|
-
from data_designer.engine.column_generators.generators.base import
|
|
9
|
-
ColumnGeneratorWithModel,
|
|
10
|
-
GenerationStrategy,
|
|
11
|
-
GeneratorMetadata,
|
|
12
|
-
)
|
|
9
|
+
from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModel, GenerationStrategy
|
|
13
10
|
from data_designer.engine.processing.utils import deserialize_json_values, parse_list_string
|
|
14
11
|
|
|
15
12
|
|
|
@@ -27,12 +24,8 @@ class EmbeddingGenerationResult(BaseModel):
|
|
|
27
24
|
|
|
28
25
|
class EmbeddingCellGenerator(ColumnGeneratorWithModel[EmbeddingColumnConfig]):
|
|
29
26
|
@staticmethod
|
|
30
|
-
def
|
|
31
|
-
return
|
|
32
|
-
name="embedding_cell_generator",
|
|
33
|
-
description="Generate embeddings for a text column.",
|
|
34
|
-
generation_strategy=GenerationStrategy.CELL_BY_CELL,
|
|
35
|
-
)
|
|
27
|
+
def get_generation_strategy() -> GenerationStrategy:
|
|
28
|
+
return GenerationStrategy.CELL_BY_CELL
|
|
36
29
|
|
|
37
30
|
def generate(self, data: dict) -> dict:
|
|
38
31
|
deserialized_record = deserialize_json_values(data)
|
|
@@ -4,31 +4,22 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
9
8
|
|
|
10
9
|
from data_designer.config.column_configs import ExpressionColumnConfig
|
|
11
|
-
from data_designer.engine.column_generators.generators.base import
|
|
12
|
-
ColumnGenerator,
|
|
13
|
-
GenerationStrategy,
|
|
14
|
-
GeneratorMetadata,
|
|
15
|
-
)
|
|
10
|
+
from data_designer.engine.column_generators.generators.base import ColumnGeneratorFullColumn
|
|
16
11
|
from data_designer.engine.column_generators.utils.errors import ExpressionTemplateRenderError
|
|
17
12
|
from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
|
|
18
13
|
from data_designer.engine.processing.utils import deserialize_json_values
|
|
14
|
+
from data_designer.lazy_heavy_imports import pd
|
|
19
15
|
|
|
20
|
-
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
import pandas as pd
|
|
21
18
|
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
22
20
|
|
|
23
|
-
class ExpressionColumnGenerator(WithJinja2UserTemplateRendering, ColumnGenerator[ExpressionColumnConfig]):
|
|
24
|
-
@staticmethod
|
|
25
|
-
def metadata() -> GeneratorMetadata:
|
|
26
|
-
return GeneratorMetadata(
|
|
27
|
-
name="expression_generator",
|
|
28
|
-
description="Generate a column from a jinja2 expression.",
|
|
29
|
-
generation_strategy=GenerationStrategy.FULL_COLUMN,
|
|
30
|
-
)
|
|
31
21
|
|
|
22
|
+
class ExpressionColumnGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorFullColumn[ExpressionColumnConfig]):
|
|
32
23
|
def generate(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
33
24
|
logger.info(f"🧩 Generating column `{self.config.name}` from expression")
|
|
34
25
|
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import functools
|
|
5
7
|
import logging
|
|
6
8
|
|
|
@@ -11,11 +13,7 @@ from data_designer.config.column_configs import (
|
|
|
11
13
|
LLMTextColumnConfig,
|
|
12
14
|
)
|
|
13
15
|
from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX
|
|
14
|
-
from data_designer.engine.column_generators.generators.base import
|
|
15
|
-
ColumnGeneratorWithModel,
|
|
16
|
-
GenerationStrategy,
|
|
17
|
-
GeneratorMetadata,
|
|
18
|
-
)
|
|
16
|
+
from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModel, GenerationStrategy
|
|
19
17
|
from data_designer.engine.column_generators.utils.prompt_renderer import (
|
|
20
18
|
PromptType,
|
|
21
19
|
RecordBasedPromptRenderer,
|
|
@@ -29,6 +27,10 @@ logger = logging.getLogger(__name__)
|
|
|
29
27
|
|
|
30
28
|
|
|
31
29
|
class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfigT]):
|
|
30
|
+
@staticmethod
|
|
31
|
+
def get_generation_strategy() -> GenerationStrategy:
|
|
32
|
+
return GenerationStrategy.CELL_BY_CELL
|
|
33
|
+
|
|
32
34
|
@functools.cached_property
|
|
33
35
|
def response_recipe(self) -> ResponseRecipe:
|
|
34
36
|
return create_response_recipe(self.config, self.model_config)
|
|
@@ -87,41 +89,13 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
|
|
|
87
89
|
return data
|
|
88
90
|
|
|
89
91
|
|
|
90
|
-
class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnConfig]):
|
|
91
|
-
@staticmethod
|
|
92
|
-
def metadata() -> GeneratorMetadata:
|
|
93
|
-
return GeneratorMetadata(
|
|
94
|
-
name="llm_text_generator",
|
|
95
|
-
description="Generate a new dataset cell from a prompt template",
|
|
96
|
-
generation_strategy=GenerationStrategy.CELL_BY_CELL,
|
|
97
|
-
)
|
|
92
|
+
class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnConfig]): ...
|
|
98
93
|
|
|
99
94
|
|
|
100
|
-
class LLMCodeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMCodeColumnConfig]):
|
|
101
|
-
@staticmethod
|
|
102
|
-
def metadata() -> GeneratorMetadata:
|
|
103
|
-
return GeneratorMetadata(
|
|
104
|
-
name="llm_code_generator",
|
|
105
|
-
description="Generate a new dataset cell from a prompt template",
|
|
106
|
-
generation_strategy=GenerationStrategy.CELL_BY_CELL,
|
|
107
|
-
)
|
|
95
|
+
class LLMCodeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMCodeColumnConfig]): ...
|
|
108
96
|
|
|
109
97
|
|
|
110
|
-
class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]):
|
|
111
|
-
@staticmethod
|
|
112
|
-
def metadata() -> GeneratorMetadata:
|
|
113
|
-
return GeneratorMetadata(
|
|
114
|
-
name="llm_structured_generator",
|
|
115
|
-
description="Generate a new dataset cell from a prompt template",
|
|
116
|
-
generation_strategy=GenerationStrategy.CELL_BY_CELL,
|
|
117
|
-
)
|
|
98
|
+
class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]): ...
|
|
118
99
|
|
|
119
100
|
|
|
120
|
-
class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]):
|
|
121
|
-
@staticmethod
|
|
122
|
-
def metadata() -> GeneratorMetadata:
|
|
123
|
-
return GeneratorMetadata(
|
|
124
|
-
name="llm_judge_generator",
|
|
125
|
-
description="Judge a new dataset cell based on a set of rubrics",
|
|
126
|
-
generation_strategy=GenerationStrategy.CELL_BY_CELL,
|
|
127
|
-
)
|
|
101
|
+
class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]): ...
|
|
@@ -6,34 +6,28 @@ from __future__ import annotations
|
|
|
6
6
|
import logging
|
|
7
7
|
import random
|
|
8
8
|
from functools import partial
|
|
9
|
-
from typing import Callable
|
|
10
|
-
|
|
11
|
-
import pandas as pd
|
|
9
|
+
from typing import TYPE_CHECKING, Callable
|
|
12
10
|
|
|
13
11
|
from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
|
|
14
|
-
from data_designer.engine.column_generators.generators.base import
|
|
15
|
-
FromScratchColumnGenerator,
|
|
16
|
-
GenerationStrategy,
|
|
17
|
-
GeneratorMetadata,
|
|
18
|
-
)
|
|
12
|
+
from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy
|
|
19
13
|
from data_designer.engine.dataset_builders.multi_column_configs import SamplerMultiColumnConfig
|
|
20
14
|
from data_designer.engine.processing.utils import concat_datasets
|
|
21
15
|
from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
|
|
22
16
|
from data_designer.engine.sampling_gen.data_sources.sources import SamplerType
|
|
23
17
|
from data_designer.engine.sampling_gen.entities.person import load_person_data_sampler
|
|
24
18
|
from data_designer.engine.sampling_gen.generator import DatasetGenerator as SamplingDatasetGenerator
|
|
19
|
+
from data_designer.lazy_heavy_imports import pd
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
import pandas as pd
|
|
25
23
|
|
|
26
24
|
logger = logging.getLogger(__name__)
|
|
27
25
|
|
|
28
26
|
|
|
29
27
|
class SamplerColumnGenerator(FromScratchColumnGenerator[SamplerMultiColumnConfig]):
|
|
30
28
|
@staticmethod
|
|
31
|
-
def
|
|
32
|
-
return
|
|
33
|
-
name="sampler_column_generator",
|
|
34
|
-
description="Generate columns using sampling-based method.",
|
|
35
|
-
generation_strategy=GenerationStrategy.FULL_COLUMN,
|
|
36
|
-
)
|
|
29
|
+
def get_generation_strategy() -> GenerationStrategy:
|
|
30
|
+
return GenerationStrategy.FULL_COLUMN
|
|
37
31
|
|
|
38
32
|
def generate(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
39
33
|
df_samplers = self.generate_from_scratch(len(data))
|
|
@@ -1,24 +1,22 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
|
|
5
4
|
from __future__ import annotations
|
|
6
5
|
|
|
7
6
|
import functools
|
|
8
7
|
import logging
|
|
9
|
-
|
|
10
|
-
import duckdb
|
|
11
|
-
import pandas as pd
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
12
9
|
|
|
13
10
|
from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy
|
|
14
|
-
from data_designer.engine.column_generators.generators.base import
|
|
15
|
-
FromScratchColumnGenerator,
|
|
16
|
-
GenerationStrategy,
|
|
17
|
-
GeneratorMetadata,
|
|
18
|
-
)
|
|
11
|
+
from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy
|
|
19
12
|
from data_designer.engine.column_generators.utils.errors import SeedDatasetError
|
|
20
13
|
from data_designer.engine.dataset_builders.multi_column_configs import SeedDatasetMultiColumnConfig
|
|
21
14
|
from data_designer.engine.processing.utils import concat_datasets
|
|
15
|
+
from data_designer.lazy_heavy_imports import duckdb, pd
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import duckdb
|
|
19
|
+
import pandas as pd
|
|
22
20
|
|
|
23
21
|
MAX_ZERO_RECORD_RESPONSE_FACTOR = 2
|
|
24
22
|
|
|
@@ -27,12 +25,8 @@ logger = logging.getLogger(__name__)
|
|
|
27
25
|
|
|
28
26
|
class SeedDatasetColumnGenerator(FromScratchColumnGenerator[SeedDatasetMultiColumnConfig]):
|
|
29
27
|
@staticmethod
|
|
30
|
-
def
|
|
31
|
-
return
|
|
32
|
-
name="seed_dataset_column_generator",
|
|
33
|
-
description="Sample columns from a seed dataset.",
|
|
34
|
-
generation_strategy=GenerationStrategy.FULL_COLUMN,
|
|
35
|
-
)
|
|
28
|
+
def get_generation_strategy() -> GenerationStrategy:
|
|
29
|
+
return GenerationStrategy.FULL_COLUMN
|
|
36
30
|
|
|
37
31
|
@property
|
|
38
32
|
def num_records_sampled(self) -> int:
|
|
@@ -4,21 +4,13 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
9
8
|
|
|
10
9
|
from data_designer.config.column_configs import ValidationColumnConfig
|
|
11
10
|
from data_designer.config.errors import InvalidConfigError
|
|
12
11
|
from data_designer.config.utils.code_lang import SQL_DIALECTS, CodeLang
|
|
13
|
-
from data_designer.config.validator_params import
|
|
14
|
-
|
|
15
|
-
ValidatorType,
|
|
16
|
-
)
|
|
17
|
-
from data_designer.engine.column_generators.generators.base import (
|
|
18
|
-
ColumnGenerator,
|
|
19
|
-
GenerationStrategy,
|
|
20
|
-
GeneratorMetadata,
|
|
21
|
-
)
|
|
12
|
+
from data_designer.config.validator_params import ValidatorParamsT, ValidatorType
|
|
13
|
+
from data_designer.engine.column_generators.generators.base import ColumnGeneratorFullColumn
|
|
22
14
|
from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor
|
|
23
15
|
from data_designer.engine.errors import DataDesignerRuntimeError
|
|
24
16
|
from data_designer.engine.validators import (
|
|
@@ -29,6 +21,10 @@ from data_designer.engine.validators import (
|
|
|
29
21
|
SQLValidator,
|
|
30
22
|
ValidationResult,
|
|
31
23
|
)
|
|
24
|
+
from data_designer.lazy_heavy_imports import pd
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
import pandas as pd
|
|
32
28
|
|
|
33
29
|
logger = logging.getLogger(__name__)
|
|
34
30
|
|
|
@@ -45,15 +41,7 @@ def get_validator_from_params(validator_type: ValidatorType, validator_params: V
|
|
|
45
41
|
return LocalCallableValidator(validator_params)
|
|
46
42
|
|
|
47
43
|
|
|
48
|
-
class ValidationColumnGenerator(
|
|
49
|
-
@staticmethod
|
|
50
|
-
def metadata() -> GeneratorMetadata:
|
|
51
|
-
return GeneratorMetadata(
|
|
52
|
-
name="validate",
|
|
53
|
-
description="Validate data.",
|
|
54
|
-
generation_strategy=GenerationStrategy.FULL_COLUMN,
|
|
55
|
-
)
|
|
56
|
-
|
|
44
|
+
class ValidationColumnGenerator(ColumnGeneratorFullColumn[ValidationColumnConfig]):
|
|
57
45
|
def generate(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
58
46
|
logger.info(f"🔍 Validating column {self.config.name!r} with {len(data)} records")
|
|
59
47
|
logger.info(f" |-- target columns: {self.config.target_columns}")
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.config.base import ConfigBase
|
|
5
7
|
from data_designer.config.column_configs import (
|
|
6
8
|
EmbeddingColumnConfig,
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.config.column_types import DataDesignerColumnType
|
|
5
7
|
from data_designer.config.utils.type_helpers import resolve_string_enum
|
|
6
8
|
from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModelRegistry
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from enum import Enum
|
|
5
7
|
|
|
6
8
|
from pydantic import BaseModel, ConfigDict, Field, create_model
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import json
|
|
5
7
|
import logging
|
|
6
8
|
|
|
@@ -8,7 +10,7 @@ from data_designer.config.column_configs import SingleColumnConfig
|
|
|
8
10
|
from data_designer.config.column_types import DataDesignerColumnType
|
|
9
11
|
from data_designer.config.models import ModelConfig
|
|
10
12
|
from data_designer.config.utils.code_lang import CodeLang
|
|
11
|
-
from data_designer.config.utils.misc import
|
|
13
|
+
from data_designer.config.utils.misc import extract_keywords_from_jinja2_template
|
|
12
14
|
from data_designer.config.utils.type_helpers import StrEnum
|
|
13
15
|
from data_designer.engine.column_generators.utils.errors import PromptTemplateRenderError
|
|
14
16
|
from data_designer.engine.column_generators.utils.judge_score_factory import (
|
|
@@ -56,7 +58,7 @@ class RecordBasedPromptRenderer(WithJinja2UserTemplateRendering):
|
|
|
56
58
|
dataset_variables=list(record.keys()),
|
|
57
59
|
)
|
|
58
60
|
except (UserTemplateUnsupportedFiltersError, UserTemplateError) as exc:
|
|
59
|
-
template_variables =
|
|
61
|
+
template_variables = extract_keywords_from_jinja2_template(prompt_template)
|
|
60
62
|
missing_columns = list(set(template_variables) - set(record.keys()))
|
|
61
63
|
|
|
62
64
|
error_msg = (
|
data_designer/engine/compiler.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
|
|
6
8
|
from data_designer.config.column_configs import SeedDatasetColumnConfig
|
|
7
|
-
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
8
9
|
from data_designer.config.data_designer_config import DataDesignerConfig
|
|
9
10
|
from data_designer.config.errors import InvalidConfigError
|
|
10
11
|
from data_designer.engine.resources.resource_provider import ResourceProvider
|
|
@@ -14,13 +15,9 @@ from data_designer.engine.validation import ViolationLevel, rich_print_violation
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
def compile_data_designer_config(
|
|
18
|
-
config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider
|
|
19
|
-
) -> DataDesignerConfig:
|
|
20
|
-
config = config_builder.build()
|
|
18
|
+
def compile_data_designer_config(config: DataDesignerConfig, resource_provider: ResourceProvider) -> DataDesignerConfig:
|
|
21
19
|
_resolve_and_add_seed_columns(config, resource_provider.seed_reader)
|
|
22
20
|
_validate(config)
|
|
23
|
-
|
|
24
21
|
return config
|
|
25
22
|
|
|
26
23
|
|
|
@@ -1,25 +1,24 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Generic, TypeVar, get_origin
|
|
4
|
+
from __future__ import annotations
|
|
7
5
|
|
|
8
|
-
|
|
6
|
+
from abc import ABC
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Generic, TypeVar, get_origin
|
|
9
9
|
|
|
10
10
|
from data_designer.config.base import ConfigBase
|
|
11
11
|
from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
|
|
12
12
|
from data_designer.engine.resources.resource_provider import ResourceProvider
|
|
13
|
+
from data_designer.lazy_heavy_imports import pd
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
import pandas as pd
|
|
13
17
|
|
|
14
18
|
DataT = TypeVar("DataT", dict, pd.DataFrame)
|
|
15
19
|
TaskConfigT = TypeVar("ConfigT", bound=ConfigBase)
|
|
16
20
|
|
|
17
21
|
|
|
18
|
-
class ConfigurableTaskMetadata(ConfigBase):
|
|
19
|
-
name: str
|
|
20
|
-
description: str
|
|
21
|
-
|
|
22
|
-
|
|
23
22
|
class ConfigurableTask(ABC, Generic[TaskConfigT]):
|
|
24
23
|
def __init__(self, config: TaskConfigT, resource_provider: ResourceProvider):
|
|
25
24
|
self._config = self.get_config_type().model_validate(config)
|
|
@@ -57,14 +56,14 @@ class ConfigurableTask(ABC, Generic[TaskConfigT]):
|
|
|
57
56
|
def config(self) -> TaskConfigT:
|
|
58
57
|
return self._config
|
|
59
58
|
|
|
59
|
+
@property
|
|
60
|
+
def name(self) -> str:
|
|
61
|
+
return self.__class__.__name__
|
|
62
|
+
|
|
60
63
|
@property
|
|
61
64
|
def resource_provider(self) -> ResourceProvider:
|
|
62
65
|
return self._resource_provider
|
|
63
66
|
|
|
64
|
-
@staticmethod
|
|
65
|
-
@abstractmethod
|
|
66
|
-
def metadata() -> ConfigurableTaskMetadata: ...
|
|
67
|
-
|
|
68
67
|
def _initialize(self) -> None:
|
|
69
68
|
"""An internal method for custom initialization logic, which will be called in the constructor."""
|
|
70
69
|
|