data-designer 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. data_designer/__init__.py +2 -0
  2. data_designer/_version.py +2 -2
  3. data_designer/cli/__init__.py +2 -0
  4. data_designer/cli/commands/download.py +2 -0
  5. data_designer/cli/commands/list.py +2 -0
  6. data_designer/cli/commands/models.py +2 -0
  7. data_designer/cli/commands/providers.py +2 -0
  8. data_designer/cli/commands/reset.py +2 -0
  9. data_designer/cli/controllers/__init__.py +2 -0
  10. data_designer/cli/controllers/download_controller.py +2 -0
  11. data_designer/cli/controllers/model_controller.py +6 -1
  12. data_designer/cli/controllers/provider_controller.py +6 -1
  13. data_designer/cli/forms/__init__.py +2 -0
  14. data_designer/cli/forms/builder.py +2 -0
  15. data_designer/cli/forms/field.py +2 -0
  16. data_designer/cli/forms/form.py +2 -0
  17. data_designer/cli/forms/model_builder.py +2 -0
  18. data_designer/cli/forms/provider_builder.py +2 -0
  19. data_designer/cli/main.py +2 -0
  20. data_designer/cli/repositories/__init__.py +2 -0
  21. data_designer/cli/repositories/base.py +2 -0
  22. data_designer/cli/repositories/model_repository.py +2 -0
  23. data_designer/cli/repositories/persona_repository.py +2 -0
  24. data_designer/cli/repositories/provider_repository.py +2 -0
  25. data_designer/cli/services/__init__.py +2 -0
  26. data_designer/cli/services/download_service.py +2 -0
  27. data_designer/cli/services/model_service.py +2 -0
  28. data_designer/cli/services/provider_service.py +2 -0
  29. data_designer/cli/ui.py +2 -0
  30. data_designer/cli/utils.py +2 -0
  31. data_designer/config/analysis/column_profilers.py +2 -0
  32. data_designer/config/analysis/column_statistics.py +8 -5
  33. data_designer/config/analysis/dataset_profiler.py +9 -3
  34. data_designer/config/analysis/utils/errors.py +2 -0
  35. data_designer/config/analysis/utils/reporting.py +7 -3
  36. data_designer/config/column_configs.py +77 -7
  37. data_designer/config/column_types.py +33 -36
  38. data_designer/config/dataset_builders.py +2 -0
  39. data_designer/config/default_model_settings.py +1 -0
  40. data_designer/config/errors.py +2 -0
  41. data_designer/config/exports.py +2 -0
  42. data_designer/config/interface.py +3 -2
  43. data_designer/config/models.py +7 -2
  44. data_designer/config/preview_results.py +7 -3
  45. data_designer/config/processors.py +2 -0
  46. data_designer/config/run_config.py +2 -0
  47. data_designer/config/sampler_constraints.py +2 -0
  48. data_designer/config/sampler_params.py +7 -2
  49. data_designer/config/seed.py +2 -0
  50. data_designer/config/seed_source.py +7 -2
  51. data_designer/config/seed_source_types.py +2 -0
  52. data_designer/config/utils/constants.py +2 -0
  53. data_designer/config/utils/errors.py +2 -0
  54. data_designer/config/utils/info.py +2 -0
  55. data_designer/config/utils/io_helpers.py +8 -3
  56. data_designer/config/utils/misc.py +2 -2
  57. data_designer/config/utils/numerical_helpers.py +2 -0
  58. data_designer/config/utils/type_helpers.py +2 -0
  59. data_designer/config/utils/visualization.py +8 -4
  60. data_designer/config/validator_params.py +2 -0
  61. data_designer/engine/analysis/column_profilers/base.py +9 -8
  62. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
  63. data_designer/engine/analysis/column_profilers/registry.py +2 -0
  64. data_designer/engine/analysis/column_statistics.py +5 -2
  65. data_designer/engine/analysis/dataset_profiler.py +12 -9
  66. data_designer/engine/analysis/errors.py +2 -0
  67. data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
  68. data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
  69. data_designer/engine/column_generators/generators/base.py +26 -14
  70. data_designer/engine/column_generators/generators/embedding.py +4 -11
  71. data_designer/engine/column_generators/generators/expression.py +7 -16
  72. data_designer/engine/column_generators/generators/llm_completion.py +11 -37
  73. data_designer/engine/column_generators/generators/samplers.py +8 -14
  74. data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
  75. data_designer/engine/column_generators/generators/validation.py +8 -20
  76. data_designer/engine/column_generators/registry.py +2 -0
  77. data_designer/engine/column_generators/utils/errors.py +2 -0
  78. data_designer/engine/column_generators/utils/generator_classification.py +2 -0
  79. data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
  80. data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
  81. data_designer/engine/compiler.py +3 -6
  82. data_designer/engine/configurable_task.py +12 -13
  83. data_designer/engine/dataset_builders/artifact_storage.py +87 -8
  84. data_designer/engine/dataset_builders/column_wise_builder.py +32 -34
  85. data_designer/engine/dataset_builders/errors.py +2 -0
  86. data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
  87. data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
  88. data_designer/engine/dataset_builders/utils/dag.py +7 -2
  89. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +9 -6
  90. data_designer/engine/dataset_builders/utils/errors.py +2 -0
  91. data_designer/engine/errors.py +2 -0
  92. data_designer/engine/model_provider.py +2 -0
  93. data_designer/engine/models/errors.py +23 -31
  94. data_designer/engine/models/facade.py +12 -9
  95. data_designer/engine/models/factory.py +42 -0
  96. data_designer/engine/models/litellm_overrides.py +22 -11
  97. data_designer/engine/models/parsers/errors.py +2 -0
  98. data_designer/engine/models/parsers/parser.py +2 -2
  99. data_designer/engine/models/parsers/postprocessors.py +1 -0
  100. data_designer/engine/models/parsers/tag_parsers.py +2 -0
  101. data_designer/engine/models/parsers/types.py +2 -0
  102. data_designer/engine/models/recipes/base.py +2 -0
  103. data_designer/engine/models/recipes/response_recipes.py +2 -0
  104. data_designer/engine/models/registry.py +11 -18
  105. data_designer/engine/models/telemetry.py +6 -2
  106. data_designer/engine/processing/ginja/ast.py +2 -0
  107. data_designer/engine/processing/ginja/environment.py +2 -0
  108. data_designer/engine/processing/ginja/exceptions.py +2 -0
  109. data_designer/engine/processing/ginja/record.py +2 -0
  110. data_designer/engine/processing/gsonschema/exceptions.py +9 -2
  111. data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
  112. data_designer/engine/processing/gsonschema/types.py +2 -0
  113. data_designer/engine/processing/gsonschema/validators.py +10 -6
  114. data_designer/engine/processing/processors/base.py +1 -5
  115. data_designer/engine/processing/processors/drop_columns.py +7 -10
  116. data_designer/engine/processing/processors/registry.py +2 -0
  117. data_designer/engine/processing/processors/schema_transform.py +7 -10
  118. data_designer/engine/processing/utils.py +7 -3
  119. data_designer/engine/registry/base.py +2 -0
  120. data_designer/engine/registry/data_designer_registry.py +2 -0
  121. data_designer/engine/registry/errors.py +2 -0
  122. data_designer/engine/resources/managed_dataset_generator.py +6 -2
  123. data_designer/engine/resources/managed_dataset_repository.py +8 -5
  124. data_designer/engine/resources/managed_storage.py +2 -0
  125. data_designer/engine/resources/resource_provider.py +8 -1
  126. data_designer/engine/resources/seed_reader.py +7 -2
  127. data_designer/engine/sampling_gen/column.py +2 -0
  128. data_designer/engine/sampling_gen/constraints.py +8 -2
  129. data_designer/engine/sampling_gen/data_sources/base.py +10 -7
  130. data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
  131. data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
  132. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
  133. data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
  134. data_designer/engine/sampling_gen/entities/errors.py +2 -0
  135. data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
  136. data_designer/engine/sampling_gen/entities/person.py +2 -0
  137. data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
  138. data_designer/engine/sampling_gen/errors.py +2 -0
  139. data_designer/engine/sampling_gen/generator.py +5 -4
  140. data_designer/engine/sampling_gen/jinja_utils.py +7 -3
  141. data_designer/engine/sampling_gen/people_gen.py +7 -7
  142. data_designer/engine/sampling_gen/person_constants.py +2 -0
  143. data_designer/engine/sampling_gen/schema.py +5 -1
  144. data_designer/engine/sampling_gen/schema_builder.py +2 -0
  145. data_designer/engine/sampling_gen/utils.py +7 -1
  146. data_designer/engine/secret_resolver.py +2 -0
  147. data_designer/engine/validation.py +2 -2
  148. data_designer/engine/validators/__init__.py +2 -0
  149. data_designer/engine/validators/base.py +2 -0
  150. data_designer/engine/validators/local_callable.py +7 -2
  151. data_designer/engine/validators/python.py +7 -1
  152. data_designer/engine/validators/remote.py +7 -1
  153. data_designer/engine/validators/sql.py +8 -3
  154. data_designer/errors.py +2 -0
  155. data_designer/essentials/__init__.py +2 -0
  156. data_designer/interface/data_designer.py +23 -17
  157. data_designer/interface/errors.py +2 -0
  158. data_designer/interface/results.py +5 -2
  159. data_designer/lazy_heavy_imports.py +54 -0
  160. data_designer/logging.py +2 -0
  161. data_designer/plugins/__init__.py +2 -0
  162. data_designer/plugins/errors.py +2 -0
  163. data_designer/plugins/plugin.py +0 -1
  164. data_designer/plugins/registry.py +2 -0
  165. data_designer/plugins/testing/__init__.py +2 -0
  166. data_designer/plugins/testing/stubs.py +21 -43
  167. data_designer/plugins/testing/utils.py +2 -0
  168. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/METADATA +12 -5
  169. data_designer-0.3.6.dist-info/RECORD +196 -0
  170. data_designer-0.3.4.dist-info/RECORD +0 -194
  171. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/WHEEL +0 -0
  172. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/entry_points.txt +0 -0
  173. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,22 +1,20 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  from collections.abc import Sequence
6
8
  from functools import cached_property
9
+ from typing import TYPE_CHECKING
7
10
 
8
- import pandas as pd
9
- import pyarrow as pa
10
11
  from pydantic import Field, field_validator
11
12
 
12
13
  from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
13
14
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
14
15
  from data_designer.config.base import ConfigBase
15
16
  from data_designer.config.column_configs import SingleColumnConfig
16
- from data_designer.config.column_types import (
17
- COLUMN_TYPE_EMOJI_MAP,
18
- ColumnConfigT,
19
- )
17
+ from data_designer.config.column_types import ColumnConfigT
20
18
  from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
21
19
  from data_designer.engine.analysis.column_statistics import get_column_statistics_calculator
22
20
  from data_designer.engine.analysis.errors import DatasetProfilerConfigurationError
@@ -24,6 +22,11 @@ from data_designer.engine.analysis.utils.column_statistics_calculations import h
24
22
  from data_designer.engine.dataset_builders.multi_column_configs import DatasetBuilderColumnConfigT, MultiColumnConfig
25
23
  from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry
26
24
  from data_designer.engine.resources.resource_provider import ResourceProvider
25
+ from data_designer.lazy_heavy_imports import pa, pd
26
+
27
+ if TYPE_CHECKING:
28
+ import pandas as pd
29
+ import pyarrow as pa
27
30
 
28
31
  logger = logging.getLogger(__name__)
29
32
 
@@ -71,7 +74,7 @@ class DataDesignerDatasetProfiler:
71
74
 
72
75
  column_statistics = []
73
76
  for c in self.config.column_configs:
74
- logger.info(f" |-- {COLUMN_TYPE_EMOJI_MAP[c.column_type]} column: '{c.name}'")
77
+ logger.info(f" |-- {c.get_column_emoji()} column: '{c.name}'")
75
78
  column_statistics.append(
76
79
  get_column_statistics_calculator(c.column_type)(
77
80
  column_config_with_df=ColumnConfigWithDataFrame(column_config=c, df=dataset)
@@ -81,14 +84,14 @@ class DataDesignerDatasetProfiler:
81
84
  column_profiles = []
82
85
  for profiler_config in self.config.column_profiler_configs or []:
83
86
  profiler = self._create_column_profiler(profiler_config)
84
- applicable_column_types = profiler.metadata().applicable_column_types
87
+ applicable_column_types = profiler.get_applicable_column_types()
85
88
  for c in self.config.column_configs:
86
89
  if c.column_type in applicable_column_types:
87
90
  params = ColumnConfigWithDataFrame(column_config=c, df=dataset)
88
91
  column_profiles.append(profiler.profile(params))
89
92
  if len(column_profiles) == 0:
90
93
  logger.warning(
91
- f"⚠️ No applicable column types found for the '{profiler.metadata().name}' profiler. "
94
+ f"⚠️ No applicable column types found for the '{profiler.name}' profiler. "
92
95
  f"This profiler is applicable to the following column types: {applicable_column_types}"
93
96
  )
94
97
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.errors import DataDesignerError
5
7
 
6
8
 
@@ -5,11 +5,8 @@ from __future__ import annotations
5
5
 
6
6
  import logging
7
7
  from numbers import Number
8
- from typing import Any
8
+ from typing import TYPE_CHECKING, Any
9
9
 
10
- import numpy as np
11
- import pandas as pd
12
- import pyarrow as pa
13
10
  import tiktoken
14
11
 
15
12
  from data_designer.config.analysis.column_statistics import (
@@ -26,6 +23,12 @@ from data_designer.engine.column_generators.utils.prompt_renderer import (
26
23
  RecordBasedPromptRenderer,
27
24
  create_response_recipe,
28
25
  )
26
+ from data_designer.lazy_heavy_imports import np, pa, pd
27
+
28
+ if TYPE_CHECKING:
29
+ import numpy as np
30
+ import pandas as pd
31
+ import pyarrow as pa
29
32
 
30
33
  RANDOM_SEED = 42
31
34
  MAX_PROMPT_SAMPLE_SIZE = 1000
@@ -1,11 +1,11 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  from collections import defaultdict
6
- from typing import Any
7
-
8
- import pandas as pd
8
+ from typing import TYPE_CHECKING, Any
9
9
 
10
10
  from data_designer.config.analysis.column_profilers import JudgeScoreDistributions, JudgeScoreSample
11
11
  from data_designer.config.analysis.column_statistics import (
@@ -15,6 +15,10 @@ from data_designer.config.analysis.column_statistics import (
15
15
  NumericalDistribution,
16
16
  )
17
17
  from data_designer.config.column_configs import LLMJudgeColumnConfig
18
+ from data_designer.lazy_heavy_imports import pd
19
+
20
+ if TYPE_CHECKING:
21
+ import pandas as pd
18
22
 
19
23
  logger = logging.getLogger(__name__)
20
24
 
@@ -9,16 +9,16 @@ from abc import ABC, abstractmethod
9
9
  from enum import Enum
10
10
  from typing import TYPE_CHECKING, overload
11
11
 
12
- import pandas as pd
13
-
14
- from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, DataT, TaskConfigT
12
+ from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT
13
+ from data_designer.lazy_heavy_imports import pd
15
14
 
16
15
  if TYPE_CHECKING:
16
+ import pandas as pd
17
+
17
18
  from data_designer.config.models import BaseInferenceParams, ModelConfig
18
19
  from data_designer.engine.models.facade import ModelFacade
19
20
  from data_designer.engine.models.registry import ModelRegistry
20
21
 
21
-
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
24
 
@@ -27,22 +27,14 @@ class GenerationStrategy(str, Enum):
27
27
  FULL_COLUMN = "full_column"
28
28
 
29
29
 
30
- class GeneratorMetadata(ConfigurableTaskMetadata):
31
- generation_strategy: GenerationStrategy
32
-
33
-
34
30
  class ColumnGenerator(ConfigurableTask[TaskConfigT], ABC):
35
31
  @property
36
32
  def can_generate_from_scratch(self) -> bool:
37
33
  return False
38
34
 
39
- @property
40
- def generation_strategy(self) -> GenerationStrategy:
41
- return self.metadata().generation_strategy
42
-
43
35
  @staticmethod
44
36
  @abstractmethod
45
- def metadata() -> GeneratorMetadata: ...
37
+ def get_generation_strategy() -> GenerationStrategy: ...
46
38
 
47
39
  @overload
48
40
  @abstractmethod
@@ -103,8 +95,28 @@ class ColumnGeneratorWithModel(ColumnGeneratorWithModelRegistry[TaskConfigT], AB
103
95
  return self.model_config.inference_parameters
104
96
 
105
97
  def log_pre_generation(self) -> None:
106
- logger.info(f"{self.config.column_type} model configuration for generating column '{self.config.name}'")
98
+ logger.info(
99
+ f"{self.config.get_column_emoji()} {self.config.column_type} model config for column '{self.config.name}'"
100
+ )
107
101
  logger.info(f" |-- model: {self.model_config.model!r}")
108
102
  logger.info(f" |-- model alias: {self.config.model_alias!r}")
109
103
  logger.info(f" |-- model provider: {self.get_model_provider_name(model_alias=self.config.model_alias)!r}")
110
104
  logger.info(f" |-- inference parameters: {self.inference_parameters.format_for_display()}")
105
+
106
+
107
+ class ColumnGeneratorCellByCell(ColumnGenerator[TaskConfigT], ABC):
108
+ @staticmethod
109
+ def get_generation_strategy() -> GenerationStrategy:
110
+ return GenerationStrategy.CELL_BY_CELL
111
+
112
+ @abstractmethod
113
+ def generate(self, data: dict) -> dict: ...
114
+
115
+
116
+ class ColumnGeneratorFullColumn(ColumnGenerator[TaskConfigT], ABC):
117
+ @staticmethod
118
+ def get_generation_strategy() -> GenerationStrategy:
119
+ return GenerationStrategy.FULL_COLUMN
120
+
121
+ @abstractmethod
122
+ def generate(self, data: pd.DataFrame) -> pd.DataFrame: ...
@@ -1,15 +1,12 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
4
5
 
5
6
  from pydantic import BaseModel, computed_field
6
7
 
7
8
  from data_designer.config.column_configs import EmbeddingColumnConfig
8
- from data_designer.engine.column_generators.generators.base import (
9
- ColumnGeneratorWithModel,
10
- GenerationStrategy,
11
- GeneratorMetadata,
12
- )
9
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModel, GenerationStrategy
13
10
  from data_designer.engine.processing.utils import deserialize_json_values, parse_list_string
14
11
 
15
12
 
@@ -27,12 +24,8 @@ class EmbeddingGenerationResult(BaseModel):
27
24
 
28
25
  class EmbeddingCellGenerator(ColumnGeneratorWithModel[EmbeddingColumnConfig]):
29
26
  @staticmethod
30
- def metadata() -> GeneratorMetadata:
31
- return GeneratorMetadata(
32
- name="embedding_cell_generator",
33
- description="Generate embeddings for a text column.",
34
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
35
- )
27
+ def get_generation_strategy() -> GenerationStrategy:
28
+ return GenerationStrategy.CELL_BY_CELL
36
29
 
37
30
  def generate(self, data: dict) -> dict:
38
31
  deserialized_record = deserialize_json_values(data)
@@ -4,31 +4,22 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import logging
7
-
8
- import pandas as pd
7
+ from typing import TYPE_CHECKING
9
8
 
10
9
  from data_designer.config.column_configs import ExpressionColumnConfig
11
- from data_designer.engine.column_generators.generators.base import (
12
- ColumnGenerator,
13
- GenerationStrategy,
14
- GeneratorMetadata,
15
- )
10
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorFullColumn
16
11
  from data_designer.engine.column_generators.utils.errors import ExpressionTemplateRenderError
17
12
  from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
18
13
  from data_designer.engine.processing.utils import deserialize_json_values
14
+ from data_designer.lazy_heavy_imports import pd
19
15
 
20
- logger = logging.getLogger(__name__)
16
+ if TYPE_CHECKING:
17
+ import pandas as pd
21
18
 
19
+ logger = logging.getLogger(__name__)
22
20
 
23
- class ExpressionColumnGenerator(WithJinja2UserTemplateRendering, ColumnGenerator[ExpressionColumnConfig]):
24
- @staticmethod
25
- def metadata() -> GeneratorMetadata:
26
- return GeneratorMetadata(
27
- name="expression_generator",
28
- description="Generate a column from a jinja2 expression.",
29
- generation_strategy=GenerationStrategy.FULL_COLUMN,
30
- )
31
21
 
22
+ class ExpressionColumnGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorFullColumn[ExpressionColumnConfig]):
32
23
  def generate(self, data: pd.DataFrame) -> pd.DataFrame:
33
24
  logger.info(f"🧩 Generating column `{self.config.name}` from expression")
34
25
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import functools
5
7
  import logging
6
8
 
@@ -11,11 +13,7 @@ from data_designer.config.column_configs import (
11
13
  LLMTextColumnConfig,
12
14
  )
13
15
  from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX
14
- from data_designer.engine.column_generators.generators.base import (
15
- ColumnGeneratorWithModel,
16
- GenerationStrategy,
17
- GeneratorMetadata,
18
- )
16
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModel, GenerationStrategy
19
17
  from data_designer.engine.column_generators.utils.prompt_renderer import (
20
18
  PromptType,
21
19
  RecordBasedPromptRenderer,
@@ -29,6 +27,10 @@ logger = logging.getLogger(__name__)
29
27
 
30
28
 
31
29
  class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfigT]):
30
+ @staticmethod
31
+ def get_generation_strategy() -> GenerationStrategy:
32
+ return GenerationStrategy.CELL_BY_CELL
33
+
32
34
  @functools.cached_property
33
35
  def response_recipe(self) -> ResponseRecipe:
34
36
  return create_response_recipe(self.config, self.model_config)
@@ -87,41 +89,13 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
87
89
  return data
88
90
 
89
91
 
90
- class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnConfig]):
91
- @staticmethod
92
- def metadata() -> GeneratorMetadata:
93
- return GeneratorMetadata(
94
- name="llm_text_generator",
95
- description="Generate a new dataset cell from a prompt template",
96
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
97
- )
92
+ class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnConfig]): ...
98
93
 
99
94
 
100
- class LLMCodeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMCodeColumnConfig]):
101
- @staticmethod
102
- def metadata() -> GeneratorMetadata:
103
- return GeneratorMetadata(
104
- name="llm_code_generator",
105
- description="Generate a new dataset cell from a prompt template",
106
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
107
- )
95
+ class LLMCodeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMCodeColumnConfig]): ...
108
96
 
109
97
 
110
- class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]):
111
- @staticmethod
112
- def metadata() -> GeneratorMetadata:
113
- return GeneratorMetadata(
114
- name="llm_structured_generator",
115
- description="Generate a new dataset cell from a prompt template",
116
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
117
- )
98
+ class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]): ...
118
99
 
119
100
 
120
- class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]):
121
- @staticmethod
122
- def metadata() -> GeneratorMetadata:
123
- return GeneratorMetadata(
124
- name="llm_judge_generator",
125
- description="Judge a new dataset cell based on a set of rubrics",
126
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
127
- )
101
+ class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]): ...
@@ -6,34 +6,28 @@ from __future__ import annotations
6
6
  import logging
7
7
  import random
8
8
  from functools import partial
9
- from typing import Callable
10
-
11
- import pandas as pd
9
+ from typing import TYPE_CHECKING, Callable
12
10
 
13
11
  from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
14
- from data_designer.engine.column_generators.generators.base import (
15
- FromScratchColumnGenerator,
16
- GenerationStrategy,
17
- GeneratorMetadata,
18
- )
12
+ from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy
19
13
  from data_designer.engine.dataset_builders.multi_column_configs import SamplerMultiColumnConfig
20
14
  from data_designer.engine.processing.utils import concat_datasets
21
15
  from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
22
16
  from data_designer.engine.sampling_gen.data_sources.sources import SamplerType
23
17
  from data_designer.engine.sampling_gen.entities.person import load_person_data_sampler
24
18
  from data_designer.engine.sampling_gen.generator import DatasetGenerator as SamplingDatasetGenerator
19
+ from data_designer.lazy_heavy_imports import pd
20
+
21
+ if TYPE_CHECKING:
22
+ import pandas as pd
25
23
 
26
24
  logger = logging.getLogger(__name__)
27
25
 
28
26
 
29
27
  class SamplerColumnGenerator(FromScratchColumnGenerator[SamplerMultiColumnConfig]):
30
28
  @staticmethod
31
- def metadata() -> GeneratorMetadata:
32
- return GeneratorMetadata(
33
- name="sampler_column_generator",
34
- description="Generate columns using sampling-based method.",
35
- generation_strategy=GenerationStrategy.FULL_COLUMN,
36
- )
29
+ def get_generation_strategy() -> GenerationStrategy:
30
+ return GenerationStrategy.FULL_COLUMN
37
31
 
38
32
  def generate(self, data: pd.DataFrame) -> pd.DataFrame:
39
33
  df_samplers = self.generate_from_scratch(len(data))
@@ -1,24 +1,22 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
-
5
4
  from __future__ import annotations
6
5
 
7
6
  import functools
8
7
  import logging
9
-
10
- import duckdb
11
- import pandas as pd
8
+ from typing import TYPE_CHECKING
12
9
 
13
10
  from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy
14
- from data_designer.engine.column_generators.generators.base import (
15
- FromScratchColumnGenerator,
16
- GenerationStrategy,
17
- GeneratorMetadata,
18
- )
11
+ from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy
19
12
  from data_designer.engine.column_generators.utils.errors import SeedDatasetError
20
13
  from data_designer.engine.dataset_builders.multi_column_configs import SeedDatasetMultiColumnConfig
21
14
  from data_designer.engine.processing.utils import concat_datasets
15
+ from data_designer.lazy_heavy_imports import duckdb, pd
16
+
17
+ if TYPE_CHECKING:
18
+ import duckdb
19
+ import pandas as pd
22
20
 
23
21
  MAX_ZERO_RECORD_RESPONSE_FACTOR = 2
24
22
 
@@ -27,12 +25,8 @@ logger = logging.getLogger(__name__)
27
25
 
28
26
  class SeedDatasetColumnGenerator(FromScratchColumnGenerator[SeedDatasetMultiColumnConfig]):
29
27
  @staticmethod
30
- def metadata() -> GeneratorMetadata:
31
- return GeneratorMetadata(
32
- name="seed_dataset_column_generator",
33
- description="Sample columns from a seed dataset.",
34
- generation_strategy=GenerationStrategy.FULL_COLUMN,
35
- )
28
+ def get_generation_strategy() -> GenerationStrategy:
29
+ return GenerationStrategy.FULL_COLUMN
36
30
 
37
31
  @property
38
32
  def num_records_sampled(self) -> int:
@@ -4,21 +4,13 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import logging
7
-
8
- import pandas as pd
7
+ from typing import TYPE_CHECKING
9
8
 
10
9
  from data_designer.config.column_configs import ValidationColumnConfig
11
10
  from data_designer.config.errors import InvalidConfigError
12
11
  from data_designer.config.utils.code_lang import SQL_DIALECTS, CodeLang
13
- from data_designer.config.validator_params import (
14
- ValidatorParamsT,
15
- ValidatorType,
16
- )
17
- from data_designer.engine.column_generators.generators.base import (
18
- ColumnGenerator,
19
- GenerationStrategy,
20
- GeneratorMetadata,
21
- )
12
+ from data_designer.config.validator_params import ValidatorParamsT, ValidatorType
13
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorFullColumn
22
14
  from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor
23
15
  from data_designer.engine.errors import DataDesignerRuntimeError
24
16
  from data_designer.engine.validators import (
@@ -29,6 +21,10 @@ from data_designer.engine.validators import (
29
21
  SQLValidator,
30
22
  ValidationResult,
31
23
  )
24
+ from data_designer.lazy_heavy_imports import pd
25
+
26
+ if TYPE_CHECKING:
27
+ import pandas as pd
32
28
 
33
29
  logger = logging.getLogger(__name__)
34
30
 
@@ -45,15 +41,7 @@ def get_validator_from_params(validator_type: ValidatorType, validator_params: V
45
41
  return LocalCallableValidator(validator_params)
46
42
 
47
43
 
48
- class ValidationColumnGenerator(ColumnGenerator[ValidationColumnConfig]):
49
- @staticmethod
50
- def metadata() -> GeneratorMetadata:
51
- return GeneratorMetadata(
52
- name="validate",
53
- description="Validate data.",
54
- generation_strategy=GenerationStrategy.FULL_COLUMN,
55
- )
56
-
44
+ class ValidationColumnGenerator(ColumnGeneratorFullColumn[ValidationColumnConfig]):
57
45
  def generate(self, data: pd.DataFrame) -> pd.DataFrame:
58
46
  logger.info(f"🔍 Validating column {self.config.name!r} with {len(data)} records")
59
47
  logger.info(f" |-- target columns: {self.config.target_columns}")
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.base import ConfigBase
5
7
  from data_designer.config.column_configs import (
6
8
  EmbeddingColumnConfig,
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.engine.errors import DataDesignerError
5
7
 
6
8
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.column_types import DataDesignerColumnType
5
7
  from data_designer.config.utils.type_helpers import resolve_string_enum
6
8
  from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModelRegistry
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
7
 
6
8
  from pydantic import BaseModel, ConfigDict, Field, create_model
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import json
5
7
  import logging
6
8
 
@@ -8,7 +10,7 @@ from data_designer.config.column_configs import SingleColumnConfig
8
10
  from data_designer.config.column_types import DataDesignerColumnType
9
11
  from data_designer.config.models import ModelConfig
10
12
  from data_designer.config.utils.code_lang import CodeLang
11
- from data_designer.config.utils.misc import get_prompt_template_keywords
13
+ from data_designer.config.utils.misc import extract_keywords_from_jinja2_template
12
14
  from data_designer.config.utils.type_helpers import StrEnum
13
15
  from data_designer.engine.column_generators.utils.errors import PromptTemplateRenderError
14
16
  from data_designer.engine.column_generators.utils.judge_score_factory import (
@@ -56,7 +58,7 @@ class RecordBasedPromptRenderer(WithJinja2UserTemplateRendering):
56
58
  dataset_variables=list(record.keys()),
57
59
  )
58
60
  except (UserTemplateUnsupportedFiltersError, UserTemplateError) as exc:
59
- template_variables = get_prompt_template_keywords(prompt_template)
61
+ template_variables = extract_keywords_from_jinja2_template(prompt_template)
60
62
  missing_columns = list(set(template_variables) - set(record.keys()))
61
63
 
62
64
  error_msg = (
@@ -1,10 +1,11 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
 
6
8
  from data_designer.config.column_configs import SeedDatasetColumnConfig
7
- from data_designer.config.config_builder import DataDesignerConfigBuilder
8
9
  from data_designer.config.data_designer_config import DataDesignerConfig
9
10
  from data_designer.config.errors import InvalidConfigError
10
11
  from data_designer.engine.resources.resource_provider import ResourceProvider
@@ -14,13 +15,9 @@ from data_designer.engine.validation import ViolationLevel, rich_print_violation
14
15
  logger = logging.getLogger(__name__)
15
16
 
16
17
 
17
- def compile_data_designer_config(
18
- config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider
19
- ) -> DataDesignerConfig:
20
- config = config_builder.build()
18
+ def compile_data_designer_config(config: DataDesignerConfig, resource_provider: ResourceProvider) -> DataDesignerConfig:
21
19
  _resolve_and_add_seed_columns(config, resource_provider.seed_reader)
22
20
  _validate(config)
23
-
24
21
  return config
25
22
 
26
23
 
@@ -1,25 +1,24 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from abc import ABC, abstractmethod
5
- from pathlib import Path
6
- from typing import Generic, TypeVar, get_origin
4
+ from __future__ import annotations
7
5
 
8
- import pandas as pd
6
+ from abc import ABC
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Generic, TypeVar, get_origin
9
9
 
10
10
  from data_designer.config.base import ConfigBase
11
11
  from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
12
12
  from data_designer.engine.resources.resource_provider import ResourceProvider
13
+ from data_designer.lazy_heavy_imports import pd
14
+
15
+ if TYPE_CHECKING:
16
+ import pandas as pd
13
17
 
14
18
  DataT = TypeVar("DataT", dict, pd.DataFrame)
15
19
  TaskConfigT = TypeVar("ConfigT", bound=ConfigBase)
16
20
 
17
21
 
18
- class ConfigurableTaskMetadata(ConfigBase):
19
- name: str
20
- description: str
21
-
22
-
23
22
  class ConfigurableTask(ABC, Generic[TaskConfigT]):
24
23
  def __init__(self, config: TaskConfigT, resource_provider: ResourceProvider):
25
24
  self._config = self.get_config_type().model_validate(config)
@@ -57,14 +56,14 @@ class ConfigurableTask(ABC, Generic[TaskConfigT]):
57
56
  def config(self) -> TaskConfigT:
58
57
  return self._config
59
58
 
59
+ @property
60
+ def name(self) -> str:
61
+ return self.__class__.__name__
62
+
60
63
  @property
61
64
  def resource_provider(self) -> ResourceProvider:
62
65
  return self._resource_provider
63
66
 
64
- @staticmethod
65
- @abstractmethod
66
- def metadata() -> ConfigurableTaskMetadata: ...
67
-
68
67
  def _initialize(self) -> None:
69
68
  """An internal method for custom initialization logic, which will be called in the constructor."""
70
69