data-designer 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. data_designer/__init__.py +2 -0
  2. data_designer/_version.py +2 -2
  3. data_designer/cli/__init__.py +2 -0
  4. data_designer/cli/commands/download.py +2 -0
  5. data_designer/cli/commands/list.py +2 -0
  6. data_designer/cli/commands/models.py +2 -0
  7. data_designer/cli/commands/providers.py +2 -0
  8. data_designer/cli/commands/reset.py +2 -0
  9. data_designer/cli/controllers/__init__.py +2 -0
  10. data_designer/cli/controllers/download_controller.py +2 -0
  11. data_designer/cli/controllers/model_controller.py +6 -1
  12. data_designer/cli/controllers/provider_controller.py +6 -1
  13. data_designer/cli/forms/__init__.py +2 -0
  14. data_designer/cli/forms/builder.py +2 -0
  15. data_designer/cli/forms/field.py +2 -0
  16. data_designer/cli/forms/form.py +2 -0
  17. data_designer/cli/forms/model_builder.py +2 -0
  18. data_designer/cli/forms/provider_builder.py +2 -0
  19. data_designer/cli/main.py +2 -0
  20. data_designer/cli/repositories/__init__.py +2 -0
  21. data_designer/cli/repositories/base.py +2 -0
  22. data_designer/cli/repositories/model_repository.py +2 -0
  23. data_designer/cli/repositories/persona_repository.py +2 -0
  24. data_designer/cli/repositories/provider_repository.py +2 -0
  25. data_designer/cli/services/__init__.py +2 -0
  26. data_designer/cli/services/download_service.py +2 -0
  27. data_designer/cli/services/model_service.py +2 -0
  28. data_designer/cli/services/provider_service.py +2 -0
  29. data_designer/cli/ui.py +2 -0
  30. data_designer/cli/utils.py +2 -0
  31. data_designer/config/analysis/column_profilers.py +2 -0
  32. data_designer/config/analysis/column_statistics.py +8 -5
  33. data_designer/config/analysis/dataset_profiler.py +9 -3
  34. data_designer/config/analysis/utils/errors.py +2 -0
  35. data_designer/config/analysis/utils/reporting.py +7 -3
  36. data_designer/config/base.py +1 -0
  37. data_designer/config/column_configs.py +77 -7
  38. data_designer/config/column_types.py +33 -36
  39. data_designer/config/dataset_builders.py +2 -0
  40. data_designer/config/dataset_metadata.py +18 -0
  41. data_designer/config/default_model_settings.py +1 -0
  42. data_designer/config/errors.py +2 -0
  43. data_designer/config/exports.py +2 -0
  44. data_designer/config/interface.py +3 -2
  45. data_designer/config/models.py +7 -2
  46. data_designer/config/preview_results.py +9 -1
  47. data_designer/config/processors.py +2 -0
  48. data_designer/config/run_config.py +19 -5
  49. data_designer/config/sampler_constraints.py +2 -0
  50. data_designer/config/sampler_params.py +7 -2
  51. data_designer/config/seed.py +2 -0
  52. data_designer/config/seed_source.py +9 -3
  53. data_designer/config/seed_source_types.py +2 -0
  54. data_designer/config/utils/constants.py +2 -0
  55. data_designer/config/utils/errors.py +2 -0
  56. data_designer/config/utils/info.py +2 -0
  57. data_designer/config/utils/io_helpers.py +8 -3
  58. data_designer/config/utils/misc.py +2 -2
  59. data_designer/config/utils/numerical_helpers.py +2 -0
  60. data_designer/config/utils/type_helpers.py +2 -0
  61. data_designer/config/utils/visualization.py +19 -11
  62. data_designer/config/validator_params.py +2 -0
  63. data_designer/engine/analysis/column_profilers/base.py +9 -8
  64. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
  65. data_designer/engine/analysis/column_profilers/registry.py +2 -0
  66. data_designer/engine/analysis/column_statistics.py +5 -2
  67. data_designer/engine/analysis/dataset_profiler.py +12 -9
  68. data_designer/engine/analysis/errors.py +2 -0
  69. data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
  70. data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
  71. data_designer/engine/column_generators/generators/base.py +26 -14
  72. data_designer/engine/column_generators/generators/embedding.py +4 -11
  73. data_designer/engine/column_generators/generators/expression.py +7 -16
  74. data_designer/engine/column_generators/generators/llm_completion.py +13 -47
  75. data_designer/engine/column_generators/generators/samplers.py +8 -14
  76. data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
  77. data_designer/engine/column_generators/generators/validation.py +9 -20
  78. data_designer/engine/column_generators/registry.py +2 -0
  79. data_designer/engine/column_generators/utils/errors.py +2 -0
  80. data_designer/engine/column_generators/utils/generator_classification.py +2 -0
  81. data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
  82. data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
  83. data_designer/engine/compiler.py +3 -6
  84. data_designer/engine/configurable_task.py +12 -13
  85. data_designer/engine/dataset_builders/artifact_storage.py +87 -8
  86. data_designer/engine/dataset_builders/column_wise_builder.py +34 -35
  87. data_designer/engine/dataset_builders/errors.py +2 -0
  88. data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
  89. data_designer/engine/dataset_builders/utils/concurrency.py +13 -4
  90. data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
  91. data_designer/engine/dataset_builders/utils/dag.py +7 -2
  92. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +35 -25
  93. data_designer/engine/dataset_builders/utils/errors.py +2 -0
  94. data_designer/engine/errors.py +2 -0
  95. data_designer/engine/model_provider.py +2 -0
  96. data_designer/engine/models/errors.py +23 -31
  97. data_designer/engine/models/facade.py +12 -9
  98. data_designer/engine/models/factory.py +42 -0
  99. data_designer/engine/models/litellm_overrides.py +16 -11
  100. data_designer/engine/models/parsers/errors.py +2 -0
  101. data_designer/engine/models/parsers/parser.py +2 -2
  102. data_designer/engine/models/parsers/postprocessors.py +1 -0
  103. data_designer/engine/models/parsers/tag_parsers.py +2 -0
  104. data_designer/engine/models/parsers/types.py +2 -0
  105. data_designer/engine/models/recipes/base.py +2 -0
  106. data_designer/engine/models/recipes/response_recipes.py +2 -0
  107. data_designer/engine/models/registry.py +11 -18
  108. data_designer/engine/models/telemetry.py +6 -2
  109. data_designer/engine/processing/ginja/ast.py +2 -0
  110. data_designer/engine/processing/ginja/environment.py +2 -0
  111. data_designer/engine/processing/ginja/exceptions.py +2 -0
  112. data_designer/engine/processing/ginja/record.py +2 -0
  113. data_designer/engine/processing/gsonschema/exceptions.py +9 -2
  114. data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
  115. data_designer/engine/processing/gsonschema/types.py +2 -0
  116. data_designer/engine/processing/gsonschema/validators.py +10 -6
  117. data_designer/engine/processing/processors/base.py +1 -5
  118. data_designer/engine/processing/processors/drop_columns.py +7 -10
  119. data_designer/engine/processing/processors/registry.py +2 -0
  120. data_designer/engine/processing/processors/schema_transform.py +7 -10
  121. data_designer/engine/processing/utils.py +7 -3
  122. data_designer/engine/registry/base.py +2 -0
  123. data_designer/engine/registry/data_designer_registry.py +2 -0
  124. data_designer/engine/registry/errors.py +2 -0
  125. data_designer/engine/resources/managed_dataset_generator.py +6 -2
  126. data_designer/engine/resources/managed_dataset_repository.py +8 -5
  127. data_designer/engine/resources/managed_storage.py +2 -0
  128. data_designer/engine/resources/resource_provider.py +20 -1
  129. data_designer/engine/resources/seed_reader.py +7 -2
  130. data_designer/engine/sampling_gen/column.py +2 -0
  131. data_designer/engine/sampling_gen/constraints.py +8 -2
  132. data_designer/engine/sampling_gen/data_sources/base.py +10 -7
  133. data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
  134. data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
  135. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
  136. data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
  137. data_designer/engine/sampling_gen/entities/errors.py +2 -0
  138. data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
  139. data_designer/engine/sampling_gen/entities/person.py +2 -0
  140. data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
  141. data_designer/engine/sampling_gen/errors.py +2 -0
  142. data_designer/engine/sampling_gen/generator.py +5 -4
  143. data_designer/engine/sampling_gen/jinja_utils.py +7 -3
  144. data_designer/engine/sampling_gen/people_gen.py +7 -7
  145. data_designer/engine/sampling_gen/person_constants.py +2 -0
  146. data_designer/engine/sampling_gen/schema.py +5 -1
  147. data_designer/engine/sampling_gen/schema_builder.py +2 -0
  148. data_designer/engine/sampling_gen/utils.py +7 -1
  149. data_designer/engine/secret_resolver.py +2 -0
  150. data_designer/engine/validation.py +2 -2
  151. data_designer/engine/validators/__init__.py +2 -0
  152. data_designer/engine/validators/base.py +2 -0
  153. data_designer/engine/validators/local_callable.py +7 -2
  154. data_designer/engine/validators/python.py +7 -1
  155. data_designer/engine/validators/remote.py +7 -1
  156. data_designer/engine/validators/sql.py +8 -3
  157. data_designer/errors.py +2 -0
  158. data_designer/essentials/__init__.py +2 -0
  159. data_designer/interface/data_designer.py +36 -39
  160. data_designer/interface/errors.py +2 -0
  161. data_designer/interface/results.py +9 -2
  162. data_designer/lazy_heavy_imports.py +54 -0
  163. data_designer/logging.py +2 -0
  164. data_designer/plugins/__init__.py +2 -0
  165. data_designer/plugins/errors.py +2 -0
  166. data_designer/plugins/plugin.py +0 -1
  167. data_designer/plugins/registry.py +2 -0
  168. data_designer/plugins/testing/__init__.py +2 -0
  169. data_designer/plugins/testing/stubs.py +21 -43
  170. data_designer/plugins/testing/utils.py +2 -0
  171. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/METADATA +19 -4
  172. data_designer-0.3.5.dist-info/RECORD +196 -0
  173. data_designer-0.3.3.dist-info/RECORD +0 -193
  174. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/WHEEL +0 -0
  175. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/entry_points.txt +0 -0
  176. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/licenses/LICENSE +0 -0
@@ -5,44 +5,41 @@ from __future__ import annotations
5
5
 
6
6
  import logging
7
7
  import random
8
+ from typing import TYPE_CHECKING
8
9
 
9
10
  from data_designer.config.analysis.column_profilers import (
10
11
  JudgeScoreProfilerConfig,
11
12
  JudgeScoreProfilerResults,
12
- JudgeScoreSample,
13
13
  JudgeScoreSummary,
14
14
  )
15
15
  from data_designer.config.analysis.column_statistics import (
16
- CategoricalDistribution,
17
- CategoricalHistogramData,
18
16
  ColumnDistributionType,
19
17
  MissingValue,
20
- NumericalDistribution,
21
- )
22
- from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP, DataDesignerColumnType
23
- from data_designer.engine.analysis.column_profilers.base import (
24
- ColumnConfigWithDataFrame,
25
- ColumnProfiler,
26
- ColumnProfilerMetadata,
27
18
  )
19
+ from data_designer.config.column_types import DataDesignerColumnType
20
+ from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
28
21
  from data_designer.engine.analysis.utils.judge_score_processing import (
29
22
  extract_judge_score_distributions,
30
23
  sample_scores_and_reasoning,
31
24
  )
32
- from data_designer.engine.models.facade import ModelFacade
33
25
  from data_designer.engine.models.recipes.response_recipes import TextResponseRecipe
34
26
 
27
+ if TYPE_CHECKING:
28
+ from data_designer.config.analysis.column_profilers import JudgeScoreSample
29
+ from data_designer.config.analysis.column_statistics import (
30
+ CategoricalDistribution,
31
+ CategoricalHistogramData,
32
+ NumericalDistribution,
33
+ )
34
+ from data_designer.engine.models.facade import ModelFacade
35
+
35
36
  logger = logging.getLogger(__name__)
36
37
 
37
38
 
38
39
  class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
39
40
  @staticmethod
40
- def metadata() -> ColumnProfilerMetadata:
41
- return ColumnProfilerMetadata(
42
- name="judge_score_profiler",
43
- description="Analyzes LLM-as-judge score distributions in a Data Designer dataset.",
44
- applicable_column_types=[DataDesignerColumnType.LLM_JUDGE],
45
- )
41
+ def get_applicable_column_types() -> list[DataDesignerColumnType]:
42
+ return [DataDesignerColumnType.LLM_JUDGE]
46
43
 
47
44
  def get_model(self, model_alias: str) -> ModelFacade:
48
45
  return self.resource_provider.model_registry.get_model(model_alias=model_alias)
@@ -51,8 +48,7 @@ class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
51
48
  column_config, df = column_config_with_df.as_tuple()
52
49
 
53
50
  logger.info(
54
- f"{COLUMN_TYPE_EMOJI_MAP[column_config.column_type]} Analyzing LLM-as-judge "
55
- f"scores for column: '{column_config.name}'"
51
+ f"{column_config.get_column_emoji()} Analyzing LLM-as-judge scores for column: '{column_config.name}'"
56
52
  )
57
53
 
58
54
  score_summaries = {}
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.analysis.column_profilers import ColumnProfilerType
5
7
  from data_designer.config.base import ConfigBase
6
8
  from data_designer.engine.analysis.column_profilers.base import ColumnProfiler
@@ -4,9 +4,8 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import logging
7
- from typing import Any, TypeAlias
7
+ from typing import TYPE_CHECKING, Any, TypeAlias
8
8
 
9
- import pandas as pd
10
9
  from pydantic import BaseModel
11
10
  from typing_extensions import Self
12
11
 
@@ -25,6 +24,10 @@ from data_designer.engine.analysis.utils.column_statistics_calculations import (
25
24
  calculate_token_stats,
26
25
  calculate_validation_column_info,
27
26
  )
27
+ from data_designer.lazy_heavy_imports import pd
28
+
29
+ if TYPE_CHECKING:
30
+ import pandas as pd
28
31
 
29
32
  logger = logging.getLogger(__name__)
30
33
 
@@ -1,22 +1,20 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  from collections.abc import Sequence
6
8
  from functools import cached_property
9
+ from typing import TYPE_CHECKING
7
10
 
8
- import pandas as pd
9
- import pyarrow as pa
10
11
  from pydantic import Field, field_validator
11
12
 
12
13
  from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT
13
14
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
14
15
  from data_designer.config.base import ConfigBase
15
16
  from data_designer.config.column_configs import SingleColumnConfig
16
- from data_designer.config.column_types import (
17
- COLUMN_TYPE_EMOJI_MAP,
18
- ColumnConfigT,
19
- )
17
+ from data_designer.config.column_types import ColumnConfigT
20
18
  from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
21
19
  from data_designer.engine.analysis.column_statistics import get_column_statistics_calculator
22
20
  from data_designer.engine.analysis.errors import DatasetProfilerConfigurationError
@@ -24,6 +22,11 @@ from data_designer.engine.analysis.utils.column_statistics_calculations import h
24
22
  from data_designer.engine.dataset_builders.multi_column_configs import DatasetBuilderColumnConfigT, MultiColumnConfig
25
23
  from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry
26
24
  from data_designer.engine.resources.resource_provider import ResourceProvider
25
+ from data_designer.lazy_heavy_imports import pa, pd
26
+
27
+ if TYPE_CHECKING:
28
+ import pandas as pd
29
+ import pyarrow as pa
27
30
 
28
31
  logger = logging.getLogger(__name__)
29
32
 
@@ -71,7 +74,7 @@ class DataDesignerDatasetProfiler:
71
74
 
72
75
  column_statistics = []
73
76
  for c in self.config.column_configs:
74
- logger.info(f" |-- {COLUMN_TYPE_EMOJI_MAP[c.column_type]} column: '{c.name}'")
77
+ logger.info(f" |-- {c.get_column_emoji()} column: '{c.name}'")
75
78
  column_statistics.append(
76
79
  get_column_statistics_calculator(c.column_type)(
77
80
  column_config_with_df=ColumnConfigWithDataFrame(column_config=c, df=dataset)
@@ -81,14 +84,14 @@ class DataDesignerDatasetProfiler:
81
84
  column_profiles = []
82
85
  for profiler_config in self.config.column_profiler_configs or []:
83
86
  profiler = self._create_column_profiler(profiler_config)
84
- applicable_column_types = profiler.metadata().applicable_column_types
87
+ applicable_column_types = profiler.get_applicable_column_types()
85
88
  for c in self.config.column_configs:
86
89
  if c.column_type in applicable_column_types:
87
90
  params = ColumnConfigWithDataFrame(column_config=c, df=dataset)
88
91
  column_profiles.append(profiler.profile(params))
89
92
  if len(column_profiles) == 0:
90
93
  logger.warning(
91
- f"⚠️ No applicable column types found for the '{profiler.metadata().name}' profiler. "
94
+ f"⚠️ No applicable column types found for the '{profiler.name}' profiler. "
92
95
  f"This profiler is applicable to the following column types: {applicable_column_types}"
93
96
  )
94
97
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.errors import DataDesignerError
5
7
 
6
8
 
@@ -5,11 +5,8 @@ from __future__ import annotations
5
5
 
6
6
  import logging
7
7
  from numbers import Number
8
- from typing import Any
8
+ from typing import TYPE_CHECKING, Any
9
9
 
10
- import numpy as np
11
- import pandas as pd
12
- import pyarrow as pa
13
10
  import tiktoken
14
11
 
15
12
  from data_designer.config.analysis.column_statistics import (
@@ -26,6 +23,12 @@ from data_designer.engine.column_generators.utils.prompt_renderer import (
26
23
  RecordBasedPromptRenderer,
27
24
  create_response_recipe,
28
25
  )
26
+ from data_designer.lazy_heavy_imports import np, pa, pd
27
+
28
+ if TYPE_CHECKING:
29
+ import numpy as np
30
+ import pandas as pd
31
+ import pyarrow as pa
29
32
 
30
33
  RANDOM_SEED = 42
31
34
  MAX_PROMPT_SAMPLE_SIZE = 1000
@@ -1,11 +1,11 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  from collections import defaultdict
6
- from typing import Any
7
-
8
- import pandas as pd
8
+ from typing import TYPE_CHECKING, Any
9
9
 
10
10
  from data_designer.config.analysis.column_profilers import JudgeScoreDistributions, JudgeScoreSample
11
11
  from data_designer.config.analysis.column_statistics import (
@@ -15,6 +15,10 @@ from data_designer.config.analysis.column_statistics import (
15
15
  NumericalDistribution,
16
16
  )
17
17
  from data_designer.config.column_configs import LLMJudgeColumnConfig
18
+ from data_designer.lazy_heavy_imports import pd
19
+
20
+ if TYPE_CHECKING:
21
+ import pandas as pd
18
22
 
19
23
  logger = logging.getLogger(__name__)
20
24
 
@@ -9,16 +9,16 @@ from abc import ABC, abstractmethod
9
9
  from enum import Enum
10
10
  from typing import TYPE_CHECKING, overload
11
11
 
12
- import pandas as pd
13
-
14
- from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, DataT, TaskConfigT
12
+ from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT
13
+ from data_designer.lazy_heavy_imports import pd
15
14
 
16
15
  if TYPE_CHECKING:
16
+ import pandas as pd
17
+
17
18
  from data_designer.config.models import BaseInferenceParams, ModelConfig
18
19
  from data_designer.engine.models.facade import ModelFacade
19
20
  from data_designer.engine.models.registry import ModelRegistry
20
21
 
21
-
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
24
 
@@ -27,22 +27,14 @@ class GenerationStrategy(str, Enum):
27
27
  FULL_COLUMN = "full_column"
28
28
 
29
29
 
30
- class GeneratorMetadata(ConfigurableTaskMetadata):
31
- generation_strategy: GenerationStrategy
32
-
33
-
34
30
  class ColumnGenerator(ConfigurableTask[TaskConfigT], ABC):
35
31
  @property
36
32
  def can_generate_from_scratch(self) -> bool:
37
33
  return False
38
34
 
39
- @property
40
- def generation_strategy(self) -> GenerationStrategy:
41
- return self.metadata().generation_strategy
42
-
43
35
  @staticmethod
44
36
  @abstractmethod
45
- def metadata() -> GeneratorMetadata: ...
37
+ def get_generation_strategy() -> GenerationStrategy: ...
46
38
 
47
39
  @overload
48
40
  @abstractmethod
@@ -103,8 +95,28 @@ class ColumnGeneratorWithModel(ColumnGeneratorWithModelRegistry[TaskConfigT], AB
103
95
  return self.model_config.inference_parameters
104
96
 
105
97
  def log_pre_generation(self) -> None:
106
- logger.info(f"{self.config.column_type} model configuration for generating column '{self.config.name}'")
98
+ logger.info(
99
+ f"{self.config.get_column_emoji()} {self.config.column_type} model config for column '{self.config.name}'"
100
+ )
107
101
  logger.info(f" |-- model: {self.model_config.model!r}")
108
102
  logger.info(f" |-- model alias: {self.config.model_alias!r}")
109
103
  logger.info(f" |-- model provider: {self.get_model_provider_name(model_alias=self.config.model_alias)!r}")
110
104
  logger.info(f" |-- inference parameters: {self.inference_parameters.format_for_display()}")
105
+
106
+
107
+ class ColumnGeneratorCellByCell(ColumnGenerator[TaskConfigT], ABC):
108
+ @staticmethod
109
+ def get_generation_strategy() -> GenerationStrategy:
110
+ return GenerationStrategy.CELL_BY_CELL
111
+
112
+ @abstractmethod
113
+ def generate(self, data: dict) -> dict: ...
114
+
115
+
116
+ class ColumnGeneratorFullColumn(ColumnGenerator[TaskConfigT], ABC):
117
+ @staticmethod
118
+ def get_generation_strategy() -> GenerationStrategy:
119
+ return GenerationStrategy.FULL_COLUMN
120
+
121
+ @abstractmethod
122
+ def generate(self, data: pd.DataFrame) -> pd.DataFrame: ...
@@ -1,15 +1,12 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
4
5
 
5
6
  from pydantic import BaseModel, computed_field
6
7
 
7
8
  from data_designer.config.column_configs import EmbeddingColumnConfig
8
- from data_designer.engine.column_generators.generators.base import (
9
- ColumnGeneratorWithModel,
10
- GenerationStrategy,
11
- GeneratorMetadata,
12
- )
9
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModel, GenerationStrategy
13
10
  from data_designer.engine.processing.utils import deserialize_json_values, parse_list_string
14
11
 
15
12
 
@@ -27,12 +24,8 @@ class EmbeddingGenerationResult(BaseModel):
27
24
 
28
25
  class EmbeddingCellGenerator(ColumnGeneratorWithModel[EmbeddingColumnConfig]):
29
26
  @staticmethod
30
- def metadata() -> GeneratorMetadata:
31
- return GeneratorMetadata(
32
- name="embedding_cell_generator",
33
- description="Generate embeddings for a text column.",
34
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
35
- )
27
+ def get_generation_strategy() -> GenerationStrategy:
28
+ return GenerationStrategy.CELL_BY_CELL
36
29
 
37
30
  def generate(self, data: dict) -> dict:
38
31
  deserialized_record = deserialize_json_values(data)
@@ -4,31 +4,22 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import logging
7
-
8
- import pandas as pd
7
+ from typing import TYPE_CHECKING
9
8
 
10
9
  from data_designer.config.column_configs import ExpressionColumnConfig
11
- from data_designer.engine.column_generators.generators.base import (
12
- ColumnGenerator,
13
- GenerationStrategy,
14
- GeneratorMetadata,
15
- )
10
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorFullColumn
16
11
  from data_designer.engine.column_generators.utils.errors import ExpressionTemplateRenderError
17
12
  from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
18
13
  from data_designer.engine.processing.utils import deserialize_json_values
14
+ from data_designer.lazy_heavy_imports import pd
19
15
 
20
- logger = logging.getLogger(__name__)
16
+ if TYPE_CHECKING:
17
+ import pandas as pd
21
18
 
19
+ logger = logging.getLogger(__name__)
22
20
 
23
- class ExpressionColumnGenerator(WithJinja2UserTemplateRendering, ColumnGenerator[ExpressionColumnConfig]):
24
- @staticmethod
25
- def metadata() -> GeneratorMetadata:
26
- return GeneratorMetadata(
27
- name="expression_generator",
28
- description="Generate a column from a jinja2 expression.",
29
- generation_strategy=GenerationStrategy.FULL_COLUMN,
30
- )
31
21
 
22
+ class ExpressionColumnGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorFullColumn[ExpressionColumnConfig]):
32
23
  def generate(self, data: pd.DataFrame) -> pd.DataFrame:
33
24
  logger.info(f"🧩 Generating column `{self.config.name}` from expression")
34
25
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import functools
5
7
  import logging
6
8
 
@@ -11,11 +13,7 @@ from data_designer.config.column_configs import (
11
13
  LLMTextColumnConfig,
12
14
  )
13
15
  from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX
14
- from data_designer.engine.column_generators.generators.base import (
15
- ColumnGeneratorWithModel,
16
- GenerationStrategy,
17
- GeneratorMetadata,
18
- )
16
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModel, GenerationStrategy
19
17
  from data_designer.engine.column_generators.utils.prompt_renderer import (
20
18
  PromptType,
21
19
  RecordBasedPromptRenderer,
@@ -28,22 +26,22 @@ from data_designer.engine.processing.utils import deserialize_json_values
28
26
  logger = logging.getLogger(__name__)
29
27
 
30
28
 
31
- DEFAULT_MAX_CONVERSATION_RESTARTS = 5
32
- DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS = 0
33
-
34
-
35
29
  class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfigT]):
30
+ @staticmethod
31
+ def get_generation_strategy() -> GenerationStrategy:
32
+ return GenerationStrategy.CELL_BY_CELL
33
+
36
34
  @functools.cached_property
37
35
  def response_recipe(self) -> ResponseRecipe:
38
36
  return create_response_recipe(self.config, self.model_config)
39
37
 
40
38
  @property
41
39
  def max_conversation_correction_steps(self) -> int:
42
- return DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS
40
+ return self.resource_provider.run_config.max_conversation_correction_steps
43
41
 
44
42
  @property
45
43
  def max_conversation_restarts(self) -> int:
46
- return DEFAULT_MAX_CONVERSATION_RESTARTS
44
+ return self.resource_provider.run_config.max_conversation_restarts
47
45
 
48
46
  @functools.cached_property
49
47
  def prompt_renderer(self) -> RecordBasedPromptRenderer:
@@ -91,45 +89,13 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
91
89
  return data
92
90
 
93
91
 
94
- class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnConfig]):
95
- @staticmethod
96
- def metadata() -> GeneratorMetadata:
97
- return GeneratorMetadata(
98
- name="llm_text_generator",
99
- description="Generate a new dataset cell from a prompt template",
100
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
101
- )
92
+ class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnConfig]): ...
102
93
 
103
94
 
104
- class LLMCodeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMCodeColumnConfig]):
105
- @staticmethod
106
- def metadata() -> GeneratorMetadata:
107
- return GeneratorMetadata(
108
- name="llm_code_generator",
109
- description="Generate a new dataset cell from a prompt template",
110
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
111
- )
112
-
95
+ class LLMCodeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMCodeColumnConfig]): ...
113
96
 
114
- class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]):
115
- @staticmethod
116
- def metadata() -> GeneratorMetadata:
117
- return GeneratorMetadata(
118
- name="llm_structured_generator",
119
- description="Generate a new dataset cell from a prompt template",
120
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
121
- )
122
97
 
98
+ class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]): ...
123
99
 
124
- class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]):
125
- @staticmethod
126
- def metadata() -> GeneratorMetadata:
127
- return GeneratorMetadata(
128
- name="llm_judge_generator",
129
- description="Judge a new dataset cell based on a set of rubrics",
130
- generation_strategy=GenerationStrategy.CELL_BY_CELL,
131
- )
132
100
 
133
- @property
134
- def max_conversation_restarts(self) -> int:
135
- return 2 * DEFAULT_MAX_CONVERSATION_RESTARTS
101
+ class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]): ...
@@ -6,34 +6,28 @@ from __future__ import annotations
6
6
  import logging
7
7
  import random
8
8
  from functools import partial
9
- from typing import Callable
10
-
11
- import pandas as pd
9
+ from typing import TYPE_CHECKING, Callable
12
10
 
13
11
  from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
14
- from data_designer.engine.column_generators.generators.base import (
15
- FromScratchColumnGenerator,
16
- GenerationStrategy,
17
- GeneratorMetadata,
18
- )
12
+ from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy
19
13
  from data_designer.engine.dataset_builders.multi_column_configs import SamplerMultiColumnConfig
20
14
  from data_designer.engine.processing.utils import concat_datasets
21
15
  from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
22
16
  from data_designer.engine.sampling_gen.data_sources.sources import SamplerType
23
17
  from data_designer.engine.sampling_gen.entities.person import load_person_data_sampler
24
18
  from data_designer.engine.sampling_gen.generator import DatasetGenerator as SamplingDatasetGenerator
19
+ from data_designer.lazy_heavy_imports import pd
20
+
21
+ if TYPE_CHECKING:
22
+ import pandas as pd
25
23
 
26
24
  logger = logging.getLogger(__name__)
27
25
 
28
26
 
29
27
  class SamplerColumnGenerator(FromScratchColumnGenerator[SamplerMultiColumnConfig]):
30
28
  @staticmethod
31
- def metadata() -> GeneratorMetadata:
32
- return GeneratorMetadata(
33
- name="sampler_column_generator",
34
- description="Generate columns using sampling-based method.",
35
- generation_strategy=GenerationStrategy.FULL_COLUMN,
36
- )
29
+ def get_generation_strategy() -> GenerationStrategy:
30
+ return GenerationStrategy.FULL_COLUMN
37
31
 
38
32
  def generate(self, data: pd.DataFrame) -> pd.DataFrame:
39
33
  df_samplers = self.generate_from_scratch(len(data))
@@ -1,24 +1,22 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
-
5
4
  from __future__ import annotations
6
5
 
7
6
  import functools
8
7
  import logging
9
-
10
- import duckdb
11
- import pandas as pd
8
+ from typing import TYPE_CHECKING
12
9
 
13
10
  from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy
14
- from data_designer.engine.column_generators.generators.base import (
15
- FromScratchColumnGenerator,
16
- GenerationStrategy,
17
- GeneratorMetadata,
18
- )
11
+ from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy
19
12
  from data_designer.engine.column_generators.utils.errors import SeedDatasetError
20
13
  from data_designer.engine.dataset_builders.multi_column_configs import SeedDatasetMultiColumnConfig
21
14
  from data_designer.engine.processing.utils import concat_datasets
15
+ from data_designer.lazy_heavy_imports import duckdb, pd
16
+
17
+ if TYPE_CHECKING:
18
+ import duckdb
19
+ import pandas as pd
22
20
 
23
21
  MAX_ZERO_RECORD_RESPONSE_FACTOR = 2
24
22
 
@@ -27,12 +25,8 @@ logger = logging.getLogger(__name__)
27
25
 
28
26
  class SeedDatasetColumnGenerator(FromScratchColumnGenerator[SeedDatasetMultiColumnConfig]):
29
27
  @staticmethod
30
- def metadata() -> GeneratorMetadata:
31
- return GeneratorMetadata(
32
- name="seed_dataset_column_generator",
33
- description="Sample columns from a seed dataset.",
34
- generation_strategy=GenerationStrategy.FULL_COLUMN,
35
- )
28
+ def get_generation_strategy() -> GenerationStrategy:
29
+ return GenerationStrategy.FULL_COLUMN
36
30
 
37
31
  @property
38
32
  def num_records_sampled(self) -> int:
@@ -4,21 +4,13 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import logging
7
-
8
- import pandas as pd
7
+ from typing import TYPE_CHECKING
9
8
 
10
9
  from data_designer.config.column_configs import ValidationColumnConfig
11
10
  from data_designer.config.errors import InvalidConfigError
12
11
  from data_designer.config.utils.code_lang import SQL_DIALECTS, CodeLang
13
- from data_designer.config.validator_params import (
14
- ValidatorParamsT,
15
- ValidatorType,
16
- )
17
- from data_designer.engine.column_generators.generators.base import (
18
- ColumnGenerator,
19
- GenerationStrategy,
20
- GeneratorMetadata,
21
- )
12
+ from data_designer.config.validator_params import ValidatorParamsT, ValidatorType
13
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorFullColumn
22
14
  from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor
23
15
  from data_designer.engine.errors import DataDesignerRuntimeError
24
16
  from data_designer.engine.validators import (
@@ -29,6 +21,10 @@ from data_designer.engine.validators import (
29
21
  SQLValidator,
30
22
  ValidationResult,
31
23
  )
24
+ from data_designer.lazy_heavy_imports import pd
25
+
26
+ if TYPE_CHECKING:
27
+ import pandas as pd
32
28
 
33
29
  logger = logging.getLogger(__name__)
34
30
 
@@ -45,15 +41,7 @@ def get_validator_from_params(validator_type: ValidatorType, validator_params: V
45
41
  return LocalCallableValidator(validator_params)
46
42
 
47
43
 
48
- class ValidationColumnGenerator(ColumnGenerator[ValidationColumnConfig]):
49
- @staticmethod
50
- def metadata() -> GeneratorMetadata:
51
- return GeneratorMetadata(
52
- name="validate",
53
- description="Validate data.",
54
- generation_strategy=GenerationStrategy.FULL_COLUMN,
55
- )
56
-
44
+ class ValidationColumnGenerator(ColumnGeneratorFullColumn[ValidationColumnConfig]):
57
45
  def generate(self, data: pd.DataFrame) -> pd.DataFrame:
58
46
  logger.info(f"🔍 Validating column {self.config.name!r} with {len(data)} records")
59
47
  logger.info(f" |-- target columns: {self.config.target_columns}")
@@ -132,6 +120,7 @@ class ValidationColumnGenerator(ColumnGenerator[ValidationColumnConfig]):
132
120
  error_callback=error_callback,
133
121
  shutdown_error_rate=settings.shutdown_error_rate,
134
122
  shutdown_error_window=settings.shutdown_error_window,
123
+ disable_early_shutdown=settings.disable_early_shutdown,
135
124
  ) as executor:
136
125
  for i, batch in enumerate(batched_records):
137
126
  executor.submit(lambda batch: self._validate_batch(validator, batch), batch, context={"index": i})
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.base import ConfigBase
5
7
  from data_designer.config.column_configs import (
6
8
  EmbeddingColumnConfig,
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.engine.errors import DataDesignerError
5
7
 
6
8
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.column_types import DataDesignerColumnType
5
7
  from data_designer.config.utils.type_helpers import resolve_string_enum
6
8
  from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModelRegistry
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
7
 
6
8
  from pydantic import BaseModel, ConfigDict, Field, create_model