data-designer 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. data_designer/__init__.py +2 -0
  2. data_designer/_version.py +2 -2
  3. data_designer/cli/__init__.py +2 -0
  4. data_designer/cli/commands/download.py +2 -0
  5. data_designer/cli/commands/list.py +2 -0
  6. data_designer/cli/commands/models.py +2 -0
  7. data_designer/cli/commands/providers.py +2 -0
  8. data_designer/cli/commands/reset.py +2 -0
  9. data_designer/cli/controllers/__init__.py +2 -0
  10. data_designer/cli/controllers/download_controller.py +2 -0
  11. data_designer/cli/controllers/model_controller.py +6 -1
  12. data_designer/cli/controllers/provider_controller.py +6 -1
  13. data_designer/cli/forms/__init__.py +2 -0
  14. data_designer/cli/forms/builder.py +2 -0
  15. data_designer/cli/forms/field.py +2 -0
  16. data_designer/cli/forms/form.py +2 -0
  17. data_designer/cli/forms/model_builder.py +2 -0
  18. data_designer/cli/forms/provider_builder.py +2 -0
  19. data_designer/cli/main.py +2 -0
  20. data_designer/cli/repositories/__init__.py +2 -0
  21. data_designer/cli/repositories/base.py +2 -0
  22. data_designer/cli/repositories/model_repository.py +2 -0
  23. data_designer/cli/repositories/persona_repository.py +2 -0
  24. data_designer/cli/repositories/provider_repository.py +2 -0
  25. data_designer/cli/services/__init__.py +2 -0
  26. data_designer/cli/services/download_service.py +2 -0
  27. data_designer/cli/services/model_service.py +2 -0
  28. data_designer/cli/services/provider_service.py +2 -0
  29. data_designer/cli/ui.py +2 -0
  30. data_designer/cli/utils.py +2 -0
  31. data_designer/config/analysis/column_profilers.py +2 -0
  32. data_designer/config/analysis/column_statistics.py +8 -5
  33. data_designer/config/analysis/dataset_profiler.py +9 -3
  34. data_designer/config/analysis/utils/errors.py +2 -0
  35. data_designer/config/analysis/utils/reporting.py +7 -3
  36. data_designer/config/base.py +1 -0
  37. data_designer/config/column_configs.py +77 -7
  38. data_designer/config/column_types.py +33 -36
  39. data_designer/config/dataset_builders.py +2 -0
  40. data_designer/config/dataset_metadata.py +18 -0
  41. data_designer/config/default_model_settings.py +1 -0
  42. data_designer/config/errors.py +2 -0
  43. data_designer/config/exports.py +2 -0
  44. data_designer/config/interface.py +3 -2
  45. data_designer/config/models.py +7 -2
  46. data_designer/config/preview_results.py +9 -1
  47. data_designer/config/processors.py +2 -0
  48. data_designer/config/run_config.py +19 -5
  49. data_designer/config/sampler_constraints.py +2 -0
  50. data_designer/config/sampler_params.py +7 -2
  51. data_designer/config/seed.py +2 -0
  52. data_designer/config/seed_source.py +9 -3
  53. data_designer/config/seed_source_types.py +2 -0
  54. data_designer/config/utils/constants.py +2 -0
  55. data_designer/config/utils/errors.py +2 -0
  56. data_designer/config/utils/info.py +2 -0
  57. data_designer/config/utils/io_helpers.py +8 -3
  58. data_designer/config/utils/misc.py +2 -2
  59. data_designer/config/utils/numerical_helpers.py +2 -0
  60. data_designer/config/utils/type_helpers.py +2 -0
  61. data_designer/config/utils/visualization.py +19 -11
  62. data_designer/config/validator_params.py +2 -0
  63. data_designer/engine/analysis/column_profilers/base.py +9 -8
  64. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
  65. data_designer/engine/analysis/column_profilers/registry.py +2 -0
  66. data_designer/engine/analysis/column_statistics.py +5 -2
  67. data_designer/engine/analysis/dataset_profiler.py +12 -9
  68. data_designer/engine/analysis/errors.py +2 -0
  69. data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
  70. data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
  71. data_designer/engine/column_generators/generators/base.py +26 -14
  72. data_designer/engine/column_generators/generators/embedding.py +4 -11
  73. data_designer/engine/column_generators/generators/expression.py +7 -16
  74. data_designer/engine/column_generators/generators/llm_completion.py +13 -47
  75. data_designer/engine/column_generators/generators/samplers.py +8 -14
  76. data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
  77. data_designer/engine/column_generators/generators/validation.py +9 -20
  78. data_designer/engine/column_generators/registry.py +2 -0
  79. data_designer/engine/column_generators/utils/errors.py +2 -0
  80. data_designer/engine/column_generators/utils/generator_classification.py +2 -0
  81. data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
  82. data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
  83. data_designer/engine/compiler.py +3 -6
  84. data_designer/engine/configurable_task.py +12 -13
  85. data_designer/engine/dataset_builders/artifact_storage.py +87 -8
  86. data_designer/engine/dataset_builders/column_wise_builder.py +34 -35
  87. data_designer/engine/dataset_builders/errors.py +2 -0
  88. data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
  89. data_designer/engine/dataset_builders/utils/concurrency.py +13 -4
  90. data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
  91. data_designer/engine/dataset_builders/utils/dag.py +7 -2
  92. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +35 -25
  93. data_designer/engine/dataset_builders/utils/errors.py +2 -0
  94. data_designer/engine/errors.py +2 -0
  95. data_designer/engine/model_provider.py +2 -0
  96. data_designer/engine/models/errors.py +23 -31
  97. data_designer/engine/models/facade.py +12 -9
  98. data_designer/engine/models/factory.py +42 -0
  99. data_designer/engine/models/litellm_overrides.py +16 -11
  100. data_designer/engine/models/parsers/errors.py +2 -0
  101. data_designer/engine/models/parsers/parser.py +2 -2
  102. data_designer/engine/models/parsers/postprocessors.py +1 -0
  103. data_designer/engine/models/parsers/tag_parsers.py +2 -0
  104. data_designer/engine/models/parsers/types.py +2 -0
  105. data_designer/engine/models/recipes/base.py +2 -0
  106. data_designer/engine/models/recipes/response_recipes.py +2 -0
  107. data_designer/engine/models/registry.py +11 -18
  108. data_designer/engine/models/telemetry.py +6 -2
  109. data_designer/engine/processing/ginja/ast.py +2 -0
  110. data_designer/engine/processing/ginja/environment.py +2 -0
  111. data_designer/engine/processing/ginja/exceptions.py +2 -0
  112. data_designer/engine/processing/ginja/record.py +2 -0
  113. data_designer/engine/processing/gsonschema/exceptions.py +9 -2
  114. data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
  115. data_designer/engine/processing/gsonschema/types.py +2 -0
  116. data_designer/engine/processing/gsonschema/validators.py +10 -6
  117. data_designer/engine/processing/processors/base.py +1 -5
  118. data_designer/engine/processing/processors/drop_columns.py +7 -10
  119. data_designer/engine/processing/processors/registry.py +2 -0
  120. data_designer/engine/processing/processors/schema_transform.py +7 -10
  121. data_designer/engine/processing/utils.py +7 -3
  122. data_designer/engine/registry/base.py +2 -0
  123. data_designer/engine/registry/data_designer_registry.py +2 -0
  124. data_designer/engine/registry/errors.py +2 -0
  125. data_designer/engine/resources/managed_dataset_generator.py +6 -2
  126. data_designer/engine/resources/managed_dataset_repository.py +8 -5
  127. data_designer/engine/resources/managed_storage.py +2 -0
  128. data_designer/engine/resources/resource_provider.py +20 -1
  129. data_designer/engine/resources/seed_reader.py +7 -2
  130. data_designer/engine/sampling_gen/column.py +2 -0
  131. data_designer/engine/sampling_gen/constraints.py +8 -2
  132. data_designer/engine/sampling_gen/data_sources/base.py +10 -7
  133. data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
  134. data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
  135. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
  136. data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
  137. data_designer/engine/sampling_gen/entities/errors.py +2 -0
  138. data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
  139. data_designer/engine/sampling_gen/entities/person.py +2 -0
  140. data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
  141. data_designer/engine/sampling_gen/errors.py +2 -0
  142. data_designer/engine/sampling_gen/generator.py +5 -4
  143. data_designer/engine/sampling_gen/jinja_utils.py +7 -3
  144. data_designer/engine/sampling_gen/people_gen.py +7 -7
  145. data_designer/engine/sampling_gen/person_constants.py +2 -0
  146. data_designer/engine/sampling_gen/schema.py +5 -1
  147. data_designer/engine/sampling_gen/schema_builder.py +2 -0
  148. data_designer/engine/sampling_gen/utils.py +7 -1
  149. data_designer/engine/secret_resolver.py +2 -0
  150. data_designer/engine/validation.py +2 -2
  151. data_designer/engine/validators/__init__.py +2 -0
  152. data_designer/engine/validators/base.py +2 -0
  153. data_designer/engine/validators/local_callable.py +7 -2
  154. data_designer/engine/validators/python.py +7 -1
  155. data_designer/engine/validators/remote.py +7 -1
  156. data_designer/engine/validators/sql.py +8 -3
  157. data_designer/errors.py +2 -0
  158. data_designer/essentials/__init__.py +2 -0
  159. data_designer/interface/data_designer.py +36 -39
  160. data_designer/interface/errors.py +2 -0
  161. data_designer/interface/results.py +9 -2
  162. data_designer/lazy_heavy_imports.py +54 -0
  163. data_designer/logging.py +2 -0
  164. data_designer/plugins/__init__.py +2 -0
  165. data_designer/plugins/errors.py +2 -0
  166. data_designer/plugins/plugin.py +0 -1
  167. data_designer/plugins/registry.py +2 -0
  168. data_designer/plugins/testing/__init__.py +2 -0
  169. data_designer/plugins/testing/stubs.py +21 -43
  170. data_designer/plugins/testing/utils.py +2 -0
  171. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/METADATA +19 -4
  172. data_designer-0.3.5.dist-info/RECORD +196 -0
  173. data_designer-0.3.3.dist-info/RECORD +0 -193
  174. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/WHEEL +0 -0
  175. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/entry_points.txt +0 -0
  176. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,7 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
4
5
 
5
6
  from typing_extensions import TypeAlias
6
7
 
@@ -15,7 +16,7 @@ from data_designer.config.column_configs import (
15
16
  SeedDatasetColumnConfig,
16
17
  ValidationColumnConfig,
17
18
  )
18
- from data_designer.config.errors import InvalidColumnTypeError, InvalidConfigError
19
+ from data_designer.config.errors import InvalidConfigError
19
20
  from data_designer.config.sampler_params import SamplerType
20
21
  from data_designer.config.utils.type_helpers import (
21
22
  SAMPLER_PARAMS,
@@ -45,22 +46,6 @@ DataDesignerColumnType = create_str_enum_from_discriminated_type_union(
45
46
  discriminator_field_name="column_type",
46
47
  )
47
48
 
48
- COLUMN_TYPE_EMOJI_MAP = {
49
- "general": "⚛️", # possible analysis column type
50
- DataDesignerColumnType.EXPRESSION: "🧩",
51
- DataDesignerColumnType.LLM_CODE: "💻",
52
- DataDesignerColumnType.LLM_JUDGE: "⚖️",
53
- DataDesignerColumnType.LLM_STRUCTURED: "🗂️",
54
- DataDesignerColumnType.LLM_TEXT: "📝",
55
- DataDesignerColumnType.SEED_DATASET: "🌱",
56
- DataDesignerColumnType.SAMPLER: "🎲",
57
- DataDesignerColumnType.VALIDATION: "🔍",
58
- DataDesignerColumnType.EMBEDDING: "🧬",
59
- }
60
- COLUMN_TYPE_EMOJI_MAP.update(
61
- {DataDesignerColumnType(p.name): p.emoji for p in plugin_manager.get_column_generator_plugins()}
62
- )
63
-
64
49
 
65
50
  def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType, **kwargs) -> ColumnConfigT:
66
51
  """Create a Data Designer column config object from kwargs.
@@ -74,27 +59,20 @@ def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType
74
59
  Data Designer column object of the appropriate type.
75
60
  """
76
61
  column_type = resolve_string_enum(column_type, DataDesignerColumnType)
77
- if column_type == DataDesignerColumnType.LLM_TEXT:
78
- return LLMTextColumnConfig(name=name, **kwargs)
79
- if column_type == DataDesignerColumnType.LLM_CODE:
80
- return LLMCodeColumnConfig(name=name, **kwargs)
81
- if column_type == DataDesignerColumnType.LLM_STRUCTURED:
82
- return LLMStructuredColumnConfig(name=name, **kwargs)
83
- if column_type == DataDesignerColumnType.LLM_JUDGE:
84
- return LLMJudgeColumnConfig(name=name, **kwargs)
85
- if column_type == DataDesignerColumnType.VALIDATION:
86
- return ValidationColumnConfig(name=name, **kwargs)
87
- if column_type == DataDesignerColumnType.EXPRESSION:
88
- return ExpressionColumnConfig(name=name, **kwargs)
62
+ config_cls = get_column_config_cls_from_type(column_type)
89
63
  if column_type == DataDesignerColumnType.SAMPLER:
90
- return SamplerColumnConfig(name=name, **_resolve_sampler_kwargs(name, kwargs))
91
- if column_type == DataDesignerColumnType.SEED_DATASET:
92
- return SeedDatasetColumnConfig(name=name, **kwargs)
93
- if column_type == DataDesignerColumnType.EMBEDDING:
94
- return EmbeddingColumnConfig(name=name, **kwargs)
64
+ kwargs = _resolve_sampler_kwargs(name, kwargs)
65
+ return config_cls(name=name, **kwargs)
66
+
67
+
68
+ def get_column_config_cls_from_type(column_type: DataDesignerColumnType) -> type[ColumnConfigT]:
69
+ """Get the column config class for a column type."""
70
+ column_type = resolve_string_enum(column_type, DataDesignerColumnType)
71
+ if column_type in _COLUMN_TYPE_CONFIG_CLS_MAP:
72
+ return _COLUMN_TYPE_CONFIG_CLS_MAP[column_type]
95
73
  if plugin := plugin_manager.get_column_generator_plugin_if_exists(column_type.value):
96
- return plugin.config_cls(name=name, **kwargs)
97
- raise InvalidColumnTypeError(f"🛑 {column_type} is not a valid column type.") # pragma: no cover
74
+ return plugin.config_cls
75
+ raise InvalidConfigError(f"🛑 {column_type} is not a valid column type.")
98
76
 
99
77
 
100
78
  def get_column_display_order() -> list[DataDesignerColumnType]:
@@ -114,6 +92,12 @@ def get_column_display_order() -> list[DataDesignerColumnType]:
114
92
  return display_order
115
93
 
116
94
 
95
+ def get_column_emoji_from_type(column_type: DataDesignerColumnType) -> str:
96
+ """Get the emoji for a column type."""
97
+ config_cls = get_column_config_cls_from_type(resolve_string_enum(column_type, DataDesignerColumnType))
98
+ return config_cls.get_column_emoji()
99
+
100
+
117
101
  def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
118
102
  if "sampler_type" not in kwargs:
119
103
  raise InvalidConfigError(f"🛑 `sampler_type` is required for sampler column '{name}'.")
@@ -142,3 +126,16 @@ def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
142
126
  "params": params,
143
127
  **{k: v for k, v in kwargs.items() if k not in ["sampler_type", "params"]},
144
128
  }
129
+
130
+
131
+ _COLUMN_TYPE_CONFIG_CLS_MAP = {
132
+ DataDesignerColumnType.LLM_TEXT: LLMTextColumnConfig,
133
+ DataDesignerColumnType.LLM_CODE: LLMCodeColumnConfig,
134
+ DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnConfig,
135
+ DataDesignerColumnType.LLM_JUDGE: LLMJudgeColumnConfig,
136
+ DataDesignerColumnType.VALIDATION: ValidationColumnConfig,
137
+ DataDesignerColumnType.EXPRESSION: ExpressionColumnConfig,
138
+ DataDesignerColumnType.SAMPLER: SamplerColumnConfig,
139
+ DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnConfig,
140
+ DataDesignerColumnType.EMBEDDING: EmbeddingColumnConfig,
141
+ }
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
7
 
6
8
 
@@ -0,0 +1,18 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class DatasetMetadata(BaseModel):
8
+ """Metadata about a generated dataset.
9
+
10
+ This object is created by the engine and passed to results objects for use
11
+ in visualization and other client-side utilities. It is designed to be
12
+ serializable so it can be sent over the wire in a client-server architecture.
13
+
14
+ Attributes:
15
+ seed_column_names: Names of columns from the seed dataset. Empty list if no seed dataset.
16
+ """
17
+
18
+ seed_column_names: list[str] = []
@@ -1,6 +1,7 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
4
5
 
5
6
  import logging
6
7
  import os
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.errors import DataDesignerError
5
7
 
6
8
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.analysis.column_profilers import JudgeScoreProfilerConfig
5
7
  from data_designer.config.column_configs import (
6
8
  EmbeddingColumnConfig,
@@ -6,13 +6,14 @@ from __future__ import annotations
6
6
  from abc import ABC, abstractmethod
7
7
  from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
8
8
 
9
- import pandas as pd
10
-
11
9
  from data_designer.config.models import ModelConfig, ModelProvider
12
10
  from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS
13
11
  from data_designer.config.utils.info import InterfaceInfo
12
+ from data_designer.lazy_heavy_imports import pd
14
13
 
15
14
  if TYPE_CHECKING:
15
+ import pandas as pd
16
+
16
17
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
17
18
  from data_designer.config.config_builder import DataDesignerConfigBuilder
18
19
  from data_designer.config.preview_results import PreviewResults
@@ -1,13 +1,14 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  from abc import ABC, abstractmethod
6
8
  from enum import Enum
7
9
  from pathlib import Path
8
- from typing import Annotated, Any, Generic, Literal, TypeVar
10
+ from typing import TYPE_CHECKING, Annotated, Any, Generic, Literal, TypeVar
9
11
 
10
- import numpy as np
11
12
  from pydantic import BaseModel, Field, field_validator, model_validator
12
13
  from typing_extensions import Self, TypeAlias
13
14
 
@@ -20,6 +21,10 @@ from data_designer.config.utils.constants import (
20
21
  MIN_TOP_P,
21
22
  )
22
23
  from data_designer.config.utils.io_helpers import smart_load_yaml
24
+ from data_designer.lazy_heavy_imports import np
25
+
26
+ if TYPE_CHECKING:
27
+ import numpy as np
23
28
 
24
29
  logger = logging.getLogger(__name__)
25
30
 
@@ -3,11 +3,16 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
- import pandas as pd
6
+ from typing import TYPE_CHECKING
7
7
 
8
8
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
9
9
  from data_designer.config.config_builder import DataDesignerConfigBuilder
10
+ from data_designer.config.dataset_metadata import DatasetMetadata
10
11
  from data_designer.config.utils.visualization import WithRecordSamplerMixin
12
+ from data_designer.lazy_heavy_imports import pd
13
+
14
+ if TYPE_CHECKING:
15
+ import pandas as pd
11
16
 
12
17
 
13
18
  class PreviewResults(WithRecordSamplerMixin):
@@ -15,6 +20,7 @@ class PreviewResults(WithRecordSamplerMixin):
15
20
  self,
16
21
  *,
17
22
  config_builder: DataDesignerConfigBuilder,
23
+ dataset_metadata: DatasetMetadata | None = None,
18
24
  dataset: pd.DataFrame | None = None,
19
25
  analysis: DatasetProfilerResults | None = None,
20
26
  processor_artifacts: dict[str, list[str] | str] | None = None,
@@ -23,6 +29,7 @@ class PreviewResults(WithRecordSamplerMixin):
23
29
 
24
30
  Args:
25
31
  config_builder: Data Designer configuration builder.
32
+ dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
26
33
  dataset: Dataset of the preview run.
27
34
  analysis: Analysis of the preview run.
28
35
  processor_artifacts: Artifacts generated by the processors.
@@ -30,4 +37,5 @@ class PreviewResults(WithRecordSamplerMixin):
30
37
  self.dataset: pd.DataFrame | None = dataset
31
38
  self.analysis: DatasetProfilerResults | None = analysis
32
39
  self.processor_artifacts: dict[str, list[str] | str] | None = processor_artifacts
40
+ self.dataset_metadata: DatasetMetadata | None = dataset_metadata
33
41
  self._config_builder = config_builder
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import json
5
7
  from abc import ABC
6
8
  from enum import Enum
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from pydantic import Field, model_validator
5
7
  from typing_extensions import Self
6
8
 
@@ -14,21 +16,33 @@ class RunConfig(ConfigBase):
14
16
  part of the dataset configuration itself.
15
17
 
16
18
  Attributes:
17
- disable_early_shutdown: If True, disables early shutdown entirely. Generation
18
- will continue regardless of error rate. Default is False.
19
- shutdown_error_rate: Error rate threshold (0.0-1.0) that triggers early shutdown.
20
- When early shutdown is disabled, this value is normalized to 1.0. Default is 0.5.
19
+ disable_early_shutdown: If True, disables the executor's early-shutdown behavior entirely.
20
+ Generation will continue regardless of error rate, and the early-shutdown exception
21
+ will never be raised. Error counts and summaries are still collected. Default is False.
22
+ shutdown_error_rate: Error rate threshold (0.0-1.0) that triggers early shutdown when
23
+ early shutdown is enabled. Default is 0.5.
21
24
  shutdown_error_window: Minimum number of completed tasks before error rate
22
25
  monitoring begins. Must be >= 0. Default is 10.
26
+ buffer_size: Number of records to process in each batch during dataset generation.
27
+ A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
28
+ to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
29
+ max_conversation_restarts: Maximum number of full conversation restarts permitted when
30
+ generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
31
+ max_conversation_correction_steps: Maximum number of correction rounds permitted within a
32
+ single conversation when generation tasks call `ModelFacade.generate(...)`. Must be >= 0.
33
+ Default is 0.
23
34
  """
24
35
 
25
36
  disable_early_shutdown: bool = False
26
37
  shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
27
38
  shutdown_error_window: int = Field(default=10, ge=0)
39
+ buffer_size: int = Field(default=1000, gt=0)
40
+ max_conversation_restarts: int = Field(default=5, ge=0)
41
+ max_conversation_correction_steps: int = Field(default=0, ge=0)
28
42
 
29
43
  @model_validator(mode="after")
30
44
  def normalize_shutdown_settings(self) -> Self:
31
- """Set shutdown_error_rate to 1.0 when early shutdown is disabled."""
45
+ """Normalize shutdown settings for compatibility."""
32
46
  if self.disable_early_shutdown:
33
47
  self.shutdown_error_rate = 1.0
34
48
  return self
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from abc import ABC, abstractmethod
5
7
  from enum import Enum
6
8
 
@@ -1,10 +1,11 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
- from typing import Literal
7
+ from typing import TYPE_CHECKING, Literal
6
8
 
7
- import pandas as pd
8
9
  from pydantic import Field, field_validator, model_validator
9
10
  from typing_extensions import Self, TypeAlias
10
11
 
@@ -16,6 +17,10 @@ from data_designer.config.utils.constants import (
16
17
  MAX_AGE,
17
18
  MIN_AGE,
18
19
  )
20
+ from data_designer.lazy_heavy_imports import pd
21
+
22
+ if TYPE_CHECKING:
23
+ import pandas as pd
19
24
 
20
25
 
21
26
  class SamplerType(str, Enum):
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
7
 
6
8
  from pydantic import Field, model_validator
@@ -1,11 +1,13 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from abc import ABC
5
- from typing import Literal
7
+ from typing import TYPE_CHECKING, Literal
6
8
 
7
- import pandas as pd
8
9
  from pydantic import BaseModel, ConfigDict, Field, field_validator
10
+ from pydantic.json_schema import SkipJsonSchema
9
11
  from typing_extensions import Self
10
12
 
11
13
  from data_designer.config.utils.io_helpers import (
@@ -13,6 +15,10 @@ from data_designer.config.utils.io_helpers import (
13
15
  validate_dataset_file_path,
14
16
  validate_path_contains_files_of_type,
15
17
  )
18
+ from data_designer.lazy_heavy_imports import pd
19
+
20
+ if TYPE_CHECKING:
21
+ import pandas as pd
16
22
 
17
23
 
18
24
  class SeedSource(BaseModel, ABC):
@@ -68,7 +74,7 @@ class DataFrameSeedSource(SeedSource):
68
74
 
69
75
  model_config = ConfigDict(arbitrary_types_allowed=True)
70
76
 
71
- df: pd.DataFrame = Field(
77
+ df: SkipJsonSchema[pd.DataFrame] = Field(
72
78
  ...,
73
79
  exclude=True,
74
80
  description=(
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from typing import Annotated
5
7
 
6
8
  from pydantic import Field
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import os
5
7
  from enum import Enum
6
8
  from pathlib import Path
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.errors import DataDesignerError
5
7
 
6
8
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from abc import ABC, abstractmethod
5
7
  from enum import Enum
6
8
  from typing import Literal, TypeVar
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import json
5
7
  import logging
6
8
  import os
@@ -8,13 +10,16 @@ from datetime import date, datetime, timedelta
8
10
  from decimal import Decimal
9
11
  from numbers import Number
10
12
  from pathlib import Path
11
- from typing import Any
13
+ from typing import TYPE_CHECKING, Any
12
14
 
13
- import numpy as np
14
- import pandas as pd
15
15
  import yaml
16
16
 
17
17
  from data_designer.config.errors import InvalidFileFormatError, InvalidFilePathError
18
+ from data_designer.lazy_heavy_imports import np, pd
19
+
20
+ if TYPE_CHECKING:
21
+ import numpy as np
22
+ import pandas as pd
18
23
 
19
24
  logger = logging.getLogger(__name__)
20
25
 
@@ -48,8 +48,8 @@ def can_run_data_designer_locally() -> bool:
48
48
  return True
49
49
 
50
50
 
51
- def get_prompt_template_keywords(template: str) -> set[str]:
52
- """Extract all keywords from a valid string template."""
51
+ def extract_keywords_from_jinja2_template(template: str) -> set[str]:
52
+ """Extract all keywords from a valid Jinja2 template."""
53
53
  with template_error_handler():
54
54
  ast = ImmutableSandboxedEnvironment().parse(template)
55
55
  keywords = set(meta.find_undeclared_variables(ast))
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import numbers
5
7
  from numbers import Number
6
8
  from typing import Any
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import inspect
5
7
  from enum import Enum
6
8
  from typing import Any, Literal, get_args, get_origin
@@ -10,8 +10,6 @@ from enum import Enum
10
10
  from functools import cached_property
11
11
  from typing import TYPE_CHECKING, Any
12
12
 
13
- import numpy as np
14
- import pandas as pd
15
13
  from rich.console import Console, Group
16
14
  from rich.padding import Padding
17
15
  from rich.panel import Panel
@@ -28,9 +26,14 @@ from data_designer.config.sampler_params import SamplerType
28
26
  from data_designer.config.utils.code_lang import code_lang_to_syntax_lexer
29
27
  from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
30
28
  from data_designer.config.utils.errors import DatasetSampleDisplayError
29
+ from data_designer.lazy_heavy_imports import np, pd
31
30
 
32
31
  if TYPE_CHECKING:
32
+ import numpy as np
33
+ import pandas as pd
34
+
33
35
  from data_designer.config.config_builder import DataDesignerConfigBuilder
36
+ from data_designer.config.dataset_metadata import DatasetMetadata
34
37
 
35
38
 
36
39
  console = Console()
@@ -57,6 +60,7 @@ class ColorPalette(str, Enum):
57
60
 
58
61
  class WithRecordSamplerMixin:
59
62
  _display_cycle_index: int = 0
63
+ dataset_metadata: DatasetMetadata | None
60
64
 
61
65
  @cached_property
62
66
  def _record_sampler_dataset(self) -> pd.DataFrame:
@@ -79,22 +83,22 @@ class WithRecordSamplerMixin:
79
83
  self,
80
84
  index: int | None = None,
81
85
  *,
82
- hide_seed_columns: bool = False,
83
86
  syntax_highlighting_theme: str = "dracula",
84
87
  background_color: str | None = None,
85
88
  processors_to_display: list[str] | None = None,
89
+ hide_seed_columns: bool = False,
86
90
  ) -> None:
87
91
  """Display a sample record from the Data Designer dataset preview.
88
92
 
89
93
  Args:
90
94
  index: Index of the record to display. If None, the next record will be displayed.
91
95
  This is useful for running the cell in a notebook multiple times.
92
- hide_seed_columns: If True, the columns from the seed dataset (if any) will not be displayed.
93
96
  syntax_highlighting_theme: Theme to use for syntax highlighting. See the `Syntax`
94
97
  documentation from `rich` for information about available themes.
95
98
  background_color: Background color to use for the record. See the `Syntax`
96
99
  documentation from `rich` for information about available background colors.
97
100
  processors_to_display: List of processors to display the artifacts for. If None, all processors will be displayed.
101
+ hide_seed_columns: If True, seed columns will not be displayed separately.
98
102
  """
99
103
  i = index or self._display_cycle_index
100
104
 
@@ -120,14 +124,18 @@ class WithRecordSamplerMixin:
120
124
  else:
121
125
  processor_data_to_display[processor] = self.processor_artifacts[processor]
122
126
 
127
+ seed_column_names = (
128
+ None if hide_seed_columns or self.dataset_metadata is None else self.dataset_metadata.seed_column_names
129
+ )
130
+
123
131
  display_sample_record(
124
132
  record=record,
125
133
  processor_data_to_display=processor_data_to_display,
126
134
  config_builder=self._config_builder,
127
135
  background_color=background_color,
128
136
  syntax_highlighting_theme=syntax_highlighting_theme,
129
- hide_seed_columns=hide_seed_columns,
130
137
  record_index=i,
138
+ seed_column_names=seed_column_names,
131
139
  )
132
140
  if index is None:
133
141
  self._display_cycle_index = (self._display_cycle_index + 1) % num_records
@@ -160,7 +168,7 @@ def display_sample_record(
160
168
  background_color: str | None = None,
161
169
  syntax_highlighting_theme: str = "dracula",
162
170
  record_index: int | None = None,
163
- hide_seed_columns: bool = False,
171
+ seed_column_names: list[str] | None = None,
164
172
  ):
165
173
  if isinstance(record, (dict, pd.Series)):
166
174
  record = pd.DataFrame([record]).iloc[0]
@@ -179,14 +187,14 @@ def display_sample_record(
179
187
  render_list = []
180
188
  table_kws = dict(show_lines=True, expand=True)
181
189
 
182
- seed_columns = config_builder.get_columns_of_type(DataDesignerColumnType.SEED_DATASET)
183
- if not hide_seed_columns and len(seed_columns) > 0:
190
+ # Display seed columns if seed_column_names is provided and not empty
191
+ if seed_column_names:
184
192
  table = Table(title="Seed Columns", **table_kws)
185
193
  table.add_column("Name")
186
194
  table.add_column("Value")
187
- for col in seed_columns:
188
- if not col.drop:
189
- table.add_row(col.name, convert_to_row_element(record[col.name]))
195
+ for col_name in seed_column_names:
196
+ if col_name in record.index:
197
+ table.add_row(col_name, convert_to_row_element(record[col_name]))
190
198
  render_list.append(pad_console_element(table))
191
199
 
192
200
  non_code_columns = (
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
7
  from typing import Any
6
8
 
@@ -5,15 +5,19 @@ from __future__ import annotations
5
5
 
6
6
  import logging
7
7
  from abc import ABC, abstractmethod
8
+ from typing import TYPE_CHECKING
8
9
 
9
- import pandas as pd
10
10
  from pydantic import BaseModel, model_validator
11
11
  from typing_extensions import Self
12
12
 
13
13
  from data_designer.config.base import ConfigBase
14
14
  from data_designer.config.column_configs import SingleColumnConfig
15
15
  from data_designer.config.column_types import DataDesignerColumnType
16
- from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, TaskConfigT
16
+ from data_designer.engine.configurable_task import ConfigurableTask, TaskConfigT
17
+ from data_designer.lazy_heavy_imports import pd
18
+
19
+ if TYPE_CHECKING:
20
+ import pandas as pd
17
21
 
18
22
  logger = logging.getLogger(__name__)
19
23
 
@@ -32,17 +36,14 @@ class ColumnConfigWithDataFrame(ConfigBase):
32
36
  return (self.column_config, self.df)
33
37
 
34
38
 
35
- class ColumnProfilerMetadata(ConfigurableTaskMetadata):
36
- applicable_column_types: list[DataDesignerColumnType]
37
-
38
-
39
39
  class ColumnProfiler(ConfigurableTask[TaskConfigT], ABC):
40
40
  @staticmethod
41
41
  @abstractmethod
42
- def metadata() -> ColumnProfilerMetadata: ...
42
+ def get_applicable_column_types() -> list[DataDesignerColumnType]:
43
+ """Returns a list of column types that this profiler can be applied to during dataset profiling."""
43
44
 
44
45
  @abstractmethod
45
46
  def profile(self, column_config_with_df: ColumnConfigWithDataFrame) -> BaseModel: ...
46
47
 
47
48
  def _initialize(self) -> None:
48
- logger.info(f"💫 Initializing column profiler: '{self.metadata().name}'")
49
+ logger.info(f"💫 Initializing column profiler: '{self.name}'")