data-designer 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. data_designer/__init__.py +2 -0
  2. data_designer/_version.py +2 -2
  3. data_designer/cli/__init__.py +2 -0
  4. data_designer/cli/commands/download.py +2 -0
  5. data_designer/cli/commands/list.py +2 -0
  6. data_designer/cli/commands/models.py +2 -0
  7. data_designer/cli/commands/providers.py +2 -0
  8. data_designer/cli/commands/reset.py +2 -0
  9. data_designer/cli/controllers/__init__.py +2 -0
  10. data_designer/cli/controllers/download_controller.py +2 -0
  11. data_designer/cli/controllers/model_controller.py +6 -1
  12. data_designer/cli/controllers/provider_controller.py +6 -1
  13. data_designer/cli/forms/__init__.py +2 -0
  14. data_designer/cli/forms/builder.py +2 -0
  15. data_designer/cli/forms/field.py +2 -0
  16. data_designer/cli/forms/form.py +2 -0
  17. data_designer/cli/forms/model_builder.py +2 -0
  18. data_designer/cli/forms/provider_builder.py +2 -0
  19. data_designer/cli/main.py +2 -0
  20. data_designer/cli/repositories/__init__.py +2 -0
  21. data_designer/cli/repositories/base.py +2 -0
  22. data_designer/cli/repositories/model_repository.py +2 -0
  23. data_designer/cli/repositories/persona_repository.py +2 -0
  24. data_designer/cli/repositories/provider_repository.py +2 -0
  25. data_designer/cli/services/__init__.py +2 -0
  26. data_designer/cli/services/download_service.py +2 -0
  27. data_designer/cli/services/model_service.py +2 -0
  28. data_designer/cli/services/provider_service.py +2 -0
  29. data_designer/cli/ui.py +2 -0
  30. data_designer/cli/utils.py +2 -0
  31. data_designer/config/analysis/column_profilers.py +2 -0
  32. data_designer/config/analysis/column_statistics.py +8 -5
  33. data_designer/config/analysis/dataset_profiler.py +9 -3
  34. data_designer/config/analysis/utils/errors.py +2 -0
  35. data_designer/config/analysis/utils/reporting.py +7 -3
  36. data_designer/config/column_configs.py +77 -7
  37. data_designer/config/column_types.py +33 -36
  38. data_designer/config/dataset_builders.py +2 -0
  39. data_designer/config/default_model_settings.py +1 -0
  40. data_designer/config/errors.py +2 -0
  41. data_designer/config/exports.py +2 -0
  42. data_designer/config/interface.py +3 -2
  43. data_designer/config/models.py +7 -2
  44. data_designer/config/preview_results.py +7 -3
  45. data_designer/config/processors.py +2 -0
  46. data_designer/config/run_config.py +2 -0
  47. data_designer/config/sampler_constraints.py +2 -0
  48. data_designer/config/sampler_params.py +7 -2
  49. data_designer/config/seed.py +2 -0
  50. data_designer/config/seed_source.py +7 -2
  51. data_designer/config/seed_source_types.py +2 -0
  52. data_designer/config/utils/constants.py +2 -0
  53. data_designer/config/utils/errors.py +2 -0
  54. data_designer/config/utils/info.py +2 -0
  55. data_designer/config/utils/io_helpers.py +8 -3
  56. data_designer/config/utils/misc.py +2 -2
  57. data_designer/config/utils/numerical_helpers.py +2 -0
  58. data_designer/config/utils/type_helpers.py +2 -0
  59. data_designer/config/utils/visualization.py +8 -4
  60. data_designer/config/validator_params.py +2 -0
  61. data_designer/engine/analysis/column_profilers/base.py +9 -8
  62. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
  63. data_designer/engine/analysis/column_profilers/registry.py +2 -0
  64. data_designer/engine/analysis/column_statistics.py +5 -2
  65. data_designer/engine/analysis/dataset_profiler.py +12 -9
  66. data_designer/engine/analysis/errors.py +2 -0
  67. data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
  68. data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
  69. data_designer/engine/column_generators/generators/base.py +26 -14
  70. data_designer/engine/column_generators/generators/embedding.py +4 -11
  71. data_designer/engine/column_generators/generators/expression.py +7 -16
  72. data_designer/engine/column_generators/generators/llm_completion.py +11 -37
  73. data_designer/engine/column_generators/generators/samplers.py +8 -14
  74. data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
  75. data_designer/engine/column_generators/generators/validation.py +8 -20
  76. data_designer/engine/column_generators/registry.py +2 -0
  77. data_designer/engine/column_generators/utils/errors.py +2 -0
  78. data_designer/engine/column_generators/utils/generator_classification.py +2 -0
  79. data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
  80. data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
  81. data_designer/engine/compiler.py +3 -6
  82. data_designer/engine/configurable_task.py +12 -13
  83. data_designer/engine/dataset_builders/artifact_storage.py +87 -8
  84. data_designer/engine/dataset_builders/column_wise_builder.py +32 -34
  85. data_designer/engine/dataset_builders/errors.py +2 -0
  86. data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
  87. data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
  88. data_designer/engine/dataset_builders/utils/dag.py +7 -2
  89. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +9 -6
  90. data_designer/engine/dataset_builders/utils/errors.py +2 -0
  91. data_designer/engine/errors.py +2 -0
  92. data_designer/engine/model_provider.py +2 -0
  93. data_designer/engine/models/errors.py +23 -31
  94. data_designer/engine/models/facade.py +12 -9
  95. data_designer/engine/models/factory.py +42 -0
  96. data_designer/engine/models/litellm_overrides.py +22 -11
  97. data_designer/engine/models/parsers/errors.py +2 -0
  98. data_designer/engine/models/parsers/parser.py +2 -2
  99. data_designer/engine/models/parsers/postprocessors.py +1 -0
  100. data_designer/engine/models/parsers/tag_parsers.py +2 -0
  101. data_designer/engine/models/parsers/types.py +2 -0
  102. data_designer/engine/models/recipes/base.py +2 -0
  103. data_designer/engine/models/recipes/response_recipes.py +2 -0
  104. data_designer/engine/models/registry.py +11 -18
  105. data_designer/engine/models/telemetry.py +6 -2
  106. data_designer/engine/processing/ginja/ast.py +2 -0
  107. data_designer/engine/processing/ginja/environment.py +2 -0
  108. data_designer/engine/processing/ginja/exceptions.py +2 -0
  109. data_designer/engine/processing/ginja/record.py +2 -0
  110. data_designer/engine/processing/gsonschema/exceptions.py +9 -2
  111. data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
  112. data_designer/engine/processing/gsonschema/types.py +2 -0
  113. data_designer/engine/processing/gsonschema/validators.py +10 -6
  114. data_designer/engine/processing/processors/base.py +1 -5
  115. data_designer/engine/processing/processors/drop_columns.py +7 -10
  116. data_designer/engine/processing/processors/registry.py +2 -0
  117. data_designer/engine/processing/processors/schema_transform.py +7 -10
  118. data_designer/engine/processing/utils.py +7 -3
  119. data_designer/engine/registry/base.py +2 -0
  120. data_designer/engine/registry/data_designer_registry.py +2 -0
  121. data_designer/engine/registry/errors.py +2 -0
  122. data_designer/engine/resources/managed_dataset_generator.py +6 -2
  123. data_designer/engine/resources/managed_dataset_repository.py +8 -5
  124. data_designer/engine/resources/managed_storage.py +2 -0
  125. data_designer/engine/resources/resource_provider.py +8 -1
  126. data_designer/engine/resources/seed_reader.py +7 -2
  127. data_designer/engine/sampling_gen/column.py +2 -0
  128. data_designer/engine/sampling_gen/constraints.py +8 -2
  129. data_designer/engine/sampling_gen/data_sources/base.py +10 -7
  130. data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
  131. data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
  132. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
  133. data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
  134. data_designer/engine/sampling_gen/entities/errors.py +2 -0
  135. data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
  136. data_designer/engine/sampling_gen/entities/person.py +2 -0
  137. data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
  138. data_designer/engine/sampling_gen/errors.py +2 -0
  139. data_designer/engine/sampling_gen/generator.py +5 -4
  140. data_designer/engine/sampling_gen/jinja_utils.py +7 -3
  141. data_designer/engine/sampling_gen/people_gen.py +7 -7
  142. data_designer/engine/sampling_gen/person_constants.py +2 -0
  143. data_designer/engine/sampling_gen/schema.py +5 -1
  144. data_designer/engine/sampling_gen/schema_builder.py +2 -0
  145. data_designer/engine/sampling_gen/utils.py +7 -1
  146. data_designer/engine/secret_resolver.py +2 -0
  147. data_designer/engine/validation.py +2 -2
  148. data_designer/engine/validators/__init__.py +2 -0
  149. data_designer/engine/validators/base.py +2 -0
  150. data_designer/engine/validators/local_callable.py +7 -2
  151. data_designer/engine/validators/python.py +7 -1
  152. data_designer/engine/validators/remote.py +7 -1
  153. data_designer/engine/validators/sql.py +8 -3
  154. data_designer/errors.py +2 -0
  155. data_designer/essentials/__init__.py +2 -0
  156. data_designer/interface/data_designer.py +23 -17
  157. data_designer/interface/errors.py +2 -0
  158. data_designer/interface/results.py +5 -2
  159. data_designer/lazy_heavy_imports.py +54 -0
  160. data_designer/logging.py +2 -0
  161. data_designer/plugins/__init__.py +2 -0
  162. data_designer/plugins/errors.py +2 -0
  163. data_designer/plugins/plugin.py +0 -1
  164. data_designer/plugins/registry.py +2 -0
  165. data_designer/plugins/testing/__init__.py +2 -0
  166. data_designer/plugins/testing/stubs.py +21 -43
  167. data_designer/plugins/testing/utils.py +2 -0
  168. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/METADATA +12 -5
  169. data_designer-0.3.6.dist-info/RECORD +196 -0
  170. data_designer-0.3.4.dist-info/RECORD +0 -194
  171. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/WHEEL +0 -0
  172. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/entry_points.txt +0 -0
  173. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,7 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
4
5
 
5
6
  from typing_extensions import TypeAlias
6
7
 
@@ -15,7 +16,7 @@ from data_designer.config.column_configs import (
15
16
  SeedDatasetColumnConfig,
16
17
  ValidationColumnConfig,
17
18
  )
18
- from data_designer.config.errors import InvalidColumnTypeError, InvalidConfigError
19
+ from data_designer.config.errors import InvalidConfigError
19
20
  from data_designer.config.sampler_params import SamplerType
20
21
  from data_designer.config.utils.type_helpers import (
21
22
  SAMPLER_PARAMS,
@@ -45,22 +46,6 @@ DataDesignerColumnType = create_str_enum_from_discriminated_type_union(
45
46
  discriminator_field_name="column_type",
46
47
  )
47
48
 
48
- COLUMN_TYPE_EMOJI_MAP = {
49
- "general": "⚛️", # possible analysis column type
50
- DataDesignerColumnType.EXPRESSION: "🧩",
51
- DataDesignerColumnType.LLM_CODE: "💻",
52
- DataDesignerColumnType.LLM_JUDGE: "⚖️",
53
- DataDesignerColumnType.LLM_STRUCTURED: "🗂️",
54
- DataDesignerColumnType.LLM_TEXT: "📝",
55
- DataDesignerColumnType.SEED_DATASET: "🌱",
56
- DataDesignerColumnType.SAMPLER: "🎲",
57
- DataDesignerColumnType.VALIDATION: "🔍",
58
- DataDesignerColumnType.EMBEDDING: "🧬",
59
- }
60
- COLUMN_TYPE_EMOJI_MAP.update(
61
- {DataDesignerColumnType(p.name): p.emoji for p in plugin_manager.get_column_generator_plugins()}
62
- )
63
-
64
49
 
65
50
  def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType, **kwargs) -> ColumnConfigT:
66
51
  """Create a Data Designer column config object from kwargs.
@@ -74,27 +59,20 @@ def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType
74
59
  Data Designer column object of the appropriate type.
75
60
  """
76
61
  column_type = resolve_string_enum(column_type, DataDesignerColumnType)
77
- if column_type == DataDesignerColumnType.LLM_TEXT:
78
- return LLMTextColumnConfig(name=name, **kwargs)
79
- if column_type == DataDesignerColumnType.LLM_CODE:
80
- return LLMCodeColumnConfig(name=name, **kwargs)
81
- if column_type == DataDesignerColumnType.LLM_STRUCTURED:
82
- return LLMStructuredColumnConfig(name=name, **kwargs)
83
- if column_type == DataDesignerColumnType.LLM_JUDGE:
84
- return LLMJudgeColumnConfig(name=name, **kwargs)
85
- if column_type == DataDesignerColumnType.VALIDATION:
86
- return ValidationColumnConfig(name=name, **kwargs)
87
- if column_type == DataDesignerColumnType.EXPRESSION:
88
- return ExpressionColumnConfig(name=name, **kwargs)
62
+ config_cls = get_column_config_cls_from_type(column_type)
89
63
  if column_type == DataDesignerColumnType.SAMPLER:
90
- return SamplerColumnConfig(name=name, **_resolve_sampler_kwargs(name, kwargs))
91
- if column_type == DataDesignerColumnType.SEED_DATASET:
92
- return SeedDatasetColumnConfig(name=name, **kwargs)
93
- if column_type == DataDesignerColumnType.EMBEDDING:
94
- return EmbeddingColumnConfig(name=name, **kwargs)
64
+ kwargs = _resolve_sampler_kwargs(name, kwargs)
65
+ return config_cls(name=name, **kwargs)
66
+
67
+
68
+ def get_column_config_cls_from_type(column_type: DataDesignerColumnType) -> type[ColumnConfigT]:
69
+ """Get the column config class for a column type."""
70
+ column_type = resolve_string_enum(column_type, DataDesignerColumnType)
71
+ if column_type in _COLUMN_TYPE_CONFIG_CLS_MAP:
72
+ return _COLUMN_TYPE_CONFIG_CLS_MAP[column_type]
95
73
  if plugin := plugin_manager.get_column_generator_plugin_if_exists(column_type.value):
96
- return plugin.config_cls(name=name, **kwargs)
97
- raise InvalidColumnTypeError(f"🛑 {column_type} is not a valid column type.") # pragma: no cover
74
+ return plugin.config_cls
75
+ raise InvalidConfigError(f"🛑 {column_type} is not a valid column type.")
98
76
 
99
77
 
100
78
  def get_column_display_order() -> list[DataDesignerColumnType]:
@@ -114,6 +92,12 @@ def get_column_display_order() -> list[DataDesignerColumnType]:
114
92
  return display_order
115
93
 
116
94
 
95
+ def get_column_emoji_from_type(column_type: DataDesignerColumnType) -> str:
96
+ """Get the emoji for a column type."""
97
+ config_cls = get_column_config_cls_from_type(resolve_string_enum(column_type, DataDesignerColumnType))
98
+ return config_cls.get_column_emoji()
99
+
100
+
117
101
  def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
118
102
  if "sampler_type" not in kwargs:
119
103
  raise InvalidConfigError(f"🛑 `sampler_type` is required for sampler column '{name}'.")
@@ -142,3 +126,16 @@ def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict:
142
126
  "params": params,
143
127
  **{k: v for k, v in kwargs.items() if k not in ["sampler_type", "params"]},
144
128
  }
129
+
130
+
131
+ _COLUMN_TYPE_CONFIG_CLS_MAP = {
132
+ DataDesignerColumnType.LLM_TEXT: LLMTextColumnConfig,
133
+ DataDesignerColumnType.LLM_CODE: LLMCodeColumnConfig,
134
+ DataDesignerColumnType.LLM_STRUCTURED: LLMStructuredColumnConfig,
135
+ DataDesignerColumnType.LLM_JUDGE: LLMJudgeColumnConfig,
136
+ DataDesignerColumnType.VALIDATION: ValidationColumnConfig,
137
+ DataDesignerColumnType.EXPRESSION: ExpressionColumnConfig,
138
+ DataDesignerColumnType.SAMPLER: SamplerColumnConfig,
139
+ DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnConfig,
140
+ DataDesignerColumnType.EMBEDDING: EmbeddingColumnConfig,
141
+ }
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
7
 
6
8
 
@@ -1,6 +1,7 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
4
5
 
5
6
  import logging
6
7
  import os
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.errors import DataDesignerError
5
7
 
6
8
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.analysis.column_profilers import JudgeScoreProfilerConfig
5
7
  from data_designer.config.column_configs import (
6
8
  EmbeddingColumnConfig,
@@ -6,13 +6,14 @@ from __future__ import annotations
6
6
  from abc import ABC, abstractmethod
7
7
  from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
8
8
 
9
- import pandas as pd
10
-
11
9
  from data_designer.config.models import ModelConfig, ModelProvider
12
10
  from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS
13
11
  from data_designer.config.utils.info import InterfaceInfo
12
+ from data_designer.lazy_heavy_imports import pd
14
13
 
15
14
  if TYPE_CHECKING:
15
+ import pandas as pd
16
+
16
17
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
17
18
  from data_designer.config.config_builder import DataDesignerConfigBuilder
18
19
  from data_designer.config.preview_results import PreviewResults
@@ -1,13 +1,14 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  from abc import ABC, abstractmethod
6
8
  from enum import Enum
7
9
  from pathlib import Path
8
- from typing import Annotated, Any, Generic, Literal, TypeVar
10
+ from typing import TYPE_CHECKING, Annotated, Any, Generic, Literal, TypeVar
9
11
 
10
- import numpy as np
11
12
  from pydantic import BaseModel, Field, field_validator, model_validator
12
13
  from typing_extensions import Self, TypeAlias
13
14
 
@@ -20,6 +21,10 @@ from data_designer.config.utils.constants import (
20
21
  MIN_TOP_P,
21
22
  )
22
23
  from data_designer.config.utils.io_helpers import smart_load_yaml
24
+ from data_designer.lazy_heavy_imports import np
25
+
26
+ if TYPE_CHECKING:
27
+ import numpy as np
23
28
 
24
29
  logger = logging.getLogger(__name__)
25
30
 
@@ -3,12 +3,16 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
- import pandas as pd
6
+ from typing import TYPE_CHECKING
7
7
 
8
8
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
9
9
  from data_designer.config.config_builder import DataDesignerConfigBuilder
10
10
  from data_designer.config.dataset_metadata import DatasetMetadata
11
11
  from data_designer.config.utils.visualization import WithRecordSamplerMixin
12
+ from data_designer.lazy_heavy_imports import pd
13
+
14
+ if TYPE_CHECKING:
15
+ import pandas as pd
12
16
 
13
17
 
14
18
  class PreviewResults(WithRecordSamplerMixin):
@@ -16,7 +20,7 @@ class PreviewResults(WithRecordSamplerMixin):
16
20
  self,
17
21
  *,
18
22
  config_builder: DataDesignerConfigBuilder,
19
- dataset_metadata: DatasetMetadata,
23
+ dataset_metadata: DatasetMetadata | None = None,
20
24
  dataset: pd.DataFrame | None = None,
21
25
  analysis: DatasetProfilerResults | None = None,
22
26
  processor_artifacts: dict[str, list[str] | str] | None = None,
@@ -33,5 +37,5 @@ class PreviewResults(WithRecordSamplerMixin):
33
37
  self.dataset: pd.DataFrame | None = dataset
34
38
  self.analysis: DatasetProfilerResults | None = analysis
35
39
  self.processor_artifacts: dict[str, list[str] | str] | None = processor_artifacts
36
- self.dataset_metadata = dataset_metadata
40
+ self.dataset_metadata: DatasetMetadata | None = dataset_metadata
37
41
  self._config_builder = config_builder
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import json
5
7
  from abc import ABC
6
8
  from enum import Enum
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from pydantic import Field, model_validator
5
7
  from typing_extensions import Self
6
8
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from abc import ABC, abstractmethod
5
7
  from enum import Enum
6
8
 
@@ -1,10 +1,11 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
- from typing import Literal
7
+ from typing import TYPE_CHECKING, Literal
6
8
 
7
- import pandas as pd
8
9
  from pydantic import Field, field_validator, model_validator
9
10
  from typing_extensions import Self, TypeAlias
10
11
 
@@ -16,6 +17,10 @@ from data_designer.config.utils.constants import (
16
17
  MAX_AGE,
17
18
  MIN_AGE,
18
19
  )
20
+ from data_designer.lazy_heavy_imports import pd
21
+
22
+ if TYPE_CHECKING:
23
+ import pandas as pd
19
24
 
20
25
 
21
26
  class SamplerType(str, Enum):
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
7
 
6
8
  from pydantic import Field, model_validator
@@ -1,10 +1,11 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from abc import ABC
5
- from typing import Literal
7
+ from typing import TYPE_CHECKING, Literal
6
8
 
7
- import pandas as pd
8
9
  from pydantic import BaseModel, ConfigDict, Field, field_validator
9
10
  from pydantic.json_schema import SkipJsonSchema
10
11
  from typing_extensions import Self
@@ -14,6 +15,10 @@ from data_designer.config.utils.io_helpers import (
14
15
  validate_dataset_file_path,
15
16
  validate_path_contains_files_of_type,
16
17
  )
18
+ from data_designer.lazy_heavy_imports import pd
19
+
20
+ if TYPE_CHECKING:
21
+ import pandas as pd
17
22
 
18
23
 
19
24
  class SeedSource(BaseModel, ABC):
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from typing import Annotated
5
7
 
6
8
  from pydantic import Field
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import os
5
7
  from enum import Enum
6
8
  from pathlib import Path
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.errors import DataDesignerError
5
7
 
6
8
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from abc import ABC, abstractmethod
5
7
  from enum import Enum
6
8
  from typing import Literal, TypeVar
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import json
5
7
  import logging
6
8
  import os
@@ -8,13 +10,16 @@ from datetime import date, datetime, timedelta
8
10
  from decimal import Decimal
9
11
  from numbers import Number
10
12
  from pathlib import Path
11
- from typing import Any
13
+ from typing import TYPE_CHECKING, Any
12
14
 
13
- import numpy as np
14
- import pandas as pd
15
15
  import yaml
16
16
 
17
17
  from data_designer.config.errors import InvalidFileFormatError, InvalidFilePathError
18
+ from data_designer.lazy_heavy_imports import np, pd
19
+
20
+ if TYPE_CHECKING:
21
+ import numpy as np
22
+ import pandas as pd
18
23
 
19
24
  logger = logging.getLogger(__name__)
20
25
 
@@ -48,8 +48,8 @@ def can_run_data_designer_locally() -> bool:
48
48
  return True
49
49
 
50
50
 
51
- def get_prompt_template_keywords(template: str) -> set[str]:
52
- """Extract all keywords from a valid string template."""
51
+ def extract_keywords_from_jinja2_template(template: str) -> set[str]:
52
+ """Extract all keywords from a valid Jinja2 template."""
53
53
  with template_error_handler():
54
54
  ast = ImmutableSandboxedEnvironment().parse(template)
55
55
  keywords = set(meta.find_undeclared_variables(ast))
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import numbers
5
7
  from numbers import Number
6
8
  from typing import Any
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import inspect
5
7
  from enum import Enum
6
8
  from typing import Any, Literal, get_args, get_origin
@@ -10,8 +10,6 @@ from enum import Enum
10
10
  from functools import cached_property
11
11
  from typing import TYPE_CHECKING, Any
12
12
 
13
- import numpy as np
14
- import pandas as pd
15
13
  from rich.console import Console, Group
16
14
  from rich.padding import Padding
17
15
  from rich.panel import Panel
@@ -28,8 +26,12 @@ from data_designer.config.sampler_params import SamplerType
28
26
  from data_designer.config.utils.code_lang import code_lang_to_syntax_lexer
29
27
  from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME
30
28
  from data_designer.config.utils.errors import DatasetSampleDisplayError
29
+ from data_designer.lazy_heavy_imports import np, pd
31
30
 
32
31
  if TYPE_CHECKING:
32
+ import numpy as np
33
+ import pandas as pd
34
+
33
35
  from data_designer.config.config_builder import DataDesignerConfigBuilder
34
36
  from data_designer.config.dataset_metadata import DatasetMetadata
35
37
 
@@ -58,7 +60,7 @@ class ColorPalette(str, Enum):
58
60
 
59
61
  class WithRecordSamplerMixin:
60
62
  _display_cycle_index: int = 0
61
- dataset_metadata: DatasetMetadata
63
+ dataset_metadata: DatasetMetadata | None
62
64
 
63
65
  @cached_property
64
66
  def _record_sampler_dataset(self) -> pd.DataFrame:
@@ -122,7 +124,9 @@ class WithRecordSamplerMixin:
122
124
  else:
123
125
  processor_data_to_display[processor] = self.processor_artifacts[processor]
124
126
 
125
- seed_column_names = None if hide_seed_columns else self.dataset_metadata.seed_column_names
127
+ seed_column_names = (
128
+ None if hide_seed_columns or self.dataset_metadata is None else self.dataset_metadata.seed_column_names
129
+ )
126
130
 
127
131
  display_sample_record(
128
132
  record=record,
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from enum import Enum
5
7
  from typing import Any
6
8
 
@@ -5,15 +5,19 @@ from __future__ import annotations
5
5
 
6
6
  import logging
7
7
  from abc import ABC, abstractmethod
8
+ from typing import TYPE_CHECKING
8
9
 
9
- import pandas as pd
10
10
  from pydantic import BaseModel, model_validator
11
11
  from typing_extensions import Self
12
12
 
13
13
  from data_designer.config.base import ConfigBase
14
14
  from data_designer.config.column_configs import SingleColumnConfig
15
15
  from data_designer.config.column_types import DataDesignerColumnType
16
- from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, TaskConfigT
16
+ from data_designer.engine.configurable_task import ConfigurableTask, TaskConfigT
17
+ from data_designer.lazy_heavy_imports import pd
18
+
19
+ if TYPE_CHECKING:
20
+ import pandas as pd
17
21
 
18
22
  logger = logging.getLogger(__name__)
19
23
 
@@ -32,17 +36,14 @@ class ColumnConfigWithDataFrame(ConfigBase):
32
36
  return (self.column_config, self.df)
33
37
 
34
38
 
35
- class ColumnProfilerMetadata(ConfigurableTaskMetadata):
36
- applicable_column_types: list[DataDesignerColumnType]
37
-
38
-
39
39
  class ColumnProfiler(ConfigurableTask[TaskConfigT], ABC):
40
40
  @staticmethod
41
41
  @abstractmethod
42
- def metadata() -> ColumnProfilerMetadata: ...
42
+ def get_applicable_column_types() -> list[DataDesignerColumnType]:
43
+ """Returns a list of column types that this profiler can be applied to during dataset profiling."""
43
44
 
44
45
  @abstractmethod
45
46
  def profile(self, column_config_with_df: ColumnConfigWithDataFrame) -> BaseModel: ...
46
47
 
47
48
  def _initialize(self) -> None:
48
- logger.info(f"💫 Initializing column profiler: '{self.metadata().name}'")
49
+ logger.info(f"💫 Initializing column profiler: '{self.name}'")
@@ -5,44 +5,41 @@ from __future__ import annotations
5
5
 
6
6
  import logging
7
7
  import random
8
+ from typing import TYPE_CHECKING
8
9
 
9
10
  from data_designer.config.analysis.column_profilers import (
10
11
  JudgeScoreProfilerConfig,
11
12
  JudgeScoreProfilerResults,
12
- JudgeScoreSample,
13
13
  JudgeScoreSummary,
14
14
  )
15
15
  from data_designer.config.analysis.column_statistics import (
16
- CategoricalDistribution,
17
- CategoricalHistogramData,
18
16
  ColumnDistributionType,
19
17
  MissingValue,
20
- NumericalDistribution,
21
- )
22
- from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP, DataDesignerColumnType
23
- from data_designer.engine.analysis.column_profilers.base import (
24
- ColumnConfigWithDataFrame,
25
- ColumnProfiler,
26
- ColumnProfilerMetadata,
27
18
  )
19
+ from data_designer.config.column_types import DataDesignerColumnType
20
+ from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler
28
21
  from data_designer.engine.analysis.utils.judge_score_processing import (
29
22
  extract_judge_score_distributions,
30
23
  sample_scores_and_reasoning,
31
24
  )
32
- from data_designer.engine.models.facade import ModelFacade
33
25
  from data_designer.engine.models.recipes.response_recipes import TextResponseRecipe
34
26
 
27
+ if TYPE_CHECKING:
28
+ from data_designer.config.analysis.column_profilers import JudgeScoreSample
29
+ from data_designer.config.analysis.column_statistics import (
30
+ CategoricalDistribution,
31
+ CategoricalHistogramData,
32
+ NumericalDistribution,
33
+ )
34
+ from data_designer.engine.models.facade import ModelFacade
35
+
35
36
  logger = logging.getLogger(__name__)
36
37
 
37
38
 
38
39
  class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
39
40
  @staticmethod
40
- def metadata() -> ColumnProfilerMetadata:
41
- return ColumnProfilerMetadata(
42
- name="judge_score_profiler",
43
- description="Analyzes LLM-as-judge score distributions in a Data Designer dataset.",
44
- applicable_column_types=[DataDesignerColumnType.LLM_JUDGE],
45
- )
41
+ def get_applicable_column_types() -> list[DataDesignerColumnType]:
42
+ return [DataDesignerColumnType.LLM_JUDGE]
46
43
 
47
44
  def get_model(self, model_alias: str) -> ModelFacade:
48
45
  return self.resource_provider.model_registry.get_model(model_alias=model_alias)
@@ -51,8 +48,7 @@ class JudgeScoreProfiler(ColumnProfiler[JudgeScoreProfilerConfig]):
51
48
  column_config, df = column_config_with_df.as_tuple()
52
49
 
53
50
  logger.info(
54
- f"{COLUMN_TYPE_EMOJI_MAP[column_config.column_type]} Analyzing LLM-as-judge "
55
- f"scores for column: '{column_config.name}'"
51
+ f"{column_config.get_column_emoji()} Analyzing LLM-as-judge scores for column: '{column_config.name}'"
56
52
  )
57
53
 
58
54
  score_summaries = {}
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.analysis.column_profilers import ColumnProfilerType
5
7
  from data_designer.config.base import ConfigBase
6
8
  from data_designer.engine.analysis.column_profilers.base import ColumnProfiler
@@ -4,9 +4,8 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import logging
7
- from typing import Any, TypeAlias
7
+ from typing import TYPE_CHECKING, Any, TypeAlias
8
8
 
9
- import pandas as pd
10
9
  from pydantic import BaseModel
11
10
  from typing_extensions import Self
12
11
 
@@ -25,6 +24,10 @@ from data_designer.engine.analysis.utils.column_statistics_calculations import (
25
24
  calculate_token_stats,
26
25
  calculate_validation_column_info,
27
26
  )
27
+ from data_designer.lazy_heavy_imports import pd
28
+
29
+ if TYPE_CHECKING:
30
+ import pandas as pd
28
31
 
29
32
  logger = logging.getLogger(__name__)
30
33