data-designer 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. data_designer/__init__.py +2 -0
  2. data_designer/_version.py +2 -2
  3. data_designer/cli/__init__.py +2 -0
  4. data_designer/cli/commands/download.py +2 -0
  5. data_designer/cli/commands/list.py +2 -0
  6. data_designer/cli/commands/models.py +2 -0
  7. data_designer/cli/commands/providers.py +2 -0
  8. data_designer/cli/commands/reset.py +2 -0
  9. data_designer/cli/controllers/__init__.py +2 -0
  10. data_designer/cli/controllers/download_controller.py +2 -0
  11. data_designer/cli/controllers/model_controller.py +6 -1
  12. data_designer/cli/controllers/provider_controller.py +6 -1
  13. data_designer/cli/forms/__init__.py +2 -0
  14. data_designer/cli/forms/builder.py +2 -0
  15. data_designer/cli/forms/field.py +2 -0
  16. data_designer/cli/forms/form.py +2 -0
  17. data_designer/cli/forms/model_builder.py +2 -0
  18. data_designer/cli/forms/provider_builder.py +2 -0
  19. data_designer/cli/main.py +2 -0
  20. data_designer/cli/repositories/__init__.py +2 -0
  21. data_designer/cli/repositories/base.py +2 -0
  22. data_designer/cli/repositories/model_repository.py +2 -0
  23. data_designer/cli/repositories/persona_repository.py +2 -0
  24. data_designer/cli/repositories/provider_repository.py +2 -0
  25. data_designer/cli/services/__init__.py +2 -0
  26. data_designer/cli/services/download_service.py +2 -0
  27. data_designer/cli/services/model_service.py +2 -0
  28. data_designer/cli/services/provider_service.py +2 -0
  29. data_designer/cli/ui.py +2 -0
  30. data_designer/cli/utils.py +2 -0
  31. data_designer/config/analysis/column_profilers.py +2 -0
  32. data_designer/config/analysis/column_statistics.py +8 -5
  33. data_designer/config/analysis/dataset_profiler.py +9 -3
  34. data_designer/config/analysis/utils/errors.py +2 -0
  35. data_designer/config/analysis/utils/reporting.py +7 -3
  36. data_designer/config/base.py +1 -0
  37. data_designer/config/column_configs.py +77 -7
  38. data_designer/config/column_types.py +33 -36
  39. data_designer/config/dataset_builders.py +2 -0
  40. data_designer/config/dataset_metadata.py +18 -0
  41. data_designer/config/default_model_settings.py +1 -0
  42. data_designer/config/errors.py +2 -0
  43. data_designer/config/exports.py +2 -0
  44. data_designer/config/interface.py +3 -2
  45. data_designer/config/models.py +7 -2
  46. data_designer/config/preview_results.py +9 -1
  47. data_designer/config/processors.py +2 -0
  48. data_designer/config/run_config.py +19 -5
  49. data_designer/config/sampler_constraints.py +2 -0
  50. data_designer/config/sampler_params.py +7 -2
  51. data_designer/config/seed.py +2 -0
  52. data_designer/config/seed_source.py +9 -3
  53. data_designer/config/seed_source_types.py +2 -0
  54. data_designer/config/utils/constants.py +2 -0
  55. data_designer/config/utils/errors.py +2 -0
  56. data_designer/config/utils/info.py +2 -0
  57. data_designer/config/utils/io_helpers.py +8 -3
  58. data_designer/config/utils/misc.py +2 -2
  59. data_designer/config/utils/numerical_helpers.py +2 -0
  60. data_designer/config/utils/type_helpers.py +2 -0
  61. data_designer/config/utils/visualization.py +19 -11
  62. data_designer/config/validator_params.py +2 -0
  63. data_designer/engine/analysis/column_profilers/base.py +9 -8
  64. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
  65. data_designer/engine/analysis/column_profilers/registry.py +2 -0
  66. data_designer/engine/analysis/column_statistics.py +5 -2
  67. data_designer/engine/analysis/dataset_profiler.py +12 -9
  68. data_designer/engine/analysis/errors.py +2 -0
  69. data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
  70. data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
  71. data_designer/engine/column_generators/generators/base.py +26 -14
  72. data_designer/engine/column_generators/generators/embedding.py +4 -11
  73. data_designer/engine/column_generators/generators/expression.py +7 -16
  74. data_designer/engine/column_generators/generators/llm_completion.py +13 -47
  75. data_designer/engine/column_generators/generators/samplers.py +8 -14
  76. data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
  77. data_designer/engine/column_generators/generators/validation.py +9 -20
  78. data_designer/engine/column_generators/registry.py +2 -0
  79. data_designer/engine/column_generators/utils/errors.py +2 -0
  80. data_designer/engine/column_generators/utils/generator_classification.py +2 -0
  81. data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
  82. data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
  83. data_designer/engine/compiler.py +3 -6
  84. data_designer/engine/configurable_task.py +12 -13
  85. data_designer/engine/dataset_builders/artifact_storage.py +87 -8
  86. data_designer/engine/dataset_builders/column_wise_builder.py +34 -35
  87. data_designer/engine/dataset_builders/errors.py +2 -0
  88. data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
  89. data_designer/engine/dataset_builders/utils/concurrency.py +13 -4
  90. data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
  91. data_designer/engine/dataset_builders/utils/dag.py +7 -2
  92. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +35 -25
  93. data_designer/engine/dataset_builders/utils/errors.py +2 -0
  94. data_designer/engine/errors.py +2 -0
  95. data_designer/engine/model_provider.py +2 -0
  96. data_designer/engine/models/errors.py +23 -31
  97. data_designer/engine/models/facade.py +12 -9
  98. data_designer/engine/models/factory.py +42 -0
  99. data_designer/engine/models/litellm_overrides.py +16 -11
  100. data_designer/engine/models/parsers/errors.py +2 -0
  101. data_designer/engine/models/parsers/parser.py +2 -2
  102. data_designer/engine/models/parsers/postprocessors.py +1 -0
  103. data_designer/engine/models/parsers/tag_parsers.py +2 -0
  104. data_designer/engine/models/parsers/types.py +2 -0
  105. data_designer/engine/models/recipes/base.py +2 -0
  106. data_designer/engine/models/recipes/response_recipes.py +2 -0
  107. data_designer/engine/models/registry.py +11 -18
  108. data_designer/engine/models/telemetry.py +6 -2
  109. data_designer/engine/processing/ginja/ast.py +2 -0
  110. data_designer/engine/processing/ginja/environment.py +2 -0
  111. data_designer/engine/processing/ginja/exceptions.py +2 -0
  112. data_designer/engine/processing/ginja/record.py +2 -0
  113. data_designer/engine/processing/gsonschema/exceptions.py +9 -2
  114. data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
  115. data_designer/engine/processing/gsonschema/types.py +2 -0
  116. data_designer/engine/processing/gsonschema/validators.py +10 -6
  117. data_designer/engine/processing/processors/base.py +1 -5
  118. data_designer/engine/processing/processors/drop_columns.py +7 -10
  119. data_designer/engine/processing/processors/registry.py +2 -0
  120. data_designer/engine/processing/processors/schema_transform.py +7 -10
  121. data_designer/engine/processing/utils.py +7 -3
  122. data_designer/engine/registry/base.py +2 -0
  123. data_designer/engine/registry/data_designer_registry.py +2 -0
  124. data_designer/engine/registry/errors.py +2 -0
  125. data_designer/engine/resources/managed_dataset_generator.py +6 -2
  126. data_designer/engine/resources/managed_dataset_repository.py +8 -5
  127. data_designer/engine/resources/managed_storage.py +2 -0
  128. data_designer/engine/resources/resource_provider.py +20 -1
  129. data_designer/engine/resources/seed_reader.py +7 -2
  130. data_designer/engine/sampling_gen/column.py +2 -0
  131. data_designer/engine/sampling_gen/constraints.py +8 -2
  132. data_designer/engine/sampling_gen/data_sources/base.py +10 -7
  133. data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
  134. data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
  135. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
  136. data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
  137. data_designer/engine/sampling_gen/entities/errors.py +2 -0
  138. data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
  139. data_designer/engine/sampling_gen/entities/person.py +2 -0
  140. data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
  141. data_designer/engine/sampling_gen/errors.py +2 -0
  142. data_designer/engine/sampling_gen/generator.py +5 -4
  143. data_designer/engine/sampling_gen/jinja_utils.py +7 -3
  144. data_designer/engine/sampling_gen/people_gen.py +7 -7
  145. data_designer/engine/sampling_gen/person_constants.py +2 -0
  146. data_designer/engine/sampling_gen/schema.py +5 -1
  147. data_designer/engine/sampling_gen/schema_builder.py +2 -0
  148. data_designer/engine/sampling_gen/utils.py +7 -1
  149. data_designer/engine/secret_resolver.py +2 -0
  150. data_designer/engine/validation.py +2 -2
  151. data_designer/engine/validators/__init__.py +2 -0
  152. data_designer/engine/validators/base.py +2 -0
  153. data_designer/engine/validators/local_callable.py +7 -2
  154. data_designer/engine/validators/python.py +7 -1
  155. data_designer/engine/validators/remote.py +7 -1
  156. data_designer/engine/validators/sql.py +8 -3
  157. data_designer/errors.py +2 -0
  158. data_designer/essentials/__init__.py +2 -0
  159. data_designer/interface/data_designer.py +36 -39
  160. data_designer/interface/errors.py +2 -0
  161. data_designer/interface/results.py +9 -2
  162. data_designer/lazy_heavy_imports.py +54 -0
  163. data_designer/logging.py +2 -0
  164. data_designer/plugins/__init__.py +2 -0
  165. data_designer/plugins/errors.py +2 -0
  166. data_designer/plugins/plugin.py +0 -1
  167. data_designer/plugins/registry.py +2 -0
  168. data_designer/plugins/testing/__init__.py +2 -0
  169. data_designer/plugins/testing/stubs.py +21 -43
  170. data_designer/plugins/testing/utils.py +2 -0
  171. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/METADATA +19 -4
  172. data_designer-0.3.5.dist-info/RECORD +196 -0
  173. data_designer-0.3.3.dist-info/RECORD +0 -193
  174. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/WHEEL +0 -0
  175. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/entry_points.txt +0 -0
  176. {data_designer-0.3.3.dist-info → data_designer-0.3.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from copy import deepcopy
5
7
 
6
8
  from data_designer.config.column_configs import SamplerColumnConfig
@@ -1,9 +1,15 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import numbers
7
+ from typing import TYPE_CHECKING
8
+
9
+ from data_designer.lazy_heavy_imports import np
5
10
 
6
- import numpy as np
11
+ if TYPE_CHECKING:
12
+ import numpy as np
7
13
 
8
14
 
9
15
  def check_random_state(seed):
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import json
5
7
  import logging
6
8
  import os
@@ -19,7 +19,7 @@ from data_designer.config.processors import ProcessorConfigT, ProcessorType
19
19
  from data_designer.config.utils.constants import RICH_CONSOLE_THEME
20
20
  from data_designer.config.utils.misc import (
21
21
  can_run_data_designer_locally,
22
- get_prompt_template_keywords,
22
+ extract_keywords_from_jinja2_template,
23
23
  )
24
24
  from data_designer.config.validator_params import ValidatorType
25
25
  from data_designer.engine.column_generators.utils.generator_classification import column_type_is_model_generated
@@ -302,7 +302,7 @@ def validate_schema_transform_processor(
302
302
  for processor_config in processor_configs:
303
303
  if processor_config.processor_type == ProcessorType.SCHEMA_TRANSFORM:
304
304
  for col, template in processor_config.template.items():
305
- template_keywords = get_prompt_template_keywords(template)
305
+ template_keywords = extract_keywords_from_jinja2_template(template)
306
306
  invalid_keywords = set(template_keywords) - all_column_names
307
307
  if len(invalid_keywords) > 0:
308
308
  invalid_keywords = ", ".join([f"'{k}'" for k in invalid_keywords])
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.engine.validators.base import BaseValidator, ValidationResult
5
7
  from data_designer.engine.validators.local_callable import LocalCallableValidator
6
8
  from data_designer.engine.validators.python import PythonValidator
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from abc import ABC, abstractmethod
5
7
  from typing import Iterator
6
8
 
@@ -1,14 +1,19 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- import logging
4
+ from __future__ import annotations
5
5
 
6
- import pandas as pd
6
+ import logging
7
+ from typing import TYPE_CHECKING
7
8
 
8
9
  from data_designer.config.validator_params import LocalCallableValidatorParams
9
10
  from data_designer.engine.errors import LocalCallableValidationError
10
11
  from data_designer.engine.processing.gsonschema.validators import validate
11
12
  from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
13
+ from data_designer.lazy_heavy_imports import pd
14
+
15
+ if TYPE_CHECKING:
16
+ import pandas as pd
12
17
 
13
18
  logger = logging.getLogger(__name__)
14
19
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import ast
5
7
  import json
6
8
  import logging
@@ -8,14 +10,18 @@ import subprocess
8
10
  import tempfile
9
11
  from collections import defaultdict
10
12
  from pathlib import Path
13
+ from typing import TYPE_CHECKING
11
14
  from uuid import uuid4
12
15
 
13
- import pandas as pd
14
16
  from pydantic import BaseModel
15
17
  from ruff.__main__ import find_ruff_bin
16
18
 
17
19
  from data_designer.config.validator_params import CodeValidatorParams
18
20
  from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
21
+ from data_designer.lazy_heavy_imports import pd
22
+
23
+ if TYPE_CHECKING:
24
+ import pandas as pd
19
25
 
20
26
  logger = logging.getLogger(__name__)
21
27
 
@@ -1,9 +1,11 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
7
+ from typing import TYPE_CHECKING
5
8
 
6
- import httpx
7
9
  from httpx_retries import Retry, RetryTransport
8
10
 
9
11
  from data_designer.config.validator_params import RemoteValidatorParams
@@ -11,6 +13,10 @@ from data_designer.engine.errors import RemoteValidationSchemaError
11
13
  from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError
12
14
  from data_designer.engine.processing.gsonschema.validators import validate
13
15
  from data_designer.engine.validators.base import BaseValidator, ValidationResult
16
+ from data_designer.lazy_heavy_imports import httpx
17
+
18
+ if TYPE_CHECKING:
19
+ import httpx
14
20
 
15
21
  logger = logging.getLogger(__name__)
16
22
 
@@ -1,15 +1,20 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  import re
6
-
7
- import pandas as pd
8
- import sqlfluff
8
+ from typing import TYPE_CHECKING
9
9
 
10
10
  from data_designer.config.utils.code_lang import CodeLang
11
11
  from data_designer.config.validator_params import CodeValidatorParams
12
12
  from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult
13
+ from data_designer.lazy_heavy_imports import pd, sqlfluff
14
+
15
+ if TYPE_CHECKING:
16
+ import pandas as pd
17
+ import sqlfluff
13
18
 
14
19
  sqlfluff_logger = logging.getLogger("sqlfluff")
15
20
  sqlfluff_logger.setLevel(logging.WARNING)
data_designer/errors.py CHANGED
@@ -1,5 +1,7 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
 
5
7
  class DataDesignerError(Exception): ...
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.default_model_settings import resolve_seed_default_model_settings
5
7
  from data_designer.config.exports import * # noqa: F403
6
8
  from data_designer.config.run_config import RunConfig
@@ -1,13 +1,15 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  from pathlib import Path
6
-
7
- import pandas as pd
8
+ from typing import TYPE_CHECKING
8
9
 
9
10
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
10
11
  from data_designer.config.config_builder import DataDesignerConfigBuilder
12
+ from data_designer.config.data_designer_config import DataDesignerConfig
11
13
  from data_designer.config.default_model_settings import (
12
14
  get_default_model_configs,
13
15
  get_default_model_providers_missing_api_keys,
@@ -29,14 +31,10 @@ from data_designer.config.utils.constants import (
29
31
  PREDEFINED_PROVIDERS,
30
32
  )
31
33
  from data_designer.config.utils.info import InfoType, InterfaceInfo
32
- from data_designer.engine.analysis.dataset_profiler import (
33
- DataDesignerDatasetProfiler,
34
- DatasetProfilerConfig,
35
- )
34
+ from data_designer.engine.analysis.dataset_profiler import DataDesignerDatasetProfiler, DatasetProfilerConfig
36
35
  from data_designer.engine.compiler import compile_data_designer_config
37
36
  from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
38
37
  from data_designer.engine.dataset_builders.column_wise_builder import ColumnWiseDatasetBuilder
39
- from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
40
38
  from data_designer.engine.model_provider import resolve_model_provider_registry
41
39
  from data_designer.engine.resources.managed_storage import init_managed_blob_storage
42
40
  from data_designer.engine.resources.resource_provider import ResourceProvider, create_resource_provider
@@ -56,14 +54,18 @@ from data_designer.engine.secret_resolver import (
56
54
  from data_designer.interface.errors import (
57
55
  DataDesignerGenerationError,
58
56
  DataDesignerProfilingError,
59
- InvalidBufferValueError,
60
57
  )
61
58
  from data_designer.interface.results import DatasetCreationResults
59
+ from data_designer.lazy_heavy_imports import pd
62
60
  from data_designer.logging import RandomEmoji
63
61
  from data_designer.plugins.plugin import PluginType
64
62
  from data_designer.plugins.registry import PluginRegistry
65
63
 
66
- DEFAULT_BUFFER_SIZE = 1000
64
+ if TYPE_CHECKING:
65
+ import pandas as pd
66
+
67
+ logger = logging.getLogger(__name__)
68
+
67
69
 
68
70
  DEFAULT_SECRET_RESOLVER = CompositeResolver([EnvironmentResolver(), PlaintextResolver()])
69
71
 
@@ -75,8 +77,6 @@ DEFAULT_SEED_READERS = [
75
77
  for plugin in PluginRegistry().get_plugins(PluginType.SEED_READER):
76
78
  DEFAULT_SEED_READERS.append(plugin.impl_cls())
77
79
 
78
- logger = logging.getLogger(__name__)
79
-
80
80
 
81
81
  class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
82
82
  """Main interface for creating datasets with Data Designer.
@@ -112,7 +112,6 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
112
112
  ):
113
113
  self._secret_resolver = secret_resolver or DEFAULT_SECRET_RESOLVER
114
114
  self._artifact_path = Path(artifact_path) if artifact_path is not None else Path.cwd() / "artifacts"
115
- self._buffer_size = DEFAULT_BUFFER_SIZE
116
115
  self._run_config = RunConfig()
117
116
  self._managed_assets_path = Path(managed_assets_path or MANAGED_ASSETS_PATH)
118
117
  self._model_providers = self._resolve_model_providers(model_providers)
@@ -166,10 +165,10 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
166
165
 
167
166
  resource_provider = self._create_resource_provider(dataset_name, config_builder)
168
167
 
169
- builder = self._create_dataset_builder(config_builder, resource_provider)
168
+ builder = self._create_dataset_builder(config_builder.build(), resource_provider)
170
169
 
171
170
  try:
172
- builder.build(num_records=num_records, buffer_size=self._buffer_size)
171
+ builder.build(num_records=num_records)
173
172
  except Exception as e:
174
173
  raise DataDesignerGenerationError(f"🛑 Error generating dataset: {e}")
175
174
 
@@ -182,10 +181,19 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
182
181
  except Exception as e:
183
182
  raise DataDesignerProfilingError(f"🛑 Error profiling dataset: {e}")
184
183
 
184
+ dataset_metadata = resource_provider.get_dataset_metadata()
185
+
186
+ # Update metadata with column statistics from analysis
187
+ if analysis:
188
+ builder.artifact_storage.update_metadata(
189
+ {"column_statistics": [stat.model_dump(mode="json") for stat in analysis.column_statistics]}
190
+ )
191
+
185
192
  return DatasetCreationResults(
186
193
  artifact_storage=builder.artifact_storage,
187
194
  analysis=analysis,
188
195
  config_builder=config_builder,
196
+ dataset_metadata=dataset_metadata,
189
197
  )
190
198
 
191
199
  def preview(
@@ -211,7 +219,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
211
219
  logger.info(f"{RandomEmoji.previewing()} Preview generation in progress")
212
220
 
213
221
  resource_provider = self._create_resource_provider("preview-dataset", config_builder)
214
- builder = self._create_dataset_builder(config_builder, resource_provider)
222
+ builder = self._create_dataset_builder(config_builder.build(), resource_provider)
215
223
 
216
224
  try:
217
225
  raw_dataset = builder.build_preview(num_records=num_records)
@@ -249,11 +257,15 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
249
257
  ):
250
258
  logger.info(f"{RandomEmoji.success()} Preview complete!")
251
259
 
260
+ # Create dataset metadata from the resource provider
261
+ dataset_metadata = resource_provider.get_dataset_metadata()
262
+
252
263
  return PreviewResults(
253
264
  dataset=processed_dataset,
254
265
  analysis=analysis,
255
266
  processor_artifacts=processor_artifacts,
256
267
  config_builder=config_builder,
268
+ dataset_metadata=dataset_metadata,
257
269
  )
258
270
 
259
271
  def validate(self, config_builder: DataDesignerConfigBuilder) -> None:
@@ -271,7 +283,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
271
283
  InvalidConfigError: If the configuration is invalid.
272
284
  """
273
285
  resource_provider = self._create_resource_provider("validate-configuration", config_builder)
274
- compile_data_designer_config(config_builder, resource_provider)
286
+ compile_data_designer_config(config_builder.build(), resource_provider)
275
287
 
276
288
  def get_default_model_configs(self) -> list[ModelConfig]:
277
289
  """Get the default model configurations.
@@ -300,34 +312,22 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
300
312
  """
301
313
  return self._secret_resolver
302
314
 
303
- def set_buffer_size(self, buffer_size: int) -> None:
304
- """Set the buffer size for dataset generation.
305
-
306
- The buffer size controls how many records are processed in memory at once
307
- during dataset generation using the `create` method. The default value is
308
- set to the constant `DEFAULT_BUFFER_SIZE` defined in the data_designer module.
309
-
310
- Args:
311
- buffer_size: Number of records to process in each buffer.
312
-
313
- Raises:
314
- InvalidBufferValueError: If buffer size is less than or equal to 0.
315
- """
316
- if buffer_size <= 0:
317
- raise InvalidBufferValueError("Buffer size must be greater than 0.")
318
- self._buffer_size = buffer_size
319
-
320
315
  def set_run_config(self, run_config: RunConfig) -> None:
321
316
  """Set the runtime configuration for dataset generation.
322
317
 
323
318
  Args:
324
319
  run_config: A RunConfig instance containing runtime settings such as
325
- early shutdown behavior. Import RunConfig from data_designer.essentials.
320
+ early shutdown behavior and batch sizing via `buffer_size`. Import RunConfig from
321
+ data_designer.essentials.
326
322
 
327
323
  Example:
328
324
  >>> from data_designer.essentials import DataDesigner, RunConfig
329
325
  >>> dd = DataDesigner()
330
326
  >>> dd.set_run_config(RunConfig(disable_early_shutdown=True))
327
+
328
+ Notes:
329
+ When `disable_early_shutdown=True`, DataDesigner will never terminate generation early
330
+ due to error-rate thresholds. Errors are still tracked for reporting.
331
331
  """
332
332
  self._run_config = run_config
333
333
 
@@ -348,14 +348,11 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
348
348
 
349
349
  def _create_dataset_builder(
350
350
  self,
351
- config_builder: DataDesignerConfigBuilder,
351
+ data_designer_config: DataDesignerConfig,
352
352
  resource_provider: ResourceProvider,
353
353
  ) -> ColumnWiseDatasetBuilder:
354
- config = compile_data_designer_config(config_builder, resource_provider)
355
-
356
354
  return ColumnWiseDatasetBuilder(
357
- column_configs=compile_dataset_builder_column_configs(config),
358
- processor_configs=config.processors or [],
355
+ data_designer_config=data_designer_config,
359
356
  resource_provider=resource_provider,
360
357
  )
361
358
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.errors import DataDesignerError
5
7
 
6
8
 
@@ -4,14 +4,18 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  from pathlib import Path
7
-
8
- import pandas as pd
7
+ from typing import TYPE_CHECKING
9
8
 
10
9
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
11
10
  from data_designer.config.config_builder import DataDesignerConfigBuilder
11
+ from data_designer.config.dataset_metadata import DatasetMetadata
12
12
  from data_designer.config.utils.visualization import WithRecordSamplerMixin
13
13
  from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
14
14
  from data_designer.engine.dataset_builders.errors import ArtifactStorageError
15
+ from data_designer.lazy_heavy_imports import pd
16
+
17
+ if TYPE_CHECKING:
18
+ import pandas as pd
15
19
 
16
20
 
17
21
  class DatasetCreationResults(WithRecordSamplerMixin):
@@ -28,6 +32,7 @@ class DatasetCreationResults(WithRecordSamplerMixin):
28
32
  artifact_storage: ArtifactStorage,
29
33
  analysis: DatasetProfilerResults,
30
34
  config_builder: DataDesignerConfigBuilder,
35
+ dataset_metadata: DatasetMetadata,
31
36
  ):
32
37
  """Creates a new instance with results based on a dataset creation run.
33
38
 
@@ -35,10 +40,12 @@ class DatasetCreationResults(WithRecordSamplerMixin):
35
40
  artifact_storage: Storage manager for accessing generated artifacts.
36
41
  analysis: Profiling results for the generated dataset.
37
42
  config_builder: Configuration builder used to create the dataset.
43
+ dataset_metadata: Metadata about the generated dataset (e.g., seed column names).
38
44
  """
39
45
  self.artifact_storage = artifact_storage
40
46
  self._analysis = analysis
41
47
  self._config_builder = config_builder
48
+ self.dataset_metadata = dataset_metadata
42
49
 
43
50
  def load_analysis(self) -> DatasetProfilerResults:
44
51
  """Load the profiling analysis results for the generated dataset.
@@ -0,0 +1,54 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ """
5
+ Lazy imports facade for heavy third-party dependencies.
6
+
7
+ This module provides a centralized facade that lazily imports heavy dependencies
8
+ only when accessed, significantly improving import performance.
9
+
10
+ Usage:
11
+ from data_designer.lazy_heavy_imports import pd, np, faker, litellm
12
+
13
+ df = pd.DataFrame(...)
14
+ arr = np.array([1, 2, 3])
15
+ fake = faker.Faker()
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import importlib
21
+
22
+ # Mapping of lazy import names to their actual module paths
23
+ _LAZY_IMPORTS = {
24
+ "pd": "pandas",
25
+ "np": "numpy",
26
+ "pq": "pyarrow.parquet",
27
+ "pa": "pyarrow",
28
+ "faker": "faker",
29
+ "litellm": "litellm",
30
+ "sqlfluff": "sqlfluff",
31
+ "httpx": "httpx",
32
+ "duckdb": "duckdb",
33
+ "nx": "networkx",
34
+ "scipy": "scipy",
35
+ "jsonschema": "jsonschema",
36
+ }
37
+
38
+
39
+ def __getattr__(name: str) -> object:
40
+ """Lazily import heavy third-party dependencies when accessed.
41
+
42
+ This allows fast imports of data_designer while deferring loading of heavy
43
+ libraries until they're actually needed.
44
+ """
45
+ if name in _LAZY_IMPORTS:
46
+ module_name = _LAZY_IMPORTS[name]
47
+ return importlib.import_module(module_name)
48
+
49
+ raise AttributeError(f"module 'data_designer.lazy_heavy_imports' has no attribute {name!r}")
50
+
51
+
52
+ def __dir__() -> list[str]:
53
+ """Return list of available lazy imports."""
54
+ return list(_LAZY_IMPORTS.keys())
data_designer/logging.py CHANGED
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  import random
6
8
  import sys
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.plugins.plugin import Plugin, PluginType
5
7
 
6
8
  __all__ = ["Plugin", "PluginType"]
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.errors import DataDesignerError
5
7
 
6
8
 
@@ -70,7 +70,6 @@ class Plugin(BaseModel):
70
70
  ..., description="The fully-qualified name o the config class object, e.g. 'my_plugin.config.MyConfig'"
71
71
  )
72
72
  plugin_type: PluginType = Field(..., description="The type of plugin")
73
- emoji: str = Field(default="🔌", description="The emoji to use in logs related to the plugin")
74
73
 
75
74
  @property
76
75
  def config_type_as_class_name(self) -> str:
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  import os
6
8
  import threading
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.plugins.testing.utils import assert_valid_plugin
5
7
 
6
8
  __all__ = [
@@ -1,11 +1,13 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from typing import Literal
5
7
 
6
8
  from data_designer.config.base import ConfigBase
7
9
  from data_designer.config.column_configs import SingleColumnConfig
8
- from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata
10
+ from data_designer.engine.column_generators.generators.base import ColumnGeneratorCellByCell
9
11
  from data_designer.plugins.plugin import Plugin, PluginType
10
12
 
11
13
  MODULE_NAME = __name__
@@ -18,15 +20,11 @@ class ValidTestConfig(SingleColumnConfig):
18
20
  name: str
19
21
 
20
22
 
21
- class ValidTestTask(ConfigurableTask[ValidTestConfig]):
23
+ class ValidTestTask(ColumnGeneratorCellByCell[ValidTestConfig]):
22
24
  """Valid task for testing plugin creation."""
23
25
 
24
- @staticmethod
25
- def metadata() -> ConfigurableTaskMetadata:
26
- return ConfigurableTaskMetadata(
27
- name="test_generator",
28
- description="Test generator",
29
- )
26
+ def generate(self, data: dict) -> dict:
27
+ return data
30
28
 
31
29
 
32
30
  class ConfigWithoutDiscriminator(ConfigBase):
@@ -53,22 +51,14 @@ class StubPluginConfigB(SingleColumnConfig):
53
51
  column_type: Literal["test-plugin-b"] = "test-plugin-b"
54
52
 
55
53
 
56
- class StubPluginTaskA(ConfigurableTask[StubPluginConfigA]):
57
- @staticmethod
58
- def metadata() -> ConfigurableTaskMetadata:
59
- return ConfigurableTaskMetadata(
60
- name="test_plugin_a",
61
- description="Test plugin A",
62
- )
54
+ class StubPluginTaskA(ColumnGeneratorCellByCell[StubPluginConfigA]):
55
+ def generate(self, data: dict) -> dict:
56
+ return data
63
57
 
64
58
 
65
- class StubPluginTaskB(ConfigurableTask[StubPluginConfigB]):
66
- @staticmethod
67
- def metadata() -> ConfigurableTaskMetadata:
68
- return ConfigurableTaskMetadata(
69
- name="test_plugin_b",
70
- description="Test plugin B",
71
- )
59
+ class StubPluginTaskB(ColumnGeneratorCellByCell[StubPluginConfigB]):
60
+ def generate(self, data: dict) -> dict:
61
+ return data
72
62
 
73
63
 
74
64
  # Stub plugins requiring different combinations of resources
@@ -86,31 +76,19 @@ class StubPluginConfigBlobsAndSeeds(SingleColumnConfig):
86
76
  column_type: Literal["test-plugin-blobs-and-seeds"] = "test-plugin-blobs-and-seeds"
87
77
 
88
78
 
89
- class StubPluginTaskModels(ConfigurableTask[StubPluginConfigModels]):
90
- @staticmethod
91
- def metadata() -> ConfigurableTaskMetadata:
92
- return ConfigurableTaskMetadata(
93
- name="test_plugin_models",
94
- description="Test plugin requiring models",
95
- )
79
+ class StubPluginTaskModels(ColumnGeneratorCellByCell[StubPluginConfigModels]):
80
+ def generate(self, data: dict) -> dict:
81
+ return data
96
82
 
97
83
 
98
- class StubPluginTaskModelsAndBlobs(ConfigurableTask[StubPluginConfigModelsAndBlobs]):
99
- @staticmethod
100
- def metadata() -> ConfigurableTaskMetadata:
101
- return ConfigurableTaskMetadata(
102
- name="test_plugin_models_and_blobs",
103
- description="Test plugin requiring models and blobs",
104
- )
84
+ class StubPluginTaskModelsAndBlobs(ColumnGeneratorCellByCell[StubPluginConfigModelsAndBlobs]):
85
+ def generate(self, data: dict) -> dict:
86
+ return data
105
87
 
106
88
 
107
- class StubPluginTaskBlobsAndSeeds(ConfigurableTask[StubPluginConfigBlobsAndSeeds]):
108
- @staticmethod
109
- def metadata() -> ConfigurableTaskMetadata:
110
- return ConfigurableTaskMetadata(
111
- name="test_plugin_blobs_and_seeds",
112
- description="Test plugin requiring blobs and seeds",
113
- )
89
+ class StubPluginTaskBlobsAndSeeds(ColumnGeneratorCellByCell[StubPluginConfigBlobsAndSeeds]):
90
+ def generate(self, data: dict) -> dict:
91
+ return data
114
92
 
115
93
 
116
94
  plugin_none = Plugin(
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.base import ConfigBase
5
7
  from data_designer.engine.configurable_task import ConfigurableTask
6
8
  from data_designer.engine.resources.seed_reader import SeedReader