data-designer 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. data_designer/__init__.py +2 -0
  2. data_designer/_version.py +2 -2
  3. data_designer/cli/__init__.py +2 -0
  4. data_designer/cli/commands/download.py +2 -0
  5. data_designer/cli/commands/list.py +2 -0
  6. data_designer/cli/commands/models.py +2 -0
  7. data_designer/cli/commands/providers.py +2 -0
  8. data_designer/cli/commands/reset.py +2 -0
  9. data_designer/cli/controllers/__init__.py +2 -0
  10. data_designer/cli/controllers/download_controller.py +2 -0
  11. data_designer/cli/controllers/model_controller.py +6 -1
  12. data_designer/cli/controllers/provider_controller.py +6 -1
  13. data_designer/cli/forms/__init__.py +2 -0
  14. data_designer/cli/forms/builder.py +2 -0
  15. data_designer/cli/forms/field.py +2 -0
  16. data_designer/cli/forms/form.py +2 -0
  17. data_designer/cli/forms/model_builder.py +2 -0
  18. data_designer/cli/forms/provider_builder.py +2 -0
  19. data_designer/cli/main.py +2 -0
  20. data_designer/cli/repositories/__init__.py +2 -0
  21. data_designer/cli/repositories/base.py +2 -0
  22. data_designer/cli/repositories/model_repository.py +2 -0
  23. data_designer/cli/repositories/persona_repository.py +2 -0
  24. data_designer/cli/repositories/provider_repository.py +2 -0
  25. data_designer/cli/services/__init__.py +2 -0
  26. data_designer/cli/services/download_service.py +2 -0
  27. data_designer/cli/services/model_service.py +2 -0
  28. data_designer/cli/services/provider_service.py +2 -0
  29. data_designer/cli/ui.py +2 -0
  30. data_designer/cli/utils.py +2 -0
  31. data_designer/config/analysis/column_profilers.py +2 -0
  32. data_designer/config/analysis/column_statistics.py +8 -5
  33. data_designer/config/analysis/dataset_profiler.py +9 -3
  34. data_designer/config/analysis/utils/errors.py +2 -0
  35. data_designer/config/analysis/utils/reporting.py +7 -3
  36. data_designer/config/column_configs.py +77 -7
  37. data_designer/config/column_types.py +33 -36
  38. data_designer/config/dataset_builders.py +2 -0
  39. data_designer/config/default_model_settings.py +1 -0
  40. data_designer/config/errors.py +2 -0
  41. data_designer/config/exports.py +2 -0
  42. data_designer/config/interface.py +3 -2
  43. data_designer/config/models.py +7 -2
  44. data_designer/config/preview_results.py +7 -3
  45. data_designer/config/processors.py +2 -0
  46. data_designer/config/run_config.py +2 -0
  47. data_designer/config/sampler_constraints.py +2 -0
  48. data_designer/config/sampler_params.py +7 -2
  49. data_designer/config/seed.py +2 -0
  50. data_designer/config/seed_source.py +7 -2
  51. data_designer/config/seed_source_types.py +2 -0
  52. data_designer/config/utils/constants.py +2 -0
  53. data_designer/config/utils/errors.py +2 -0
  54. data_designer/config/utils/info.py +2 -0
  55. data_designer/config/utils/io_helpers.py +8 -3
  56. data_designer/config/utils/misc.py +2 -2
  57. data_designer/config/utils/numerical_helpers.py +2 -0
  58. data_designer/config/utils/type_helpers.py +2 -0
  59. data_designer/config/utils/visualization.py +8 -4
  60. data_designer/config/validator_params.py +2 -0
  61. data_designer/engine/analysis/column_profilers/base.py +9 -8
  62. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
  63. data_designer/engine/analysis/column_profilers/registry.py +2 -0
  64. data_designer/engine/analysis/column_statistics.py +5 -2
  65. data_designer/engine/analysis/dataset_profiler.py +12 -9
  66. data_designer/engine/analysis/errors.py +2 -0
  67. data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
  68. data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
  69. data_designer/engine/column_generators/generators/base.py +26 -14
  70. data_designer/engine/column_generators/generators/embedding.py +4 -11
  71. data_designer/engine/column_generators/generators/expression.py +7 -16
  72. data_designer/engine/column_generators/generators/llm_completion.py +11 -37
  73. data_designer/engine/column_generators/generators/samplers.py +8 -14
  74. data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
  75. data_designer/engine/column_generators/generators/validation.py +8 -20
  76. data_designer/engine/column_generators/registry.py +2 -0
  77. data_designer/engine/column_generators/utils/errors.py +2 -0
  78. data_designer/engine/column_generators/utils/generator_classification.py +2 -0
  79. data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
  80. data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
  81. data_designer/engine/compiler.py +3 -6
  82. data_designer/engine/configurable_task.py +12 -13
  83. data_designer/engine/dataset_builders/artifact_storage.py +87 -8
  84. data_designer/engine/dataset_builders/column_wise_builder.py +32 -34
  85. data_designer/engine/dataset_builders/errors.py +2 -0
  86. data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
  87. data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
  88. data_designer/engine/dataset_builders/utils/dag.py +7 -2
  89. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +9 -6
  90. data_designer/engine/dataset_builders/utils/errors.py +2 -0
  91. data_designer/engine/errors.py +2 -0
  92. data_designer/engine/model_provider.py +2 -0
  93. data_designer/engine/models/errors.py +23 -31
  94. data_designer/engine/models/facade.py +12 -9
  95. data_designer/engine/models/factory.py +42 -0
  96. data_designer/engine/models/litellm_overrides.py +22 -11
  97. data_designer/engine/models/parsers/errors.py +2 -0
  98. data_designer/engine/models/parsers/parser.py +2 -2
  99. data_designer/engine/models/parsers/postprocessors.py +1 -0
  100. data_designer/engine/models/parsers/tag_parsers.py +2 -0
  101. data_designer/engine/models/parsers/types.py +2 -0
  102. data_designer/engine/models/recipes/base.py +2 -0
  103. data_designer/engine/models/recipes/response_recipes.py +2 -0
  104. data_designer/engine/models/registry.py +11 -18
  105. data_designer/engine/models/telemetry.py +6 -2
  106. data_designer/engine/processing/ginja/ast.py +2 -0
  107. data_designer/engine/processing/ginja/environment.py +2 -0
  108. data_designer/engine/processing/ginja/exceptions.py +2 -0
  109. data_designer/engine/processing/ginja/record.py +2 -0
  110. data_designer/engine/processing/gsonschema/exceptions.py +9 -2
  111. data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
  112. data_designer/engine/processing/gsonschema/types.py +2 -0
  113. data_designer/engine/processing/gsonschema/validators.py +10 -6
  114. data_designer/engine/processing/processors/base.py +1 -5
  115. data_designer/engine/processing/processors/drop_columns.py +7 -10
  116. data_designer/engine/processing/processors/registry.py +2 -0
  117. data_designer/engine/processing/processors/schema_transform.py +7 -10
  118. data_designer/engine/processing/utils.py +7 -3
  119. data_designer/engine/registry/base.py +2 -0
  120. data_designer/engine/registry/data_designer_registry.py +2 -0
  121. data_designer/engine/registry/errors.py +2 -0
  122. data_designer/engine/resources/managed_dataset_generator.py +6 -2
  123. data_designer/engine/resources/managed_dataset_repository.py +8 -5
  124. data_designer/engine/resources/managed_storage.py +2 -0
  125. data_designer/engine/resources/resource_provider.py +8 -1
  126. data_designer/engine/resources/seed_reader.py +7 -2
  127. data_designer/engine/sampling_gen/column.py +2 -0
  128. data_designer/engine/sampling_gen/constraints.py +8 -2
  129. data_designer/engine/sampling_gen/data_sources/base.py +10 -7
  130. data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
  131. data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
  132. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
  133. data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
  134. data_designer/engine/sampling_gen/entities/errors.py +2 -0
  135. data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
  136. data_designer/engine/sampling_gen/entities/person.py +2 -0
  137. data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
  138. data_designer/engine/sampling_gen/errors.py +2 -0
  139. data_designer/engine/sampling_gen/generator.py +5 -4
  140. data_designer/engine/sampling_gen/jinja_utils.py +7 -3
  141. data_designer/engine/sampling_gen/people_gen.py +7 -7
  142. data_designer/engine/sampling_gen/person_constants.py +2 -0
  143. data_designer/engine/sampling_gen/schema.py +5 -1
  144. data_designer/engine/sampling_gen/schema_builder.py +2 -0
  145. data_designer/engine/sampling_gen/utils.py +7 -1
  146. data_designer/engine/secret_resolver.py +2 -0
  147. data_designer/engine/validation.py +2 -2
  148. data_designer/engine/validators/__init__.py +2 -0
  149. data_designer/engine/validators/base.py +2 -0
  150. data_designer/engine/validators/local_callable.py +7 -2
  151. data_designer/engine/validators/python.py +7 -1
  152. data_designer/engine/validators/remote.py +7 -1
  153. data_designer/engine/validators/sql.py +8 -3
  154. data_designer/errors.py +2 -0
  155. data_designer/essentials/__init__.py +2 -0
  156. data_designer/interface/data_designer.py +23 -17
  157. data_designer/interface/errors.py +2 -0
  158. data_designer/interface/results.py +5 -2
  159. data_designer/lazy_heavy_imports.py +54 -0
  160. data_designer/logging.py +2 -0
  161. data_designer/plugins/__init__.py +2 -0
  162. data_designer/plugins/errors.py +2 -0
  163. data_designer/plugins/plugin.py +0 -1
  164. data_designer/plugins/registry.py +2 -0
  165. data_designer/plugins/testing/__init__.py +2 -0
  166. data_designer/plugins/testing/stubs.py +21 -43
  167. data_designer/plugins/testing/utils.py +2 -0
  168. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/METADATA +12 -5
  169. data_designer-0.3.6.dist-info/RECORD +196 -0
  170. data_designer-0.3.4.dist-info/RECORD +0 -194
  171. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/WHEEL +0 -0
  172. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/entry_points.txt +0 -0
  173. {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/licenses/LICENSE +0 -0
@@ -5,21 +5,32 @@ from __future__ import annotations
5
5
 
6
6
  import random
7
7
  import threading
8
+ from typing import TYPE_CHECKING
8
9
 
9
- import httpx
10
- import litellm
11
- from litellm import RetryPolicy
12
- from litellm.caching.in_memory_cache import InMemoryCache
13
- from litellm.litellm_core_utils.logging_callback_manager import LoggingCallbackManager
14
- from litellm.router import Router
10
+ # Import specific litellm submodules needed for class inheritance
11
+ # Note: Class inheritance requires base classes at definition time, so we import these directly.
12
+ # Runtime litellm usage below still benefits from lazy loading via the litellm alias.
13
+ import litellm.caching.in_memory_cache as _litellm_cache
14
+ import litellm.router as _litellm_router
15
15
  from pydantic import BaseModel, Field
16
16
  from typing_extensions import override
17
17
 
18
+ # Use lazy loading for runtime litellm usage (RetryPolicy, utils, etc.)
19
+ from data_designer.lazy_heavy_imports import httpx, litellm
18
20
  from data_designer.logging import quiet_noisy_logger
19
21
 
22
+ if TYPE_CHECKING:
23
+ import httpx
24
+ import litellm
25
+
20
26
  DEFAULT_MAX_CALLBACKS = 1000
21
27
 
22
28
 
29
+ def _get_logging_callback_manager():
30
+ """Lazy accessor for LoggingCallbackManager to avoid loading litellm at import time."""
31
+ return litellm.litellm_core_utils.logging_callback_manager.LoggingCallbackManager
32
+
33
+
23
34
  class LiteLLMRouterDefaultKwargs(BaseModel):
24
35
  ## Number of seconds to wait initially after a connection
25
36
  ## failure.
@@ -35,15 +46,15 @@ class LiteLLMRouterDefaultKwargs(BaseModel):
35
46
 
36
47
  ## Sets the default retry policy, including the number
37
48
  ## of retries to use in particular scenarios.
38
- retry_policy: RetryPolicy = Field(
39
- default_factory=lambda: RetryPolicy(
49
+ retry_policy: litellm.RetryPolicy = Field(
50
+ default_factory=lambda: litellm.RetryPolicy(
40
51
  RateLimitErrorRetries=3,
41
52
  TimeoutErrorRetries=3,
42
53
  )
43
54
  )
44
55
 
45
56
 
46
- class ThreadSafeCache(InMemoryCache):
57
+ class ThreadSafeCache(_litellm_cache.InMemoryCache):
47
58
  def __init__(self, *args, **kwargs):
48
59
  super().__init__(*args, **kwargs)
49
60
 
@@ -78,7 +89,7 @@ class ThreadSafeCache(InMemoryCache):
78
89
  super().flush_cache()
79
90
 
80
91
 
81
- class CustomRouter(Router):
92
+ class CustomRouter(_litellm_router.Router):
82
93
  def __init__(
83
94
  self,
84
95
  *args,
@@ -155,7 +166,7 @@ def apply_litellm_patches():
155
166
  litellm.in_memory_llm_clients_cache = ThreadSafeCache()
156
167
 
157
168
  # Workaround for the litellm issue described in https://github.com/BerriAI/litellm/issues/9792
158
- LoggingCallbackManager.MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS
169
+ _get_logging_callback_manager().MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS
159
170
 
160
171
  quiet_noisy_logger("httpx")
161
172
  quiet_noisy_logger("LiteLLM")
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
 
5
7
  class ParserException(Exception):
6
8
  """Identifies errors resulting from generic parser errors.
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from functools import reduce
5
7
 
6
8
  import marko
@@ -80,13 +82,11 @@ class LLMResponseParser:
80
82
  code: str
81
83
  syntax: Optional[str] = None
82
84
 
83
-
84
85
  class CodeBlockParser:
85
86
  def __call__(self, element: _Element) -> CodeBlock:
86
87
  # Implementation details...
87
88
  return CodeBlock(code=element.text, syntax=element.get("class"))
88
89
 
89
-
90
90
  parser = LLMResponseParser(
91
91
  tag_parsers={
92
92
  "pre.code": CodeBlockParser(),
@@ -1,6 +1,7 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
4
5
 
5
6
  import json_repair
6
7
  from pydantic import BaseModel, ValidationError
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from lxml.etree import _Element
5
7
 
6
8
  from data_designer.engine.models.parsers.types import CodeBlock, TextBlock
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from typing import Any, Protocol, runtime_checkable
5
7
 
6
8
  from lxml.etree import _Element
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import abc
5
7
  from collections.abc import Callable
6
8
  from typing import Generic, TypeVar
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import json
5
7
  from collections.abc import Callable
6
8
 
@@ -4,14 +4,17 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import logging
7
+ from collections.abc import Callable
8
+ from typing import TYPE_CHECKING
7
9
 
8
10
  from data_designer.config.models import GenerationType, ModelConfig
9
11
  from data_designer.engine.model_provider import ModelProvider, ModelProviderRegistry
10
- from data_designer.engine.models.facade import ModelFacade
11
- from data_designer.engine.models.litellm_overrides import apply_litellm_patches
12
12
  from data_designer.engine.models.usage import ModelUsageStats, RequestUsageStats, TokenUsageStats
13
13
  from data_designer.engine.secret_resolver import SecretResolver
14
14
 
15
+ if TYPE_CHECKING:
16
+ from data_designer.engine.models.facade import ModelFacade
17
+
15
18
  logger = logging.getLogger(__name__)
16
19
 
17
20
 
@@ -22,10 +25,12 @@ class ModelRegistry:
22
25
  secret_resolver: SecretResolver,
23
26
  model_provider_registry: ModelProviderRegistry,
24
27
  model_configs: list[ModelConfig] | None = None,
28
+ model_facade_factory: Callable[[ModelConfig, SecretResolver, ModelProviderRegistry], ModelFacade] | None = None,
25
29
  ):
26
30
  self._secret_resolver = secret_resolver
27
31
  self._model_provider_registry = model_provider_registry
28
- self._model_configs = {}
32
+ self._model_facade_factory = model_facade_factory
33
+ self._model_configs: dict[str, ModelConfig] = {}
29
34
  self._models: dict[str, ModelFacade] = {}
30
35
  self._set_model_configs(model_configs)
31
36
 
@@ -136,18 +141,6 @@ class ModelRegistry:
136
141
  # Models are now lazily initialized in get_model() when first requested
137
142
 
138
143
  def _get_model(self, model_config: ModelConfig) -> ModelFacade:
139
- return ModelFacade(model_config, self._secret_resolver, self._model_provider_registry)
140
-
141
-
142
- def create_model_registry(
143
- *,
144
- model_configs: list[ModelConfig] | None = None,
145
- secret_resolver: SecretResolver,
146
- model_provider_registry: ModelProviderRegistry,
147
- ) -> ModelRegistry:
148
- apply_litellm_patches()
149
- return ModelRegistry(
150
- model_configs=model_configs,
151
- secret_resolver=secret_resolver,
152
- model_provider_registry=model_provider_registry,
153
- )
144
+ if self._model_facade_factory is None:
145
+ raise RuntimeError("ModelRegistry was not initialized with a model_facade_factory")
146
+ return self._model_facade_factory(model_config, self._secret_resolver, self._model_provider_registry)
@@ -18,11 +18,15 @@ import platform
18
18
  from dataclasses import dataclass
19
19
  from datetime import datetime, timezone
20
20
  from enum import Enum
21
- from typing import Any, ClassVar
21
+ from typing import TYPE_CHECKING, Any, ClassVar
22
22
 
23
- import httpx
24
23
  from pydantic import BaseModel, Field
25
24
 
25
+ from data_designer.lazy_heavy_imports import httpx
26
+
27
+ if TYPE_CHECKING:
28
+ import httpx
29
+
26
30
  TELEMETRY_ENABLED = os.getenv("NEMO_TELEMETRY_ENABLED", "true").lower() in ("1", "true", "yes")
27
31
  CLIENT_ID = "184482118588404"
28
32
  NEMO_TELEMETRY_VERSION = "nemo-telemetry/1.0"
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from collections import deque
5
7
 
6
8
  from jinja2 import nodes as j_nodes
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import re
5
7
  from collections.abc import Callable
6
8
  from functools import partial, wraps
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import re
5
7
 
6
8
  from jinja2 import TemplateAssertionError
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import json
5
7
 
6
8
  from data_designer.config.utils.io_helpers import serialize_data
@@ -1,8 +1,15 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from jsonschema import ValidationError
4
+ from __future__ import annotations
5
5
 
6
+ from typing import TYPE_CHECKING
6
7
 
7
- class JSONSchemaValidationError(ValidationError):
8
+ from data_designer.lazy_heavy_imports import jsonschema
9
+
10
+ if TYPE_CHECKING:
11
+ import jsonschema
12
+
13
+
14
+ class JSONSchemaValidationError(jsonschema.ValidationError):
8
15
  """Alias of ValidationError to ease imports."""
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from copy import deepcopy
5
7
  from typing import Any
6
8
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from typing import Any, TypeVar
5
7
 
6
8
  T_primitive = TypeVar("T_primitive", str, int, float, bool)
@@ -1,19 +1,23 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  import re
6
8
  from copy import deepcopy
7
9
  from decimal import ROUND_HALF_UP, Decimal
8
- from typing import Any, overload
9
-
10
- from jsonschema import Draft202012Validator, ValidationError, validators
10
+ from typing import TYPE_CHECKING, Any, overload
11
11
 
12
12
  from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError
13
13
  from data_designer.engine.processing.gsonschema.schema_transformers import forbid_additional_properties
14
14
  from data_designer.engine.processing.gsonschema.types import DataObjectT, JSONSchemaT, T_primitive
15
+ from data_designer.lazy_heavy_imports import jsonschema
16
+
17
+ if TYPE_CHECKING:
18
+ import jsonschema
15
19
 
16
- DEFAULT_JSONSCHEMA_VALIDATOR = Draft202012Validator
20
+ DEFAULT_JSONSCHEMA_VALIDATOR = jsonschema.Draft202012Validator
17
21
 
18
22
  logger = logging.getLogger(__name__)
19
23
 
@@ -69,7 +73,7 @@ def extend_jsonschema_validator_with_pruning(validator):
69
73
  Type[jsonschema.Validator]: A validator class that will
70
74
  prune extra fields.
71
75
  """
72
- return validators.extend(validator, {"additionalProperties": prune_additional_properties})
76
+ return jsonschema.validators.extend(validator, {"additionalProperties": prune_additional_properties})
73
77
 
74
78
 
75
79
  def _get_decimal_info_from_anyof(schema: dict) -> tuple[bool, int | None]:
@@ -190,7 +194,7 @@ def validate(
190
194
 
191
195
  try:
192
196
  validator(schema).validate(final_object)
193
- except ValidationError as exc:
197
+ except jsonschema.ValidationError as exc:
194
198
  raise JSONSchemaValidationError(str(exc)) from exc
195
199
 
196
200
  final_object = normalize_decimal_fields(final_object, schema)
@@ -5,13 +5,9 @@ from __future__ import annotations
5
5
 
6
6
  from abc import ABC, abstractmethod
7
7
 
8
- from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, DataT, TaskConfigT
8
+ from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT
9
9
 
10
10
 
11
11
  class Processor(ConfigurableTask[TaskConfigT], ABC):
12
- @staticmethod
13
- @abstractmethod
14
- def metadata() -> ConfigurableTaskMetadata: ...
15
-
16
12
  @abstractmethod
17
13
  def process(self, data: DataT, *, current_batch_number: int | None = None) -> DataT: ...
@@ -1,26 +1,23 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- import logging
4
+ from __future__ import annotations
5
5
 
6
- import pandas as pd
6
+ import logging
7
+ from typing import TYPE_CHECKING
7
8
 
8
9
  from data_designer.config.processors import DropColumnsProcessorConfig
9
- from data_designer.engine.configurable_task import ConfigurableTaskMetadata
10
10
  from data_designer.engine.dataset_builders.artifact_storage import BatchStage
11
11
  from data_designer.engine.processing.processors.base import Processor
12
+ from data_designer.lazy_heavy_imports import pd
13
+
14
+ if TYPE_CHECKING:
15
+ import pandas as pd
12
16
 
13
17
  logger = logging.getLogger(__name__)
14
18
 
15
19
 
16
20
  class DropColumnsProcessor(Processor[DropColumnsProcessorConfig]):
17
- @staticmethod
18
- def metadata() -> ConfigurableTaskMetadata:
19
- return ConfigurableTaskMetadata(
20
- name="drop_columns_processor",
21
- description="Drop columns from the input dataset.",
22
- )
23
-
24
21
  def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
25
22
  logger.info(f"🙈 Dropping columns: {self.config.column_names}")
26
23
  if current_batch_number is not None: # not in preview mode
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.base import ConfigBase
5
7
  from data_designer.config.processors import (
6
8
  DropColumnsProcessorConfig,
@@ -1,29 +1,26 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import json
5
7
  import logging
6
-
7
- import pandas as pd
8
+ from typing import TYPE_CHECKING
8
9
 
9
10
  from data_designer.config.processors import SchemaTransformProcessorConfig
10
- from data_designer.engine.configurable_task import ConfigurableTaskMetadata
11
11
  from data_designer.engine.dataset_builders.artifact_storage import BatchStage
12
12
  from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
13
13
  from data_designer.engine.processing.processors.base import Processor
14
14
  from data_designer.engine.processing.utils import deserialize_json_values
15
+ from data_designer.lazy_heavy_imports import pd
16
+
17
+ if TYPE_CHECKING:
18
+ import pandas as pd
15
19
 
16
20
  logger = logging.getLogger(__name__)
17
21
 
18
22
 
19
23
  class SchemaTransformProcessor(WithJinja2UserTemplateRendering, Processor[SchemaTransformProcessorConfig]):
20
- @staticmethod
21
- def metadata() -> ConfigurableTaskMetadata:
22
- return ConfigurableTaskMetadata(
23
- name="schema_transform_processor",
24
- description="Generate dataset with transformed schema using a Jinja2 template.",
25
- )
26
-
27
24
  @property
28
25
  def template_as_str(self) -> str:
29
26
  return json.dumps(self.config.template)
@@ -1,13 +1,18 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import ast
5
7
  import json
6
8
  import logging
7
9
  import re
8
- from typing import Any, TypeVar, overload
10
+ from typing import TYPE_CHECKING, Any, TypeVar, overload
11
+
12
+ from data_designer.lazy_heavy_imports import pd
9
13
 
10
- import pandas as pd
14
+ if TYPE_CHECKING:
15
+ import pandas as pd
11
16
 
12
17
  logger = logging.getLogger(__name__)
13
18
 
@@ -52,7 +57,6 @@ def deserialize_json_values(data):
52
57
  - Dictionary (potentially with nested JSON strings to deserialize)
53
58
  - Some other object that can't be deserialized.
54
59
 
55
-
56
60
  Returns:
57
61
  Deserialized data in the corresponding format:
58
62
  - Dictionary (when input is a single string)
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import threading
5
7
  from typing import Any, Generic, TypeVar
6
8
 
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.engine.analysis.column_profilers.registry import (
5
7
  ColumnProfilerRegistry,
6
8
  create_default_column_profiler_registry,
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.engine.errors import DataDesignerError
5
7
 
6
8
 
@@ -1,11 +1,15 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
- from typing import Any
4
+ from __future__ import annotations
5
5
 
6
- import pandas as pd
6
+ from typing import TYPE_CHECKING, Any
7
7
 
8
8
  from data_designer.engine.resources.managed_dataset_repository import ManagedDatasetRepository
9
+ from data_designer.lazy_heavy_imports import pd
10
+
11
+ if TYPE_CHECKING:
12
+ import pandas as pd
9
13
 
10
14
 
11
15
  class ManagedDatasetGenerator:
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  import tempfile
6
8
  import threading
@@ -9,13 +11,15 @@ from abc import ABC, abstractmethod
9
11
  from dataclasses import dataclass
10
12
  from functools import cached_property
11
13
  from pathlib import Path
12
- from typing import Any
13
-
14
- import duckdb
15
- import pandas as pd
14
+ from typing import TYPE_CHECKING, Any
16
15
 
17
16
  from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
18
17
  from data_designer.engine.resources.managed_storage import LocalBlobStorageProvider, ManagedBlobStorage
18
+ from data_designer.lazy_heavy_imports import duckdb, pd
19
+
20
+ if TYPE_CHECKING:
21
+ import duckdb
22
+ import pandas as pd
19
23
 
20
24
  logger = logging.getLogger(__name__)
21
25
 
@@ -52,7 +56,6 @@ class Table:
52
56
 
53
57
  DataCatalog = list[Table]
54
58
 
55
-
56
59
  # For now we hardcode the remote data catalog in code. This make it easier
57
60
  # initialize the data catalog. Eventually we can make this work more
58
61
  # dynamically once this data catalog pattern becomes more widely adopted.
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import logging
5
7
  from abc import ABC, abstractmethod
6
8
  from collections.abc import Iterator
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from data_designer.config.base import ConfigBase
5
7
  from data_designer.config.dataset_metadata import DatasetMetadata
6
8
  from data_designer.config.models import ModelConfig
@@ -9,7 +11,8 @@ from data_designer.config.seed_source import SeedSource
9
11
  from data_designer.config.utils.type_helpers import StrEnum
10
12
  from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
11
13
  from data_designer.engine.model_provider import ModelProviderRegistry
12
- from data_designer.engine.models.registry import ModelRegistry, create_model_registry
14
+ from data_designer.engine.models.factory import create_model_registry
15
+ from data_designer.engine.models.registry import ModelRegistry
13
16
  from data_designer.engine.resources.managed_storage import ManagedBlobStorage, init_managed_blob_storage
14
17
  from data_designer.engine.resources.seed_reader import SeedReader, SeedReaderRegistry
15
18
  from data_designer.engine.secret_resolver import SecretResolver
@@ -51,12 +54,16 @@ def create_resource_provider(
51
54
  seed_dataset_source: SeedSource | None = None,
52
55
  run_config: RunConfig | None = None,
53
56
  ) -> ResourceProvider:
57
+ """Factory function for creating a ResourceProvider instance.
58
+ This function triggers lazy loading of heavy dependencies like litellm.
59
+ """
54
60
  seed_reader = None
55
61
  if seed_dataset_source:
56
62
  seed_reader = seed_reader_registry.get_reader(
57
63
  seed_dataset_source,
58
64
  secret_resolver,
59
65
  )
66
+
60
67
  return ResourceProvider(
61
68
  artifact_storage=artifact_storage,
62
69
  model_registry=create_model_registry(
@@ -1,11 +1,12 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from abc import ABC, abstractmethod
5
7
  from collections.abc import Sequence
6
- from typing import Generic, TypeVar, get_args, get_origin
8
+ from typing import TYPE_CHECKING, Generic, TypeVar, get_args, get_origin
7
9
 
8
- import duckdb
9
10
  from huggingface_hub import HfFileSystem
10
11
  from typing_extensions import Self
11
12
 
@@ -17,6 +18,10 @@ from data_designer.config.seed_source import (
17
18
  )
18
19
  from data_designer.engine.secret_resolver import SecretResolver
19
20
  from data_designer.errors import DataDesignerError
21
+ from data_designer.lazy_heavy_imports import duckdb
22
+
23
+ if TYPE_CHECKING:
24
+ import duckdb
20
25
 
21
26
 
22
27
  class SeedReaderError(DataDesignerError): ...
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  from typing import Any
5
7
 
6
8
  from pydantic import field_serializer, model_validator