data-designer 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/__init__.py +2 -0
- data_designer/_version.py +2 -2
- data_designer/cli/__init__.py +2 -0
- data_designer/cli/commands/download.py +2 -0
- data_designer/cli/commands/list.py +2 -0
- data_designer/cli/commands/models.py +2 -0
- data_designer/cli/commands/providers.py +2 -0
- data_designer/cli/commands/reset.py +2 -0
- data_designer/cli/controllers/__init__.py +2 -0
- data_designer/cli/controllers/download_controller.py +2 -0
- data_designer/cli/controllers/model_controller.py +6 -1
- data_designer/cli/controllers/provider_controller.py +6 -1
- data_designer/cli/forms/__init__.py +2 -0
- data_designer/cli/forms/builder.py +2 -0
- data_designer/cli/forms/field.py +2 -0
- data_designer/cli/forms/form.py +2 -0
- data_designer/cli/forms/model_builder.py +2 -0
- data_designer/cli/forms/provider_builder.py +2 -0
- data_designer/cli/main.py +2 -0
- data_designer/cli/repositories/__init__.py +2 -0
- data_designer/cli/repositories/base.py +2 -0
- data_designer/cli/repositories/model_repository.py +2 -0
- data_designer/cli/repositories/persona_repository.py +2 -0
- data_designer/cli/repositories/provider_repository.py +2 -0
- data_designer/cli/services/__init__.py +2 -0
- data_designer/cli/services/download_service.py +2 -0
- data_designer/cli/services/model_service.py +2 -0
- data_designer/cli/services/provider_service.py +2 -0
- data_designer/cli/ui.py +2 -0
- data_designer/cli/utils.py +2 -0
- data_designer/config/analysis/column_profilers.py +2 -0
- data_designer/config/analysis/column_statistics.py +8 -5
- data_designer/config/analysis/dataset_profiler.py +9 -3
- data_designer/config/analysis/utils/errors.py +2 -0
- data_designer/config/analysis/utils/reporting.py +7 -3
- data_designer/config/column_configs.py +77 -7
- data_designer/config/column_types.py +33 -36
- data_designer/config/dataset_builders.py +2 -0
- data_designer/config/default_model_settings.py +1 -0
- data_designer/config/errors.py +2 -0
- data_designer/config/exports.py +2 -0
- data_designer/config/interface.py +3 -2
- data_designer/config/models.py +7 -2
- data_designer/config/preview_results.py +7 -3
- data_designer/config/processors.py +2 -0
- data_designer/config/run_config.py +2 -0
- data_designer/config/sampler_constraints.py +2 -0
- data_designer/config/sampler_params.py +7 -2
- data_designer/config/seed.py +2 -0
- data_designer/config/seed_source.py +7 -2
- data_designer/config/seed_source_types.py +2 -0
- data_designer/config/utils/constants.py +2 -0
- data_designer/config/utils/errors.py +2 -0
- data_designer/config/utils/info.py +2 -0
- data_designer/config/utils/io_helpers.py +8 -3
- data_designer/config/utils/misc.py +2 -2
- data_designer/config/utils/numerical_helpers.py +2 -0
- data_designer/config/utils/type_helpers.py +2 -0
- data_designer/config/utils/visualization.py +8 -4
- data_designer/config/validator_params.py +2 -0
- data_designer/engine/analysis/column_profilers/base.py +9 -8
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +15 -19
- data_designer/engine/analysis/column_profilers/registry.py +2 -0
- data_designer/engine/analysis/column_statistics.py +5 -2
- data_designer/engine/analysis/dataset_profiler.py +12 -9
- data_designer/engine/analysis/errors.py +2 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +7 -4
- data_designer/engine/analysis/utils/judge_score_processing.py +7 -3
- data_designer/engine/column_generators/generators/base.py +26 -14
- data_designer/engine/column_generators/generators/embedding.py +4 -11
- data_designer/engine/column_generators/generators/expression.py +7 -16
- data_designer/engine/column_generators/generators/llm_completion.py +11 -37
- data_designer/engine/column_generators/generators/samplers.py +8 -14
- data_designer/engine/column_generators/generators/seed_dataset.py +9 -15
- data_designer/engine/column_generators/generators/validation.py +8 -20
- data_designer/engine/column_generators/registry.py +2 -0
- data_designer/engine/column_generators/utils/errors.py +2 -0
- data_designer/engine/column_generators/utils/generator_classification.py +2 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +2 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +4 -2
- data_designer/engine/compiler.py +3 -6
- data_designer/engine/configurable_task.py +12 -13
- data_designer/engine/dataset_builders/artifact_storage.py +87 -8
- data_designer/engine/dataset_builders/column_wise_builder.py +32 -34
- data_designer/engine/dataset_builders/errors.py +2 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +2 -0
- data_designer/engine/dataset_builders/utils/config_compiler.py +2 -0
- data_designer/engine/dataset_builders/utils/dag.py +7 -2
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +9 -6
- data_designer/engine/dataset_builders/utils/errors.py +2 -0
- data_designer/engine/errors.py +2 -0
- data_designer/engine/model_provider.py +2 -0
- data_designer/engine/models/errors.py +23 -31
- data_designer/engine/models/facade.py +12 -9
- data_designer/engine/models/factory.py +42 -0
- data_designer/engine/models/litellm_overrides.py +22 -11
- data_designer/engine/models/parsers/errors.py +2 -0
- data_designer/engine/models/parsers/parser.py +2 -2
- data_designer/engine/models/parsers/postprocessors.py +1 -0
- data_designer/engine/models/parsers/tag_parsers.py +2 -0
- data_designer/engine/models/parsers/types.py +2 -0
- data_designer/engine/models/recipes/base.py +2 -0
- data_designer/engine/models/recipes/response_recipes.py +2 -0
- data_designer/engine/models/registry.py +11 -18
- data_designer/engine/models/telemetry.py +6 -2
- data_designer/engine/processing/ginja/ast.py +2 -0
- data_designer/engine/processing/ginja/environment.py +2 -0
- data_designer/engine/processing/ginja/exceptions.py +2 -0
- data_designer/engine/processing/ginja/record.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +9 -2
- data_designer/engine/processing/gsonschema/schema_transformers.py +2 -0
- data_designer/engine/processing/gsonschema/types.py +2 -0
- data_designer/engine/processing/gsonschema/validators.py +10 -6
- data_designer/engine/processing/processors/base.py +1 -5
- data_designer/engine/processing/processors/drop_columns.py +7 -10
- data_designer/engine/processing/processors/registry.py +2 -0
- data_designer/engine/processing/processors/schema_transform.py +7 -10
- data_designer/engine/processing/utils.py +7 -3
- data_designer/engine/registry/base.py +2 -0
- data_designer/engine/registry/data_designer_registry.py +2 -0
- data_designer/engine/registry/errors.py +2 -0
- data_designer/engine/resources/managed_dataset_generator.py +6 -2
- data_designer/engine/resources/managed_dataset_repository.py +8 -5
- data_designer/engine/resources/managed_storage.py +2 -0
- data_designer/engine/resources/resource_provider.py +8 -1
- data_designer/engine/resources/seed_reader.py +7 -2
- data_designer/engine/sampling_gen/column.py +2 -0
- data_designer/engine/sampling_gen/constraints.py +8 -2
- data_designer/engine/sampling_gen/data_sources/base.py +10 -7
- data_designer/engine/sampling_gen/data_sources/errors.py +2 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +27 -22
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +2 -2
- data_designer/engine/sampling_gen/entities/email_address_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/errors.py +2 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +2 -0
- data_designer/engine/sampling_gen/entities/person.py +2 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +8 -1
- data_designer/engine/sampling_gen/errors.py +2 -0
- data_designer/engine/sampling_gen/generator.py +5 -4
- data_designer/engine/sampling_gen/jinja_utils.py +7 -3
- data_designer/engine/sampling_gen/people_gen.py +7 -7
- data_designer/engine/sampling_gen/person_constants.py +2 -0
- data_designer/engine/sampling_gen/schema.py +5 -1
- data_designer/engine/sampling_gen/schema_builder.py +2 -0
- data_designer/engine/sampling_gen/utils.py +7 -1
- data_designer/engine/secret_resolver.py +2 -0
- data_designer/engine/validation.py +2 -2
- data_designer/engine/validators/__init__.py +2 -0
- data_designer/engine/validators/base.py +2 -0
- data_designer/engine/validators/local_callable.py +7 -2
- data_designer/engine/validators/python.py +7 -1
- data_designer/engine/validators/remote.py +7 -1
- data_designer/engine/validators/sql.py +8 -3
- data_designer/errors.py +2 -0
- data_designer/essentials/__init__.py +2 -0
- data_designer/interface/data_designer.py +23 -17
- data_designer/interface/errors.py +2 -0
- data_designer/interface/results.py +5 -2
- data_designer/lazy_heavy_imports.py +54 -0
- data_designer/logging.py +2 -0
- data_designer/plugins/__init__.py +2 -0
- data_designer/plugins/errors.py +2 -0
- data_designer/plugins/plugin.py +0 -1
- data_designer/plugins/registry.py +2 -0
- data_designer/plugins/testing/__init__.py +2 -0
- data_designer/plugins/testing/stubs.py +21 -43
- data_designer/plugins/testing/utils.py +2 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/METADATA +12 -5
- data_designer-0.3.6.dist-info/RECORD +196 -0
- data_designer-0.3.4.dist-info/RECORD +0 -194
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/WHEEL +0 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/entry_points.txt +0 -0
- {data_designer-0.3.4.dist-info → data_designer-0.3.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,21 +5,32 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import random
|
|
7
7
|
import threading
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
import
|
|
11
|
-
from litellm
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
from litellm.router import Router
|
|
10
|
+
# Import specific litellm submodules needed for class inheritance
|
|
11
|
+
# Note: Class inheritance requires base classes at definition time, so we import these directly.
|
|
12
|
+
# Runtime litellm usage below still benefits from lazy loading via the litellm alias.
|
|
13
|
+
import litellm.caching.in_memory_cache as _litellm_cache
|
|
14
|
+
import litellm.router as _litellm_router
|
|
15
15
|
from pydantic import BaseModel, Field
|
|
16
16
|
from typing_extensions import override
|
|
17
17
|
|
|
18
|
+
# Use lazy loading for runtime litellm usage (RetryPolicy, utils, etc.)
|
|
19
|
+
from data_designer.lazy_heavy_imports import httpx, litellm
|
|
18
20
|
from data_designer.logging import quiet_noisy_logger
|
|
19
21
|
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
import httpx
|
|
24
|
+
import litellm
|
|
25
|
+
|
|
20
26
|
DEFAULT_MAX_CALLBACKS = 1000
|
|
21
27
|
|
|
22
28
|
|
|
29
|
+
def _get_logging_callback_manager():
|
|
30
|
+
"""Lazy accessor for LoggingCallbackManager to avoid loading litellm at import time."""
|
|
31
|
+
return litellm.litellm_core_utils.logging_callback_manager.LoggingCallbackManager
|
|
32
|
+
|
|
33
|
+
|
|
23
34
|
class LiteLLMRouterDefaultKwargs(BaseModel):
|
|
24
35
|
## Number of seconds to wait initially after a connection
|
|
25
36
|
## failure.
|
|
@@ -35,15 +46,15 @@ class LiteLLMRouterDefaultKwargs(BaseModel):
|
|
|
35
46
|
|
|
36
47
|
## Sets the default retry policy, including the number
|
|
37
48
|
## of retries to use in particular scenarios.
|
|
38
|
-
retry_policy: RetryPolicy = Field(
|
|
39
|
-
default_factory=lambda: RetryPolicy(
|
|
49
|
+
retry_policy: litellm.RetryPolicy = Field(
|
|
50
|
+
default_factory=lambda: litellm.RetryPolicy(
|
|
40
51
|
RateLimitErrorRetries=3,
|
|
41
52
|
TimeoutErrorRetries=3,
|
|
42
53
|
)
|
|
43
54
|
)
|
|
44
55
|
|
|
45
56
|
|
|
46
|
-
class ThreadSafeCache(InMemoryCache):
|
|
57
|
+
class ThreadSafeCache(_litellm_cache.InMemoryCache):
|
|
47
58
|
def __init__(self, *args, **kwargs):
|
|
48
59
|
super().__init__(*args, **kwargs)
|
|
49
60
|
|
|
@@ -78,7 +89,7 @@ class ThreadSafeCache(InMemoryCache):
|
|
|
78
89
|
super().flush_cache()
|
|
79
90
|
|
|
80
91
|
|
|
81
|
-
class CustomRouter(Router):
|
|
92
|
+
class CustomRouter(_litellm_router.Router):
|
|
82
93
|
def __init__(
|
|
83
94
|
self,
|
|
84
95
|
*args,
|
|
@@ -155,7 +166,7 @@ def apply_litellm_patches():
|
|
|
155
166
|
litellm.in_memory_llm_clients_cache = ThreadSafeCache()
|
|
156
167
|
|
|
157
168
|
# Workaround for the litellm issue described in https://github.com/BerriAI/litellm/issues/9792
|
|
158
|
-
|
|
169
|
+
_get_logging_callback_manager().MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS
|
|
159
170
|
|
|
160
171
|
quiet_noisy_logger("httpx")
|
|
161
172
|
quiet_noisy_logger("LiteLLM")
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
class ParserException(Exception):
|
|
6
8
|
"""Identifies errors resulting from generic parser errors.
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from functools import reduce
|
|
5
7
|
|
|
6
8
|
import marko
|
|
@@ -80,13 +82,11 @@ class LLMResponseParser:
|
|
|
80
82
|
code: str
|
|
81
83
|
syntax: Optional[str] = None
|
|
82
84
|
|
|
83
|
-
|
|
84
85
|
class CodeBlockParser:
|
|
85
86
|
def __call__(self, element: _Element) -> CodeBlock:
|
|
86
87
|
# Implementation details...
|
|
87
88
|
return CodeBlock(code=element.text, syntax=element.get("class"))
|
|
88
89
|
|
|
89
|
-
|
|
90
90
|
parser = LLMResponseParser(
|
|
91
91
|
tag_parsers={
|
|
92
92
|
"pre.code": CodeBlockParser(),
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from lxml.etree import _Element
|
|
5
7
|
|
|
6
8
|
from data_designer.engine.models.parsers.types import CodeBlock, TextBlock
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from typing import Any, Protocol, runtime_checkable
|
|
5
7
|
|
|
6
8
|
from lxml.etree import _Element
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import abc
|
|
5
7
|
from collections.abc import Callable
|
|
6
8
|
from typing import Generic, TypeVar
|
|
@@ -4,14 +4,17 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
7
9
|
|
|
8
10
|
from data_designer.config.models import GenerationType, ModelConfig
|
|
9
11
|
from data_designer.engine.model_provider import ModelProvider, ModelProviderRegistry
|
|
10
|
-
from data_designer.engine.models.facade import ModelFacade
|
|
11
|
-
from data_designer.engine.models.litellm_overrides import apply_litellm_patches
|
|
12
12
|
from data_designer.engine.models.usage import ModelUsageStats, RequestUsageStats, TokenUsageStats
|
|
13
13
|
from data_designer.engine.secret_resolver import SecretResolver
|
|
14
14
|
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from data_designer.engine.models.facade import ModelFacade
|
|
17
|
+
|
|
15
18
|
logger = logging.getLogger(__name__)
|
|
16
19
|
|
|
17
20
|
|
|
@@ -22,10 +25,12 @@ class ModelRegistry:
|
|
|
22
25
|
secret_resolver: SecretResolver,
|
|
23
26
|
model_provider_registry: ModelProviderRegistry,
|
|
24
27
|
model_configs: list[ModelConfig] | None = None,
|
|
28
|
+
model_facade_factory: Callable[[ModelConfig, SecretResolver, ModelProviderRegistry], ModelFacade] | None = None,
|
|
25
29
|
):
|
|
26
30
|
self._secret_resolver = secret_resolver
|
|
27
31
|
self._model_provider_registry = model_provider_registry
|
|
28
|
-
self.
|
|
32
|
+
self._model_facade_factory = model_facade_factory
|
|
33
|
+
self._model_configs: dict[str, ModelConfig] = {}
|
|
29
34
|
self._models: dict[str, ModelFacade] = {}
|
|
30
35
|
self._set_model_configs(model_configs)
|
|
31
36
|
|
|
@@ -136,18 +141,6 @@ class ModelRegistry:
|
|
|
136
141
|
# Models are now lazily initialized in get_model() when first requested
|
|
137
142
|
|
|
138
143
|
def _get_model(self, model_config: ModelConfig) -> ModelFacade:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def create_model_registry(
|
|
143
|
-
*,
|
|
144
|
-
model_configs: list[ModelConfig] | None = None,
|
|
145
|
-
secret_resolver: SecretResolver,
|
|
146
|
-
model_provider_registry: ModelProviderRegistry,
|
|
147
|
-
) -> ModelRegistry:
|
|
148
|
-
apply_litellm_patches()
|
|
149
|
-
return ModelRegistry(
|
|
150
|
-
model_configs=model_configs,
|
|
151
|
-
secret_resolver=secret_resolver,
|
|
152
|
-
model_provider_registry=model_provider_registry,
|
|
153
|
-
)
|
|
144
|
+
if self._model_facade_factory is None:
|
|
145
|
+
raise RuntimeError("ModelRegistry was not initialized with a model_facade_factory")
|
|
146
|
+
return self._model_facade_factory(model_config, self._secret_resolver, self._model_provider_registry)
|
|
@@ -18,11 +18,15 @@ import platform
|
|
|
18
18
|
from dataclasses import dataclass
|
|
19
19
|
from datetime import datetime, timezone
|
|
20
20
|
from enum import Enum
|
|
21
|
-
from typing import Any, ClassVar
|
|
21
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
22
22
|
|
|
23
|
-
import httpx
|
|
24
23
|
from pydantic import BaseModel, Field
|
|
25
24
|
|
|
25
|
+
from data_designer.lazy_heavy_imports import httpx
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
import httpx
|
|
29
|
+
|
|
26
30
|
TELEMETRY_ENABLED = os.getenv("NEMO_TELEMETRY_ENABLED", "true").lower() in ("1", "true", "yes")
|
|
27
31
|
CLIENT_ID = "184482118588404"
|
|
28
32
|
NEMO_TELEMETRY_VERSION = "nemo-telemetry/1.0"
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import re
|
|
5
7
|
from collections.abc import Callable
|
|
6
8
|
from functools import partial, wraps
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import json
|
|
5
7
|
|
|
6
8
|
from data_designer.config.utils.io_helpers import serialize_data
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
|
-
|
|
8
|
+
from data_designer.lazy_heavy_imports import jsonschema
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
import jsonschema
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class JSONSchemaValidationError(jsonschema.ValidationError):
|
|
8
15
|
"""Alias of ValidationError to ease imports."""
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from typing import Any, TypeVar
|
|
5
7
|
|
|
6
8
|
T_primitive = TypeVar("T_primitive", str, int, float, bool)
|
|
@@ -1,19 +1,23 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
import re
|
|
6
8
|
from copy import deepcopy
|
|
7
9
|
from decimal import ROUND_HALF_UP, Decimal
|
|
8
|
-
from typing import Any, overload
|
|
9
|
-
|
|
10
|
-
from jsonschema import Draft202012Validator, ValidationError, validators
|
|
10
|
+
from typing import TYPE_CHECKING, Any, overload
|
|
11
11
|
|
|
12
12
|
from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError
|
|
13
13
|
from data_designer.engine.processing.gsonschema.schema_transformers import forbid_additional_properties
|
|
14
14
|
from data_designer.engine.processing.gsonschema.types import DataObjectT, JSONSchemaT, T_primitive
|
|
15
|
+
from data_designer.lazy_heavy_imports import jsonschema
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import jsonschema
|
|
15
19
|
|
|
16
|
-
DEFAULT_JSONSCHEMA_VALIDATOR = Draft202012Validator
|
|
20
|
+
DEFAULT_JSONSCHEMA_VALIDATOR = jsonschema.Draft202012Validator
|
|
17
21
|
|
|
18
22
|
logger = logging.getLogger(__name__)
|
|
19
23
|
|
|
@@ -69,7 +73,7 @@ def extend_jsonschema_validator_with_pruning(validator):
|
|
|
69
73
|
Type[jsonschema.Validator]: A validator class that will
|
|
70
74
|
prune extra fields.
|
|
71
75
|
"""
|
|
72
|
-
return validators.extend(validator, {"additionalProperties": prune_additional_properties})
|
|
76
|
+
return jsonschema.validators.extend(validator, {"additionalProperties": prune_additional_properties})
|
|
73
77
|
|
|
74
78
|
|
|
75
79
|
def _get_decimal_info_from_anyof(schema: dict) -> tuple[bool, int | None]:
|
|
@@ -190,7 +194,7 @@ def validate(
|
|
|
190
194
|
|
|
191
195
|
try:
|
|
192
196
|
validator(schema).validate(final_object)
|
|
193
|
-
except ValidationError as exc:
|
|
197
|
+
except jsonschema.ValidationError as exc:
|
|
194
198
|
raise JSONSchemaValidationError(str(exc)) from exc
|
|
195
199
|
|
|
196
200
|
final_object = normalize_decimal_fields(final_object, schema)
|
|
@@ -5,13 +5,9 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
from abc import ABC, abstractmethod
|
|
7
7
|
|
|
8
|
-
from data_designer.engine.configurable_task import ConfigurableTask,
|
|
8
|
+
from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class Processor(ConfigurableTask[TaskConfigT], ABC):
|
|
12
|
-
@staticmethod
|
|
13
|
-
@abstractmethod
|
|
14
|
-
def metadata() -> ConfigurableTaskMetadata: ...
|
|
15
|
-
|
|
16
12
|
@abstractmethod
|
|
17
13
|
def process(self, data: DataT, *, current_batch_number: int | None = None) -> DataT: ...
|
|
@@ -1,26 +1,23 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
import logging
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
8
9
|
from data_designer.config.processors import DropColumnsProcessorConfig
|
|
9
|
-
from data_designer.engine.configurable_task import ConfigurableTaskMetadata
|
|
10
10
|
from data_designer.engine.dataset_builders.artifact_storage import BatchStage
|
|
11
11
|
from data_designer.engine.processing.processors.base import Processor
|
|
12
|
+
from data_designer.lazy_heavy_imports import pd
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import pandas as pd
|
|
12
16
|
|
|
13
17
|
logger = logging.getLogger(__name__)
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
class DropColumnsProcessor(Processor[DropColumnsProcessorConfig]):
|
|
17
|
-
@staticmethod
|
|
18
|
-
def metadata() -> ConfigurableTaskMetadata:
|
|
19
|
-
return ConfigurableTaskMetadata(
|
|
20
|
-
name="drop_columns_processor",
|
|
21
|
-
description="Drop columns from the input dataset.",
|
|
22
|
-
)
|
|
23
|
-
|
|
24
21
|
def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame:
|
|
25
22
|
logger.info(f"🙈 Dropping columns: {self.config.column_names}")
|
|
26
23
|
if current_batch_number is not None: # not in preview mode
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.config.base import ConfigBase
|
|
5
7
|
from data_designer.config.processors import (
|
|
6
8
|
DropColumnsProcessorConfig,
|
|
@@ -1,29 +1,26 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import json
|
|
5
7
|
import logging
|
|
6
|
-
|
|
7
|
-
import pandas as pd
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
8
9
|
|
|
9
10
|
from data_designer.config.processors import SchemaTransformProcessorConfig
|
|
10
|
-
from data_designer.engine.configurable_task import ConfigurableTaskMetadata
|
|
11
11
|
from data_designer.engine.dataset_builders.artifact_storage import BatchStage
|
|
12
12
|
from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering
|
|
13
13
|
from data_designer.engine.processing.processors.base import Processor
|
|
14
14
|
from data_designer.engine.processing.utils import deserialize_json_values
|
|
15
|
+
from data_designer.lazy_heavy_imports import pd
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import pandas as pd
|
|
15
19
|
|
|
16
20
|
logger = logging.getLogger(__name__)
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
class SchemaTransformProcessor(WithJinja2UserTemplateRendering, Processor[SchemaTransformProcessorConfig]):
|
|
20
|
-
@staticmethod
|
|
21
|
-
def metadata() -> ConfigurableTaskMetadata:
|
|
22
|
-
return ConfigurableTaskMetadata(
|
|
23
|
-
name="schema_transform_processor",
|
|
24
|
-
description="Generate dataset with transformed schema using a Jinja2 template.",
|
|
25
|
-
)
|
|
26
|
-
|
|
27
24
|
@property
|
|
28
25
|
def template_as_str(self) -> str:
|
|
29
26
|
return json.dumps(self.config.template)
|
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import ast
|
|
5
7
|
import json
|
|
6
8
|
import logging
|
|
7
9
|
import re
|
|
8
|
-
from typing import Any, TypeVar, overload
|
|
10
|
+
from typing import TYPE_CHECKING, Any, TypeVar, overload
|
|
11
|
+
|
|
12
|
+
from data_designer.lazy_heavy_imports import pd
|
|
9
13
|
|
|
10
|
-
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import pandas as pd
|
|
11
16
|
|
|
12
17
|
logger = logging.getLogger(__name__)
|
|
13
18
|
|
|
@@ -52,7 +57,6 @@ def deserialize_json_values(data):
|
|
|
52
57
|
- Dictionary (potentially with nested JSON strings to deserialize)
|
|
53
58
|
- Some other object that can't be deserialized.
|
|
54
59
|
|
|
55
|
-
|
|
56
60
|
Returns:
|
|
57
61
|
Deserialized data in the corresponding format:
|
|
58
62
|
- Dictionary (when input is a single string)
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.engine.analysis.column_profilers.registry import (
|
|
5
7
|
ColumnProfilerRegistry,
|
|
6
8
|
create_default_column_profiler_registry,
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from __future__ import annotations
|
|
5
5
|
|
|
6
|
-
import
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
7
|
|
|
8
8
|
from data_designer.engine.resources.managed_dataset_repository import ManagedDatasetRepository
|
|
9
|
+
from data_designer.lazy_heavy_imports import pd
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
import pandas as pd
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
class ManagedDatasetGenerator:
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
import tempfile
|
|
6
8
|
import threading
|
|
@@ -9,13 +11,15 @@ from abc import ABC, abstractmethod
|
|
|
9
11
|
from dataclasses import dataclass
|
|
10
12
|
from functools import cached_property
|
|
11
13
|
from pathlib import Path
|
|
12
|
-
from typing import Any
|
|
13
|
-
|
|
14
|
-
import duckdb
|
|
15
|
-
import pandas as pd
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
16
15
|
|
|
17
16
|
from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
|
|
18
17
|
from data_designer.engine.resources.managed_storage import LocalBlobStorageProvider, ManagedBlobStorage
|
|
18
|
+
from data_designer.lazy_heavy_imports import duckdb, pd
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
import duckdb
|
|
22
|
+
import pandas as pd
|
|
19
23
|
|
|
20
24
|
logger = logging.getLogger(__name__)
|
|
21
25
|
|
|
@@ -52,7 +56,6 @@ class Table:
|
|
|
52
56
|
|
|
53
57
|
DataCatalog = list[Table]
|
|
54
58
|
|
|
55
|
-
|
|
56
59
|
# For now we hardcode the remote data catalog in code. This make it easier
|
|
57
60
|
# initialize the data catalog. Eventually we can make this work more
|
|
58
61
|
# dynamically once this data catalog pattern becomes more widely adopted.
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
import logging
|
|
5
7
|
from abc import ABC, abstractmethod
|
|
6
8
|
from collections.abc import Iterator
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from data_designer.config.base import ConfigBase
|
|
5
7
|
from data_designer.config.dataset_metadata import DatasetMetadata
|
|
6
8
|
from data_designer.config.models import ModelConfig
|
|
@@ -9,7 +11,8 @@ from data_designer.config.seed_source import SeedSource
|
|
|
9
11
|
from data_designer.config.utils.type_helpers import StrEnum
|
|
10
12
|
from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
|
|
11
13
|
from data_designer.engine.model_provider import ModelProviderRegistry
|
|
12
|
-
from data_designer.engine.models.
|
|
14
|
+
from data_designer.engine.models.factory import create_model_registry
|
|
15
|
+
from data_designer.engine.models.registry import ModelRegistry
|
|
13
16
|
from data_designer.engine.resources.managed_storage import ManagedBlobStorage, init_managed_blob_storage
|
|
14
17
|
from data_designer.engine.resources.seed_reader import SeedReader, SeedReaderRegistry
|
|
15
18
|
from data_designer.engine.secret_resolver import SecretResolver
|
|
@@ -51,12 +54,16 @@ def create_resource_provider(
|
|
|
51
54
|
seed_dataset_source: SeedSource | None = None,
|
|
52
55
|
run_config: RunConfig | None = None,
|
|
53
56
|
) -> ResourceProvider:
|
|
57
|
+
"""Factory function for creating a ResourceProvider instance.
|
|
58
|
+
This function triggers lazy loading of heavy dependencies like litellm.
|
|
59
|
+
"""
|
|
54
60
|
seed_reader = None
|
|
55
61
|
if seed_dataset_source:
|
|
56
62
|
seed_reader = seed_reader_registry.get_reader(
|
|
57
63
|
seed_dataset_source,
|
|
58
64
|
secret_resolver,
|
|
59
65
|
)
|
|
66
|
+
|
|
60
67
|
return ResourceProvider(
|
|
61
68
|
artifact_storage=artifact_storage,
|
|
62
69
|
model_registry=create_model_registry(
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from abc import ABC, abstractmethod
|
|
5
7
|
from collections.abc import Sequence
|
|
6
|
-
from typing import Generic, TypeVar, get_args, get_origin
|
|
8
|
+
from typing import TYPE_CHECKING, Generic, TypeVar, get_args, get_origin
|
|
7
9
|
|
|
8
|
-
import duckdb
|
|
9
10
|
from huggingface_hub import HfFileSystem
|
|
10
11
|
from typing_extensions import Self
|
|
11
12
|
|
|
@@ -17,6 +18,10 @@ from data_designer.config.seed_source import (
|
|
|
17
18
|
)
|
|
18
19
|
from data_designer.engine.secret_resolver import SecretResolver
|
|
19
20
|
from data_designer.errors import DataDesignerError
|
|
21
|
+
from data_designer.lazy_heavy_imports import duckdb
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
import duckdb
|
|
20
25
|
|
|
21
26
|
|
|
22
27
|
class SeedReaderError(DataDesignerError): ...
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
4
6
|
from typing import Any
|
|
5
7
|
|
|
6
8
|
from pydantic import field_serializer, model_validator
|