data-designer 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/_version.py +2 -2
- data_designer/config/analysis/column_profilers.py +4 -4
- data_designer/config/analysis/column_statistics.py +5 -5
- data_designer/config/analysis/dataset_profiler.py +6 -6
- data_designer/config/analysis/utils/errors.py +1 -1
- data_designer/config/analysis/utils/reporting.py +5 -5
- data_designer/config/base.py +2 -2
- data_designer/config/column_configs.py +8 -8
- data_designer/config/column_types.py +9 -5
- data_designer/config/config_builder.py +32 -27
- data_designer/config/data_designer_config.py +7 -7
- data_designer/config/datastore.py +4 -4
- data_designer/config/default_model_settings.py +4 -4
- data_designer/config/errors.py +1 -1
- data_designer/config/exports.py +133 -0
- data_designer/config/interface.py +6 -6
- data_designer/config/models.py +109 -5
- data_designer/config/preview_results.py +9 -6
- data_designer/config/processors.py +48 -4
- data_designer/config/sampler_constraints.py +1 -1
- data_designer/config/sampler_params.py +2 -2
- data_designer/config/seed.py +3 -3
- data_designer/config/utils/constants.py +1 -1
- data_designer/config/utils/errors.py +1 -1
- data_designer/config/utils/info.py +8 -4
- data_designer/config/utils/io_helpers.py +5 -5
- data_designer/config/utils/misc.py +3 -3
- data_designer/config/utils/numerical_helpers.py +1 -1
- data_designer/config/utils/type_helpers.py +7 -3
- data_designer/config/utils/validation.py +37 -6
- data_designer/config/utils/visualization.py +42 -10
- data_designer/config/validator_params.py +2 -2
- data_designer/engine/analysis/column_profilers/base.py +1 -1
- data_designer/engine/analysis/dataset_profiler.py +1 -1
- data_designer/engine/analysis/utils/judge_score_processing.py +1 -1
- data_designer/engine/column_generators/generators/samplers.py +1 -1
- data_designer/engine/dataset_builders/artifact_storage.py +16 -6
- data_designer/engine/dataset_builders/column_wise_builder.py +4 -1
- data_designer/engine/dataset_builders/utils/concurrency.py +1 -1
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +1 -1
- data_designer/engine/errors.py +1 -1
- data_designer/engine/models/errors.py +1 -1
- data_designer/engine/models/facade.py +1 -1
- data_designer/engine/models/parsers/parser.py +2 -2
- data_designer/engine/models/recipes/response_recipes.py +1 -1
- data_designer/engine/processing/ginja/environment.py +1 -1
- data_designer/engine/processing/gsonschema/validators.py +1 -1
- data_designer/engine/processing/processors/drop_columns.py +1 -1
- data_designer/engine/processing/processors/registry.py +3 -0
- data_designer/engine/processing/processors/schema_transform.py +53 -0
- data_designer/engine/resources/managed_dataset_repository.py +4 -4
- data_designer/engine/resources/managed_storage.py +1 -1
- data_designer/engine/sampling_gen/constraints.py +1 -1
- data_designer/engine/sampling_gen/data_sources/base.py +1 -1
- data_designer/engine/sampling_gen/entities/email_address_utils.py +1 -1
- data_designer/engine/sampling_gen/entities/national_id_utils.py +1 -1
- data_designer/engine/sampling_gen/entities/person.py +1 -1
- data_designer/engine/sampling_gen/entities/phone_number.py +1 -1
- data_designer/engine/sampling_gen/people_gen.py +3 -3
- data_designer/engine/secret_resolver.py +1 -1
- data_designer/engine/validators/python.py +2 -2
- data_designer/essentials/__init__.py +20 -128
- data_designer/interface/data_designer.py +23 -19
- data_designer/interface/results.py +36 -0
- data_designer/logging.py +2 -2
- data_designer/plugin_manager.py +14 -26
- data_designer/plugins/registry.py +1 -1
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/METADATA +9 -9
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/RECORD +72 -70
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/WHEEL +0 -0
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/entry_points.txt +0 -0
- {data_designer-0.1.3.dist-info → data_designer-0.1.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,137 +1,29 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
3
|
+
|
|
4
|
+
from data_designer.config.default_model_settings import resolve_seed_default_model_settings
|
|
5
|
+
from data_designer.config.exports import * # noqa: F403
|
|
6
|
+
from data_designer.config.validator_params import LocalCallableValidatorParams
|
|
7
|
+
from data_designer.interface.data_designer import DataDesigner
|
|
8
|
+
from data_designer.logging import LoggingConfig, configure_logging
|
|
4
9
|
|
|
5
10
|
configure_logging(LoggingConfig.default())
|
|
6
11
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
LLMCodeColumnConfig,
|
|
11
|
-
LLMJudgeColumnConfig,
|
|
12
|
-
LLMStructuredColumnConfig,
|
|
13
|
-
LLMTextColumnConfig,
|
|
14
|
-
SamplerColumnConfig,
|
|
15
|
-
Score,
|
|
16
|
-
SeedDatasetColumnConfig,
|
|
17
|
-
ValidationColumnConfig,
|
|
18
|
-
)
|
|
19
|
-
from ..config.column_types import DataDesignerColumnType
|
|
20
|
-
from ..config.config_builder import DataDesignerConfigBuilder
|
|
21
|
-
from ..config.data_designer_config import DataDesignerConfig
|
|
22
|
-
from ..config.dataset_builders import BuildStage
|
|
23
|
-
from ..config.datastore import DatastoreSettings
|
|
24
|
-
from ..config.models import (
|
|
25
|
-
ImageContext,
|
|
26
|
-
ImageFormat,
|
|
27
|
-
InferenceParameters,
|
|
28
|
-
ManualDistribution,
|
|
29
|
-
ManualDistributionParams,
|
|
30
|
-
Modality,
|
|
31
|
-
ModalityContext,
|
|
32
|
-
ModalityDataType,
|
|
33
|
-
ModelConfig,
|
|
34
|
-
UniformDistribution,
|
|
35
|
-
UniformDistributionParams,
|
|
36
|
-
)
|
|
37
|
-
from ..config.processors import DropColumnsProcessorConfig, ProcessorType
|
|
38
|
-
from ..config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
|
|
39
|
-
from ..config.sampler_params import (
|
|
40
|
-
BernoulliMixtureSamplerParams,
|
|
41
|
-
BernoulliSamplerParams,
|
|
42
|
-
BinomialSamplerParams,
|
|
43
|
-
CategorySamplerParams,
|
|
44
|
-
DatetimeSamplerParams,
|
|
45
|
-
GaussianSamplerParams,
|
|
46
|
-
PersonFromFakerSamplerParams,
|
|
47
|
-
PersonSamplerParams,
|
|
48
|
-
PoissonSamplerParams,
|
|
49
|
-
SamplerType,
|
|
50
|
-
ScipySamplerParams,
|
|
51
|
-
SubcategorySamplerParams,
|
|
52
|
-
TimeDeltaSamplerParams,
|
|
53
|
-
UniformSamplerParams,
|
|
54
|
-
UUIDSamplerParams,
|
|
55
|
-
)
|
|
56
|
-
from ..config.seed import DatastoreSeedDatasetReference, IndexRange, PartitionBlock, SamplingStrategy, SeedConfig
|
|
57
|
-
from ..config.utils.code_lang import CodeLang
|
|
58
|
-
from ..config.utils.info import InfoType
|
|
59
|
-
from ..config.utils.misc import can_run_data_designer_locally
|
|
60
|
-
from ..config.validator_params import (
|
|
61
|
-
CodeValidatorParams,
|
|
62
|
-
RemoteValidatorParams,
|
|
63
|
-
ValidatorType,
|
|
64
|
-
)
|
|
12
|
+
# Resolve default model settings on import to ensure they are available when the library is used.
|
|
13
|
+
resolve_seed_default_model_settings()
|
|
14
|
+
|
|
65
15
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
16
|
+
def get_essentials_exports() -> list[str]:
|
|
17
|
+
logging = [
|
|
18
|
+
configure_logging.__name__,
|
|
19
|
+
LoggingConfig.__name__,
|
|
20
|
+
]
|
|
21
|
+
local = [
|
|
22
|
+
DataDesigner.__name__,
|
|
23
|
+
LocalCallableValidatorParams.__name__,
|
|
24
|
+
]
|
|
72
25
|
|
|
73
|
-
|
|
74
|
-
except ModuleNotFoundError:
|
|
75
|
-
pass
|
|
26
|
+
return logging + local + get_config_exports() # noqa: F405
|
|
76
27
|
|
|
77
|
-
__all__ = [
|
|
78
|
-
"BernoulliMixtureSamplerParams",
|
|
79
|
-
"BernoulliSamplerParams",
|
|
80
|
-
"BinomialSamplerParams",
|
|
81
|
-
"CategorySamplerParams",
|
|
82
|
-
"CodeLang",
|
|
83
|
-
"CodeValidatorParams",
|
|
84
|
-
"ColumnInequalityConstraint",
|
|
85
|
-
"configure_logging",
|
|
86
|
-
"DataDesignerColumnType",
|
|
87
|
-
"DataDesignerConfig",
|
|
88
|
-
"DataDesignerConfigBuilder",
|
|
89
|
-
"BuildStage",
|
|
90
|
-
"DatastoreSeedDatasetReference",
|
|
91
|
-
"DatastoreSettings",
|
|
92
|
-
"DatetimeSamplerParams",
|
|
93
|
-
"DropColumnsProcessorConfig",
|
|
94
|
-
"ExpressionColumnConfig",
|
|
95
|
-
"GaussianSamplerParams",
|
|
96
|
-
"IndexRange",
|
|
97
|
-
"InfoType",
|
|
98
|
-
"ImageContext",
|
|
99
|
-
"ImageFormat",
|
|
100
|
-
"InferenceParameters",
|
|
101
|
-
"JudgeScoreProfilerConfig",
|
|
102
|
-
"LLMCodeColumnConfig",
|
|
103
|
-
"LLMJudgeColumnConfig",
|
|
104
|
-
"LLMStructuredColumnConfig",
|
|
105
|
-
"LLMTextColumnConfig",
|
|
106
|
-
"LoggingConfig",
|
|
107
|
-
"ManualDistribution",
|
|
108
|
-
"ManualDistributionParams",
|
|
109
|
-
"Modality",
|
|
110
|
-
"ModalityContext",
|
|
111
|
-
"ModalityDataType",
|
|
112
|
-
"ModelConfig",
|
|
113
|
-
"PartitionBlock",
|
|
114
|
-
"PersonSamplerParams",
|
|
115
|
-
"PersonFromFakerSamplerParams",
|
|
116
|
-
"PoissonSamplerParams",
|
|
117
|
-
"ProcessorType",
|
|
118
|
-
"RemoteValidatorParams",
|
|
119
|
-
"SamplerColumnConfig",
|
|
120
|
-
"SamplerType",
|
|
121
|
-
"SamplingStrategy",
|
|
122
|
-
"ScalarInequalityConstraint",
|
|
123
|
-
"ScipySamplerParams",
|
|
124
|
-
"Score",
|
|
125
|
-
"SeedConfig",
|
|
126
|
-
"SeedDatasetColumnConfig",
|
|
127
|
-
"SubcategorySamplerParams",
|
|
128
|
-
"TimeDeltaSamplerParams",
|
|
129
|
-
"UniformDistribution",
|
|
130
|
-
"UniformDistributionParams",
|
|
131
|
-
"UniformSamplerParams",
|
|
132
|
-
"UUIDSamplerParams",
|
|
133
|
-
"ValidationColumnConfig",
|
|
134
|
-
"ValidatorType",
|
|
135
|
-
]
|
|
136
28
|
|
|
137
|
-
__all__
|
|
29
|
+
__all__ = get_essentials_exports()
|
|
@@ -13,7 +13,6 @@ from data_designer.config.default_model_settings import (
|
|
|
13
13
|
get_default_model_providers_missing_api_keys,
|
|
14
14
|
get_default_provider_name,
|
|
15
15
|
get_default_providers,
|
|
16
|
-
resolve_seed_default_model_settings,
|
|
17
16
|
)
|
|
18
17
|
from data_designer.config.interface import DataDesignerInterface
|
|
19
18
|
from data_designer.config.models import (
|
|
@@ -31,7 +30,6 @@ from data_designer.config.utils.constants import (
|
|
|
31
30
|
)
|
|
32
31
|
from data_designer.config.utils.info import InfoType, InterfaceInfo
|
|
33
32
|
from data_designer.config.utils.io_helpers import write_seed_dataset
|
|
34
|
-
from data_designer.config.utils.misc import can_run_data_designer_locally
|
|
35
33
|
from data_designer.engine.analysis.dataset_profiler import (
|
|
36
34
|
DataDesignerDatasetProfiler,
|
|
37
35
|
DatasetProfilerConfig,
|
|
@@ -66,11 +64,6 @@ DEFAULT_BUFFER_SIZE = 1000
|
|
|
66
64
|
logger = logging.getLogger(__name__)
|
|
67
65
|
|
|
68
66
|
|
|
69
|
-
# Resolve default model settings on import to ensure they are available when the library is used.
|
|
70
|
-
if can_run_data_designer_locally():
|
|
71
|
-
resolve_seed_default_model_settings()
|
|
72
|
-
|
|
73
|
-
|
|
74
67
|
class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
75
68
|
"""Main interface for creating datasets with Data Designer.
|
|
76
69
|
|
|
@@ -256,6 +249,17 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
256
249
|
except Exception as e:
|
|
257
250
|
raise DataDesignerProfilingError(f"🛑 Error profiling preview dataset: {e}")
|
|
258
251
|
|
|
252
|
+
if builder.artifact_storage.processors_outputs_path.exists():
|
|
253
|
+
processor_artifacts = {
|
|
254
|
+
processor_config.name: pd.read_parquet(
|
|
255
|
+
builder.artifact_storage.processors_outputs_path / f"{processor_config.name}.parquet",
|
|
256
|
+
dtype_backend="pyarrow",
|
|
257
|
+
).to_dict(orient="records")
|
|
258
|
+
for processor_config in config_builder.get_processor_configs()
|
|
259
|
+
}
|
|
260
|
+
else:
|
|
261
|
+
processor_artifacts = {}
|
|
262
|
+
|
|
259
263
|
if (
|
|
260
264
|
len(processed_dataset) > 0
|
|
261
265
|
and isinstance(analysis, DatasetProfilerResults)
|
|
@@ -266,6 +270,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
266
270
|
return PreviewResults(
|
|
267
271
|
dataset=processed_dataset,
|
|
268
272
|
analysis=analysis,
|
|
273
|
+
processor_artifacts=processor_artifacts,
|
|
269
274
|
config_builder=config_builder,
|
|
270
275
|
)
|
|
271
276
|
|
|
@@ -315,18 +320,17 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
315
320
|
|
|
316
321
|
def _resolve_model_providers(self, model_providers: list[ModelProvider] | None) -> list[ModelProvider]:
|
|
317
322
|
if model_providers is None:
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
return model_providers
|
|
323
|
+
model_providers = get_default_providers()
|
|
324
|
+
missing_api_keys = get_default_model_providers_missing_api_keys()
|
|
325
|
+
if len(missing_api_keys) == len(PREDEFINED_PROVIDERS):
|
|
326
|
+
logger.warning(
|
|
327
|
+
"🚨 You are trying to use a default model provider but your API keys are missing."
|
|
328
|
+
"\n\t\t\tSet the API key for the default providers you intend to use and re-initialize the Data Designer object."
|
|
329
|
+
"\n\t\t\tAlternatively, you can provide your own model providers during Data Designer object initialization."
|
|
330
|
+
"\n\t\t\tSee https://nvidia-nemo.github.io/DataDesigner/concepts/models/model-providers/ for more information."
|
|
331
|
+
)
|
|
332
|
+
self._get_interface_info(model_providers).display(InfoType.MODEL_PROVIDERS)
|
|
333
|
+
return model_providers
|
|
330
334
|
return model_providers or []
|
|
331
335
|
|
|
332
336
|
def _create_dataset_builder(
|
|
@@ -3,12 +3,15 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
6
8
|
import pandas as pd
|
|
7
9
|
|
|
8
10
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
9
11
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
10
12
|
from data_designer.config.utils.visualization import WithRecordSamplerMixin
|
|
11
13
|
from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage
|
|
14
|
+
from data_designer.engine.dataset_builders.errors import ArtifactStorageError
|
|
12
15
|
|
|
13
16
|
|
|
14
17
|
class DatasetCreationResults(WithRecordSamplerMixin):
|
|
@@ -53,3 +56,36 @@ class DatasetCreationResults(WithRecordSamplerMixin):
|
|
|
53
56
|
A pandas DataFrame containing the full generated dataset.
|
|
54
57
|
"""
|
|
55
58
|
return self.artifact_storage.load_dataset()
|
|
59
|
+
|
|
60
|
+
def load_processor_dataset(self, processor_name: str) -> pd.DataFrame:
|
|
61
|
+
"""Load the dataset generated by a processor.
|
|
62
|
+
|
|
63
|
+
This only works for processors that write their artifacts in Parquet format.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
processor_name: The name of the processor to load the dataset from.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
A pandas DataFrame containing the dataset generated by the processor.
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
dataset = self.artifact_storage.read_parquet_files(
|
|
73
|
+
self.artifact_storage.processors_outputs_path / processor_name
|
|
74
|
+
)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
raise ArtifactStorageError(f"Failed to load dataset for processor {processor_name}: {e}")
|
|
77
|
+
|
|
78
|
+
return dataset
|
|
79
|
+
|
|
80
|
+
def get_path_to_processor_artifacts(self, processor_name: str) -> Path:
|
|
81
|
+
"""Get the path to the artifacts generated by a processor.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
processor_name: The name of the processor to load the artifact from.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
The path to the artifacts.
|
|
88
|
+
"""
|
|
89
|
+
if not self.artifact_storage.processors_outputs_path.exists():
|
|
90
|
+
raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.")
|
|
91
|
+
return self.artifact_storage.processors_outputs_path / processor_name
|
data_designer/logging.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from dataclasses import dataclass, field
|
|
5
4
|
import logging
|
|
6
|
-
from pathlib import Path
|
|
7
5
|
import random
|
|
8
6
|
import sys
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
9
|
from typing import TextIO, Union
|
|
10
10
|
|
|
11
11
|
from pythonjsonlogger import jsonlogger
|
data_designer/plugin_manager.py
CHANGED
|
@@ -6,25 +6,16 @@ from __future__ import annotations
|
|
|
6
6
|
from enum import Enum
|
|
7
7
|
from typing import TYPE_CHECKING, Type, TypeAlias
|
|
8
8
|
|
|
9
|
-
from .
|
|
9
|
+
from data_designer.plugins.plugin import PluginType
|
|
10
|
+
from data_designer.plugins.registry import PluginRegistry
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
from data_designer.plugins.plugin import Plugin
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
if can_run_data_designer_locally():
|
|
16
|
-
from data_designer.plugins.plugin import PluginType
|
|
17
|
-
from data_designer.plugins.registry import PluginRegistry
|
|
18
|
-
|
|
19
|
-
|
|
20
16
|
class PluginManager:
|
|
21
17
|
def __init__(self):
|
|
22
|
-
|
|
23
|
-
self._plugins_supported = True
|
|
24
|
-
self._plugin_registry = PluginRegistry()
|
|
25
|
-
else:
|
|
26
|
-
self._plugins_supported = False
|
|
27
|
-
self._plugin_registry = None
|
|
18
|
+
self._plugin_registry = PluginRegistry()
|
|
28
19
|
|
|
29
20
|
def get_column_generator_plugins(self) -> list[Plugin]:
|
|
30
21
|
"""Get all column generator plugins.
|
|
@@ -32,7 +23,7 @@ class PluginManager:
|
|
|
32
23
|
Returns:
|
|
33
24
|
A list of all column generator plugins.
|
|
34
25
|
"""
|
|
35
|
-
return self._plugin_registry.get_plugins(PluginType.COLUMN_GENERATOR)
|
|
26
|
+
return self._plugin_registry.get_plugins(PluginType.COLUMN_GENERATOR)
|
|
36
27
|
|
|
37
28
|
def get_column_generator_plugin_if_exists(self, plugin_name: str) -> Plugin | None:
|
|
38
29
|
"""Get a column generator plugin by name if it exists.
|
|
@@ -43,9 +34,8 @@ class PluginManager:
|
|
|
43
34
|
Returns:
|
|
44
35
|
The plugin if found, otherwise None.
|
|
45
36
|
"""
|
|
46
|
-
if self.
|
|
37
|
+
if self._plugin_registry.plugin_exists(plugin_name):
|
|
47
38
|
return self._plugin_registry.get_plugin(plugin_name)
|
|
48
|
-
return None
|
|
49
39
|
|
|
50
40
|
def get_plugin_column_types(self, enum_type: Type[Enum], required_resources: list[str] | None = None) -> list[Enum]:
|
|
51
41
|
"""Get a list of plugin column types.
|
|
@@ -58,13 +48,12 @@ class PluginManager:
|
|
|
58
48
|
A list of plugin column types.
|
|
59
49
|
"""
|
|
60
50
|
type_list = []
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
type_list.append(enum_type(plugin.name))
|
|
51
|
+
for plugin in self._plugin_registry.get_plugins(PluginType.COLUMN_GENERATOR):
|
|
52
|
+
if required_resources:
|
|
53
|
+
task_required_resources = plugin.task_cls.metadata().required_resources or []
|
|
54
|
+
if not all(resource in task_required_resources for resource in required_resources):
|
|
55
|
+
continue
|
|
56
|
+
type_list.append(enum_type(plugin.name))
|
|
68
57
|
return type_list
|
|
69
58
|
|
|
70
59
|
def inject_into_column_config_type_union(self, column_config_type: Type[TypeAlias]) -> Type[TypeAlias]:
|
|
@@ -76,8 +65,7 @@ class PluginManager:
|
|
|
76
65
|
Returns:
|
|
77
66
|
The column config type with plugins injected.
|
|
78
67
|
"""
|
|
79
|
-
|
|
80
|
-
column_config_type
|
|
81
|
-
|
|
82
|
-
)
|
|
68
|
+
column_config_type = self._plugin_registry.add_plugin_types_to_union(
|
|
69
|
+
column_config_type, PluginType.COLUMN_GENERATOR
|
|
70
|
+
)
|
|
83
71
|
return column_config_type
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from importlib.metadata import entry_points
|
|
5
4
|
import logging
|
|
6
5
|
import os
|
|
7
6
|
import threading
|
|
7
|
+
from importlib.metadata import entry_points
|
|
8
8
|
from typing import Type, TypeAlias
|
|
9
9
|
|
|
10
10
|
from typing_extensions import Self
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-designer
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: General framework for synthetic data generation
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
@@ -144,12 +144,12 @@ preview.display_sample_record()
|
|
|
144
144
|
|
|
145
145
|
### 📚 Learn more
|
|
146
146
|
|
|
147
|
-
- **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/quick-start/)** – Detailed walkthrough with more examples
|
|
148
|
-
- **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/)** – Step-by-step interactive tutorials
|
|
149
|
-
- **[Column Types](https://nvidia-nemo.github.io/DataDesigner/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
|
|
150
|
-
- **[Validators](https://nvidia-nemo.github.io/DataDesigner/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
|
|
151
|
-
- **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/models/model-configs/)** – Configure custom models and providers
|
|
152
|
-
- **[Person Sampling](https://nvidia-nemo.github.io/DataDesigner/concepts/person_sampling/)** – Learn how to sample realistic person data with demographic attributes
|
|
147
|
+
- **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/latest/quick-start/)** – Detailed walkthrough with more examples
|
|
148
|
+
- **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/)** – Step-by-step interactive tutorials
|
|
149
|
+
- **[Column Types](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
|
|
150
|
+
- **[Validators](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
|
|
151
|
+
- **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/model-configs/)** – Configure custom models and providers
|
|
152
|
+
- **[Person Sampling](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/person_sampling/)** – Learn how to sample realistic person data with demographic attributes
|
|
153
153
|
|
|
154
154
|
### 🔧 Configure models via CLI
|
|
155
155
|
|
|
@@ -161,7 +161,7 @@ data-designer config list # View current settings
|
|
|
161
161
|
|
|
162
162
|
### 🤝 Get involved
|
|
163
163
|
|
|
164
|
-
- **[Contributing Guide](https://nvidia-nemo.github.io/DataDesigner/CONTRIBUTING)** – Help improve Data Designer
|
|
164
|
+
- **[Contributing Guide](https://nvidia-nemo.github.io/DataDesigner/latest/CONTRIBUTING)** – Help improve Data Designer
|
|
165
165
|
- **[GitHub Issues](https://github.com/NVIDIA-NeMo/DataDesigner/issues)** – Report bugs or make a feature request
|
|
166
166
|
|
|
167
167
|
---
|
|
@@ -178,7 +178,7 @@ If you use NeMo Data Designer in your research, please cite it using the followi
|
|
|
178
178
|
|
|
179
179
|
```bibtex
|
|
180
180
|
@misc{nemo-data-designer,
|
|
181
|
-
author = {The NeMo Data Designer Team},
|
|
181
|
+
author = {The NeMo Data Designer Team, NVIDIA},
|
|
182
182
|
title = {NeMo Data Designer: A framework for generating synthetic data from scratch or based on your own seed data},
|
|
183
183
|
howpublished = {\url{https://github.com/NVIDIA-NeMo/DataDesigner}},
|
|
184
184
|
year = {2025},
|