data-designer 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/_version.py +2 -2
- data_designer/config/analysis/column_profilers.py +4 -4
- data_designer/config/analysis/column_statistics.py +5 -5
- data_designer/config/analysis/dataset_profiler.py +6 -6
- data_designer/config/analysis/utils/errors.py +1 -1
- data_designer/config/analysis/utils/reporting.py +5 -5
- data_designer/config/base.py +2 -2
- data_designer/config/column_configs.py +8 -8
- data_designer/config/column_types.py +9 -5
- data_designer/config/config_builder.py +32 -27
- data_designer/config/data_designer_config.py +7 -7
- data_designer/config/datastore.py +4 -4
- data_designer/config/default_model_settings.py +4 -4
- data_designer/config/errors.py +1 -1
- data_designer/config/exports.py +128 -0
- data_designer/config/interface.py +6 -6
- data_designer/config/models.py +109 -5
- data_designer/config/preview_results.py +3 -3
- data_designer/config/processors.py +2 -2
- data_designer/config/sampler_constraints.py +1 -1
- data_designer/config/sampler_params.py +2 -5
- data_designer/config/seed.py +3 -3
- data_designer/config/utils/constants.py +1 -1
- data_designer/config/utils/errors.py +1 -1
- data_designer/config/utils/info.py +8 -4
- data_designer/config/utils/io_helpers.py +5 -5
- data_designer/config/utils/misc.py +3 -3
- data_designer/config/utils/numerical_helpers.py +1 -1
- data_designer/config/utils/type_helpers.py +7 -3
- data_designer/config/utils/validation.py +5 -5
- data_designer/config/utils/visualization.py +10 -10
- data_designer/config/validator_params.py +2 -2
- data_designer/engine/analysis/column_profilers/base.py +1 -1
- data_designer/engine/analysis/dataset_profiler.py +1 -1
- data_designer/engine/analysis/utils/judge_score_processing.py +1 -1
- data_designer/engine/column_generators/generators/samplers.py +1 -1
- data_designer/engine/dataset_builders/artifact_storage.py +16 -2
- data_designer/engine/dataset_builders/column_wise_builder.py +3 -3
- data_designer/engine/dataset_builders/utils/concurrency.py +1 -1
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +1 -1
- data_designer/engine/errors.py +1 -1
- data_designer/engine/models/errors.py +1 -1
- data_designer/engine/models/facade.py +1 -1
- data_designer/engine/models/parsers/parser.py +2 -2
- data_designer/engine/models/recipes/response_recipes.py +1 -1
- data_designer/engine/processing/ginja/environment.py +1 -1
- data_designer/engine/processing/gsonschema/validators.py +1 -1
- data_designer/engine/resources/managed_dataset_repository.py +4 -4
- data_designer/engine/resources/managed_storage.py +1 -1
- data_designer/engine/sampling_gen/constraints.py +1 -1
- data_designer/engine/sampling_gen/data_sources/base.py +1 -1
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +31 -9
- data_designer/engine/sampling_gen/entities/email_address_utils.py +1 -1
- data_designer/engine/sampling_gen/entities/national_id_utils.py +1 -1
- data_designer/engine/sampling_gen/entities/person.py +1 -1
- data_designer/engine/sampling_gen/entities/phone_number.py +1 -1
- data_designer/engine/sampling_gen/people_gen.py +3 -3
- data_designer/engine/secret_resolver.py +1 -1
- data_designer/engine/validators/python.py +2 -2
- data_designer/essentials/__init__.py +20 -128
- data_designer/interface/data_designer.py +16 -20
- data_designer/logging.py +2 -2
- data_designer/plugin_manager.py +14 -26
- data_designer/plugins/registry.py +1 -1
- {data_designer-0.1.2.dist-info → data_designer-0.1.4.dist-info}/METADATA +2 -2
- {data_designer-0.1.2.dist-info → data_designer-0.1.4.dist-info}/RECORD +69 -68
- {data_designer-0.1.2.dist-info → data_designer-0.1.4.dist-info}/WHEEL +1 -1
- {data_designer-0.1.2.dist-info → data_designer-0.1.4.dist-info}/entry_points.txt +0 -0
- {data_designer-0.1.2.dist-info → data_designer-0.1.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,12 +4,12 @@
|
|
|
4
4
|
from functools import reduce
|
|
5
5
|
from typing import Optional
|
|
6
6
|
|
|
7
|
+
import marko
|
|
7
8
|
from lxml import etree
|
|
8
9
|
from lxml.etree import _Element
|
|
9
|
-
import marko
|
|
10
10
|
|
|
11
|
-
from data_designer.engine.models.parsers.postprocessors import merge_text_blocks
|
|
12
11
|
import data_designer.engine.models.parsers.tag_parsers as tp
|
|
12
|
+
from data_designer.engine.models.parsers.postprocessors import merge_text_blocks
|
|
13
13
|
from data_designer.engine.models.parsers.types import (
|
|
14
14
|
LLMStructuredResponse,
|
|
15
15
|
PostProcessor,
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from collections.abc import Callable
|
|
5
4
|
import json
|
|
5
|
+
from collections.abc import Callable
|
|
6
6
|
|
|
7
7
|
from pydantic import BaseModel
|
|
8
8
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
import re
|
|
4
5
|
from collections.abc import Callable
|
|
5
6
|
from functools import partial, wraps
|
|
6
|
-
import re
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
9
|
from jinja2 import meta
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from copy import deepcopy
|
|
5
4
|
import logging
|
|
5
|
+
from copy import deepcopy
|
|
6
6
|
from typing import Any, overload
|
|
7
7
|
|
|
8
8
|
from jsonschema import Draft202012Validator, ValidationError, validators
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from abc import ABC, abstractmethod
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from functools import cached_property
|
|
7
4
|
import logging
|
|
8
|
-
from pathlib import Path
|
|
9
5
|
import tempfile
|
|
10
6
|
import threading
|
|
11
7
|
import time
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from functools import cached_property
|
|
11
|
+
from pathlib import Path
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
14
|
import duckdb
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
import logging
|
|
4
5
|
from abc import ABC, abstractmethod
|
|
5
6
|
from collections.abc import Iterator
|
|
6
7
|
from contextlib import contextmanager
|
|
7
|
-
import logging
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import IO
|
|
10
10
|
|
|
@@ -5,8 +5,8 @@ from abc import ABC, abstractmethod
|
|
|
5
5
|
from typing import Type
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
|
-
from numpy.typing import NDArray
|
|
9
8
|
import pandas as pd
|
|
9
|
+
from numpy.typing import NDArray
|
|
10
10
|
|
|
11
11
|
from data_designer.config.base import ConfigBase
|
|
12
12
|
from data_designer.config.sampler_constraints import (
|
|
@@ -5,8 +5,8 @@ from abc import ABC, abstractmethod
|
|
|
5
5
|
from typing import Any, Generic, Optional, Type, TypeVar, Union
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
|
-
from numpy.typing import NDArray
|
|
9
8
|
import pandas as pd
|
|
9
|
+
from numpy.typing import NDArray
|
|
10
10
|
from scipy import stats
|
|
11
11
|
|
|
12
12
|
from data_designer.config.sampler_params import SamplerParamsT
|
|
@@ -14,6 +14,7 @@ REQUIRED_FIELDS = {"first_name", "last_name", "age", "locale"}
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
PII_FIELDS = [
|
|
17
|
+
# Core demographic fields
|
|
17
18
|
"uuid",
|
|
18
19
|
"first_name",
|
|
19
20
|
"middle_name",
|
|
@@ -22,25 +23,38 @@ PII_FIELDS = [
|
|
|
22
23
|
"age",
|
|
23
24
|
"birth_date",
|
|
24
25
|
"marital_status",
|
|
25
|
-
"street_name",
|
|
26
|
-
"street_number",
|
|
27
|
-
"unit",
|
|
28
26
|
"postcode",
|
|
29
|
-
"region",
|
|
30
27
|
"city",
|
|
31
|
-
"
|
|
28
|
+
"region",
|
|
32
29
|
"country",
|
|
33
|
-
"
|
|
34
|
-
"zone",
|
|
30
|
+
"locale",
|
|
35
31
|
"bachelors_field",
|
|
36
|
-
"education_degree",
|
|
37
32
|
"education_level",
|
|
38
33
|
"occupation",
|
|
39
|
-
"
|
|
34
|
+
"national_id",
|
|
35
|
+
# US-specific fields
|
|
36
|
+
"street_name",
|
|
37
|
+
"street_number",
|
|
38
|
+
"unit",
|
|
39
|
+
"state",
|
|
40
|
+
"email_address",
|
|
41
|
+
"phone_number",
|
|
42
|
+
# Japan-specific fields
|
|
43
|
+
"area",
|
|
44
|
+
"prefecture",
|
|
45
|
+
"zone",
|
|
46
|
+
# India-specific fields
|
|
47
|
+
"district",
|
|
48
|
+
"religion",
|
|
49
|
+
"education_degree",
|
|
50
|
+
"first_language",
|
|
51
|
+
"second_language",
|
|
52
|
+
"third_language",
|
|
40
53
|
]
|
|
41
54
|
|
|
42
55
|
|
|
43
56
|
PERSONA_FIELDS = [
|
|
57
|
+
# Core persona fields
|
|
44
58
|
"persona",
|
|
45
59
|
"career_goals_and_ambitions",
|
|
46
60
|
"arts_persona",
|
|
@@ -61,4 +75,12 @@ PERSONA_FIELDS = [
|
|
|
61
75
|
"extraversion",
|
|
62
76
|
"agreeableness",
|
|
63
77
|
"neuroticism",
|
|
78
|
+
# Japan-specific persona fields
|
|
79
|
+
"aspects",
|
|
80
|
+
"digital_skills",
|
|
81
|
+
# India-specific persona fields
|
|
82
|
+
"linguistic_persona",
|
|
83
|
+
"religious_persona",
|
|
84
|
+
"linguistic_background",
|
|
85
|
+
"religious_background",
|
|
64
86
|
]
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from datetime import date, timedelta
|
|
5
4
|
import random
|
|
5
|
+
from datetime import date, timedelta
|
|
6
6
|
from typing import Any, Literal, TypeAlias
|
|
7
7
|
|
|
8
8
|
from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from pathlib import Path
|
|
5
4
|
import random
|
|
5
|
+
from pathlib import Path
|
|
6
6
|
from typing import Optional
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
@@ -3,15 +3,15 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
import random
|
|
7
|
+
import uuid
|
|
6
8
|
from abc import ABC, abstractmethod
|
|
7
9
|
from collections.abc import Callable
|
|
8
10
|
from copy import deepcopy
|
|
9
|
-
import random
|
|
10
11
|
from typing import TYPE_CHECKING, Any, Union
|
|
11
|
-
import uuid
|
|
12
12
|
|
|
13
|
-
from faker import Faker
|
|
14
13
|
import pandas as pd
|
|
14
|
+
from faker import Faker
|
|
15
15
|
|
|
16
16
|
from data_designer.config.utils.constants import AVAILABLE_LOCALES, DEFAULT_AGE_RANGE
|
|
17
17
|
from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from collections.abc import Sequence
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
6
|
import os
|
|
7
|
+
from collections.abc import Sequence
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Protocol
|
|
10
10
|
|
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
4
|
import ast
|
|
5
|
-
from collections import defaultdict
|
|
6
5
|
import logging
|
|
7
|
-
from pathlib import Path
|
|
8
6
|
import re
|
|
9
7
|
import subprocess
|
|
10
8
|
import tempfile
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from pathlib import Path
|
|
11
11
|
from uuid import uuid4
|
|
12
12
|
|
|
13
13
|
import pandas as pd
|
|
@@ -1,137 +1,29 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
3
|
+
|
|
4
|
+
from data_designer.config.default_model_settings import resolve_seed_default_model_settings
|
|
5
|
+
from data_designer.config.exports import * # noqa: F403
|
|
6
|
+
from data_designer.config.validator_params import LocalCallableValidatorParams
|
|
7
|
+
from data_designer.interface.data_designer import DataDesigner
|
|
8
|
+
from data_designer.logging import LoggingConfig, configure_logging
|
|
4
9
|
|
|
5
10
|
configure_logging(LoggingConfig.default())
|
|
6
11
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
LLMCodeColumnConfig,
|
|
11
|
-
LLMJudgeColumnConfig,
|
|
12
|
-
LLMStructuredColumnConfig,
|
|
13
|
-
LLMTextColumnConfig,
|
|
14
|
-
SamplerColumnConfig,
|
|
15
|
-
Score,
|
|
16
|
-
SeedDatasetColumnConfig,
|
|
17
|
-
ValidationColumnConfig,
|
|
18
|
-
)
|
|
19
|
-
from ..config.column_types import DataDesignerColumnType
|
|
20
|
-
from ..config.config_builder import DataDesignerConfigBuilder
|
|
21
|
-
from ..config.data_designer_config import DataDesignerConfig
|
|
22
|
-
from ..config.dataset_builders import BuildStage
|
|
23
|
-
from ..config.datastore import DatastoreSettings
|
|
24
|
-
from ..config.models import (
|
|
25
|
-
ImageContext,
|
|
26
|
-
ImageFormat,
|
|
27
|
-
InferenceParameters,
|
|
28
|
-
ManualDistribution,
|
|
29
|
-
ManualDistributionParams,
|
|
30
|
-
Modality,
|
|
31
|
-
ModalityContext,
|
|
32
|
-
ModalityDataType,
|
|
33
|
-
ModelConfig,
|
|
34
|
-
UniformDistribution,
|
|
35
|
-
UniformDistributionParams,
|
|
36
|
-
)
|
|
37
|
-
from ..config.processors import DropColumnsProcessorConfig, ProcessorType
|
|
38
|
-
from ..config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint
|
|
39
|
-
from ..config.sampler_params import (
|
|
40
|
-
BernoulliMixtureSamplerParams,
|
|
41
|
-
BernoulliSamplerParams,
|
|
42
|
-
BinomialSamplerParams,
|
|
43
|
-
CategorySamplerParams,
|
|
44
|
-
DatetimeSamplerParams,
|
|
45
|
-
GaussianSamplerParams,
|
|
46
|
-
PersonFromFakerSamplerParams,
|
|
47
|
-
PersonSamplerParams,
|
|
48
|
-
PoissonSamplerParams,
|
|
49
|
-
SamplerType,
|
|
50
|
-
ScipySamplerParams,
|
|
51
|
-
SubcategorySamplerParams,
|
|
52
|
-
TimeDeltaSamplerParams,
|
|
53
|
-
UniformSamplerParams,
|
|
54
|
-
UUIDSamplerParams,
|
|
55
|
-
)
|
|
56
|
-
from ..config.seed import DatastoreSeedDatasetReference, IndexRange, PartitionBlock, SamplingStrategy, SeedConfig
|
|
57
|
-
from ..config.utils.code_lang import CodeLang
|
|
58
|
-
from ..config.utils.info import InfoType
|
|
59
|
-
from ..config.utils.misc import can_run_data_designer_locally
|
|
60
|
-
from ..config.validator_params import (
|
|
61
|
-
CodeValidatorParams,
|
|
62
|
-
RemoteValidatorParams,
|
|
63
|
-
ValidatorType,
|
|
64
|
-
)
|
|
12
|
+
# Resolve default model settings on import to ensure they are available when the library is used.
|
|
13
|
+
resolve_seed_default_model_settings()
|
|
14
|
+
|
|
65
15
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
16
|
+
def get_essentials_exports() -> list[str]:
|
|
17
|
+
logging = [
|
|
18
|
+
configure_logging.__name__,
|
|
19
|
+
LoggingConfig.__name__,
|
|
20
|
+
]
|
|
21
|
+
local = [
|
|
22
|
+
DataDesigner.__name__,
|
|
23
|
+
LocalCallableValidatorParams.__name__,
|
|
24
|
+
]
|
|
72
25
|
|
|
73
|
-
|
|
74
|
-
except ModuleNotFoundError:
|
|
75
|
-
pass
|
|
26
|
+
return logging + local + get_config_exports() # noqa: F405
|
|
76
27
|
|
|
77
|
-
__all__ = [
|
|
78
|
-
"BernoulliMixtureSamplerParams",
|
|
79
|
-
"BernoulliSamplerParams",
|
|
80
|
-
"BinomialSamplerParams",
|
|
81
|
-
"CategorySamplerParams",
|
|
82
|
-
"CodeLang",
|
|
83
|
-
"CodeValidatorParams",
|
|
84
|
-
"ColumnInequalityConstraint",
|
|
85
|
-
"configure_logging",
|
|
86
|
-
"DataDesignerColumnType",
|
|
87
|
-
"DataDesignerConfig",
|
|
88
|
-
"DataDesignerConfigBuilder",
|
|
89
|
-
"BuildStage",
|
|
90
|
-
"DatastoreSeedDatasetReference",
|
|
91
|
-
"DatastoreSettings",
|
|
92
|
-
"DatetimeSamplerParams",
|
|
93
|
-
"DropColumnsProcessorConfig",
|
|
94
|
-
"ExpressionColumnConfig",
|
|
95
|
-
"GaussianSamplerParams",
|
|
96
|
-
"IndexRange",
|
|
97
|
-
"InfoType",
|
|
98
|
-
"ImageContext",
|
|
99
|
-
"ImageFormat",
|
|
100
|
-
"InferenceParameters",
|
|
101
|
-
"JudgeScoreProfilerConfig",
|
|
102
|
-
"LLMCodeColumnConfig",
|
|
103
|
-
"LLMJudgeColumnConfig",
|
|
104
|
-
"LLMStructuredColumnConfig",
|
|
105
|
-
"LLMTextColumnConfig",
|
|
106
|
-
"LoggingConfig",
|
|
107
|
-
"ManualDistribution",
|
|
108
|
-
"ManualDistributionParams",
|
|
109
|
-
"Modality",
|
|
110
|
-
"ModalityContext",
|
|
111
|
-
"ModalityDataType",
|
|
112
|
-
"ModelConfig",
|
|
113
|
-
"PartitionBlock",
|
|
114
|
-
"PersonSamplerParams",
|
|
115
|
-
"PersonFromFakerSamplerParams",
|
|
116
|
-
"PoissonSamplerParams",
|
|
117
|
-
"ProcessorType",
|
|
118
|
-
"RemoteValidatorParams",
|
|
119
|
-
"SamplerColumnConfig",
|
|
120
|
-
"SamplerType",
|
|
121
|
-
"SamplingStrategy",
|
|
122
|
-
"ScalarInequalityConstraint",
|
|
123
|
-
"ScipySamplerParams",
|
|
124
|
-
"Score",
|
|
125
|
-
"SeedConfig",
|
|
126
|
-
"SeedDatasetColumnConfig",
|
|
127
|
-
"SubcategorySamplerParams",
|
|
128
|
-
"TimeDeltaSamplerParams",
|
|
129
|
-
"UniformDistribution",
|
|
130
|
-
"UniformDistributionParams",
|
|
131
|
-
"UniformSamplerParams",
|
|
132
|
-
"UUIDSamplerParams",
|
|
133
|
-
"ValidationColumnConfig",
|
|
134
|
-
"ValidatorType",
|
|
135
|
-
]
|
|
136
28
|
|
|
137
|
-
__all__
|
|
29
|
+
__all__ = get_essentials_exports()
|
|
@@ -13,7 +13,6 @@ from data_designer.config.default_model_settings import (
|
|
|
13
13
|
get_default_model_providers_missing_api_keys,
|
|
14
14
|
get_default_provider_name,
|
|
15
15
|
get_default_providers,
|
|
16
|
-
resolve_seed_default_model_settings,
|
|
17
16
|
)
|
|
18
17
|
from data_designer.config.interface import DataDesignerInterface
|
|
19
18
|
from data_designer.config.models import (
|
|
@@ -31,7 +30,6 @@ from data_designer.config.utils.constants import (
|
|
|
31
30
|
)
|
|
32
31
|
from data_designer.config.utils.info import InfoType, InterfaceInfo
|
|
33
32
|
from data_designer.config.utils.io_helpers import write_seed_dataset
|
|
34
|
-
from data_designer.config.utils.misc import can_run_data_designer_locally
|
|
35
33
|
from data_designer.engine.analysis.dataset_profiler import (
|
|
36
34
|
DataDesignerDatasetProfiler,
|
|
37
35
|
DatasetProfilerConfig,
|
|
@@ -66,11 +64,6 @@ DEFAULT_BUFFER_SIZE = 1000
|
|
|
66
64
|
logger = logging.getLogger(__name__)
|
|
67
65
|
|
|
68
66
|
|
|
69
|
-
# Resolve default model settings on import to ensure they are available when the library is used.
|
|
70
|
-
if can_run_data_designer_locally():
|
|
71
|
-
resolve_seed_default_model_settings()
|
|
72
|
-
|
|
73
|
-
|
|
74
67
|
class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
75
68
|
"""Main interface for creating datasets with Data Designer.
|
|
76
69
|
|
|
@@ -173,7 +166,11 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
173
166
|
configuration (columns, constraints, seed data, etc.).
|
|
174
167
|
num_records: Number of records to generate.
|
|
175
168
|
dataset_name: Name of the dataset. This name will be used as the dataset
|
|
176
|
-
folder name in the artifact path directory.
|
|
169
|
+
folder name in the artifact path directory. If a non-empty directory with the
|
|
170
|
+
same name already exists, dataset will be saved to a new directory with
|
|
171
|
+
a datetime stamp. For example, if the dataset name is "awesome_dataset" and a directory
|
|
172
|
+
with the same name already exists, the dataset will be saved to a new directory
|
|
173
|
+
with the name "awesome_dataset_2025-01-01_12-00-00".
|
|
177
174
|
|
|
178
175
|
Returns:
|
|
179
176
|
DatasetCreationResults object with methods for loading the generated dataset,
|
|
@@ -311,18 +308,17 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
311
308
|
|
|
312
309
|
def _resolve_model_providers(self, model_providers: list[ModelProvider] | None) -> list[ModelProvider]:
|
|
313
310
|
if model_providers is None:
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
return model_providers
|
|
311
|
+
model_providers = get_default_providers()
|
|
312
|
+
missing_api_keys = get_default_model_providers_missing_api_keys()
|
|
313
|
+
if len(missing_api_keys) == len(PREDEFINED_PROVIDERS):
|
|
314
|
+
logger.warning(
|
|
315
|
+
"🚨 You are trying to use a default model provider but your API keys are missing."
|
|
316
|
+
"\n\t\t\tSet the API key for the default providers you intend to use and re-initialize the Data Designer object."
|
|
317
|
+
"\n\t\t\tAlternatively, you can provide your own model providers during Data Designer object initialization."
|
|
318
|
+
"\n\t\t\tSee https://nvidia-nemo.github.io/DataDesigner/concepts/models/model-providers/ for more information."
|
|
319
|
+
)
|
|
320
|
+
self._get_interface_info(model_providers).display(InfoType.MODEL_PROVIDERS)
|
|
321
|
+
return model_providers
|
|
326
322
|
return model_providers or []
|
|
327
323
|
|
|
328
324
|
def _create_dataset_builder(
|
data_designer/logging.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from dataclasses import dataclass, field
|
|
5
4
|
import logging
|
|
6
|
-
from pathlib import Path
|
|
7
5
|
import random
|
|
8
6
|
import sys
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
9
|
from typing import TextIO, Union
|
|
10
10
|
|
|
11
11
|
from pythonjsonlogger import jsonlogger
|
data_designer/plugin_manager.py
CHANGED
|
@@ -6,25 +6,16 @@ from __future__ import annotations
|
|
|
6
6
|
from enum import Enum
|
|
7
7
|
from typing import TYPE_CHECKING, Type, TypeAlias
|
|
8
8
|
|
|
9
|
-
from .
|
|
9
|
+
from data_designer.plugins.plugin import PluginType
|
|
10
|
+
from data_designer.plugins.registry import PluginRegistry
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
from data_designer.plugins.plugin import Plugin
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
if can_run_data_designer_locally():
|
|
16
|
-
from data_designer.plugins.plugin import PluginType
|
|
17
|
-
from data_designer.plugins.registry import PluginRegistry
|
|
18
|
-
|
|
19
|
-
|
|
20
16
|
class PluginManager:
|
|
21
17
|
def __init__(self):
|
|
22
|
-
|
|
23
|
-
self._plugins_supported = True
|
|
24
|
-
self._plugin_registry = PluginRegistry()
|
|
25
|
-
else:
|
|
26
|
-
self._plugins_supported = False
|
|
27
|
-
self._plugin_registry = None
|
|
18
|
+
self._plugin_registry = PluginRegistry()
|
|
28
19
|
|
|
29
20
|
def get_column_generator_plugins(self) -> list[Plugin]:
|
|
30
21
|
"""Get all column generator plugins.
|
|
@@ -32,7 +23,7 @@ class PluginManager:
|
|
|
32
23
|
Returns:
|
|
33
24
|
A list of all column generator plugins.
|
|
34
25
|
"""
|
|
35
|
-
return self._plugin_registry.get_plugins(PluginType.COLUMN_GENERATOR)
|
|
26
|
+
return self._plugin_registry.get_plugins(PluginType.COLUMN_GENERATOR)
|
|
36
27
|
|
|
37
28
|
def get_column_generator_plugin_if_exists(self, plugin_name: str) -> Plugin | None:
|
|
38
29
|
"""Get a column generator plugin by name if it exists.
|
|
@@ -43,9 +34,8 @@ class PluginManager:
|
|
|
43
34
|
Returns:
|
|
44
35
|
The plugin if found, otherwise None.
|
|
45
36
|
"""
|
|
46
|
-
if self.
|
|
37
|
+
if self._plugin_registry.plugin_exists(plugin_name):
|
|
47
38
|
return self._plugin_registry.get_plugin(plugin_name)
|
|
48
|
-
return None
|
|
49
39
|
|
|
50
40
|
def get_plugin_column_types(self, enum_type: Type[Enum], required_resources: list[str] | None = None) -> list[Enum]:
|
|
51
41
|
"""Get a list of plugin column types.
|
|
@@ -58,13 +48,12 @@ class PluginManager:
|
|
|
58
48
|
A list of plugin column types.
|
|
59
49
|
"""
|
|
60
50
|
type_list = []
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
type_list.append(enum_type(plugin.name))
|
|
51
|
+
for plugin in self._plugin_registry.get_plugins(PluginType.COLUMN_GENERATOR):
|
|
52
|
+
if required_resources:
|
|
53
|
+
task_required_resources = plugin.task_cls.metadata().required_resources or []
|
|
54
|
+
if not all(resource in task_required_resources for resource in required_resources):
|
|
55
|
+
continue
|
|
56
|
+
type_list.append(enum_type(plugin.name))
|
|
68
57
|
return type_list
|
|
69
58
|
|
|
70
59
|
def inject_into_column_config_type_union(self, column_config_type: Type[TypeAlias]) -> Type[TypeAlias]:
|
|
@@ -76,8 +65,7 @@ class PluginManager:
|
|
|
76
65
|
Returns:
|
|
77
66
|
The column config type with plugins injected.
|
|
78
67
|
"""
|
|
79
|
-
|
|
80
|
-
column_config_type
|
|
81
|
-
|
|
82
|
-
)
|
|
68
|
+
column_config_type = self._plugin_registry.add_plugin_types_to_union(
|
|
69
|
+
column_config_type, PluginType.COLUMN_GENERATOR
|
|
70
|
+
)
|
|
83
71
|
return column_config_type
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
-
from importlib.metadata import entry_points
|
|
5
4
|
import logging
|
|
6
5
|
import os
|
|
7
6
|
import threading
|
|
7
|
+
from importlib.metadata import entry_points
|
|
8
8
|
from typing import Type, TypeAlias
|
|
9
9
|
|
|
10
10
|
from typing_extensions import Self
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-designer
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: General framework for synthetic data generation
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
License-File: LICENSE
|
|
@@ -148,7 +148,7 @@ preview.display_sample_record()
|
|
|
148
148
|
- **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/)** – Step-by-step interactive tutorials
|
|
149
149
|
- **[Column Types](https://nvidia-nemo.github.io/DataDesigner/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
|
|
150
150
|
- **[Validators](https://nvidia-nemo.github.io/DataDesigner/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
|
|
151
|
-
- **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/models/model-configs/)** – Configure custom models and providers
|
|
151
|
+
- **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/concepts/models/model-configs/)** – Configure custom models and providers
|
|
152
152
|
- **[Person Sampling](https://nvidia-nemo.github.io/DataDesigner/concepts/person_sampling/)** – Learn how to sample realistic person data with demographic attributes
|
|
153
153
|
|
|
154
154
|
### 🔧 Configure models via CLI
|