data-designer 0.3.8rc1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/cli/commands/__init__.py +1 -1
- data_designer/interface/__init__.py +21 -1
- data_designer/{_version.py → interface/_version.py} +2 -2
- data_designer/interface/data_designer.py +8 -11
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
- data_designer-0.4.0.dist-info/RECORD +39 -0
- data_designer/__init__.py +0 -17
- data_designer/config/__init__.py +0 -2
- data_designer/config/analysis/__init__.py +0 -2
- data_designer/config/analysis/column_profilers.py +0 -159
- data_designer/config/analysis/column_statistics.py +0 -421
- data_designer/config/analysis/dataset_profiler.py +0 -84
- data_designer/config/analysis/utils/errors.py +0 -10
- data_designer/config/analysis/utils/reporting.py +0 -192
- data_designer/config/base.py +0 -69
- data_designer/config/column_configs.py +0 -470
- data_designer/config/column_types.py +0 -141
- data_designer/config/config_builder.py +0 -595
- data_designer/config/data_designer_config.py +0 -40
- data_designer/config/dataset_builders.py +0 -13
- data_designer/config/dataset_metadata.py +0 -18
- data_designer/config/default_model_settings.py +0 -121
- data_designer/config/errors.py +0 -24
- data_designer/config/exports.py +0 -145
- data_designer/config/interface.py +0 -55
- data_designer/config/models.py +0 -455
- data_designer/config/preview_results.py +0 -41
- data_designer/config/processors.py +0 -148
- data_designer/config/run_config.py +0 -48
- data_designer/config/sampler_constraints.py +0 -52
- data_designer/config/sampler_params.py +0 -639
- data_designer/config/seed.py +0 -116
- data_designer/config/seed_source.py +0 -84
- data_designer/config/seed_source_types.py +0 -19
- data_designer/config/utils/code_lang.py +0 -82
- data_designer/config/utils/constants.py +0 -363
- data_designer/config/utils/errors.py +0 -21
- data_designer/config/utils/info.py +0 -94
- data_designer/config/utils/io_helpers.py +0 -258
- data_designer/config/utils/misc.py +0 -78
- data_designer/config/utils/numerical_helpers.py +0 -30
- data_designer/config/utils/type_helpers.py +0 -106
- data_designer/config/utils/visualization.py +0 -482
- data_designer/config/validator_params.py +0 -94
- data_designer/engine/__init__.py +0 -2
- data_designer/engine/analysis/column_profilers/base.py +0 -49
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
- data_designer/engine/analysis/column_profilers/registry.py +0 -22
- data_designer/engine/analysis/column_statistics.py +0 -145
- data_designer/engine/analysis/dataset_profiler.py +0 -149
- data_designer/engine/analysis/errors.py +0 -9
- data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
- data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
- data_designer/engine/column_generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/__init__.py +0 -2
- data_designer/engine/column_generators/generators/base.py +0 -122
- data_designer/engine/column_generators/generators/embedding.py +0 -35
- data_designer/engine/column_generators/generators/expression.py +0 -55
- data_designer/engine/column_generators/generators/llm_completion.py +0 -113
- data_designer/engine/column_generators/generators/samplers.py +0 -69
- data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
- data_designer/engine/column_generators/generators/validation.py +0 -140
- data_designer/engine/column_generators/registry.py +0 -60
- data_designer/engine/column_generators/utils/errors.py +0 -15
- data_designer/engine/column_generators/utils/generator_classification.py +0 -43
- data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
- data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
- data_designer/engine/compiler.py +0 -97
- data_designer/engine/configurable_task.py +0 -71
- data_designer/engine/dataset_builders/artifact_storage.py +0 -283
- data_designer/engine/dataset_builders/column_wise_builder.py +0 -338
- data_designer/engine/dataset_builders/errors.py +0 -15
- data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
- data_designer/engine/dataset_builders/utils/__init__.py +0 -2
- data_designer/engine/dataset_builders/utils/concurrency.py +0 -215
- data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
- data_designer/engine/dataset_builders/utils/dag.py +0 -62
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
- data_designer/engine/dataset_builders/utils/errors.py +0 -15
- data_designer/engine/errors.py +0 -51
- data_designer/engine/model_provider.py +0 -77
- data_designer/engine/models/__init__.py +0 -2
- data_designer/engine/models/errors.py +0 -300
- data_designer/engine/models/facade.py +0 -287
- data_designer/engine/models/factory.py +0 -42
- data_designer/engine/models/litellm_overrides.py +0 -179
- data_designer/engine/models/parsers/__init__.py +0 -2
- data_designer/engine/models/parsers/errors.py +0 -34
- data_designer/engine/models/parsers/parser.py +0 -235
- data_designer/engine/models/parsers/postprocessors.py +0 -93
- data_designer/engine/models/parsers/tag_parsers.py +0 -62
- data_designer/engine/models/parsers/types.py +0 -84
- data_designer/engine/models/recipes/base.py +0 -81
- data_designer/engine/models/recipes/response_recipes.py +0 -293
- data_designer/engine/models/registry.py +0 -146
- data_designer/engine/models/telemetry.py +0 -359
- data_designer/engine/models/usage.py +0 -73
- data_designer/engine/models/utils.py +0 -38
- data_designer/engine/processing/ginja/__init__.py +0 -2
- data_designer/engine/processing/ginja/ast.py +0 -65
- data_designer/engine/processing/ginja/environment.py +0 -463
- data_designer/engine/processing/ginja/exceptions.py +0 -56
- data_designer/engine/processing/ginja/record.py +0 -32
- data_designer/engine/processing/gsonschema/__init__.py +0 -2
- data_designer/engine/processing/gsonschema/exceptions.py +0 -15
- data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
- data_designer/engine/processing/gsonschema/types.py +0 -10
- data_designer/engine/processing/gsonschema/validators.py +0 -202
- data_designer/engine/processing/processors/base.py +0 -13
- data_designer/engine/processing/processors/drop_columns.py +0 -42
- data_designer/engine/processing/processors/registry.py +0 -25
- data_designer/engine/processing/processors/schema_transform.py +0 -49
- data_designer/engine/processing/utils.py +0 -169
- data_designer/engine/registry/base.py +0 -99
- data_designer/engine/registry/data_designer_registry.py +0 -39
- data_designer/engine/registry/errors.py +0 -12
- data_designer/engine/resources/managed_dataset_generator.py +0 -39
- data_designer/engine/resources/managed_dataset_repository.py +0 -197
- data_designer/engine/resources/managed_storage.py +0 -65
- data_designer/engine/resources/resource_provider.py +0 -77
- data_designer/engine/resources/seed_reader.py +0 -154
- data_designer/engine/sampling_gen/column.py +0 -91
- data_designer/engine/sampling_gen/constraints.py +0 -100
- data_designer/engine/sampling_gen/data_sources/base.py +0 -217
- data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
- data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
- data_designer/engine/sampling_gen/entities/__init__.py +0 -2
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
- data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
- data_designer/engine/sampling_gen/entities/errors.py +0 -10
- data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
- data_designer/engine/sampling_gen/entities/person.py +0 -144
- data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
- data_designer/engine/sampling_gen/errors.py +0 -26
- data_designer/engine/sampling_gen/generator.py +0 -122
- data_designer/engine/sampling_gen/jinja_utils.py +0 -64
- data_designer/engine/sampling_gen/people_gen.py +0 -199
- data_designer/engine/sampling_gen/person_constants.py +0 -56
- data_designer/engine/sampling_gen/schema.py +0 -147
- data_designer/engine/sampling_gen/schema_builder.py +0 -61
- data_designer/engine/sampling_gen/utils.py +0 -46
- data_designer/engine/secret_resolver.py +0 -82
- data_designer/engine/validation.py +0 -367
- data_designer/engine/validators/__init__.py +0 -19
- data_designer/engine/validators/base.py +0 -38
- data_designer/engine/validators/local_callable.py +0 -39
- data_designer/engine/validators/python.py +0 -254
- data_designer/engine/validators/remote.py +0 -89
- data_designer/engine/validators/sql.py +0 -65
- data_designer/errors.py +0 -7
- data_designer/essentials/__init__.py +0 -33
- data_designer/lazy_heavy_imports.py +0 -54
- data_designer/logging.py +0 -163
- data_designer/plugin_manager.py +0 -78
- data_designer/plugins/__init__.py +0 -8
- data_designer/plugins/errors.py +0 -15
- data_designer/plugins/plugin.py +0 -141
- data_designer/plugins/registry.py +0 -88
- data_designer/plugins/testing/__init__.py +0 -10
- data_designer/plugins/testing/stubs.py +0 -116
- data_designer/plugins/testing/utils.py +0 -20
- data_designer-0.3.8rc1.dist-info/RECORD +0 -196
- data_designer-0.3.8rc1.dist-info/licenses/LICENSE +0 -201
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
- {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c)
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
@@ -1,2 +1,22 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c)
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from data_designer.config.default_model_settings import resolve_seed_default_model_settings
|
|
5
|
+
from data_designer.interface.data_designer import DataDesigner
|
|
6
|
+
from data_designer.interface.errors import (
|
|
7
|
+
DataDesignerGenerationError,
|
|
8
|
+
DataDesignerProfilingError,
|
|
9
|
+
)
|
|
10
|
+
from data_designer.interface.results import DatasetCreationResults
|
|
11
|
+
from data_designer.logging import configure_logging
|
|
12
|
+
|
|
13
|
+
configure_logging()
|
|
14
|
+
resolve_seed_default_model_settings()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"DataDesigner",
|
|
19
|
+
"DataDesignerGenerationError",
|
|
20
|
+
"DataDesignerProfilingError",
|
|
21
|
+
"DatasetCreationResults",
|
|
22
|
+
]
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0,
|
|
31
|
+
__version__ = version = '0.4.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 4, 0)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -12,9 +12,9 @@ from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
|
12
12
|
from data_designer.config.data_designer_config import DataDesignerConfig
|
|
13
13
|
from data_designer.config.default_model_settings import (
|
|
14
14
|
get_default_model_configs,
|
|
15
|
-
get_default_model_providers_missing_api_keys,
|
|
16
15
|
get_default_provider_name,
|
|
17
16
|
get_default_providers,
|
|
17
|
+
get_providers_with_missing_api_keys,
|
|
18
18
|
)
|
|
19
19
|
from data_designer.config.interface import DataDesignerInterface
|
|
20
20
|
from data_designer.config.models import (
|
|
@@ -28,7 +28,6 @@ from data_designer.config.utils.constants import (
|
|
|
28
28
|
MANAGED_ASSETS_PATH,
|
|
29
29
|
MODEL_CONFIGS_FILE_PATH,
|
|
30
30
|
MODEL_PROVIDERS_FILE_PATH,
|
|
31
|
-
PREDEFINED_PROVIDERS,
|
|
32
31
|
)
|
|
33
32
|
from data_designer.config.utils.info import InfoType, InterfaceInfo
|
|
34
33
|
from data_designer.engine.analysis.dataset_profiler import DataDesignerDatasetProfiler, DatasetProfilerConfig
|
|
@@ -317,13 +316,8 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
317
316
|
|
|
318
317
|
Args:
|
|
319
318
|
run_config: A RunConfig instance containing runtime settings such as
|
|
320
|
-
early shutdown behavior
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
Example:
|
|
324
|
-
>>> from data_designer.essentials import DataDesigner, RunConfig
|
|
325
|
-
>>> dd = DataDesigner()
|
|
326
|
-
>>> dd.set_run_config(RunConfig(disable_early_shutdown=True))
|
|
319
|
+
early shutdown behavior, batch sizing via `buffer_size`, and non-inference worker
|
|
320
|
+
concurrency via `non_inference_max_parallel_workers`.
|
|
327
321
|
|
|
328
322
|
Notes:
|
|
329
323
|
When `disable_early_shutdown=True`, DataDesigner will never terminate generation early
|
|
@@ -334,8 +328,11 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
334
328
|
def _resolve_model_providers(self, model_providers: list[ModelProvider] | None) -> list[ModelProvider]:
|
|
335
329
|
if model_providers is None:
|
|
336
330
|
model_providers = get_default_providers()
|
|
337
|
-
|
|
338
|
-
|
|
331
|
+
# Check which providers have missing API keys (from YAML file or env vars)
|
|
332
|
+
providers_with_missing_keys = get_providers_with_missing_api_keys(model_providers)
|
|
333
|
+
|
|
334
|
+
if len(providers_with_missing_keys) == len(model_providers):
|
|
335
|
+
# All providers have missing API keys
|
|
339
336
|
logger.warning(
|
|
340
337
|
"🚨 You are trying to use a default model provider but your API keys are missing."
|
|
341
338
|
"\n\t\t\tSet the API key for the default providers you intend to use and re-initialize the Data Designer object."
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: data-designer
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: General framework for synthetic data generation
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
|
-
License-File: LICENSE
|
|
7
6
|
Classifier: Development Status :: 4 - Beta
|
|
8
7
|
Classifier: Intended Audience :: Developers
|
|
9
8
|
Classifier: Intended Audience :: Science/Research
|
|
@@ -15,33 +14,9 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
15
14
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
15
|
Classifier: Topic :: Software Development
|
|
17
16
|
Requires-Python: >=3.10
|
|
18
|
-
Requires-Dist:
|
|
19
|
-
Requires-Dist:
|
|
20
|
-
Requires-Dist: faker<21,>=20.1.0
|
|
21
|
-
Requires-Dist: httpx-retries<1,>=0.4.2
|
|
22
|
-
Requires-Dist: httpx<1,>=0.27.2
|
|
23
|
-
Requires-Dist: huggingface-hub<2,>=1.0.1
|
|
24
|
-
Requires-Dist: jinja2<4,>=3.1.6
|
|
25
|
-
Requires-Dist: json-repair<1,>=0.48.0
|
|
26
|
-
Requires-Dist: jsonpath-rust-bindings<2,>=1.0
|
|
27
|
-
Requires-Dist: litellm<1.80.12,>=1.73.6
|
|
28
|
-
Requires-Dist: lxml<7,>=6.0.2
|
|
29
|
-
Requires-Dist: marko<3,>=2.1.2
|
|
30
|
-
Requires-Dist: networkx<4,>=3.0
|
|
31
|
-
Requires-Dist: numpy<3,>=1.23.5
|
|
32
|
-
Requires-Dist: pandas<3,>=2.3.3
|
|
17
|
+
Requires-Dist: data-designer-config
|
|
18
|
+
Requires-Dist: data-designer-engine
|
|
33
19
|
Requires-Dist: prompt-toolkit<4,>=3.0.0
|
|
34
|
-
Requires-Dist: pyarrow<20,>=19.0.1
|
|
35
|
-
Requires-Dist: pydantic[email]<3,>=2.9.2
|
|
36
|
-
Requires-Dist: pygments<3,>=2.19.2
|
|
37
|
-
Requires-Dist: python-json-logger<4,>=3
|
|
38
|
-
Requires-Dist: pyyaml<7,>=6.0.1
|
|
39
|
-
Requires-Dist: requests<3,>=2.32.2
|
|
40
|
-
Requires-Dist: rich<15,>=13.7.1
|
|
41
|
-
Requires-Dist: ruff<1,>=0.14.10
|
|
42
|
-
Requires-Dist: scipy<2,>=1.11.0
|
|
43
|
-
Requires-Dist: sqlfluff<4,>=3.2.0
|
|
44
|
-
Requires-Dist: tiktoken<1,>=0.8.0
|
|
45
20
|
Requires-Dist: typer<1,>=0.12.0
|
|
46
21
|
Description-Content-Type: text/markdown
|
|
47
22
|
|
|
@@ -104,26 +79,19 @@ export OPENROUTER_API_KEY="your-openrouter-api-key-here"
|
|
|
104
79
|
|
|
105
80
|
### 3. Start generating data!
|
|
106
81
|
```python
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
DataDesigner,
|
|
110
|
-
DataDesignerConfigBuilder,
|
|
111
|
-
LLMTextColumnConfig,
|
|
112
|
-
PersonSamplerParams,
|
|
113
|
-
SamplerColumnConfig,
|
|
114
|
-
SamplerType,
|
|
115
|
-
)
|
|
82
|
+
import data_designer.config as dd
|
|
83
|
+
from data_designer.interface import DataDesigner
|
|
116
84
|
|
|
117
85
|
# Initialize with default settings
|
|
118
86
|
data_designer = DataDesigner()
|
|
119
|
-
config_builder = DataDesignerConfigBuilder()
|
|
87
|
+
config_builder = dd.DataDesignerConfigBuilder()
|
|
120
88
|
|
|
121
89
|
# Add a product category
|
|
122
90
|
config_builder.add_column(
|
|
123
|
-
SamplerColumnConfig(
|
|
91
|
+
dd.SamplerColumnConfig(
|
|
124
92
|
name="product_category",
|
|
125
|
-
sampler_type=SamplerType.CATEGORY,
|
|
126
|
-
params=CategorySamplerParams(
|
|
93
|
+
sampler_type=dd.SamplerType.CATEGORY,
|
|
94
|
+
params=dd.CategorySamplerParams(
|
|
127
95
|
values=["Electronics", "Clothing", "Home & Kitchen", "Books"],
|
|
128
96
|
),
|
|
129
97
|
)
|
|
@@ -131,7 +99,7 @@ config_builder.add_column(
|
|
|
131
99
|
|
|
132
100
|
# Generate personalized customer reviews
|
|
133
101
|
config_builder.add_column(
|
|
134
|
-
LLMTextColumnConfig(
|
|
102
|
+
dd.LLMTextColumnConfig(
|
|
135
103
|
name="review",
|
|
136
104
|
model_alias="nvidia-text",
|
|
137
105
|
prompt="Write a brief product review for a {{ product_category }} item you recently purchased.",
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
data_designer/cli/README.md,sha256=uPE3KdlF5Y3H8pQc8c6ZZ3h6YSFXNQW-iEXGQJuVnI4,9026
|
|
2
|
+
data_designer/cli/__init__.py,sha256=--5yQzMciTX8-vroyXyFNBCqQ0HQd67GWCwnIoIHhJ4,251
|
|
3
|
+
data_designer/cli/main.py,sha256=1klKdUKPZTgmUbduHSzEFueQHWkc-42Gcbri25cjiHo,1974
|
|
4
|
+
data_designer/cli/ui.py,sha256=IgpV_Ht6qmLFrT3ybgOoADTQthoSGJxrwds38o1Zz10,17632
|
|
5
|
+
data_designer/cli/utils.py,sha256=yyKZfr4ndcsngKgmpj5r4fN7fP6ouX-Nwx1Go5s6SdM,2151
|
|
6
|
+
data_designer/cli/commands/__init__.py,sha256=ObZ6NUPeEvvpGTJ5WIGKUyIrIjaI747OM6ErweRtHxQ,137
|
|
7
|
+
data_designer/cli/commands/download.py,sha256=bTynzORVj1rftrrQhmTj6se-ITi2_L7Z3qtio1mLvXU,1770
|
|
8
|
+
data_designer/cli/commands/list.py,sha256=Lu02qFTkhEkLX2e7ak_rHmoO8_4Jjrgy4Yua-EAtyHo,4091
|
|
9
|
+
data_designer/cli/commands/models.py,sha256=Ot4eWyEbCS7heG_bylBdWZ1qj4CILv_hTddm2VdY0Dc,428
|
|
10
|
+
data_designer/cli/commands/providers.py,sha256=-zVNtE_0A0hifcUk6n3c_v_Olcd14mHt3N8_HahHTQ4,491
|
|
11
|
+
data_designer/cli/commands/reset.py,sha256=iCNjkFNdGU6Y7rv-Fprl9ZW60riseL_R7CrYi6DrwR0,3514
|
|
12
|
+
data_designer/cli/controllers/__init__.py,sha256=70il4GIKebdau43nCXyu4VcQj7IFNoxxjEo1Z3hm8_M,491
|
|
13
|
+
data_designer/cli/controllers/download_controller.py,sha256=9lQo-njn890WJiewGazfd6SrBBA4Rj8LYFkXZG_phPI,8117
|
|
14
|
+
data_designer/cli/controllers/model_controller.py,sha256=CZimP1npWwH8UrJXlfMIfbNEn9pcJKtg14CqgArbqQM,9020
|
|
15
|
+
data_designer/cli/controllers/provider_controller.py,sha256=mSLHkc60lu9VsXJE2NNpFZ6zHkasz6UQLtoTBYoFtkA,12293
|
|
16
|
+
data_designer/cli/forms/__init__.py,sha256=UpTr7s5q2GFFssNz3229Kb5JxvFOqtZ55XpifB9a15w,713
|
|
17
|
+
data_designer/cli/forms/builder.py,sha256=Juem3wB2j1KXtZZY7wVP0-eWKK_tj_0-L8Zq9EAS0-k,1731
|
|
18
|
+
data_designer/cli/forms/field.py,sha256=TYEQLqjMvYBS_ftf6Ms-D5J6TOIK9NNe-Ydvo5Nkq50,7543
|
|
19
|
+
data_designer/cli/forms/form.py,sha256=wFdKS0WfuhfotRtwWZgJyN2HrTthI7Kx07NUoQV2DtM,2066
|
|
20
|
+
data_designer/cli/forms/model_builder.py,sha256=DPggV2cl-XQPUiVhrrGO_4_d7jTn5_kBeBn9oTw-V1U,13354
|
|
21
|
+
data_designer/cli/forms/provider_builder.py,sha256=YA6IoLwV39Sh6w0lZYoF25m-ryhnqBqysXLUo4V5X-w,2936
|
|
22
|
+
data_designer/cli/repositories/__init__.py,sha256=ukHlLpOimH9CCJsdW5U9tooV_oFWQ4iLGK5GNi5YXtM,475
|
|
23
|
+
data_designer/cli/repositories/base.py,sha256=ofOAHeAYAL6Bm4EJjSOFaNPD-odm2PlrW6quRkQQzaw,1095
|
|
24
|
+
data_designer/cli/repositories/model_repository.py,sha256=oaa5ISP8Y-BRzXOdzAhDHf0FqmWngSEJ8RGOYWeXi4M,1487
|
|
25
|
+
data_designer/cli/repositories/persona_repository.py,sha256=3ZRarD6BYAKVYFs_r9hDkh2nfkKW7BA8KJyfcYP0RRc,2683
|
|
26
|
+
data_designer/cli/repositories/provider_repository.py,sha256=hG6tYbjR3gT8DmXL7usRvMrc6ILws4ECyTZ5imENpuQ,1556
|
|
27
|
+
data_designer/cli/services/__init__.py,sha256=2ycyikXx-8gbYZm-xl6IMyKXLwR4REU5heg6BkUW6qo,455
|
|
28
|
+
data_designer/cli/services/download_service.py,sha256=m_wtDfxAA80tZdIf9kUS3ye8fzKG-3DjfDnm5u0-mJE,3519
|
|
29
|
+
data_designer/cli/services/model_service.py,sha256=cFiP9ZQIprPdrVibUC6uwL-NuCYRgx8XVIjxDV-TznU,3926
|
|
30
|
+
data_designer/cli/services/provider_service.py,sha256=5cou_EWU0RwE9p2PWpRBM9HcPqdENLpkHUuGzQ-l9J4,3957
|
|
31
|
+
data_designer/interface/__init__.py,sha256=2LbGosKhVhNXSUj-MX00b6UJRW-qeyiQ7PdEwtJxwso,718
|
|
32
|
+
data_designer/interface/_version.py,sha256=2_0GUP7yBCXRus-qiJKxQD62z172WSs1sQ6DVpPsbmM,704
|
|
33
|
+
data_designer/interface/data_designer.py,sha256=0LBAUL7W75EmMwz-f4Lr0my9BXg1OTR2hKpgNsWnqLk,17275
|
|
34
|
+
data_designer/interface/errors.py,sha256=Ft9GMeIrOHJv_PC_1rU6hWcNyq1GHdsFYZSc9HnUrxU,606
|
|
35
|
+
data_designer/interface/results.py,sha256=3fGwlhif4ufqUGh-EgsGccrob4S6a7WZ6BgFiszTo_A,3871
|
|
36
|
+
data_designer-0.4.0.dist-info/METADATA,sha256=yKQ114sG3AInb2Hqi5d0Totv_0xt7RUwjd7VpXFQ__w,7152
|
|
37
|
+
data_designer-0.4.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
38
|
+
data_designer-0.4.0.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
|
|
39
|
+
data_designer-0.4.0.dist-info/RECORD,,
|
data_designer/__init__.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
try:
|
|
7
|
-
from data_designer._version import __version__
|
|
8
|
-
except ImportError:
|
|
9
|
-
# Fallback for editable installs without build
|
|
10
|
-
try:
|
|
11
|
-
from importlib.metadata import version
|
|
12
|
-
|
|
13
|
-
__version__ = version("data-designer")
|
|
14
|
-
except Exception:
|
|
15
|
-
__version__ = "0.0.0.dev0+unknown"
|
|
16
|
-
|
|
17
|
-
__all__ = ["__version__"]
|
data_designer/config/__init__.py
DELETED
|
@@ -1,159 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from abc import ABC
|
|
7
|
-
from enum import Enum
|
|
8
|
-
|
|
9
|
-
from pydantic import BaseModel, Field
|
|
10
|
-
from rich.panel import Panel
|
|
11
|
-
from rich.table import Column, Table
|
|
12
|
-
from typing_extensions import TypeAlias
|
|
13
|
-
|
|
14
|
-
from data_designer.config.analysis.column_statistics import (
|
|
15
|
-
CategoricalDistribution,
|
|
16
|
-
CategoricalHistogramData,
|
|
17
|
-
ColumnDistributionType,
|
|
18
|
-
MissingValue,
|
|
19
|
-
NumericalDistribution,
|
|
20
|
-
)
|
|
21
|
-
from data_designer.config.analysis.utils.reporting import TITLE_STYLE, create_judge_score_summary_table
|
|
22
|
-
from data_designer.config.base import ConfigBase
|
|
23
|
-
from data_designer.config.utils.visualization import ColorPalette
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class ColumnProfilerType(str, Enum):
|
|
27
|
-
JUDGE_SCORE = "judge-score"
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class ColumnProfilerResults(BaseModel, ABC):
|
|
31
|
-
"""Abstract base class for column profiler results.
|
|
32
|
-
|
|
33
|
-
Stores results from column profiling operations. Subclasses hold profiler-specific
|
|
34
|
-
analysis results and provide methods for generating formatted report sections for display.
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
def create_report_section(self) -> Panel:
|
|
38
|
-
"""Creates a Rich Panel containing the formatted profiler results for display.
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
A Rich Panel containing the formatted profiler results. Default implementation
|
|
42
|
-
returns a "Not Implemented" message; subclasses should override to provide
|
|
43
|
-
specific formatting.
|
|
44
|
-
"""
|
|
45
|
-
return Panel(
|
|
46
|
-
f"Report section generation not implemented for '{self.__class__.__name__}'.",
|
|
47
|
-
title="Not Implemented",
|
|
48
|
-
border_style=f"bold {ColorPalette.YELLOW.value}",
|
|
49
|
-
padding=(1, 2),
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
class JudgeScoreProfilerConfig(ConfigBase):
|
|
54
|
-
"""Configuration for the LLM-as-a-judge score profiler.
|
|
55
|
-
|
|
56
|
-
Attributes:
|
|
57
|
-
model_alias: Alias of the LLM model to use for generating score distribution summaries.
|
|
58
|
-
Must match a model alias defined in the Data Designer configuration.
|
|
59
|
-
summary_score_sample_size: Number of score samples to include when prompting the LLM
|
|
60
|
-
to generate summaries. Larger sample sizes provide more context but increase
|
|
61
|
-
token usage. Must be at least 1. Defaults to 20.
|
|
62
|
-
"""
|
|
63
|
-
|
|
64
|
-
model_alias: str
|
|
65
|
-
summary_score_sample_size: int | None = Field(default=20, ge=1)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class JudgeScoreSample(BaseModel):
|
|
69
|
-
"""Container for a single judge score and its associated reasoning.
|
|
70
|
-
|
|
71
|
-
Stores a paired score-reasoning sample extracted from an LLM-as-a-judge column.
|
|
72
|
-
Used when generating summaries to provide the LLM with examples of scoring patterns.
|
|
73
|
-
|
|
74
|
-
Attributes:
|
|
75
|
-
score: The score value assigned by the judge. Can be numeric (int) or categorical (str).
|
|
76
|
-
reasoning: The reasoning or explanation provided by the judge for this score.
|
|
77
|
-
"""
|
|
78
|
-
|
|
79
|
-
score: int | str
|
|
80
|
-
reasoning: str
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
class JudgeScoreDistributions(BaseModel):
|
|
84
|
-
"""Container for computed distributions across all judge score dimensions.
|
|
85
|
-
|
|
86
|
-
Stores the complete distribution analysis for all score dimensions in an LLM-as-a-judge
|
|
87
|
-
column. Each score dimension (e.g., "relevance", "fluency") has its own distribution
|
|
88
|
-
computed from the generated data.
|
|
89
|
-
|
|
90
|
-
Attributes:
|
|
91
|
-
scores: Mapping of each score dimension name to its list of score values.
|
|
92
|
-
reasoning: Mapping of each score dimension name to its list of reasoning texts.
|
|
93
|
-
distribution_types: Mapping of each score dimension name to its classification.
|
|
94
|
-
distributions: Mapping of each score dimension name to its computed distribution statistics.
|
|
95
|
-
histograms: Mapping of each score dimension name to its histogram data.
|
|
96
|
-
"""
|
|
97
|
-
|
|
98
|
-
scores: dict[str, list[int | str]]
|
|
99
|
-
reasoning: dict[str, list[str]]
|
|
100
|
-
distribution_types: dict[str, ColumnDistributionType]
|
|
101
|
-
distributions: dict[str, CategoricalDistribution | NumericalDistribution | MissingValue]
|
|
102
|
-
histograms: dict[str, CategoricalHistogramData | MissingValue]
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
class JudgeScoreSummary(BaseModel):
|
|
106
|
-
"""Container for an LLM-generated summary of a judge score dimension.
|
|
107
|
-
|
|
108
|
-
Stores the natural language summary and sample data for a single score dimension
|
|
109
|
-
generated by the judge score profiler. The summary is created by an LLM analyzing
|
|
110
|
-
the distribution and patterns in the score-reasoning pairs.
|
|
111
|
-
|
|
112
|
-
Attributes:
|
|
113
|
-
score_name: Name of the score dimension being summarized (e.g., "relevance", "fluency").
|
|
114
|
-
summary: LLM-generated natural language summary describing the scoring patterns,
|
|
115
|
-
distribution characteristics, and notable trends for this score dimension.
|
|
116
|
-
score_samples: List of score-reasoning pairs that were used to generate the summary.
|
|
117
|
-
These are the examples of the scoring behavior that were used to generate the summary.
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
score_name: str
|
|
121
|
-
summary: str
|
|
122
|
-
score_samples: list[JudgeScoreSample]
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
class JudgeScoreProfilerResults(ColumnProfilerResults):
|
|
126
|
-
"""Container for complete judge score profiler analysis results.
|
|
127
|
-
|
|
128
|
-
Attributes:
|
|
129
|
-
column_name: Name of the judge column that was profiled.
|
|
130
|
-
summaries: Mapping of each score dimension name to its LLM-generated summary.
|
|
131
|
-
score_distributions: Complete distribution analysis across all score dimensions.
|
|
132
|
-
"""
|
|
133
|
-
|
|
134
|
-
column_name: str
|
|
135
|
-
summaries: dict[str, JudgeScoreSummary]
|
|
136
|
-
score_distributions: JudgeScoreDistributions | MissingValue
|
|
137
|
-
|
|
138
|
-
def create_report_section(self) -> Panel:
|
|
139
|
-
layout = Table.grid(Column(), expand=True, padding=(2, 0))
|
|
140
|
-
|
|
141
|
-
for score_name in self.summaries.keys():
|
|
142
|
-
layout.add_row(
|
|
143
|
-
create_judge_score_summary_table(
|
|
144
|
-
score_name=score_name,
|
|
145
|
-
histogram=self.score_distributions.histograms[score_name],
|
|
146
|
-
summary=self.summaries[score_name].summary,
|
|
147
|
-
)
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
return Panel(
|
|
151
|
-
layout,
|
|
152
|
-
title=f"[{TITLE_STYLE}]LLM-as-a-Judge Score Profile: '{self.column_name}'[/{TITLE_STYLE}]",
|
|
153
|
-
padding=(1, 2),
|
|
154
|
-
)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
ColumnProfilerConfigT: TypeAlias = JudgeScoreProfilerConfig
|
|
158
|
-
|
|
159
|
-
ColumnProfilerResultsT: TypeAlias = JudgeScoreProfilerResults
|