data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,2 +1,2 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
@@ -1,2 +1,22 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1
+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from data_designer.config.default_model_settings import resolve_seed_default_model_settings
5
+ from data_designer.interface.data_designer import DataDesigner
6
+ from data_designer.interface.errors import (
7
+ DataDesignerGenerationError,
8
+ DataDesignerProfilingError,
9
+ )
10
+ from data_designer.interface.results import DatasetCreationResults
11
+ from data_designer.logging import configure_logging
12
+
13
+ configure_logging()
14
+ resolve_seed_default_model_settings()
15
+
16
+
17
+ __all__ = [
18
+ "DataDesigner",
19
+ "DataDesignerGenerationError",
20
+ "DataDesignerProfilingError",
21
+ "DatasetCreationResults",
22
+ ]
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.8rc2'
32
- __version_tuple__ = version_tuple = (0, 3, 8, 'rc2')
31
+ __version__ = version = '0.4.0'
32
+ __version_tuple__ = version_tuple = (0, 4, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -317,13 +317,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
317
317
  Args:
318
318
  run_config: A RunConfig instance containing runtime settings such as
319
319
  early shutdown behavior, batch sizing via `buffer_size`, and non-inference worker
320
- concurrency via `non_inference_max_parallel_workers`. Import RunConfig from
321
- data_designer.essentials.
322
-
323
- Example:
324
- >>> from data_designer.essentials import DataDesigner, RunConfig
325
- >>> dd = DataDesigner()
326
- >>> dd.set_run_config(RunConfig(disable_early_shutdown=True))
320
+ concurrency via `non_inference_max_parallel_workers`.
327
321
 
328
322
  Notes:
329
323
  When `disable_early_shutdown=True`, DataDesigner will never terminate generation early
@@ -1,9 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer
3
- Version: 0.3.8rc2
3
+ Version: 0.4.0
4
4
  Summary: General framework for synthetic data generation
5
5
  License-Expression: Apache-2.0
6
- License-File: LICENSE
7
6
  Classifier: Development Status :: 4 - Beta
8
7
  Classifier: Intended Audience :: Developers
9
8
  Classifier: Intended Audience :: Science/Research
@@ -15,33 +14,9 @@ Classifier: Programming Language :: Python :: 3.13
15
14
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
15
  Classifier: Topic :: Software Development
17
16
  Requires-Python: >=3.10
18
- Requires-Dist: anyascii<1,>=0.3.3
19
- Requires-Dist: duckdb<2,>=1.1.3
20
- Requires-Dist: faker<21,>=20.1.0
21
- Requires-Dist: httpx-retries<1,>=0.4.2
22
- Requires-Dist: httpx<1,>=0.27.2
23
- Requires-Dist: huggingface-hub<2,>=1.0.1
24
- Requires-Dist: jinja2<4,>=3.1.6
25
- Requires-Dist: json-repair<1,>=0.48.0
26
- Requires-Dist: jsonpath-rust-bindings<2,>=1.0
27
- Requires-Dist: litellm<1.80.12,>=1.73.6
28
- Requires-Dist: lxml<7,>=6.0.2
29
- Requires-Dist: marko<3,>=2.1.2
30
- Requires-Dist: networkx<4,>=3.0
31
- Requires-Dist: numpy<3,>=1.23.5
32
- Requires-Dist: pandas<3,>=2.3.3
17
+ Requires-Dist: data-designer-config
18
+ Requires-Dist: data-designer-engine
33
19
  Requires-Dist: prompt-toolkit<4,>=3.0.0
34
- Requires-Dist: pyarrow<20,>=19.0.1
35
- Requires-Dist: pydantic[email]<3,>=2.9.2
36
- Requires-Dist: pygments<3,>=2.19.2
37
- Requires-Dist: python-json-logger<4,>=3
38
- Requires-Dist: pyyaml<7,>=6.0.1
39
- Requires-Dist: requests<3,>=2.32.2
40
- Requires-Dist: rich<15,>=13.7.1
41
- Requires-Dist: ruff<1,>=0.14.10
42
- Requires-Dist: scipy<2,>=1.11.0
43
- Requires-Dist: sqlfluff<4,>=3.2.0
44
- Requires-Dist: tiktoken<1,>=0.8.0
45
20
  Requires-Dist: typer<1,>=0.12.0
46
21
  Description-Content-Type: text/markdown
47
22
 
@@ -104,26 +79,19 @@ export OPENROUTER_API_KEY="your-openrouter-api-key-here"
104
79
 
105
80
  ### 3. Start generating data!
106
81
  ```python
107
- from data_designer.essentials import (
108
- CategorySamplerParams,
109
- DataDesigner,
110
- DataDesignerConfigBuilder,
111
- LLMTextColumnConfig,
112
- PersonSamplerParams,
113
- SamplerColumnConfig,
114
- SamplerType,
115
- )
82
+ import data_designer.config as dd
83
+ from data_designer.interface import DataDesigner
116
84
 
117
85
  # Initialize with default settings
118
86
  data_designer = DataDesigner()
119
- config_builder = DataDesignerConfigBuilder()
87
+ config_builder = dd.DataDesignerConfigBuilder()
120
88
 
121
89
  # Add a product category
122
90
  config_builder.add_column(
123
- SamplerColumnConfig(
91
+ dd.SamplerColumnConfig(
124
92
  name="product_category",
125
- sampler_type=SamplerType.CATEGORY,
126
- params=CategorySamplerParams(
93
+ sampler_type=dd.SamplerType.CATEGORY,
94
+ params=dd.CategorySamplerParams(
127
95
  values=["Electronics", "Clothing", "Home & Kitchen", "Books"],
128
96
  ),
129
97
  )
@@ -131,7 +99,7 @@ config_builder.add_column(
131
99
 
132
100
  # Generate personalized customer reviews
133
101
  config_builder.add_column(
134
- LLMTextColumnConfig(
102
+ dd.LLMTextColumnConfig(
135
103
  name="review",
136
104
  model_alias="nvidia-text",
137
105
  prompt="Write a brief product review for a {{ product_category }} item you recently purchased.",
@@ -0,0 +1,39 @@
1
+ data_designer/cli/README.md,sha256=uPE3KdlF5Y3H8pQc8c6ZZ3h6YSFXNQW-iEXGQJuVnI4,9026
2
+ data_designer/cli/__init__.py,sha256=--5yQzMciTX8-vroyXyFNBCqQ0HQd67GWCwnIoIHhJ4,251
3
+ data_designer/cli/main.py,sha256=1klKdUKPZTgmUbduHSzEFueQHWkc-42Gcbri25cjiHo,1974
4
+ data_designer/cli/ui.py,sha256=IgpV_Ht6qmLFrT3ybgOoADTQthoSGJxrwds38o1Zz10,17632
5
+ data_designer/cli/utils.py,sha256=yyKZfr4ndcsngKgmpj5r4fN7fP6ouX-Nwx1Go5s6SdM,2151
6
+ data_designer/cli/commands/__init__.py,sha256=ObZ6NUPeEvvpGTJ5WIGKUyIrIjaI747OM6ErweRtHxQ,137
7
+ data_designer/cli/commands/download.py,sha256=bTynzORVj1rftrrQhmTj6se-ITi2_L7Z3qtio1mLvXU,1770
8
+ data_designer/cli/commands/list.py,sha256=Lu02qFTkhEkLX2e7ak_rHmoO8_4Jjrgy4Yua-EAtyHo,4091
9
+ data_designer/cli/commands/models.py,sha256=Ot4eWyEbCS7heG_bylBdWZ1qj4CILv_hTddm2VdY0Dc,428
10
+ data_designer/cli/commands/providers.py,sha256=-zVNtE_0A0hifcUk6n3c_v_Olcd14mHt3N8_HahHTQ4,491
11
+ data_designer/cli/commands/reset.py,sha256=iCNjkFNdGU6Y7rv-Fprl9ZW60riseL_R7CrYi6DrwR0,3514
12
+ data_designer/cli/controllers/__init__.py,sha256=70il4GIKebdau43nCXyu4VcQj7IFNoxxjEo1Z3hm8_M,491
13
+ data_designer/cli/controllers/download_controller.py,sha256=9lQo-njn890WJiewGazfd6SrBBA4Rj8LYFkXZG_phPI,8117
14
+ data_designer/cli/controllers/model_controller.py,sha256=CZimP1npWwH8UrJXlfMIfbNEn9pcJKtg14CqgArbqQM,9020
15
+ data_designer/cli/controllers/provider_controller.py,sha256=mSLHkc60lu9VsXJE2NNpFZ6zHkasz6UQLtoTBYoFtkA,12293
16
+ data_designer/cli/forms/__init__.py,sha256=UpTr7s5q2GFFssNz3229Kb5JxvFOqtZ55XpifB9a15w,713
17
+ data_designer/cli/forms/builder.py,sha256=Juem3wB2j1KXtZZY7wVP0-eWKK_tj_0-L8Zq9EAS0-k,1731
18
+ data_designer/cli/forms/field.py,sha256=TYEQLqjMvYBS_ftf6Ms-D5J6TOIK9NNe-Ydvo5Nkq50,7543
19
+ data_designer/cli/forms/form.py,sha256=wFdKS0WfuhfotRtwWZgJyN2HrTthI7Kx07NUoQV2DtM,2066
20
+ data_designer/cli/forms/model_builder.py,sha256=DPggV2cl-XQPUiVhrrGO_4_d7jTn5_kBeBn9oTw-V1U,13354
21
+ data_designer/cli/forms/provider_builder.py,sha256=YA6IoLwV39Sh6w0lZYoF25m-ryhnqBqysXLUo4V5X-w,2936
22
+ data_designer/cli/repositories/__init__.py,sha256=ukHlLpOimH9CCJsdW5U9tooV_oFWQ4iLGK5GNi5YXtM,475
23
+ data_designer/cli/repositories/base.py,sha256=ofOAHeAYAL6Bm4EJjSOFaNPD-odm2PlrW6quRkQQzaw,1095
24
+ data_designer/cli/repositories/model_repository.py,sha256=oaa5ISP8Y-BRzXOdzAhDHf0FqmWngSEJ8RGOYWeXi4M,1487
25
+ data_designer/cli/repositories/persona_repository.py,sha256=3ZRarD6BYAKVYFs_r9hDkh2nfkKW7BA8KJyfcYP0RRc,2683
26
+ data_designer/cli/repositories/provider_repository.py,sha256=hG6tYbjR3gT8DmXL7usRvMrc6ILws4ECyTZ5imENpuQ,1556
27
+ data_designer/cli/services/__init__.py,sha256=2ycyikXx-8gbYZm-xl6IMyKXLwR4REU5heg6BkUW6qo,455
28
+ data_designer/cli/services/download_service.py,sha256=m_wtDfxAA80tZdIf9kUS3ye8fzKG-3DjfDnm5u0-mJE,3519
29
+ data_designer/cli/services/model_service.py,sha256=cFiP9ZQIprPdrVibUC6uwL-NuCYRgx8XVIjxDV-TznU,3926
30
+ data_designer/cli/services/provider_service.py,sha256=5cou_EWU0RwE9p2PWpRBM9HcPqdENLpkHUuGzQ-l9J4,3957
31
+ data_designer/interface/__init__.py,sha256=2LbGosKhVhNXSUj-MX00b6UJRW-qeyiQ7PdEwtJxwso,718
32
+ data_designer/interface/_version.py,sha256=2_0GUP7yBCXRus-qiJKxQD62z172WSs1sQ6DVpPsbmM,704
33
+ data_designer/interface/data_designer.py,sha256=0LBAUL7W75EmMwz-f4Lr0my9BXg1OTR2hKpgNsWnqLk,17275
34
+ data_designer/interface/errors.py,sha256=Ft9GMeIrOHJv_PC_1rU6hWcNyq1GHdsFYZSc9HnUrxU,606
35
+ data_designer/interface/results.py,sha256=3fGwlhif4ufqUGh-EgsGccrob4S6a7WZ6BgFiszTo_A,3871
36
+ data_designer-0.4.0.dist-info/METADATA,sha256=yKQ114sG3AInb2Hqi5d0Totv_0xt7RUwjd7VpXFQ__w,7152
37
+ data_designer-0.4.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
+ data_designer-0.4.0.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
39
+ data_designer-0.4.0.dist-info/RECORD,,
data_designer/__init__.py DELETED
@@ -1,17 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- try:
7
- from data_designer._version import __version__
8
- except ImportError:
9
- # Fallback for editable installs without build
10
- try:
11
- from importlib.metadata import version
12
-
13
- __version__ = version("data-designer")
14
- except Exception:
15
- __version__ = "0.0.0.dev0+unknown"
16
-
17
- __all__ = ["__version__"]
@@ -1,2 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
@@ -1,2 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
@@ -1,159 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from abc import ABC
7
- from enum import Enum
8
-
9
- from pydantic import BaseModel, Field
10
- from rich.panel import Panel
11
- from rich.table import Column, Table
12
- from typing_extensions import TypeAlias
13
-
14
- from data_designer.config.analysis.column_statistics import (
15
- CategoricalDistribution,
16
- CategoricalHistogramData,
17
- ColumnDistributionType,
18
- MissingValue,
19
- NumericalDistribution,
20
- )
21
- from data_designer.config.analysis.utils.reporting import TITLE_STYLE, create_judge_score_summary_table
22
- from data_designer.config.base import ConfigBase
23
- from data_designer.config.utils.visualization import ColorPalette
24
-
25
-
26
- class ColumnProfilerType(str, Enum):
27
- JUDGE_SCORE = "judge-score"
28
-
29
-
30
- class ColumnProfilerResults(BaseModel, ABC):
31
- """Abstract base class for column profiler results.
32
-
33
- Stores results from column profiling operations. Subclasses hold profiler-specific
34
- analysis results and provide methods for generating formatted report sections for display.
35
- """
36
-
37
- def create_report_section(self) -> Panel:
38
- """Creates a Rich Panel containing the formatted profiler results for display.
39
-
40
- Returns:
41
- A Rich Panel containing the formatted profiler results. Default implementation
42
- returns a "Not Implemented" message; subclasses should override to provide
43
- specific formatting.
44
- """
45
- return Panel(
46
- f"Report section generation not implemented for '{self.__class__.__name__}'.",
47
- title="Not Implemented",
48
- border_style=f"bold {ColorPalette.YELLOW.value}",
49
- padding=(1, 2),
50
- )
51
-
52
-
53
- class JudgeScoreProfilerConfig(ConfigBase):
54
- """Configuration for the LLM-as-a-judge score profiler.
55
-
56
- Attributes:
57
- model_alias: Alias of the LLM model to use for generating score distribution summaries.
58
- Must match a model alias defined in the Data Designer configuration.
59
- summary_score_sample_size: Number of score samples to include when prompting the LLM
60
- to generate summaries. Larger sample sizes provide more context but increase
61
- token usage. Must be at least 1. Defaults to 20.
62
- """
63
-
64
- model_alias: str
65
- summary_score_sample_size: int | None = Field(default=20, ge=1)
66
-
67
-
68
- class JudgeScoreSample(BaseModel):
69
- """Container for a single judge score and its associated reasoning.
70
-
71
- Stores a paired score-reasoning sample extracted from an LLM-as-a-judge column.
72
- Used when generating summaries to provide the LLM with examples of scoring patterns.
73
-
74
- Attributes:
75
- score: The score value assigned by the judge. Can be numeric (int) or categorical (str).
76
- reasoning: The reasoning or explanation provided by the judge for this score.
77
- """
78
-
79
- score: int | str
80
- reasoning: str
81
-
82
-
83
- class JudgeScoreDistributions(BaseModel):
84
- """Container for computed distributions across all judge score dimensions.
85
-
86
- Stores the complete distribution analysis for all score dimensions in an LLM-as-a-judge
87
- column. Each score dimension (e.g., "relevance", "fluency") has its own distribution
88
- computed from the generated data.
89
-
90
- Attributes:
91
- scores: Mapping of each score dimension name to its list of score values.
92
- reasoning: Mapping of each score dimension name to its list of reasoning texts.
93
- distribution_types: Mapping of each score dimension name to its classification.
94
- distributions: Mapping of each score dimension name to its computed distribution statistics.
95
- histograms: Mapping of each score dimension name to its histogram data.
96
- """
97
-
98
- scores: dict[str, list[int | str]]
99
- reasoning: dict[str, list[str]]
100
- distribution_types: dict[str, ColumnDistributionType]
101
- distributions: dict[str, CategoricalDistribution | NumericalDistribution | MissingValue]
102
- histograms: dict[str, CategoricalHistogramData | MissingValue]
103
-
104
-
105
- class JudgeScoreSummary(BaseModel):
106
- """Container for an LLM-generated summary of a judge score dimension.
107
-
108
- Stores the natural language summary and sample data for a single score dimension
109
- generated by the judge score profiler. The summary is created by an LLM analyzing
110
- the distribution and patterns in the score-reasoning pairs.
111
-
112
- Attributes:
113
- score_name: Name of the score dimension being summarized (e.g., "relevance", "fluency").
114
- summary: LLM-generated natural language summary describing the scoring patterns,
115
- distribution characteristics, and notable trends for this score dimension.
116
- score_samples: List of score-reasoning pairs that were used to generate the summary.
117
- These are the examples of the scoring behavior that were used to generate the summary.
118
- """
119
-
120
- score_name: str
121
- summary: str
122
- score_samples: list[JudgeScoreSample]
123
-
124
-
125
- class JudgeScoreProfilerResults(ColumnProfilerResults):
126
- """Container for complete judge score profiler analysis results.
127
-
128
- Attributes:
129
- column_name: Name of the judge column that was profiled.
130
- summaries: Mapping of each score dimension name to its LLM-generated summary.
131
- score_distributions: Complete distribution analysis across all score dimensions.
132
- """
133
-
134
- column_name: str
135
- summaries: dict[str, JudgeScoreSummary]
136
- score_distributions: JudgeScoreDistributions | MissingValue
137
-
138
- def create_report_section(self) -> Panel:
139
- layout = Table.grid(Column(), expand=True, padding=(2, 0))
140
-
141
- for score_name in self.summaries.keys():
142
- layout.add_row(
143
- create_judge_score_summary_table(
144
- score_name=score_name,
145
- histogram=self.score_distributions.histograms[score_name],
146
- summary=self.summaries[score_name].summary,
147
- )
148
- )
149
-
150
- return Panel(
151
- layout,
152
- title=f"[{TITLE_STYLE}]LLM-as-a-Judge Score Profile: '{self.column_name}'[/{TITLE_STYLE}]",
153
- padding=(1, 2),
154
- )
155
-
156
-
157
- ColumnProfilerConfigT: TypeAlias = JudgeScoreProfilerConfig
158
-
159
- ColumnProfilerResultsT: TypeAlias = JudgeScoreProfilerResults