data-designer 0.3.8rc2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +1 -7
  5. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -129
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -51
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -335
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -212
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc2.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc2.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc2.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,116 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from enum import Enum
7
-
8
- from pydantic import Field, model_validator
9
- from typing_extensions import Self
10
-
11
- from data_designer.config.base import ConfigBase
12
- from data_designer.config.seed_source_types import SeedSourceT
13
-
14
-
15
- class SamplingStrategy(str, Enum):
16
- ORDERED = "ordered"
17
- SHUFFLE = "shuffle"
18
-
19
-
20
- class IndexRange(ConfigBase):
21
- start: int = Field(ge=0, description="The start index of the index range (inclusive)")
22
- end: int = Field(ge=0, description="The end index of the index range (inclusive)")
23
-
24
- @model_validator(mode="after")
25
- def _validate_index_range(self) -> Self:
26
- if self.start > self.end:
27
- raise ValueError("'start' index must be less than or equal to 'end' index")
28
- return self
29
-
30
- @property
31
- def size(self) -> int:
32
- return self.end - self.start + 1
33
-
34
-
35
- class PartitionBlock(ConfigBase):
36
- index: int = Field(default=0, ge=0, description="The index of the partition to sample from")
37
- num_partitions: int = Field(default=1, ge=1, description="The total number of partitions in the dataset")
38
-
39
- @model_validator(mode="after")
40
- def _validate_partition_block(self) -> Self:
41
- if self.index >= self.num_partitions:
42
- raise ValueError("'index' must be less than 'num_partitions'")
43
- return self
44
-
45
- def to_index_range(self, dataset_size: int) -> IndexRange:
46
- partition_size = dataset_size // self.num_partitions
47
- start = self.index * partition_size
48
-
49
- # For the last partition, extend to the end of the dataset to include remainder rows
50
- if self.index == self.num_partitions - 1:
51
- end = dataset_size - 1
52
- else:
53
- end = ((self.index + 1) * partition_size) - 1
54
- return IndexRange(start=start, end=end)
55
-
56
-
57
- class SeedConfig(ConfigBase):
58
- """Configuration for sampling data from a seed dataset.
59
-
60
- Args:
61
- source: A SeedSource defining where the seed data exists
62
- sampling_strategy: Strategy for how to sample rows from the dataset.
63
- - ORDERED: Read rows sequentially in their original order.
64
- - SHUFFLE: Randomly shuffle rows before sampling. When used with
65
- selection_strategy, shuffling occurs within the selected range/partition.
66
- selection_strategy: Optional strategy to select a subset of the dataset.
67
- - IndexRange: Select a specific range of indices (e.g., rows 100-200).
68
- - PartitionBlock: Select a partition by splitting the dataset into N equal parts.
69
- Partition indices are zero-based (index=0 is the first partition, index=1 is
70
- the second, etc.).
71
-
72
- Examples:
73
- Read rows sequentially from start to end:
74
- SeedConfig(
75
- source=LocalFileSeedSource(path="my_data.parquet"),
76
- sampling_strategy=SamplingStrategy.ORDERED
77
- )
78
-
79
- Read rows in random order:
80
- SeedConfig(
81
- source=LocalFileSeedSource(path="my_data.parquet"),
82
- sampling_strategy=SamplingStrategy.SHUFFLE
83
- )
84
-
85
- Read specific index range (rows 100-199):
86
- SeedConfig(
87
- source=LocalFileSeedSource(path="my_data.parquet"),
88
- sampling_strategy=SamplingStrategy.ORDERED,
89
- selection_strategy=IndexRange(start=100, end=199)
90
- )
91
-
92
- Read random rows from a specific index range (shuffles within rows 100-199):
93
- SeedConfig(
94
- source=LocalFileSeedSource(path="my_data.parquet"),
95
- sampling_strategy=SamplingStrategy.SHUFFLE,
96
- selection_strategy=IndexRange(start=100, end=199)
97
- )
98
-
99
- Read from partition 2 (3rd partition, zero-based) of 5 partitions (20% of dataset):
100
- SeedConfig(
101
- source=LocalFileSeedSource(path="my_data.parquet"),
102
- sampling_strategy=SamplingStrategy.ORDERED,
103
- selection_strategy=PartitionBlock(index=2, num_partitions=5)
104
- )
105
-
106
- Read shuffled rows from partition 0 of 10 partitions (shuffles within the partition):
107
- SeedConfig(
108
- source=LocalFileSeedSource(path="my_data.parquet"),
109
- sampling_strategy=SamplingStrategy.SHUFFLE,
110
- selection_strategy=PartitionBlock(index=0, num_partitions=10)
111
- )
112
- """
113
-
114
- source: SeedSourceT
115
- sampling_strategy: SamplingStrategy = SamplingStrategy.ORDERED
116
- selection_strategy: IndexRange | PartitionBlock | None = None
@@ -1,84 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from abc import ABC
7
- from typing import TYPE_CHECKING, Literal
8
-
9
- from pydantic import BaseModel, ConfigDict, Field, field_validator
10
- from pydantic.json_schema import SkipJsonSchema
11
- from typing_extensions import Self
12
-
13
- from data_designer.config.utils.io_helpers import (
14
- VALID_DATASET_FILE_EXTENSIONS,
15
- validate_dataset_file_path,
16
- validate_path_contains_files_of_type,
17
- )
18
- from data_designer.lazy_heavy_imports import pd
19
-
20
- if TYPE_CHECKING:
21
- import pandas as pd
22
-
23
-
24
- class SeedSource(BaseModel, ABC):
25
- """Base class for seed dataset configurations.
26
-
27
- All subclasses must define a `seed_type` field with a Literal value.
28
- This serves as a discriminated union discriminator.
29
- """
30
-
31
- seed_type: str
32
-
33
-
34
- class LocalFileSeedSource(SeedSource):
35
- seed_type: Literal["local"] = "local"
36
-
37
- path: str
38
-
39
- @field_validator("path", mode="after")
40
- def validate_path(cls, v: str) -> str:
41
- valid_wild_card_versions = {f"*{ext}" for ext in VALID_DATASET_FILE_EXTENSIONS}
42
- if any(v.endswith(wildcard) for wildcard in valid_wild_card_versions):
43
- parts = v.split("*.")
44
- file_path = parts[0]
45
- file_extension = parts[-1]
46
- validate_path_contains_files_of_type(file_path, file_extension)
47
- else:
48
- validate_dataset_file_path(v)
49
- return v
50
-
51
- @classmethod
52
- def from_dataframe(cls, df: pd.DataFrame, path: str) -> Self:
53
- df.to_parquet(path, index=False)
54
- return cls(path=path)
55
-
56
-
57
- class HuggingFaceSeedSource(SeedSource):
58
- seed_type: Literal["hf"] = "hf"
59
-
60
- path: str = Field(
61
- ...,
62
- description=(
63
- "Path to the seed data in HuggingFace. Wildcards are allowed. Examples include "
64
- "'datasets/my-username/my-dataset/data/000_00000.parquet', 'datasets/my-username/my-dataset/data/*.parquet', "
65
- "and 'datasets/my-username/my-dataset/**/*.parquet'"
66
- ),
67
- )
68
- token: str | None = None
69
- endpoint: str = "https://huggingface.co"
70
-
71
-
72
- class DataFrameSeedSource(SeedSource):
73
- seed_type: Literal["df"] = "df"
74
-
75
- model_config = ConfigDict(arbitrary_types_allowed=True)
76
-
77
- df: SkipJsonSchema[pd.DataFrame] = Field(
78
- ...,
79
- exclude=True,
80
- description=(
81
- "DataFrame to use directly as the seed dataset. NOTE: if you need to write a Data Designer config, "
82
- "you must use `LocalFileSeedSource` instead, since DataFrame objects are not serializable."
83
- ),
84
- )
@@ -1,19 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from typing import Annotated
7
-
8
- from pydantic import Field
9
- from typing_extensions import TypeAlias
10
-
11
- from data_designer.config.seed_source import DataFrameSeedSource, HuggingFaceSeedSource, LocalFileSeedSource
12
- from data_designer.plugin_manager import PluginManager
13
-
14
- plugin_manager = PluginManager()
15
-
16
- _SeedSourceT: TypeAlias = LocalFileSeedSource | HuggingFaceSeedSource | DataFrameSeedSource
17
- _SeedSourceT = plugin_manager.inject_into_seed_source_type_union(_SeedSourceT)
18
-
19
- SeedSourceT = Annotated[_SeedSourceT, Field(discriminator="seed_type")]
@@ -1,82 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from enum import Enum
7
-
8
-
9
- class CodeLang(str, Enum):
10
- GO = "go"
11
- JAVASCRIPT = "javascript"
12
- JAVA = "java"
13
- KOTLIN = "kotlin"
14
- PYTHON = "python"
15
- RUBY = "ruby"
16
- RUST = "rust"
17
- SCALA = "scala"
18
- SWIFT = "swift"
19
- TYPESCRIPT = "typescript"
20
- SQL_SQLITE = "sql:sqlite"
21
- SQL_TSQL = "sql:tsql"
22
- SQL_BIGQUERY = "sql:bigquery"
23
- SQL_MYSQL = "sql:mysql"
24
- SQL_POSTGRES = "sql:postgres"
25
- SQL_ANSI = "sql:ansi"
26
-
27
- @staticmethod
28
- def parse(value: str | CodeLang) -> tuple[str, str | None]:
29
- value = value.value if isinstance(value, CodeLang) else value
30
- split_vals = value.split(":")
31
- return (split_vals[0], split_vals[1] if len(split_vals) > 1 else None)
32
-
33
- @staticmethod
34
- def parse_lang(value: str | CodeLang) -> str:
35
- return CodeLang.parse(value)[0]
36
-
37
- @staticmethod
38
- def parse_dialect(value: str | CodeLang) -> str | None:
39
- return CodeLang.parse(value)[1]
40
-
41
- @staticmethod
42
- def supported_values() -> set[str]:
43
- return {lang.value for lang in CodeLang}
44
-
45
-
46
- SQL_DIALECTS: set[CodeLang] = {
47
- CodeLang.SQL_SQLITE,
48
- CodeLang.SQL_TSQL,
49
- CodeLang.SQL_BIGQUERY,
50
- CodeLang.SQL_MYSQL,
51
- CodeLang.SQL_POSTGRES,
52
- CodeLang.SQL_ANSI,
53
- }
54
-
55
- ##########################################################
56
- # Helper functions
57
- ##########################################################
58
-
59
-
60
- def code_lang_to_syntax_lexer(code_lang: CodeLang | str) -> str:
61
- """Convert the code language to a syntax lexer for Pygments.
62
-
63
- Reference: https://pygments.org/docs/lexers/
64
- """
65
- code_lang_to_lexer = {
66
- CodeLang.GO: "golang",
67
- CodeLang.JAVASCRIPT: "javascript",
68
- CodeLang.JAVA: "java",
69
- CodeLang.KOTLIN: "kotlin",
70
- CodeLang.PYTHON: "python",
71
- CodeLang.RUBY: "ruby",
72
- CodeLang.RUST: "rust",
73
- CodeLang.SCALA: "scala",
74
- CodeLang.SWIFT: "swift",
75
- CodeLang.SQL_SQLITE: "sql",
76
- CodeLang.SQL_ANSI: "sql",
77
- CodeLang.SQL_TSQL: "tsql",
78
- CodeLang.SQL_BIGQUERY: "sql",
79
- CodeLang.SQL_MYSQL: "mysql",
80
- CodeLang.SQL_POSTGRES: "postgres",
81
- }
82
- return code_lang_to_lexer.get(code_lang, code_lang)
@@ -1,363 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import os
7
- from enum import Enum
8
- from pathlib import Path
9
-
10
- from rich.theme import Theme
11
-
12
- DEFAULT_NUM_RECORDS = 10
13
-
14
- EPSILON = 1e-8
15
- REPORTING_PRECISION = 2
16
-
17
- DEFAULT_REPR_HTML_STYLE = "nord"
18
-
19
- REPR_HTML_FIXED_WIDTH = 1000
20
- REPR_HTML_TEMPLATE = """
21
- <meta charset="UTF-8">
22
- <style>
23
- {{css}}
24
-
25
- .code {{{{
26
- padding: 4px;
27
- border: 1px solid grey;
28
- border-radius: 4px;
29
- max-width: {fixed_width}px;
30
- width: 100%;
31
- display: inline-block;
32
- box-sizing: border-box;
33
- text-align: left;
34
- vertical-align: top;
35
- line-height: normal;
36
- overflow-x: auto;
37
- }}}}
38
-
39
- .code pre {{{{
40
- white-space: pre-wrap; /* CSS 3 */
41
- white-space: -moz-pre-wrap; /* Mozilla, since 1999 */
42
- white-space: -pre-wrap; /* Opera 4-6 */
43
- white-space: -o-pre-wrap; /* Opera 7 */
44
- word-wrap: break-word;
45
- overflow-wrap: break-word;
46
- margin: 0;
47
- }}}}
48
- </style>
49
- {{highlighted_html}}
50
- """.format(fixed_width=REPR_HTML_FIXED_WIDTH)
51
-
52
-
53
- class NordColor(Enum):
54
- NORD0 = "#2E3440" # Darkest gray (background)
55
- NORD1 = "#3B4252" # Dark gray
56
- NORD2 = "#434C5E" # Medium dark gray
57
- NORD3 = "#4C566A" # Lighter dark gray
58
- NORD4 = "#D8DEE9" # Light gray (default text)
59
- NORD5 = "#E5E9F0" # Very light gray
60
- NORD6 = "#ECEFF4" # Almost white
61
- NORD7 = "#8FBCBB" # Teal
62
- NORD8 = "#88C0D0" # Light cyan
63
- NORD9 = "#81A1C1" # Soft blue
64
- NORD10 = "#5E81AC" # Darker blue
65
- NORD11 = "#BF616A" # Red
66
- NORD12 = "#D08770" # Orange
67
- NORD13 = "#EBCB8B" # Yellow
68
- NORD14 = "#A3BE8C" # Green
69
- NORD15 = "#B48EAD" # Purple
70
-
71
-
72
- RICH_CONSOLE_THEME = Theme(
73
- {
74
- "repr.number": NordColor.NORD15.value, # Purple for numbers
75
- "repr.string": NordColor.NORD14.value, # Green for strings
76
- "repr.bool_true": NordColor.NORD9.value, # Blue for True
77
- "repr.bool_false": NordColor.NORD9.value, # Blue for False
78
- "repr.none": NordColor.NORD11.value, # Red for None
79
- "repr.brace": NordColor.NORD7.value, # Teal for brackets/braces
80
- "repr.comma": NordColor.NORD7.value, # Teal for commas
81
- "repr.ellipsis": NordColor.NORD7.value, # Teal for ellipsis
82
- "repr.attrib_name": NordColor.NORD3.value, # Light gray for dict keys
83
- "repr.attrib_equal": NordColor.NORD7.value, # Teal for equals signs
84
- "repr.call": NordColor.NORD10.value, # Darker blue for function calls
85
- "repr.function_name": NordColor.NORD10.value, # Darker blue for function names
86
- "repr.class_name": NordColor.NORD12.value, # Orange for class names
87
- "repr.module_name": NordColor.NORD8.value, # Light cyan for module names
88
- "repr.error": NordColor.NORD11.value, # Red for errors
89
- "repr.warning": NordColor.NORD13.value, # Yellow for warnings
90
- }
91
- )
92
-
93
- DEFAULT_HIST_NAME_COLOR = "medium_purple1"
94
-
95
- DEFAULT_HIST_VALUE_COLOR = "pale_green3"
96
-
97
-
98
- DEFAULT_AGE_RANGE = [18, 114]
99
- MIN_AGE = 0
100
- MAX_AGE = 114
101
-
102
- US_STATES_AND_MAJOR_TERRITORIES = {
103
- # States
104
- "AK",
105
- "AL",
106
- "AR",
107
- "AZ",
108
- "CA",
109
- "CO",
110
- "CT",
111
- "DE",
112
- "FL",
113
- "GA",
114
- "HI",
115
- "IA",
116
- "ID",
117
- "IL",
118
- "IN",
119
- "KS",
120
- "KY",
121
- "LA",
122
- "MA",
123
- "MD",
124
- "ME",
125
- "MI",
126
- "MN",
127
- "MO",
128
- "MS",
129
- "MT",
130
- "NC",
131
- "ND",
132
- "NE",
133
- "NH",
134
- "NJ",
135
- "NM",
136
- "NV",
137
- "NY",
138
- "OH",
139
- "OK",
140
- "OR",
141
- "PA",
142
- "RI",
143
- "SC",
144
- "SD",
145
- "TN",
146
- "TX",
147
- "UT",
148
- "VA",
149
- "VT",
150
- "WA",
151
- "WI",
152
- "WV",
153
- "WY",
154
- # D.C.
155
- "DC",
156
- # Territories
157
- "AS",
158
- "GU",
159
- "MP",
160
- "PR",
161
- "VI",
162
- }
163
-
164
- MAX_TEMPERATURE = 2.0
165
- MIN_TEMPERATURE = 0.0
166
- MAX_TOP_P = 1.0
167
- MIN_TOP_P = 0.0
168
- MIN_MAX_TOKENS = 1
169
- REASONING_TRACE_COLUMN_POSTFIX = "__reasoning_trace"
170
-
171
- AVAILABLE_LOCALES = [
172
- "ar_AA",
173
- "ar_AE",
174
- "ar_BH",
175
- "ar_EG",
176
- "ar_JO",
177
- "ar_PS",
178
- "ar_SA",
179
- "az_AZ",
180
- "bg_BG",
181
- "bn_BD",
182
- "bs_BA",
183
- "cs_CZ",
184
- "da_DK",
185
- "de",
186
- "de_AT",
187
- "de_CH",
188
- "de_DE",
189
- "dk_DK",
190
- "el_CY",
191
- "el_GR",
192
- "en",
193
- "en_AU",
194
- "en_BD",
195
- "en_CA",
196
- "en_GB",
197
- "en_IE",
198
- "en_IN",
199
- "en_NZ",
200
- "en_PH",
201
- "en_TH",
202
- "en_US",
203
- "es",
204
- "es_AR",
205
- "es_CA",
206
- "es_CL",
207
- "es_CO",
208
- "es_ES",
209
- "es_MX",
210
- "et_EE",
211
- "fa_IR",
212
- "fi_FI",
213
- "fil_PH",
214
- "fr_BE",
215
- "fr_CA",
216
- "fr_CH",
217
- "fr_FR",
218
- # "fr_QC", deprecated, use fr_CA instead
219
- "ga_IE",
220
- "he_IL",
221
- "hi_IN",
222
- "hr_HR",
223
- "hu_HU",
224
- "hy_AM",
225
- "id_ID",
226
- "it_CH",
227
- "it_IT",
228
- "ja_JP",
229
- "ka_GE",
230
- "ko_KR",
231
- "la",
232
- "lb_LU",
233
- "lt_LT",
234
- "lv_LV",
235
- "mt_MT",
236
- "ne_NP",
237
- "nl_BE",
238
- "nl_NL",
239
- "no_NO",
240
- "or_IN",
241
- "pl_PL",
242
- "pt_BR",
243
- "pt_PT",
244
- "ro_RO",
245
- "ru_RU",
246
- "sk_SK",
247
- "sl_SI",
248
- "sq_AL",
249
- "sv_SE",
250
- "ta_IN",
251
- "th",
252
- "th_TH",
253
- "tl_PH",
254
- "tr_TR",
255
- "tw_GH",
256
- "uk_UA",
257
- "vi_VN",
258
- "zh_CN",
259
- "zh_TW",
260
- "zu_ZA",
261
- ]
262
-
263
- DATA_DESIGNER_HOME_ENV_VAR = "DATA_DESIGNER_HOME"
264
-
265
- DATA_DESIGNER_HOME = Path(os.getenv(DATA_DESIGNER_HOME_ENV_VAR, Path.home() / ".data-designer"))
266
-
267
- MANAGED_ASSETS_PATH_ENV_VAR = "DATA_DESIGNER_MANAGED_ASSETS_PATH"
268
-
269
- MANAGED_ASSETS_PATH = Path(os.getenv(MANAGED_ASSETS_PATH_ENV_VAR, DATA_DESIGNER_HOME / "managed-assets"))
270
-
271
- MODEL_CONFIGS_FILE_NAME = "model_configs.yaml"
272
-
273
- MODEL_CONFIGS_FILE_PATH = DATA_DESIGNER_HOME / MODEL_CONFIGS_FILE_NAME
274
-
275
- MODEL_PROVIDERS_FILE_NAME = "model_providers.yaml"
276
-
277
- MODEL_PROVIDERS_FILE_PATH = DATA_DESIGNER_HOME / MODEL_PROVIDERS_FILE_NAME
278
-
279
- NVIDIA_PROVIDER_NAME = "nvidia"
280
-
281
- NVIDIA_API_KEY_ENV_VAR_NAME = "NVIDIA_API_KEY"
282
-
283
- OPENAI_PROVIDER_NAME = "openai"
284
-
285
- OPENAI_API_KEY_ENV_VAR_NAME = "OPENAI_API_KEY"
286
-
287
- OPENROUTER_PROVIDER_NAME = "openrouter"
288
-
289
- OPENROUTER_API_KEY_ENV_VAR_NAME = "OPENROUTER_API_KEY"
290
-
291
- PREDEFINED_PROVIDERS = [
292
- {
293
- "name": NVIDIA_PROVIDER_NAME,
294
- "endpoint": "https://integrate.api.nvidia.com/v1",
295
- "provider_type": "openai",
296
- "api_key": NVIDIA_API_KEY_ENV_VAR_NAME,
297
- },
298
- {
299
- "name": OPENAI_PROVIDER_NAME,
300
- "endpoint": "https://api.openai.com/v1",
301
- "provider_type": "openai",
302
- "api_key": OPENAI_API_KEY_ENV_VAR_NAME,
303
- },
304
- {
305
- "name": OPENROUTER_PROVIDER_NAME,
306
- "endpoint": "https://openrouter.ai/api/v1",
307
- "provider_type": "openai",
308
- "api_key": OPENROUTER_API_KEY_ENV_VAR_NAME,
309
- },
310
- ]
311
-
312
-
313
- DEFAULT_TEXT_INFERENCE_PARAMS = {"temperature": 0.85, "top_p": 0.95}
314
- DEFAULT_REASONING_INFERENCE_PARAMS = {"temperature": 0.35, "top_p": 0.95}
315
- DEFAULT_VISION_INFERENCE_PARAMS = {"temperature": 0.85, "top_p": 0.95}
316
- DEFAULT_EMBEDDING_INFERENCE_PARAMS = {"encoding_format": "float"}
317
- NEMOTRON_3_NANO_30B_A3B_INFERENCE_PARAMS = {"temperature": 1.0, "top_p": 1.0}
318
-
319
- PREDEFINED_PROVIDERS_MODEL_MAP = {
320
- NVIDIA_PROVIDER_NAME: {
321
- "text": {
322
- "model": "nvidia/nemotron-3-nano-30b-a3b",
323
- "inference_parameters": NEMOTRON_3_NANO_30B_A3B_INFERENCE_PARAMS,
324
- },
325
- "reasoning": {"model": "openai/gpt-oss-20b", "inference_parameters": DEFAULT_REASONING_INFERENCE_PARAMS},
326
- "vision": {"model": "nvidia/nemotron-nano-12b-v2-vl", "inference_parameters": DEFAULT_VISION_INFERENCE_PARAMS},
327
- "embedding": {
328
- "model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
329
- "inference_parameters": DEFAULT_EMBEDDING_INFERENCE_PARAMS | {"extra_body": {"input_type": "query"}},
330
- },
331
- },
332
- OPENAI_PROVIDER_NAME: {
333
- "text": {"model": "gpt-4.1", "inference_parameters": DEFAULT_TEXT_INFERENCE_PARAMS},
334
- "reasoning": {"model": "gpt-5", "inference_parameters": DEFAULT_REASONING_INFERENCE_PARAMS},
335
- "vision": {"model": "gpt-5", "inference_parameters": DEFAULT_VISION_INFERENCE_PARAMS},
336
- "embedding": {"model": "text-embedding-3-large", "inference_parameters": DEFAULT_EMBEDDING_INFERENCE_PARAMS},
337
- },
338
- OPENROUTER_PROVIDER_NAME: {
339
- "text": {
340
- "model": "nvidia/nemotron-3-nano-30b-a3b",
341
- "inference_parameters": NEMOTRON_3_NANO_30B_A3B_INFERENCE_PARAMS,
342
- },
343
- "reasoning": {"model": "openai/gpt-oss-20b", "inference_parameters": DEFAULT_REASONING_INFERENCE_PARAMS},
344
- "vision": {"model": "nvidia/nemotron-nano-12b-v2-vl", "inference_parameters": DEFAULT_VISION_INFERENCE_PARAMS},
345
- "embedding": {
346
- "model": "openai/text-embedding-3-large",
347
- "inference_parameters": DEFAULT_EMBEDDING_INFERENCE_PARAMS,
348
- },
349
- },
350
- }
351
-
352
- # Persona locale metadata - used by the CLI and the person sampler.
353
- NEMOTRON_PERSONAS_DATASET_SIZES = {
354
- "en_US": "1.24 GB",
355
- "en_IN": "2.39 GB",
356
- "hi_Deva_IN": "4.14 GB",
357
- "hi_Latn_IN": "2.7 GB",
358
- "ja_JP": "1.69 GB",
359
- }
360
-
361
- LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys())
362
-
363
- NEMOTRON_PERSONAS_DATASET_PREFIX = "nemotron-personas-dataset-"
@@ -1,21 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.errors import DataDesignerError
7
-
8
-
9
- class UserJinjaTemplateSyntaxError(DataDesignerError): ...
10
-
11
-
12
- class InvalidEnumValueError(DataDesignerError): ...
13
-
14
-
15
- class InvalidTypeUnionError(DataDesignerError): ...
16
-
17
-
18
- class InvalidDiscriminatorFieldError(DataDesignerError): ...
19
-
20
-
21
- class DatasetSampleDisplayError(DataDesignerError): ...