data-designer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. data_designer/__init__.py +15 -0
  2. data_designer/_version.py +34 -0
  3. data_designer/cli/README.md +236 -0
  4. data_designer/cli/__init__.py +6 -0
  5. data_designer/cli/commands/__init__.py +2 -0
  6. data_designer/cli/commands/list.py +130 -0
  7. data_designer/cli/commands/models.py +10 -0
  8. data_designer/cli/commands/providers.py +11 -0
  9. data_designer/cli/commands/reset.py +100 -0
  10. data_designer/cli/controllers/__init__.py +7 -0
  11. data_designer/cli/controllers/model_controller.py +246 -0
  12. data_designer/cli/controllers/provider_controller.py +317 -0
  13. data_designer/cli/forms/__init__.py +20 -0
  14. data_designer/cli/forms/builder.py +51 -0
  15. data_designer/cli/forms/field.py +180 -0
  16. data_designer/cli/forms/form.py +59 -0
  17. data_designer/cli/forms/model_builder.py +125 -0
  18. data_designer/cli/forms/provider_builder.py +76 -0
  19. data_designer/cli/main.py +44 -0
  20. data_designer/cli/repositories/__init__.py +8 -0
  21. data_designer/cli/repositories/base.py +39 -0
  22. data_designer/cli/repositories/model_repository.py +42 -0
  23. data_designer/cli/repositories/provider_repository.py +43 -0
  24. data_designer/cli/services/__init__.py +7 -0
  25. data_designer/cli/services/model_service.py +116 -0
  26. data_designer/cli/services/provider_service.py +111 -0
  27. data_designer/cli/ui.py +448 -0
  28. data_designer/cli/utils.py +47 -0
  29. data_designer/config/__init__.py +2 -0
  30. data_designer/config/analysis/column_profilers.py +89 -0
  31. data_designer/config/analysis/column_statistics.py +274 -0
  32. data_designer/config/analysis/dataset_profiler.py +60 -0
  33. data_designer/config/analysis/utils/errors.py +8 -0
  34. data_designer/config/analysis/utils/reporting.py +188 -0
  35. data_designer/config/base.py +68 -0
  36. data_designer/config/column_configs.py +354 -0
  37. data_designer/config/column_types.py +168 -0
  38. data_designer/config/config_builder.py +660 -0
  39. data_designer/config/data_designer_config.py +40 -0
  40. data_designer/config/dataset_builders.py +11 -0
  41. data_designer/config/datastore.py +151 -0
  42. data_designer/config/default_model_settings.py +123 -0
  43. data_designer/config/errors.py +19 -0
  44. data_designer/config/interface.py +54 -0
  45. data_designer/config/models.py +231 -0
  46. data_designer/config/preview_results.py +32 -0
  47. data_designer/config/processors.py +41 -0
  48. data_designer/config/sampler_constraints.py +51 -0
  49. data_designer/config/sampler_params.py +604 -0
  50. data_designer/config/seed.py +145 -0
  51. data_designer/config/utils/code_lang.py +83 -0
  52. data_designer/config/utils/constants.py +313 -0
  53. data_designer/config/utils/errors.py +19 -0
  54. data_designer/config/utils/info.py +88 -0
  55. data_designer/config/utils/io_helpers.py +273 -0
  56. data_designer/config/utils/misc.py +81 -0
  57. data_designer/config/utils/numerical_helpers.py +28 -0
  58. data_designer/config/utils/type_helpers.py +100 -0
  59. data_designer/config/utils/validation.py +336 -0
  60. data_designer/config/utils/visualization.py +427 -0
  61. data_designer/config/validator_params.py +96 -0
  62. data_designer/engine/__init__.py +2 -0
  63. data_designer/engine/analysis/column_profilers/base.py +55 -0
  64. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
  65. data_designer/engine/analysis/column_profilers/registry.py +20 -0
  66. data_designer/engine/analysis/column_statistics.py +142 -0
  67. data_designer/engine/analysis/dataset_profiler.py +125 -0
  68. data_designer/engine/analysis/errors.py +7 -0
  69. data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
  70. data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
  71. data_designer/engine/column_generators/__init__.py +2 -0
  72. data_designer/engine/column_generators/generators/__init__.py +2 -0
  73. data_designer/engine/column_generators/generators/base.py +61 -0
  74. data_designer/engine/column_generators/generators/expression.py +63 -0
  75. data_designer/engine/column_generators/generators/llm_generators.py +172 -0
  76. data_designer/engine/column_generators/generators/samplers.py +75 -0
  77. data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
  78. data_designer/engine/column_generators/generators/validation.py +147 -0
  79. data_designer/engine/column_generators/registry.py +56 -0
  80. data_designer/engine/column_generators/utils/errors.py +13 -0
  81. data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
  82. data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
  83. data_designer/engine/configurable_task.py +82 -0
  84. data_designer/engine/dataset_builders/artifact_storage.py +181 -0
  85. data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
  86. data_designer/engine/dataset_builders/errors.py +13 -0
  87. data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
  88. data_designer/engine/dataset_builders/utils/__init__.py +2 -0
  89. data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
  90. data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
  91. data_designer/engine/dataset_builders/utils/dag.py +56 -0
  92. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
  93. data_designer/engine/dataset_builders/utils/errors.py +13 -0
  94. data_designer/engine/errors.py +49 -0
  95. data_designer/engine/model_provider.py +75 -0
  96. data_designer/engine/models/__init__.py +2 -0
  97. data_designer/engine/models/errors.py +308 -0
  98. data_designer/engine/models/facade.py +225 -0
  99. data_designer/engine/models/litellm_overrides.py +162 -0
  100. data_designer/engine/models/parsers/__init__.py +2 -0
  101. data_designer/engine/models/parsers/errors.py +34 -0
  102. data_designer/engine/models/parsers/parser.py +236 -0
  103. data_designer/engine/models/parsers/postprocessors.py +93 -0
  104. data_designer/engine/models/parsers/tag_parsers.py +60 -0
  105. data_designer/engine/models/parsers/types.py +82 -0
  106. data_designer/engine/models/recipes/base.py +79 -0
  107. data_designer/engine/models/recipes/response_recipes.py +291 -0
  108. data_designer/engine/models/registry.py +118 -0
  109. data_designer/engine/models/usage.py +75 -0
  110. data_designer/engine/models/utils.py +38 -0
  111. data_designer/engine/processing/ginja/__init__.py +2 -0
  112. data_designer/engine/processing/ginja/ast.py +64 -0
  113. data_designer/engine/processing/ginja/environment.py +461 -0
  114. data_designer/engine/processing/ginja/exceptions.py +54 -0
  115. data_designer/engine/processing/ginja/record.py +30 -0
  116. data_designer/engine/processing/gsonschema/__init__.py +2 -0
  117. data_designer/engine/processing/gsonschema/exceptions.py +8 -0
  118. data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
  119. data_designer/engine/processing/gsonschema/types.py +8 -0
  120. data_designer/engine/processing/gsonschema/validators.py +143 -0
  121. data_designer/engine/processing/processors/base.py +15 -0
  122. data_designer/engine/processing/processors/drop_columns.py +46 -0
  123. data_designer/engine/processing/processors/registry.py +20 -0
  124. data_designer/engine/processing/utils.py +120 -0
  125. data_designer/engine/registry/base.py +97 -0
  126. data_designer/engine/registry/data_designer_registry.py +37 -0
  127. data_designer/engine/registry/errors.py +10 -0
  128. data_designer/engine/resources/managed_dataset_generator.py +35 -0
  129. data_designer/engine/resources/managed_dataset_repository.py +194 -0
  130. data_designer/engine/resources/managed_storage.py +63 -0
  131. data_designer/engine/resources/resource_provider.py +46 -0
  132. data_designer/engine/resources/seed_dataset_data_store.py +66 -0
  133. data_designer/engine/sampling_gen/column.py +89 -0
  134. data_designer/engine/sampling_gen/constraints.py +95 -0
  135. data_designer/engine/sampling_gen/data_sources/base.py +214 -0
  136. data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
  137. data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
  138. data_designer/engine/sampling_gen/entities/__init__.py +2 -0
  139. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  140. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
  141. data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
  142. data_designer/engine/sampling_gen/entities/errors.py +8 -0
  143. data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
  144. data_designer/engine/sampling_gen/entities/person.py +142 -0
  145. data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
  146. data_designer/engine/sampling_gen/errors.py +24 -0
  147. data_designer/engine/sampling_gen/generator.py +121 -0
  148. data_designer/engine/sampling_gen/jinja_utils.py +60 -0
  149. data_designer/engine/sampling_gen/people_gen.py +203 -0
  150. data_designer/engine/sampling_gen/person_constants.py +54 -0
  151. data_designer/engine/sampling_gen/schema.py +143 -0
  152. data_designer/engine/sampling_gen/schema_builder.py +59 -0
  153. data_designer/engine/sampling_gen/utils.py +40 -0
  154. data_designer/engine/secret_resolver.py +80 -0
  155. data_designer/engine/validators/__init__.py +17 -0
  156. data_designer/engine/validators/base.py +36 -0
  157. data_designer/engine/validators/local_callable.py +34 -0
  158. data_designer/engine/validators/python.py +245 -0
  159. data_designer/engine/validators/remote.py +83 -0
  160. data_designer/engine/validators/sql.py +60 -0
  161. data_designer/errors.py +5 -0
  162. data_designer/essentials/__init__.py +137 -0
  163. data_designer/interface/__init__.py +2 -0
  164. data_designer/interface/data_designer.py +351 -0
  165. data_designer/interface/errors.py +16 -0
  166. data_designer/interface/results.py +55 -0
  167. data_designer/logging.py +161 -0
  168. data_designer/plugin_manager.py +83 -0
  169. data_designer/plugins/__init__.py +6 -0
  170. data_designer/plugins/errors.py +10 -0
  171. data_designer/plugins/plugin.py +69 -0
  172. data_designer/plugins/registry.py +86 -0
  173. data_designer-0.1.0.dist-info/METADATA +173 -0
  174. data_designer-0.1.0.dist-info/RECORD +177 -0
  175. data_designer-0.1.0.dist-info/WHEEL +4 -0
  176. data_designer-0.1.0.dist-info/entry_points.txt +2 -0
  177. data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,118 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+
8
+ from data_designer.config.models import ModelConfig
9
+ from data_designer.engine.model_provider import ModelProvider, ModelProviderRegistry
10
+ from data_designer.engine.models.facade import ModelFacade
11
+ from data_designer.engine.models.litellm_overrides import apply_litellm_patches
12
+ from data_designer.engine.secret_resolver import SecretResolver
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ModelRegistry:
18
+ def __init__(
19
+ self,
20
+ *,
21
+ secret_resolver: SecretResolver,
22
+ model_provider_registry: ModelProviderRegistry,
23
+ model_configs: list[ModelConfig] | None = None,
24
+ ):
25
+ self._secret_resolver = secret_resolver
26
+ self._model_provider_registry = model_provider_registry
27
+ self._model_configs = {}
28
+ self._models = {}
29
+ self._set_model_configs(model_configs)
30
+
31
+ @property
32
+ def model_configs(self) -> dict[str, ModelConfig]:
33
+ return self._model_configs
34
+
35
+ @property
36
+ def models(self) -> dict[str, ModelFacade]:
37
+ return self._models
38
+
39
+ def register_model_configs(self, model_configs: list[ModelConfig]) -> None:
40
+ """Register a new Model configuration at runtime.
41
+
42
+ Args:
43
+ model_config: A new Model configuration to register. If an
44
+ Model configuration already exists in the registry
45
+ with the same name, then it will be overwritten.
46
+ """
47
+ self._set_model_configs(list(self._model_configs.values()) + model_configs)
48
+
49
+ def get_model(self, *, model_alias: str) -> ModelFacade:
50
+ # Check if model config exists first
51
+ if model_alias not in self._model_configs:
52
+ raise ValueError(f"No model config with alias {model_alias!r} found!")
53
+
54
+ # Lazy initialization: only create model facade when first requested
55
+ if model_alias not in self._models:
56
+ self._models[model_alias] = self._get_model(self._model_configs[model_alias])
57
+
58
+ return self._models[model_alias]
59
+
60
+ def get_model_config(self, *, model_alias: str) -> ModelConfig:
61
+ if model_alias not in self._model_configs:
62
+ raise ValueError(f"No model config with alias {model_alias!r} found!")
63
+ return self._model_configs[model_alias]
64
+
65
+ def get_model_usage_stats(self, total_time_elapsed: float) -> dict[str, dict]:
66
+ return {
67
+ model.model_name: model.usage_stats.get_usage_stats(total_time_elapsed=total_time_elapsed)
68
+ for model in self._models.values()
69
+ if model.usage_stats.has_usage
70
+ }
71
+
72
+ def get_model_provider(self, *, model_alias: str) -> ModelProvider:
73
+ model_config = self.get_model_config(model_alias=model_alias)
74
+ return self._model_provider_registry.get_provider(model_config.provider)
75
+
76
+ def run_health_check(self, model_aliases: set[str]) -> None:
77
+ logger.info("🩺 Running health checks for models...")
78
+ for model_alias in model_aliases:
79
+ model = self.get_model(model_alias=model_alias)
80
+ logger.info(
81
+ f" |-- 👀 Checking {model.model_name!r} in provider named {model.model_provider_name!r} for model alias {model.model_alias!r}..."
82
+ )
83
+ try:
84
+ model.generate(
85
+ prompt="Hello!",
86
+ parser=lambda x: x,
87
+ system_prompt="You are a helpful assistant.",
88
+ max_correction_steps=0,
89
+ max_conversation_restarts=0,
90
+ skip_usage_tracking=True,
91
+ purpose="running health checks",
92
+ )
93
+ logger.info(" |-- ✅ Passed!")
94
+ except Exception as e:
95
+ logger.error(" |-- ❌ Failed!")
96
+ raise e
97
+
98
+ def _set_model_configs(self, model_configs: list[ModelConfig]) -> None:
99
+ model_configs = model_configs or []
100
+ self._model_configs = {mc.alias: mc for mc in model_configs}
101
+ # Models are now lazily initialized in get_model() when first requested
102
+
103
+ def _get_model(self, model_config: ModelConfig) -> ModelFacade:
104
+ return ModelFacade(model_config, self._secret_resolver, self._model_provider_registry)
105
+
106
+
107
+ def create_model_registry(
108
+ *,
109
+ model_configs: list[ModelConfig] | None = None,
110
+ secret_resolver: SecretResolver,
111
+ model_provider_registry: ModelProviderRegistry,
112
+ ) -> ModelRegistry:
113
+ apply_litellm_patches()
114
+ return ModelRegistry(
115
+ model_configs=model_configs,
116
+ secret_resolver=secret_resolver,
117
+ model_provider_registry=model_provider_registry,
118
+ )
@@ -0,0 +1,75 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+
8
+ from pydantic import BaseModel, computed_field
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class TokenUsageStats(BaseModel):
14
+ prompt_tokens: int = 0
15
+ completion_tokens: int = 0
16
+
17
+ @computed_field
18
+ def total_tokens(self) -> int:
19
+ return self.prompt_tokens + self.completion_tokens
20
+
21
+ @property
22
+ def has_usage(self) -> bool:
23
+ return self.total_tokens > 0
24
+
25
+ def extend(self, *, prompt_tokens: int, completion_tokens: int) -> None:
26
+ self.prompt_tokens += prompt_tokens
27
+ self.completion_tokens += completion_tokens
28
+
29
+
30
+ class RequestUsageStats(BaseModel):
31
+ successful_requests: int = 0
32
+ failed_requests: int = 0
33
+
34
+ @computed_field
35
+ def total_requests(self) -> int:
36
+ return self.successful_requests + self.failed_requests
37
+
38
+ @property
39
+ def has_usage(self) -> bool:
40
+ return self.total_requests > 0
41
+
42
+ def extend(self, *, successful_requests: int, failed_requests: int) -> None:
43
+ self.successful_requests += successful_requests
44
+ self.failed_requests += failed_requests
45
+
46
+
47
+ class ModelUsageStats(BaseModel):
48
+ token_usage: TokenUsageStats = TokenUsageStats()
49
+ request_usage: RequestUsageStats = RequestUsageStats()
50
+
51
+ @property
52
+ def has_usage(self) -> bool:
53
+ return self.token_usage.has_usage and self.request_usage.has_usage
54
+
55
+ def extend(
56
+ self, *, token_usage: TokenUsageStats | None = None, request_usage: RequestUsageStats | None = None
57
+ ) -> None:
58
+ if token_usage is not None:
59
+ self.token_usage.extend(
60
+ prompt_tokens=token_usage.prompt_tokens, completion_tokens=token_usage.completion_tokens
61
+ )
62
+ if request_usage is not None:
63
+ self.request_usage.extend(
64
+ successful_requests=request_usage.successful_requests, failed_requests=request_usage.failed_requests
65
+ )
66
+
67
+ def get_usage_stats(self, *, total_time_elapsed: float) -> dict:
68
+ return self.model_dump() | {
69
+ "tokens_per_second": int(self.token_usage.total_tokens / total_time_elapsed)
70
+ if total_time_elapsed > 0
71
+ else 0,
72
+ "requests_per_minute": int(self.request_usage.total_requests / total_time_elapsed * 60)
73
+ if total_time_elapsed > 0
74
+ else 0,
75
+ }
@@ -0,0 +1,38 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from __future__ import annotations
5
+
6
+ from typing import Any
7
+
8
+
9
+ def prompt_to_messages(
10
+ *,
11
+ user_prompt: str,
12
+ system_prompt: str | None = None,
13
+ multi_modal_context: list[dict[str, Any]] | None = None,
14
+ ) -> list[dict[str, str | list[dict]]]:
15
+ """Convert a user and system prompt into Messages format.
16
+
17
+ Args:
18
+ user_prompt (str): A user prompt.
19
+ system_prompt (str, optional): An optional system prompt.
20
+ """
21
+ user_content = user_prompt
22
+ if multi_modal_context and len(multi_modal_context) > 0:
23
+ user_content = []
24
+ user_content.append({"type": "text", "text": user_prompt})
25
+ for context in multi_modal_context:
26
+ user_content.append(context)
27
+ return (
28
+ [
29
+ str_to_message(content=system_prompt, role="system"),
30
+ str_to_message(content=user_content, role="user"),
31
+ ]
32
+ if system_prompt
33
+ else [str_to_message(content=user_content, role="user")]
34
+ )
35
+
36
+
37
+ def str_to_message(content: str | list[dict], role: str = "user") -> dict[str, str | list[dict]]:
38
+ return {"content": content, "role": role}
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,64 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from collections import deque
5
+ from typing import Optional, Type
6
+
7
+ from jinja2 import nodes as j_nodes
8
+
9
+
10
+ def ast_max_depth(node: j_nodes.Node) -> int:
11
+ """Calculate the depth of a Jinja AST from a given node.
12
+
13
+ Args:
14
+ node (jinja2.nodes.Node): The starting Jinja2 AST node
15
+
16
+ Returns:
17
+ int: The maximum depth of the tree
18
+ """
19
+ # Each entry is (node, depth)
20
+ queue = deque([(node, 1)])
21
+ max_depth = 0
22
+
23
+ while queue:
24
+ current_node, current_depth = queue.popleft()
25
+
26
+ # Update maximum depth seen so far
27
+ max_depth = max(max_depth, current_depth)
28
+
29
+ # Add all children with incremented depth
30
+ for child in current_node.iter_child_nodes():
31
+ queue.append((child, current_depth + 1))
32
+
33
+ return max_depth
34
+
35
+
36
+ def ast_descendant_count(ast: j_nodes.Node, only_type: Optional[Type[j_nodes.Node]] = None) -> int:
37
+ """Count the number of nodes which descend from the given node.
38
+
39
+ Args:
40
+ ast (jinja2.nodes.Node): The starting Jinja2 AST node
41
+ only_type (Type[jinja2.nodes.Node]): If specified, then only
42
+ nodes of this type will be counted.
43
+
44
+ Returns:
45
+ int: The number of nodes descended from the given node.
46
+ """
47
+ if only_type is None:
48
+ only_type = j_nodes.Node
49
+
50
+ return len(list(ast.find_all(only_type)))
51
+
52
+
53
+ def ast_count_name_references(ast: j_nodes.Node, name: str) -> int:
54
+ """Count the number of nodes descended from the current that refer to name.
55
+
56
+ Args:
57
+ ast (jinja2.nodes.Node): The starting Jinja2 AST node
58
+
59
+ Returns:
60
+ int: The number of nodes descended from the provided node whose
61
+ name field matches the given name.
62
+ """
63
+ referenced_names = [node.name for node in ast.find_all(j_nodes.Name) if node.name == name]
64
+ return len(referenced_names)