data-designer 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/_version.py +2 -2
- data_designer/config/default_model_settings.py +14 -6
- data_designer/config/run_config.py +3 -0
- data_designer/config/utils/constants.py +2 -0
- data_designer/engine/column_generators/generators/llm_completion.py +15 -3
- data_designer/engine/compiler.py +32 -1
- data_designer/engine/dataset_builders/column_wise_builder.py +2 -5
- data_designer/engine/dataset_builders/utils/concurrency.py +0 -3
- data_designer/engine/models/litellm_overrides.py +28 -22
- data_designer/engine/processing/utils.py +15 -8
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +7 -3
- data_designer/interface/data_designer.py +8 -5
- {data_designer-0.3.6.dist-info → data_designer-0.3.8.dist-info}/METADATA +1 -1
- {data_designer-0.3.6.dist-info → data_designer-0.3.8.dist-info}/RECORD +17 -17
- {data_designer-0.3.6.dist-info → data_designer-0.3.8.dist-info}/WHEEL +0 -0
- {data_designer-0.3.6.dist-info → data_designer-0.3.8.dist-info}/entry_points.txt +0 -0
- {data_designer-0.3.6.dist-info → data_designer-0.3.8.dist-info}/licenses/LICENSE +0 -0
data_designer/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.3.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 3,
|
|
31
|
+
__version__ = version = '0.3.8'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 3, 8)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -71,12 +71,20 @@ def get_default_model_configs() -> list[ModelConfig]:
|
|
|
71
71
|
return []
|
|
72
72
|
|
|
73
73
|
|
|
74
|
-
def
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
74
|
+
def get_providers_with_missing_api_keys(providers: list[ModelProvider]) -> list[ModelProvider]:
|
|
75
|
+
providers_with_missing_keys = []
|
|
76
|
+
|
|
77
|
+
for provider in providers:
|
|
78
|
+
if provider.api_key is None:
|
|
79
|
+
# No API key specified at all
|
|
80
|
+
providers_with_missing_keys.append(provider)
|
|
81
|
+
elif provider.api_key.isupper() and "_" in provider.api_key:
|
|
82
|
+
# Looks like an environment variable name, check if it's set
|
|
83
|
+
if os.environ.get(provider.api_key) is None:
|
|
84
|
+
providers_with_missing_keys.append(provider)
|
|
85
|
+
# else: It's an actual API key value (not an env var), so it's valid
|
|
86
|
+
|
|
87
|
+
return providers_with_missing_keys
|
|
80
88
|
|
|
81
89
|
|
|
82
90
|
def get_default_providers() -> list[ModelProvider]:
|
|
@@ -26,6 +26,8 @@ class RunConfig(ConfigBase):
|
|
|
26
26
|
buffer_size: Number of records to process in each batch during dataset generation.
|
|
27
27
|
A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
|
|
28
28
|
to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
|
|
29
|
+
non_inference_max_parallel_workers: Maximum number of worker threads used for non-inference
|
|
30
|
+
cell-by-cell generators. Must be >= 1. Default is 4.
|
|
29
31
|
max_conversation_restarts: Maximum number of full conversation restarts permitted when
|
|
30
32
|
generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
|
|
31
33
|
max_conversation_correction_steps: Maximum number of correction rounds permitted within a
|
|
@@ -37,6 +39,7 @@ class RunConfig(ConfigBase):
|
|
|
37
39
|
shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
|
|
38
40
|
shutdown_error_window: int = Field(default=10, ge=0)
|
|
39
41
|
buffer_size: int = Field(default=1000, gt=0)
|
|
42
|
+
non_inference_max_parallel_workers: int = Field(default=4, ge=1)
|
|
40
43
|
max_conversation_restarts: int = Field(default=5, ge=0)
|
|
41
44
|
max_conversation_correction_steps: int = Field(default=0, ge=0)
|
|
42
45
|
|
|
@@ -353,9 +353,11 @@ PREDEFINED_PROVIDERS_MODEL_MAP = {
|
|
|
353
353
|
NEMOTRON_PERSONAS_DATASET_SIZES = {
|
|
354
354
|
"en_US": "1.24 GB",
|
|
355
355
|
"en_IN": "2.39 GB",
|
|
356
|
+
"en_SG": "0.30 GB",
|
|
356
357
|
"hi_Deva_IN": "4.14 GB",
|
|
357
358
|
"hi_Latn_IN": "2.7 GB",
|
|
358
359
|
"ja_JP": "1.69 GB",
|
|
360
|
+
"pt_BR": "2.33 GB",
|
|
359
361
|
}
|
|
360
362
|
|
|
361
363
|
LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys())
|
|
@@ -55,6 +55,9 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
|
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
def generate(self, data: dict) -> dict:
|
|
58
|
+
# Deserialize input data from previous columns so Jinja2 templates can access nested fields
|
|
59
|
+
# Example: If prev column stored '{"key": "value"}', templates can use {{ prev_column.key }}
|
|
60
|
+
# Note: This creates a new dict and doesn't mutate the original `data` argument
|
|
58
61
|
deserialized_record = deserialize_json_values(data)
|
|
59
62
|
|
|
60
63
|
multi_modal_context = None
|
|
@@ -81,13 +84,18 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
|
|
|
81
84
|
purpose=f"running generation for column '{self.config.name}'",
|
|
82
85
|
)
|
|
83
86
|
|
|
84
|
-
|
|
87
|
+
serialized_output = self.response_recipe.serialize_output(response)
|
|
88
|
+
data[self.config.name] = self._process_serialized_output(serialized_output)
|
|
85
89
|
|
|
86
90
|
if reasoning_trace:
|
|
87
91
|
data[self.config.name + REASONING_TRACE_COLUMN_POSTFIX] = reasoning_trace
|
|
88
92
|
|
|
89
93
|
return data
|
|
90
94
|
|
|
95
|
+
def _process_serialized_output(self, serialized_output: str) -> str | dict | list:
|
|
96
|
+
"""Process the serialized output from the model. Subclasses can override to customize deserialization."""
|
|
97
|
+
return serialized_output
|
|
98
|
+
|
|
91
99
|
|
|
92
100
|
class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnConfig]): ...
|
|
93
101
|
|
|
@@ -95,7 +103,11 @@ class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnC
|
|
|
95
103
|
class LLMCodeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMCodeColumnConfig]): ...
|
|
96
104
|
|
|
97
105
|
|
|
98
|
-
class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]):
|
|
106
|
+
class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]):
|
|
107
|
+
def _process_serialized_output(self, serialized_output: str) -> dict | list:
|
|
108
|
+
return deserialize_json_values(serialized_output)
|
|
99
109
|
|
|
100
110
|
|
|
101
|
-
class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]):
|
|
111
|
+
class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]):
|
|
112
|
+
def _process_serialized_output(self, serialized_output: str) -> dict | list:
|
|
113
|
+
return deserialize_json_values(serialized_output)
|
data_designer/engine/compiler.py
CHANGED
|
@@ -5,9 +5,10 @@ from __future__ import annotations
|
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
7
|
|
|
8
|
-
from data_designer.config.column_configs import SeedDatasetColumnConfig
|
|
8
|
+
from data_designer.config.column_configs import SamplerColumnConfig, SeedDatasetColumnConfig
|
|
9
9
|
from data_designer.config.data_designer_config import DataDesignerConfig
|
|
10
10
|
from data_designer.config.errors import InvalidConfigError
|
|
11
|
+
from data_designer.config.sampler_params import UUIDSamplerParams
|
|
11
12
|
from data_designer.engine.resources.resource_provider import ResourceProvider
|
|
12
13
|
from data_designer.engine.resources.seed_reader import SeedReader
|
|
13
14
|
from data_designer.engine.validation import ViolationLevel, rich_print_violations, validate_data_designer_config
|
|
@@ -17,6 +18,7 @@ logger = logging.getLogger(__name__)
|
|
|
17
18
|
|
|
18
19
|
def compile_data_designer_config(config: DataDesignerConfig, resource_provider: ResourceProvider) -> DataDesignerConfig:
|
|
19
20
|
_resolve_and_add_seed_columns(config, resource_provider.seed_reader)
|
|
21
|
+
_add_internal_row_id_column_if_needed(config)
|
|
20
22
|
_validate(config)
|
|
21
23
|
return config
|
|
22
24
|
|
|
@@ -41,6 +43,35 @@ def _resolve_and_add_seed_columns(config: DataDesignerConfig, seed_reader: SeedR
|
|
|
41
43
|
config.columns.extend([SeedDatasetColumnConfig(name=col_name) for col_name in seed_col_names])
|
|
42
44
|
|
|
43
45
|
|
|
46
|
+
def _add_internal_row_id_column_if_needed(config: DataDesignerConfig) -> None:
|
|
47
|
+
"""Adds a UUID sampler column named '_internal_row_id' (set to drop) if needed to enable generation.
|
|
48
|
+
|
|
49
|
+
Generation requires either:
|
|
50
|
+
- At least one sampler column (which can generate data from scratch), OR
|
|
51
|
+
- A seed dataset (which provides initial data rows)
|
|
52
|
+
|
|
53
|
+
If neither exists, a UUID sampler column '_internal_row_id' is automatically added and marked for drop
|
|
54
|
+
to enable the generation process to start.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
config: The DataDesigner configuration to potentially modify.
|
|
58
|
+
"""
|
|
59
|
+
has_sampler_column = any(isinstance(col, SamplerColumnConfig) for col in config.columns)
|
|
60
|
+
has_seed_dataset_column = any(isinstance(col, SeedDatasetColumnConfig) for col in config.columns)
|
|
61
|
+
|
|
62
|
+
if not has_sampler_column and not has_seed_dataset_column:
|
|
63
|
+
logger.warning(
|
|
64
|
+
"🔔 No sampler column or seed dataset detected. Adding UUID column '_internal_row_id' (marked for drop) to enable generation."
|
|
65
|
+
)
|
|
66
|
+
id_column = SamplerColumnConfig(
|
|
67
|
+
name="_internal_row_id",
|
|
68
|
+
sampler_type="uuid",
|
|
69
|
+
params=UUIDSamplerParams(),
|
|
70
|
+
drop=True,
|
|
71
|
+
)
|
|
72
|
+
config.columns.insert(0, id_column)
|
|
73
|
+
|
|
74
|
+
|
|
44
75
|
def _validate(config: DataDesignerConfig) -> None:
|
|
45
76
|
allowed_references = _get_allowed_references(config)
|
|
46
77
|
violations = validate_data_designer_config(
|
|
@@ -31,10 +31,7 @@ from data_designer.engine.compiler import compile_data_designer_config
|
|
|
31
31
|
from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage
|
|
32
32
|
from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError
|
|
33
33
|
from data_designer.engine.dataset_builders.multi_column_configs import MultiColumnConfig
|
|
34
|
-
from data_designer.engine.dataset_builders.utils.concurrency import
|
|
35
|
-
MAX_CONCURRENCY_PER_NON_LLM_GENERATOR,
|
|
36
|
-
ConcurrentThreadExecutor,
|
|
37
|
-
)
|
|
34
|
+
from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor
|
|
38
35
|
from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
|
|
39
36
|
from data_designer.engine.dataset_builders.utils.dataset_batch_manager import DatasetBatchManager
|
|
40
37
|
from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum, TelemetryHandler
|
|
@@ -202,7 +199,7 @@ class ColumnWiseDatasetBuilder:
|
|
|
202
199
|
self.batch_manager.add_records(df.to_dict(orient="records"))
|
|
203
200
|
|
|
204
201
|
def _run_cell_by_cell_generator(self, generator: ColumnGenerator) -> None:
|
|
205
|
-
max_workers =
|
|
202
|
+
max_workers = self._resource_provider.run_config.non_inference_max_parallel_workers
|
|
206
203
|
if isinstance(generator, ColumnGeneratorWithModel):
|
|
207
204
|
max_workers = generator.inference_parameters.max_parallel_requests
|
|
208
205
|
self._fan_out_with_threads(generator, max_workers=max_workers)
|
|
@@ -16,9 +16,6 @@ from data_designer.engine.errors import DataDesignerRuntimeError, ErrorTrap
|
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
|
-
# Constants
|
|
20
|
-
MAX_CONCURRENCY_PER_NON_LLM_GENERATOR = 4
|
|
21
|
-
|
|
22
19
|
|
|
23
20
|
class ExecutorResults(BaseModel):
|
|
24
21
|
failure_threshold: float = 0.0 # Error rate threshold
|
|
@@ -1,36 +1,42 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
LiteLLM overrides and customizations.
|
|
7
|
+
|
|
8
|
+
Note on imports: This module uses direct (eager) imports for litellm rather than lazy loading.
|
|
9
|
+
This is intentional because:
|
|
10
|
+
|
|
11
|
+
1. Class inheritance requires base classes to be resolved at class definition time,
|
|
12
|
+
making lazy imports incompatible with our ThreadSafeCache and CustomRouter classes.
|
|
13
|
+
|
|
14
|
+
2. This module is already lazily loaded at the application level - it's only imported
|
|
15
|
+
by facade.py, which itself is imported inside the create_model_registry() factory
|
|
16
|
+
function. So litellm is only loaded when models are actually needed.
|
|
17
|
+
|
|
18
|
+
3. Attempting to use lazy imports here causes intermittent ImportErrors.
|
|
19
|
+
"""
|
|
20
|
+
|
|
4
21
|
from __future__ import annotations
|
|
5
22
|
|
|
6
23
|
import random
|
|
7
24
|
import threading
|
|
8
|
-
from typing import TYPE_CHECKING
|
|
9
25
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
26
|
+
import httpx
|
|
27
|
+
import litellm
|
|
28
|
+
from litellm import RetryPolicy
|
|
29
|
+
from litellm.caching.in_memory_cache import InMemoryCache
|
|
30
|
+
from litellm.litellm_core_utils.logging_callback_manager import LoggingCallbackManager
|
|
31
|
+
from litellm.router import Router
|
|
15
32
|
from pydantic import BaseModel, Field
|
|
16
33
|
from typing_extensions import override
|
|
17
34
|
|
|
18
|
-
# Use lazy loading for runtime litellm usage (RetryPolicy, utils, etc.)
|
|
19
|
-
from data_designer.lazy_heavy_imports import httpx, litellm
|
|
20
35
|
from data_designer.logging import quiet_noisy_logger
|
|
21
36
|
|
|
22
|
-
if TYPE_CHECKING:
|
|
23
|
-
import httpx
|
|
24
|
-
import litellm
|
|
25
|
-
|
|
26
37
|
DEFAULT_MAX_CALLBACKS = 1000
|
|
27
38
|
|
|
28
39
|
|
|
29
|
-
def _get_logging_callback_manager():
|
|
30
|
-
"""Lazy accessor for LoggingCallbackManager to avoid loading litellm at import time."""
|
|
31
|
-
return litellm.litellm_core_utils.logging_callback_manager.LoggingCallbackManager
|
|
32
|
-
|
|
33
|
-
|
|
34
40
|
class LiteLLMRouterDefaultKwargs(BaseModel):
|
|
35
41
|
## Number of seconds to wait initially after a connection
|
|
36
42
|
## failure.
|
|
@@ -46,15 +52,15 @@ class LiteLLMRouterDefaultKwargs(BaseModel):
|
|
|
46
52
|
|
|
47
53
|
## Sets the default retry policy, including the number
|
|
48
54
|
## of retries to use in particular scenarios.
|
|
49
|
-
retry_policy:
|
|
50
|
-
default_factory=lambda:
|
|
55
|
+
retry_policy: RetryPolicy = Field(
|
|
56
|
+
default_factory=lambda: RetryPolicy(
|
|
51
57
|
RateLimitErrorRetries=3,
|
|
52
58
|
TimeoutErrorRetries=3,
|
|
53
59
|
)
|
|
54
60
|
)
|
|
55
61
|
|
|
56
62
|
|
|
57
|
-
class ThreadSafeCache(
|
|
63
|
+
class ThreadSafeCache(InMemoryCache):
|
|
58
64
|
def __init__(self, *args, **kwargs):
|
|
59
65
|
super().__init__(*args, **kwargs)
|
|
60
66
|
|
|
@@ -89,7 +95,7 @@ class ThreadSafeCache(_litellm_cache.InMemoryCache):
|
|
|
89
95
|
super().flush_cache()
|
|
90
96
|
|
|
91
97
|
|
|
92
|
-
class CustomRouter(
|
|
98
|
+
class CustomRouter(Router):
|
|
93
99
|
def __init__(
|
|
94
100
|
self,
|
|
95
101
|
*args,
|
|
@@ -166,7 +172,7 @@ def apply_litellm_patches():
|
|
|
166
172
|
litellm.in_memory_llm_clients_cache = ThreadSafeCache()
|
|
167
173
|
|
|
168
174
|
# Workaround for the litellm issue described in https://github.com/BerriAI/litellm/issues/9792
|
|
169
|
-
|
|
175
|
+
LoggingCallbackManager.MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS
|
|
170
176
|
|
|
171
177
|
quiet_noisy_logger("httpx")
|
|
172
178
|
quiet_noisy_logger("LiteLLM")
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
6
|
import ast
|
|
7
|
+
import copy
|
|
7
8
|
import json
|
|
8
9
|
import logging
|
|
9
10
|
import re
|
|
@@ -50,6 +51,8 @@ def deserialize_json_values(data: T) -> T: ...
|
|
|
50
51
|
def deserialize_json_values(data):
|
|
51
52
|
"""De-serialize JSON strings in various input formats.
|
|
52
53
|
|
|
54
|
+
This function creates a deep copy of the input data and does not mutate the original.
|
|
55
|
+
|
|
53
56
|
Args:
|
|
54
57
|
data: Input data in one of four formats:
|
|
55
58
|
- Single string (JSON string to deserialize)
|
|
@@ -63,18 +66,22 @@ def deserialize_json_values(data):
|
|
|
63
66
|
- List of dictionaries (when input is a list of strings)
|
|
64
67
|
- Dictionary (when input is a dictionary, with nested JSON strings deserialized)
|
|
65
68
|
- The original object (if there is no deserialization to perform)
|
|
69
|
+
|
|
66
70
|
"""
|
|
71
|
+
# Create a deep copy to avoid mutating the original data
|
|
72
|
+
data_copy = copy.deepcopy(data)
|
|
73
|
+
|
|
67
74
|
# Case 1: Single string input
|
|
68
|
-
if isinstance(
|
|
75
|
+
if isinstance(data_copy, str):
|
|
69
76
|
try:
|
|
70
|
-
return json.loads(
|
|
77
|
+
return json.loads(data_copy)
|
|
71
78
|
except json.JSONDecodeError:
|
|
72
|
-
return
|
|
79
|
+
return data_copy
|
|
73
80
|
|
|
74
81
|
# Case 2: List of strings input
|
|
75
|
-
elif isinstance(
|
|
82
|
+
elif isinstance(data_copy, list):
|
|
76
83
|
result = []
|
|
77
|
-
for item in
|
|
84
|
+
for item in data_copy:
|
|
78
85
|
if isinstance(item, str):
|
|
79
86
|
try:
|
|
80
87
|
result.append(json.loads(item))
|
|
@@ -86,9 +93,9 @@ def deserialize_json_values(data):
|
|
|
86
93
|
return result
|
|
87
94
|
|
|
88
95
|
# Case 3: Dictionary input with potential nested JSON strings
|
|
89
|
-
elif isinstance(
|
|
96
|
+
elif isinstance(data_copy, dict):
|
|
90
97
|
result = {}
|
|
91
|
-
for key, value in
|
|
98
|
+
for key, value in data_copy.items():
|
|
92
99
|
if isinstance(value, str):
|
|
93
100
|
try:
|
|
94
101
|
result[key] = json.loads(value)
|
|
@@ -103,7 +110,7 @@ def deserialize_json_values(data):
|
|
|
103
110
|
|
|
104
111
|
# Fallback for other data types
|
|
105
112
|
else:
|
|
106
|
-
return
|
|
113
|
+
return data_copy
|
|
107
114
|
|
|
108
115
|
|
|
109
116
|
def parse_list_string(text: str) -> list[str]:
|
|
@@ -40,13 +40,16 @@ PII_FIELDS = [
|
|
|
40
40
|
"state",
|
|
41
41
|
"email_address",
|
|
42
42
|
"phone_number",
|
|
43
|
+
# Brazil-specific fields
|
|
44
|
+
"race",
|
|
43
45
|
# Japan-specific fields
|
|
44
46
|
"area",
|
|
45
47
|
"prefecture",
|
|
46
48
|
"zone",
|
|
49
|
+
# Brazil and India shared fields
|
|
50
|
+
"religion",
|
|
47
51
|
# India-specific fields
|
|
48
52
|
"district",
|
|
49
|
-
"religion",
|
|
50
53
|
"education_degree",
|
|
51
54
|
"first_language",
|
|
52
55
|
"second_language",
|
|
@@ -78,9 +81,10 @@ PERSONA_FIELDS = [
|
|
|
78
81
|
# Japan-specific persona fields
|
|
79
82
|
"aspects",
|
|
80
83
|
"digital_skills",
|
|
84
|
+
# Brazil and India shared persona fields
|
|
85
|
+
"religious_persona",
|
|
86
|
+
"religious_background",
|
|
81
87
|
# India-specific persona fields
|
|
82
88
|
"linguistic_persona",
|
|
83
|
-
"religious_persona",
|
|
84
89
|
"linguistic_background",
|
|
85
|
-
"religious_background",
|
|
86
90
|
]
|
|
@@ -12,9 +12,9 @@ from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
|
12
12
|
from data_designer.config.data_designer_config import DataDesignerConfig
|
|
13
13
|
from data_designer.config.default_model_settings import (
|
|
14
14
|
get_default_model_configs,
|
|
15
|
-
get_default_model_providers_missing_api_keys,
|
|
16
15
|
get_default_provider_name,
|
|
17
16
|
get_default_providers,
|
|
17
|
+
get_providers_with_missing_api_keys,
|
|
18
18
|
)
|
|
19
19
|
from data_designer.config.interface import DataDesignerInterface
|
|
20
20
|
from data_designer.config.models import (
|
|
@@ -28,7 +28,6 @@ from data_designer.config.utils.constants import (
|
|
|
28
28
|
MANAGED_ASSETS_PATH,
|
|
29
29
|
MODEL_CONFIGS_FILE_PATH,
|
|
30
30
|
MODEL_PROVIDERS_FILE_PATH,
|
|
31
|
-
PREDEFINED_PROVIDERS,
|
|
32
31
|
)
|
|
33
32
|
from data_designer.config.utils.info import InfoType, InterfaceInfo
|
|
34
33
|
from data_designer.engine.analysis.dataset_profiler import DataDesignerDatasetProfiler, DatasetProfilerConfig
|
|
@@ -317,7 +316,8 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
317
316
|
|
|
318
317
|
Args:
|
|
319
318
|
run_config: A RunConfig instance containing runtime settings such as
|
|
320
|
-
early shutdown behavior
|
|
319
|
+
early shutdown behavior, batch sizing via `buffer_size`, and non-inference worker
|
|
320
|
+
concurrency via `non_inference_max_parallel_workers`. Import RunConfig from
|
|
321
321
|
data_designer.essentials.
|
|
322
322
|
|
|
323
323
|
Example:
|
|
@@ -334,8 +334,11 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
334
334
|
def _resolve_model_providers(self, model_providers: list[ModelProvider] | None) -> list[ModelProvider]:
|
|
335
335
|
if model_providers is None:
|
|
336
336
|
model_providers = get_default_providers()
|
|
337
|
-
|
|
338
|
-
|
|
337
|
+
# Check which providers have missing API keys (from YAML file or env vars)
|
|
338
|
+
providers_with_missing_keys = get_providers_with_missing_api_keys(model_providers)
|
|
339
|
+
|
|
340
|
+
if len(providers_with_missing_keys) == len(model_providers):
|
|
341
|
+
# All providers have missing API keys
|
|
339
342
|
logger.warning(
|
|
340
343
|
"🚨 You are trying to use a default model provider but your API keys are missing."
|
|
341
344
|
"\n\t\t\tSet the API key for the default providers you intend to use and re-initialize the Data Designer object."
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
data_designer/__init__.py,sha256=iLr6FpW41-DFbGexuXCJ6gN1xBMNUZ2jfj9XxySmQhk,502
|
|
2
|
-
data_designer/_version.py,sha256=
|
|
2
|
+
data_designer/_version.py,sha256=iWEYFk8Ve-UDYmpiH5KfwfwhQTYtkxZlaGPwKE2xDy0,704
|
|
3
3
|
data_designer/errors.py,sha256=r1pBvmvRBAsPmb7oF_veubhkxZ2uPo9cGEDwykLziX4,220
|
|
4
4
|
data_designer/lazy_heavy_imports.py,sha256=wULSEPQRUOZXvOnb0tdf6wNbRBpaaczYfAjY-pstCBM,1512
|
|
5
5
|
data_designer/logging.py,sha256=gRi9BOqm95UC1-u4pn6n-G4EySy9HhwKVyKLRO4aqm4,5382
|
|
@@ -42,14 +42,14 @@ data_designer/config/config_builder.py,sha256=vuPibkodbJxbCXdaI1tt1Uyo1SVCnAOfLB
|
|
|
42
42
|
data_designer/config/data_designer_config.py,sha256=qOojviug05vHR2S4800sjd4OmxhSVi6kB8SAFXLlPog,1891
|
|
43
43
|
data_designer/config/dataset_builders.py,sha256=jdCujJYFlKAiSkPNX2Qeyrs683GrRcCDv_m8ZZhtg64,368
|
|
44
44
|
data_designer/config/dataset_metadata.py,sha256=UTlEgnHWgjwPuc7bP95T7gaKmcr7pIhFMy9vvbUwMV4,647
|
|
45
|
-
data_designer/config/default_model_settings.py,sha256=
|
|
45
|
+
data_designer/config/default_model_settings.py,sha256=c-llH2otfG0tMCMsxoz3ZcS1nFxIQQPfRedFXAydDbc,4868
|
|
46
46
|
data_designer/config/errors.py,sha256=JhvUYecfLmP0gZjQzqA3OmfaSs9TRlC5E-ubnV_-3gs,560
|
|
47
47
|
data_designer/config/exports.py,sha256=lNwteK4djETKXrMKh5PPeHeZvPAZ5RpnJt2otpoaUz0,4756
|
|
48
48
|
data_designer/config/interface.py,sha256=ikmpm_KwencTpM-yg0auo7XMgcmMSa67S75IqdpFLfk,1676
|
|
49
49
|
data_designer/config/models.py,sha256=OekrXEVnI9WdHzEVk-8fO0NtxLZtjKVtCL03RY8qwYs,15457
|
|
50
50
|
data_designer/config/preview_results.py,sha256=WnPlDcHElIHNfjV_P-nLu_Dpul8D3Eyb5qyi3E173Gs,1744
|
|
51
51
|
data_designer/config/processors.py,sha256=lnyUZA1EhO9NWjjVFFioYxSgeYpoAaM1J7UzwOYkvms,6028
|
|
52
|
-
data_designer/config/run_config.py,sha256=
|
|
52
|
+
data_designer/config/run_config.py,sha256=oJ163DpHXu9PzST5Hn9px-bIP9DYjIkCO7UGB93J7bI,2663
|
|
53
53
|
data_designer/config/sampler_constraints.py,sha256=tQI1XLF5bS4TnyKMLo0nArvefnXI8dWCzov38r4qNCQ,1197
|
|
54
54
|
data_designer/config/sampler_params.py,sha256=Gio-53vjSYOdPhF2CEq4HSWCXCaZMy4WpGPbuFVcWOM,27965
|
|
55
55
|
data_designer/config/seed.py,sha256=eShSqOcSUzfCEZBnqY-rB0qZpRGxjeOE3fSaJAwacec,4668
|
|
@@ -63,7 +63,7 @@ data_designer/config/analysis/dataset_profiler.py,sha256=-5eX55IXivwUBMg2pI-d_3e
|
|
|
63
63
|
data_designer/config/analysis/utils/errors.py,sha256=pvmdQ_YuIlWW4NFw-cX_rOoQf-GG8y_FiQzNctB__DQ,331
|
|
64
64
|
data_designer/config/analysis/utils/reporting.py,sha256=teTzd1OHtpI4vbIinGOGsKXyNldO3F5eqbNdAztF0_s,7066
|
|
65
65
|
data_designer/config/utils/code_lang.py,sha256=EqMJh1GL5ysUZIoyqx_6vmqenUKHm4J-RQtKXiA4EPg,2354
|
|
66
|
-
data_designer/config/utils/constants.py,sha256=
|
|
66
|
+
data_designer/config/utils/constants.py,sha256=eqDQ57b8B0v5qRSO0He45LEjSxtfxlsPtHRvBu1xkw0,8973
|
|
67
67
|
data_designer/config/utils/errors.py,sha256=HCjer0YrF0bMn5j8gmgWaLb0395LAr_hxMD1ftOsOc8,520
|
|
68
68
|
data_designer/config/utils/info.py,sha256=yOa4U8kI_CY4OfCKZxCm2okU8klAiThvyjKM5tG-F0A,3469
|
|
69
69
|
data_designer/config/utils/io_helpers.py,sha256=kzvOR7QgqijkqU-O2enIlpCWwHvzc3oRaEl4Lsjh1Do,8466
|
|
@@ -72,7 +72,7 @@ data_designer/config/utils/numerical_helpers.py,sha256=DIubKzc8q2_Bw7xRjyOGwxYul
|
|
|
72
72
|
data_designer/config/utils/type_helpers.py,sha256=XyVup24F4Bl7uNze_yUW9oD6EzFbfsJWKhpeMN2901A,4059
|
|
73
73
|
data_designer/config/utils/visualization.py,sha256=_0Mn-jva0Oz1tVTQH1mnWSARpqZ2kh1JSzJEuikyy9s,18491
|
|
74
74
|
data_designer/engine/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
|
|
75
|
-
data_designer/engine/compiler.py,sha256=
|
|
75
|
+
data_designer/engine/compiler.py,sha256=4QAeCJjINtH0afSXygdhiKMyq2KIfaDthK3ApZLgrQ0,4152
|
|
76
76
|
data_designer/engine/configurable_task.py,sha256=6R4FPXPzIeK0lqNVSEXzRDtK14B3dFz38lplr-nkvRE,2539
|
|
77
77
|
data_designer/engine/errors.py,sha256=YXI7ny83BQ16sOK43CpTm384hJTKuZkPTEAjlHlDIfA,1303
|
|
78
78
|
data_designer/engine/model_provider.py,sha256=_uU5Bw7yrGlMROjHL4dN1mMTg1eN-LVW5JWcQxovhAA,2823
|
|
@@ -92,7 +92,7 @@ data_designer/engine/column_generators/generators/__init__.py,sha256=XLO09Ei8g0l
|
|
|
92
92
|
data_designer/engine/column_generators/generators/base.py,sha256=QElk5KsaUQ3EYwlv40NcZgQsw3HIkX3YQV_0S3erl7Q,4209
|
|
93
93
|
data_designer/engine/column_generators/generators/embedding.py,sha256=uB0jgHlCgctgIUf9ZfMqG1YThbJ0g-GCX3VdNbdDSko,1407
|
|
94
94
|
data_designer/engine/column_generators/generators/expression.py,sha256=BiQcfVTinvQl3OI9nkdhB9B7FGBueWiHJwxTA8uNVuY,2330
|
|
95
|
-
data_designer/engine/column_generators/generators/llm_completion.py,sha256=
|
|
95
|
+
data_designer/engine/column_generators/generators/llm_completion.py,sha256=3S3ikNLLLGnutUdcuswL5dUfcLgT_-he8DiRZ9K706U,4721
|
|
96
96
|
data_designer/engine/column_generators/generators/samplers.py,sha256=gNzURmu9K8Zb5MHamKvZPIxmWlFgl2W4FIVgaFcy4f0,3371
|
|
97
97
|
data_designer/engine/column_generators/generators/seed_dataset.py,sha256=CoQPbz4Ww7pBLaGw8-CYqIk1sjfkBaoRMKZQexdfgKY,6824
|
|
98
98
|
data_designer/engine/column_generators/generators/validation.py,sha256=YfYbk-8_ZUye0No6_Q7hIqpZv_tunnEZ6HkLSMFXlDE,6659
|
|
@@ -101,11 +101,11 @@ data_designer/engine/column_generators/utils/generator_classification.py,sha256=
|
|
|
101
101
|
data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=gESiqMrQzbbcFpZas0sAAAkrH2DL0Z4Nq5ywBO-pQ6k,2141
|
|
102
102
|
data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=LATVAlDYwL7HyM7Nogd6n9XTTk-j9s64o4z0LpKHMhQ,4819
|
|
103
103
|
data_designer/engine/dataset_builders/artifact_storage.py,sha256=CKpTBtJTde7OQvsFZQa1v1autVz5yUxlBHkIKeATFnE,10999
|
|
104
|
-
data_designer/engine/dataset_builders/column_wise_builder.py,sha256=
|
|
104
|
+
data_designer/engine/dataset_builders/column_wise_builder.py,sha256=lzCSk3dFmdZvKLPAVIRNp9oJQsiilthHRW7mB4dUUB4,15716
|
|
105
105
|
data_designer/engine/dataset_builders/errors.py,sha256=gLXtPcGSMBG10PzQ85dOXskdA0mKbBQrHa_VtP9sbVY,400
|
|
106
106
|
data_designer/engine/dataset_builders/multi_column_configs.py,sha256=U4Pg0ETCBq5phRhb2zt8IFa4fRx-aTMakomKOBnrs0U,1660
|
|
107
107
|
data_designer/engine/dataset_builders/utils/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
|
|
108
|
-
data_designer/engine/dataset_builders/utils/concurrency.py,sha256=
|
|
108
|
+
data_designer/engine/dataset_builders/utils/concurrency.py,sha256=Lga_xd8i3ZAPqJlKCB4GHG7uxWxws1m-UGAz9UeqU_8,8283
|
|
109
109
|
data_designer/engine/dataset_builders/utils/config_compiler.py,sha256=NGI6U0vgG88d5YKj7oW_SIJ4-_fhA6VFhPbjqGRHea4,2441
|
|
110
110
|
data_designer/engine/dataset_builders/utils/dag.py,sha256=RIEI75OtiphkuDl1vfI_MQC1xMiiIg29s-0C_fNZkWQ,2613
|
|
111
111
|
data_designer/engine/dataset_builders/utils/dataset_batch_manager.py,sha256=IfWd_HcfEzIPhgFp2dJaxNIKRlrPsHqYATFXauvCfaw,8133
|
|
@@ -114,7 +114,7 @@ data_designer/engine/models/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE
|
|
|
114
114
|
data_designer/engine/models/errors.py,sha256=k9oZnmk8DRD8U2SVKJJRLwrcdsCcVoJiOb_Q7ZyEdvg,12271
|
|
115
115
|
data_designer/engine/models/facade.py,sha256=UBMpw_o2JcsWpJsPdpTPKfFZCh_i0eeG_oaWi1XeKds,12582
|
|
116
116
|
data_designer/engine/models/factory.py,sha256=2NjI0iiGv8ayQ1c249lsJtha4pDmvmtSjdwvlvitRds,1581
|
|
117
|
-
data_designer/engine/models/litellm_overrides.py,sha256=
|
|
117
|
+
data_designer/engine/models/litellm_overrides.py,sha256=e9IZCFQ6BhNWlOTncm8ErL8w4rtE1_4USh2mtUYxCZI,6207
|
|
118
118
|
data_designer/engine/models/registry.py,sha256=7hZ6TQwwZf259yRZmc3ZI20a4wAo3PCOozPi9Mc5KLo,6827
|
|
119
119
|
data_designer/engine/models/telemetry.py,sha256=wmuekvPRZjNz7p7ImKx5H_hqDRhTv_dSB-u2S6Ze3uo,12502
|
|
120
120
|
data_designer/engine/models/usage.py,sha256=A0LV9Ycuj_7snOsaqnirs4mlkAjozv2mzj2om2FpDoU,2410
|
|
@@ -127,7 +127,7 @@ data_designer/engine/models/parsers/tag_parsers.py,sha256=HNAIBfXW1Wjdkw4IX-P9sH
|
|
|
127
127
|
data_designer/engine/models/parsers/types.py,sha256=wEt80al1FykbMplZVjJ5uXFtacMx-a9GE4_QoqDJ6Us,2631
|
|
128
128
|
data_designer/engine/models/recipes/base.py,sha256=AQg3Ay_E0hBEVg-sqSNVVZNMJfJ3r1eT14-b9yqymnQ,2630
|
|
129
129
|
data_designer/engine/models/recipes/response_recipes.py,sha256=UX9m-8RTDj3sXkzEdKpkSj5z7jO-fQhdca3MSByb_Js,10189
|
|
130
|
-
data_designer/engine/processing/utils.py,sha256=
|
|
130
|
+
data_designer/engine/processing/utils.py,sha256=g82KsdDR20g_isadpmgHnneQSX0W21aCVhkp5TIWEhw,5443
|
|
131
131
|
data_designer/engine/processing/ginja/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
|
|
132
132
|
data_designer/engine/processing/ginja/ast.py,sha256=w62yt434RDnJYrcfofIDThGv0C5H9XJE3VHOnxEzJVM,1964
|
|
133
133
|
data_designer/engine/processing/ginja/environment.py,sha256=wJRbzPuUCQGvCi4zS4g8sYzihgu_6fn-tE_nYSL1AoU,18974
|
|
@@ -164,7 +164,7 @@ data_designer/engine/sampling_gen/data_sources/base.py,sha256=zUG5XTplD5pgHh4ytC
|
|
|
164
164
|
data_designer/engine/sampling_gen/data_sources/errors.py,sha256=_9rbwUpaz0Pd2Ods4AVDQ7Uq4JvPyfHhTp51BdtJDto,367
|
|
165
165
|
data_designer/engine/sampling_gen/data_sources/sources.py,sha256=53KVPp7REjNKA0rajGmT_tBkxwQqwrcIKhcijBGcfcs,13647
|
|
166
166
|
data_designer/engine/sampling_gen/entities/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
|
|
167
|
-
data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256=
|
|
167
|
+
data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256=r8qXWe8EquJognihPNGzma4fFuSQAAZHlkYVsGmcX2w,2006
|
|
168
168
|
data_designer/engine/sampling_gen/entities/email_address_utils.py,sha256=THfD7muq5tMHkRWOATN-N3iSFgkKjT4e8hKquDFMTlU,5272
|
|
169
169
|
data_designer/engine/sampling_gen/entities/errors.py,sha256=SbtwwG6JgoY4k6pq2-y-lD60nX_pqjf5QftmwgXt0us,352
|
|
170
170
|
data_designer/engine/sampling_gen/entities/national_id_utils.py,sha256=XUFB6RhfLGFQUNyy0B6BSgtrG9NdEnIjfSALBwJplho,2652
|
|
@@ -179,7 +179,7 @@ data_designer/engine/validators/remote.py,sha256=rythhIrH2GvqncMQeF3FiJa9Om0KZWe
|
|
|
179
179
|
data_designer/engine/validators/sql.py,sha256=AMaEdA-gj9j0zwVp809x3ycKltd51wVEhI8mMYGyxd4,2408
|
|
180
180
|
data_designer/essentials/__init__.py,sha256=dIGYH9s0_VQJ1lG8S-ElZiISz59LHo9v7Y5upizcA1M,1135
|
|
181
181
|
data_designer/interface/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
|
|
182
|
-
data_designer/interface/data_designer.py,sha256=
|
|
182
|
+
data_designer/interface/data_designer.py,sha256=nqsl2Et1wSF3TQzYx3gC1drJzrKlm4FtRE7QJYilbys,17544
|
|
183
183
|
data_designer/interface/errors.py,sha256=Ft9GMeIrOHJv_PC_1rU6hWcNyq1GHdsFYZSc9HnUrxU,606
|
|
184
184
|
data_designer/interface/results.py,sha256=3fGwlhif4ufqUGh-EgsGccrob4S6a7WZ6BgFiszTo_A,3871
|
|
185
185
|
data_designer/plugins/__init__.py,sha256=qe1alcTEtnMSMdzknjb57vvjqKgFE5cEHXxBj8tPWMI,275
|
|
@@ -189,8 +189,8 @@ data_designer/plugins/registry.py,sha256=Cnt33Q25o9bS2v2YDbV3QPM57VNrtIBKAb4ERQR
|
|
|
189
189
|
data_designer/plugins/testing/__init__.py,sha256=yyxrrH_i3q0Xb56QO9Ma35WtHlQ5PJF1b2pQoKa16xU,296
|
|
190
190
|
data_designer/plugins/testing/stubs.py,sha256=9tUF209ayZR6f0Q1LsRDW4kEOTgPoIxV8jlq4QoWuW0,3498
|
|
191
191
|
data_designer/plugins/testing/utils.py,sha256=a9LEgK827cnIzHEkgXOdgywrKDLBE36cyttrpG1ctT4,973
|
|
192
|
-
data_designer-0.3.
|
|
193
|
-
data_designer-0.3.
|
|
194
|
-
data_designer-0.3.
|
|
195
|
-
data_designer-0.3.
|
|
196
|
-
data_designer-0.3.
|
|
192
|
+
data_designer-0.3.8.dist-info/METADATA,sha256=n9jXs34c2_rOL-Tme5Y6xrb4fMo0-GNNEVHlCl5WfLY,8119
|
|
193
|
+
data_designer-0.3.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
194
|
+
data_designer-0.3.8.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
|
|
195
|
+
data_designer-0.3.8.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
|
|
196
|
+
data_designer-0.3.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|