data-designer 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data_designer/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.6'
32
- __version_tuple__ = version_tuple = (0, 3, 6)
31
+ __version__ = version = '0.3.8'
32
+ __version_tuple__ = version_tuple = (0, 3, 8)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -71,12 +71,20 @@ def get_default_model_configs() -> list[ModelConfig]:
71
71
  return []
72
72
 
73
73
 
74
- def get_default_model_providers_missing_api_keys() -> list[str]:
75
- missing_api_keys = []
76
- for predefined_provider in PREDEFINED_PROVIDERS:
77
- if os.environ.get(predefined_provider["api_key"]) is None:
78
- missing_api_keys.append(predefined_provider["api_key"])
79
- return missing_api_keys
74
+ def get_providers_with_missing_api_keys(providers: list[ModelProvider]) -> list[ModelProvider]:
75
+ providers_with_missing_keys = []
76
+
77
+ for provider in providers:
78
+ if provider.api_key is None:
79
+ # No API key specified at all
80
+ providers_with_missing_keys.append(provider)
81
+ elif provider.api_key.isupper() and "_" in provider.api_key:
82
+ # Looks like an environment variable name, check if it's set
83
+ if os.environ.get(provider.api_key) is None:
84
+ providers_with_missing_keys.append(provider)
85
+ # else: It's an actual API key value (not an env var), so it's valid
86
+
87
+ return providers_with_missing_keys
80
88
 
81
89
 
82
90
  def get_default_providers() -> list[ModelProvider]:
@@ -26,6 +26,8 @@ class RunConfig(ConfigBase):
26
26
  buffer_size: Number of records to process in each batch during dataset generation.
27
27
  A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
28
28
  to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
29
+ non_inference_max_parallel_workers: Maximum number of worker threads used for non-inference
30
+ cell-by-cell generators. Must be >= 1. Default is 4.
29
31
  max_conversation_restarts: Maximum number of full conversation restarts permitted when
30
32
  generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
31
33
  max_conversation_correction_steps: Maximum number of correction rounds permitted within a
@@ -37,6 +39,7 @@ class RunConfig(ConfigBase):
37
39
  shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
38
40
  shutdown_error_window: int = Field(default=10, ge=0)
39
41
  buffer_size: int = Field(default=1000, gt=0)
42
+ non_inference_max_parallel_workers: int = Field(default=4, ge=1)
40
43
  max_conversation_restarts: int = Field(default=5, ge=0)
41
44
  max_conversation_correction_steps: int = Field(default=0, ge=0)
42
45
 
@@ -353,9 +353,11 @@ PREDEFINED_PROVIDERS_MODEL_MAP = {
353
353
  NEMOTRON_PERSONAS_DATASET_SIZES = {
354
354
  "en_US": "1.24 GB",
355
355
  "en_IN": "2.39 GB",
356
+ "en_SG": "0.30 GB",
356
357
  "hi_Deva_IN": "4.14 GB",
357
358
  "hi_Latn_IN": "2.7 GB",
358
359
  "ja_JP": "1.69 GB",
360
+ "pt_BR": "2.33 GB",
359
361
  }
360
362
 
361
363
  LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys())
@@ -55,6 +55,9 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
55
55
  )
56
56
 
57
57
  def generate(self, data: dict) -> dict:
58
+ # Deserialize input data from previous columns so Jinja2 templates can access nested fields
59
+ # Example: If prev column stored '{"key": "value"}', templates can use {{ prev_column.key }}
60
+ # Note: This creates a new dict and doesn't mutate the original `data` argument
58
61
  deserialized_record = deserialize_json_values(data)
59
62
 
60
63
  multi_modal_context = None
@@ -81,13 +84,18 @@ class ColumnGeneratorWithModelChatCompletion(ColumnGeneratorWithModel[TaskConfig
81
84
  purpose=f"running generation for column '{self.config.name}'",
82
85
  )
83
86
 
84
- data[self.config.name] = deserialize_json_values(self.response_recipe.serialize_output(response))
87
+ serialized_output = self.response_recipe.serialize_output(response)
88
+ data[self.config.name] = self._process_serialized_output(serialized_output)
85
89
 
86
90
  if reasoning_trace:
87
91
  data[self.config.name + REASONING_TRACE_COLUMN_POSTFIX] = reasoning_trace
88
92
 
89
93
  return data
90
94
 
95
+ def _process_serialized_output(self, serialized_output: str) -> str | dict | list:
96
+ """Process the serialized output from the model. Subclasses can override to customize deserialization."""
97
+ return serialized_output
98
+
91
99
 
92
100
  class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnConfig]): ...
93
101
 
@@ -95,7 +103,11 @@ class LLMTextCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMTextColumnC
95
103
  class LLMCodeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMCodeColumnConfig]): ...
96
104
 
97
105
 
98
- class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]): ...
106
+ class LLMStructuredCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMStructuredColumnConfig]):
107
+ def _process_serialized_output(self, serialized_output: str) -> dict | list:
108
+ return deserialize_json_values(serialized_output)
99
109
 
100
110
 
101
- class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]): ...
111
+ class LLMJudgeCellGenerator(ColumnGeneratorWithModelChatCompletion[LLMJudgeColumnConfig]):
112
+ def _process_serialized_output(self, serialized_output: str) -> dict | list:
113
+ return deserialize_json_values(serialized_output)
@@ -5,9 +5,10 @@ from __future__ import annotations
5
5
 
6
6
  import logging
7
7
 
8
- from data_designer.config.column_configs import SeedDatasetColumnConfig
8
+ from data_designer.config.column_configs import SamplerColumnConfig, SeedDatasetColumnConfig
9
9
  from data_designer.config.data_designer_config import DataDesignerConfig
10
10
  from data_designer.config.errors import InvalidConfigError
11
+ from data_designer.config.sampler_params import UUIDSamplerParams
11
12
  from data_designer.engine.resources.resource_provider import ResourceProvider
12
13
  from data_designer.engine.resources.seed_reader import SeedReader
13
14
  from data_designer.engine.validation import ViolationLevel, rich_print_violations, validate_data_designer_config
@@ -17,6 +18,7 @@ logger = logging.getLogger(__name__)
17
18
 
18
19
  def compile_data_designer_config(config: DataDesignerConfig, resource_provider: ResourceProvider) -> DataDesignerConfig:
19
20
  _resolve_and_add_seed_columns(config, resource_provider.seed_reader)
21
+ _add_internal_row_id_column_if_needed(config)
20
22
  _validate(config)
21
23
  return config
22
24
 
@@ -41,6 +43,35 @@ def _resolve_and_add_seed_columns(config: DataDesignerConfig, seed_reader: SeedR
41
43
  config.columns.extend([SeedDatasetColumnConfig(name=col_name) for col_name in seed_col_names])
42
44
 
43
45
 
46
+ def _add_internal_row_id_column_if_needed(config: DataDesignerConfig) -> None:
47
+ """Adds a UUID sampler column named '_internal_row_id' (set to drop) if needed to enable generation.
48
+
49
+ Generation requires either:
50
+ - At least one sampler column (which can generate data from scratch), OR
51
+ - A seed dataset (which provides initial data rows)
52
+
53
+ If neither exists, a UUID sampler column '_internal_row_id' is automatically added and marked for drop
54
+ to enable the generation process to start.
55
+
56
+ Args:
57
+ config: The DataDesigner configuration to potentially modify.
58
+ """
59
+ has_sampler_column = any(isinstance(col, SamplerColumnConfig) for col in config.columns)
60
+ has_seed_dataset_column = any(isinstance(col, SeedDatasetColumnConfig) for col in config.columns)
61
+
62
+ if not has_sampler_column and not has_seed_dataset_column:
63
+ logger.warning(
64
+ "🔔 No sampler column or seed dataset detected. Adding UUID column '_internal_row_id' (marked for drop) to enable generation."
65
+ )
66
+ id_column = SamplerColumnConfig(
67
+ name="_internal_row_id",
68
+ sampler_type="uuid",
69
+ params=UUIDSamplerParams(),
70
+ drop=True,
71
+ )
72
+ config.columns.insert(0, id_column)
73
+
74
+
44
75
  def _validate(config: DataDesignerConfig) -> None:
45
76
  allowed_references = _get_allowed_references(config)
46
77
  violations = validate_data_designer_config(
@@ -31,10 +31,7 @@ from data_designer.engine.compiler import compile_data_designer_config
31
31
  from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage
32
32
  from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError
33
33
  from data_designer.engine.dataset_builders.multi_column_configs import MultiColumnConfig
34
- from data_designer.engine.dataset_builders.utils.concurrency import (
35
- MAX_CONCURRENCY_PER_NON_LLM_GENERATOR,
36
- ConcurrentThreadExecutor,
37
- )
34
+ from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor
38
35
  from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
39
36
  from data_designer.engine.dataset_builders.utils.dataset_batch_manager import DatasetBatchManager
40
37
  from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum, TelemetryHandler
@@ -202,7 +199,7 @@ class ColumnWiseDatasetBuilder:
202
199
  self.batch_manager.add_records(df.to_dict(orient="records"))
203
200
 
204
201
  def _run_cell_by_cell_generator(self, generator: ColumnGenerator) -> None:
205
- max_workers = MAX_CONCURRENCY_PER_NON_LLM_GENERATOR
202
+ max_workers = self._resource_provider.run_config.non_inference_max_parallel_workers
206
203
  if isinstance(generator, ColumnGeneratorWithModel):
207
204
  max_workers = generator.inference_parameters.max_parallel_requests
208
205
  self._fan_out_with_threads(generator, max_workers=max_workers)
@@ -16,9 +16,6 @@ from data_designer.engine.errors import DataDesignerRuntimeError, ErrorTrap
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
19
- # Constants
20
- MAX_CONCURRENCY_PER_NON_LLM_GENERATOR = 4
21
-
22
19
 
23
20
  class ExecutorResults(BaseModel):
24
21
  failure_threshold: float = 0.0 # Error rate threshold
@@ -1,36 +1,42 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+
5
+ """
6
+ LiteLLM overrides and customizations.
7
+
8
+ Note on imports: This module uses direct (eager) imports for litellm rather than lazy loading.
9
+ This is intentional because:
10
+
11
+ 1. Class inheritance requires base classes to be resolved at class definition time,
12
+ making lazy imports incompatible with our ThreadSafeCache and CustomRouter classes.
13
+
14
+ 2. This module is already lazily loaded at the application level - it's only imported
15
+ by facade.py, which itself is imported inside the create_model_registry() factory
16
+ function. So litellm is only loaded when models are actually needed.
17
+
18
+ 3. Attempting to use lazy imports here causes intermittent ImportErrors.
19
+ """
20
+
4
21
  from __future__ import annotations
5
22
 
6
23
  import random
7
24
  import threading
8
- from typing import TYPE_CHECKING
9
25
 
10
- # Import specific litellm submodules needed for class inheritance
11
- # Note: Class inheritance requires base classes at definition time, so we import these directly.
12
- # Runtime litellm usage below still benefits from lazy loading via the litellm alias.
13
- import litellm.caching.in_memory_cache as _litellm_cache
14
- import litellm.router as _litellm_router
26
+ import httpx
27
+ import litellm
28
+ from litellm import RetryPolicy
29
+ from litellm.caching.in_memory_cache import InMemoryCache
30
+ from litellm.litellm_core_utils.logging_callback_manager import LoggingCallbackManager
31
+ from litellm.router import Router
15
32
  from pydantic import BaseModel, Field
16
33
  from typing_extensions import override
17
34
 
18
- # Use lazy loading for runtime litellm usage (RetryPolicy, utils, etc.)
19
- from data_designer.lazy_heavy_imports import httpx, litellm
20
35
  from data_designer.logging import quiet_noisy_logger
21
36
 
22
- if TYPE_CHECKING:
23
- import httpx
24
- import litellm
25
-
26
37
  DEFAULT_MAX_CALLBACKS = 1000
27
38
 
28
39
 
29
- def _get_logging_callback_manager():
30
- """Lazy accessor for LoggingCallbackManager to avoid loading litellm at import time."""
31
- return litellm.litellm_core_utils.logging_callback_manager.LoggingCallbackManager
32
-
33
-
34
40
  class LiteLLMRouterDefaultKwargs(BaseModel):
35
41
  ## Number of seconds to wait initially after a connection
36
42
  ## failure.
@@ -46,15 +52,15 @@ class LiteLLMRouterDefaultKwargs(BaseModel):
46
52
 
47
53
  ## Sets the default retry policy, including the number
48
54
  ## of retries to use in particular scenarios.
49
- retry_policy: litellm.RetryPolicy = Field(
50
- default_factory=lambda: litellm.RetryPolicy(
55
+ retry_policy: RetryPolicy = Field(
56
+ default_factory=lambda: RetryPolicy(
51
57
  RateLimitErrorRetries=3,
52
58
  TimeoutErrorRetries=3,
53
59
  )
54
60
  )
55
61
 
56
62
 
57
- class ThreadSafeCache(_litellm_cache.InMemoryCache):
63
+ class ThreadSafeCache(InMemoryCache):
58
64
  def __init__(self, *args, **kwargs):
59
65
  super().__init__(*args, **kwargs)
60
66
 
@@ -89,7 +95,7 @@ class ThreadSafeCache(_litellm_cache.InMemoryCache):
89
95
  super().flush_cache()
90
96
 
91
97
 
92
- class CustomRouter(_litellm_router.Router):
98
+ class CustomRouter(Router):
93
99
  def __init__(
94
100
  self,
95
101
  *args,
@@ -166,7 +172,7 @@ def apply_litellm_patches():
166
172
  litellm.in_memory_llm_clients_cache = ThreadSafeCache()
167
173
 
168
174
  # Workaround for the litellm issue described in https://github.com/BerriAI/litellm/issues/9792
169
- _get_logging_callback_manager().MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS
175
+ LoggingCallbackManager.MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS
170
176
 
171
177
  quiet_noisy_logger("httpx")
172
178
  quiet_noisy_logger("LiteLLM")
@@ -4,6 +4,7 @@
4
4
  from __future__ import annotations
5
5
 
6
6
  import ast
7
+ import copy
7
8
  import json
8
9
  import logging
9
10
  import re
@@ -50,6 +51,8 @@ def deserialize_json_values(data: T) -> T: ...
50
51
  def deserialize_json_values(data):
51
52
  """De-serialize JSON strings in various input formats.
52
53
 
54
+ This function creates a deep copy of the input data and does not mutate the original.
55
+
53
56
  Args:
54
57
  data: Input data in one of four formats:
55
58
  - Single string (JSON string to deserialize)
@@ -63,18 +66,22 @@ def deserialize_json_values(data):
63
66
  - List of dictionaries (when input is a list of strings)
64
67
  - Dictionary (when input is a dictionary, with nested JSON strings deserialized)
65
68
  - The original object (if there is no deserialization to perform)
69
+
66
70
  """
71
+ # Create a deep copy to avoid mutating the original data
72
+ data_copy = copy.deepcopy(data)
73
+
67
74
  # Case 1: Single string input
68
- if isinstance(data, str):
75
+ if isinstance(data_copy, str):
69
76
  try:
70
- return json.loads(data)
77
+ return json.loads(data_copy)
71
78
  except json.JSONDecodeError:
72
- return data
79
+ return data_copy
73
80
 
74
81
  # Case 2: List of strings input
75
- elif isinstance(data, list):
82
+ elif isinstance(data_copy, list):
76
83
  result = []
77
- for item in data:
84
+ for item in data_copy:
78
85
  if isinstance(item, str):
79
86
  try:
80
87
  result.append(json.loads(item))
@@ -86,9 +93,9 @@ def deserialize_json_values(data):
86
93
  return result
87
94
 
88
95
  # Case 3: Dictionary input with potential nested JSON strings
89
- elif isinstance(data, dict):
96
+ elif isinstance(data_copy, dict):
90
97
  result = {}
91
- for key, value in data.items():
98
+ for key, value in data_copy.items():
92
99
  if isinstance(value, str):
93
100
  try:
94
101
  result[key] = json.loads(value)
@@ -103,7 +110,7 @@ def deserialize_json_values(data):
103
110
 
104
111
  # Fallback for other data types
105
112
  else:
106
- return data
113
+ return data_copy
107
114
 
108
115
 
109
116
  def parse_list_string(text: str) -> list[str]:
@@ -40,13 +40,16 @@ PII_FIELDS = [
40
40
  "state",
41
41
  "email_address",
42
42
  "phone_number",
43
+ # Brazil-specific fields
44
+ "race",
43
45
  # Japan-specific fields
44
46
  "area",
45
47
  "prefecture",
46
48
  "zone",
49
+ # Brazil and India shared fields
50
+ "religion",
47
51
  # India-specific fields
48
52
  "district",
49
- "religion",
50
53
  "education_degree",
51
54
  "first_language",
52
55
  "second_language",
@@ -78,9 +81,10 @@ PERSONA_FIELDS = [
78
81
  # Japan-specific persona fields
79
82
  "aspects",
80
83
  "digital_skills",
84
+ # Brazil and India shared persona fields
85
+ "religious_persona",
86
+ "religious_background",
81
87
  # India-specific persona fields
82
88
  "linguistic_persona",
83
- "religious_persona",
84
89
  "linguistic_background",
85
- "religious_background",
86
90
  ]
@@ -12,9 +12,9 @@ from data_designer.config.config_builder import DataDesignerConfigBuilder
12
12
  from data_designer.config.data_designer_config import DataDesignerConfig
13
13
  from data_designer.config.default_model_settings import (
14
14
  get_default_model_configs,
15
- get_default_model_providers_missing_api_keys,
16
15
  get_default_provider_name,
17
16
  get_default_providers,
17
+ get_providers_with_missing_api_keys,
18
18
  )
19
19
  from data_designer.config.interface import DataDesignerInterface
20
20
  from data_designer.config.models import (
@@ -28,7 +28,6 @@ from data_designer.config.utils.constants import (
28
28
  MANAGED_ASSETS_PATH,
29
29
  MODEL_CONFIGS_FILE_PATH,
30
30
  MODEL_PROVIDERS_FILE_PATH,
31
- PREDEFINED_PROVIDERS,
32
31
  )
33
32
  from data_designer.config.utils.info import InfoType, InterfaceInfo
34
33
  from data_designer.engine.analysis.dataset_profiler import DataDesignerDatasetProfiler, DatasetProfilerConfig
@@ -317,7 +316,8 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
317
316
 
318
317
  Args:
319
318
  run_config: A RunConfig instance containing runtime settings such as
320
- early shutdown behavior and batch sizing via `buffer_size`. Import RunConfig from
319
+ early shutdown behavior, batch sizing via `buffer_size`, and non-inference worker
320
+ concurrency via `non_inference_max_parallel_workers`. Import RunConfig from
321
321
  data_designer.essentials.
322
322
 
323
323
  Example:
@@ -334,8 +334,11 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
334
334
  def _resolve_model_providers(self, model_providers: list[ModelProvider] | None) -> list[ModelProvider]:
335
335
  if model_providers is None:
336
336
  model_providers = get_default_providers()
337
- missing_api_keys = get_default_model_providers_missing_api_keys()
338
- if len(missing_api_keys) == len(PREDEFINED_PROVIDERS):
337
+ # Check which providers have missing API keys (from YAML file or env vars)
338
+ providers_with_missing_keys = get_providers_with_missing_api_keys(model_providers)
339
+
340
+ if len(providers_with_missing_keys) == len(model_providers):
341
+ # All providers have missing API keys
339
342
  logger.warning(
340
343
  "🚨 You are trying to use a default model provider but your API keys are missing."
341
344
  "\n\t\t\tSet the API key for the default providers you intend to use and re-initialize the Data Designer object."
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer
3
- Version: 0.3.6
3
+ Version: 0.3.8
4
4
  Summary: General framework for synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -1,5 +1,5 @@
1
1
  data_designer/__init__.py,sha256=iLr6FpW41-DFbGexuXCJ6gN1xBMNUZ2jfj9XxySmQhk,502
2
- data_designer/_version.py,sha256=DPG4bRRG7SLMPQyCzFx7koj75iQndn-UI-ENmNbYVCQ,704
2
+ data_designer/_version.py,sha256=iWEYFk8Ve-UDYmpiH5KfwfwhQTYtkxZlaGPwKE2xDy0,704
3
3
  data_designer/errors.py,sha256=r1pBvmvRBAsPmb7oF_veubhkxZ2uPo9cGEDwykLziX4,220
4
4
  data_designer/lazy_heavy_imports.py,sha256=wULSEPQRUOZXvOnb0tdf6wNbRBpaaczYfAjY-pstCBM,1512
5
5
  data_designer/logging.py,sha256=gRi9BOqm95UC1-u4pn6n-G4EySy9HhwKVyKLRO4aqm4,5382
@@ -42,14 +42,14 @@ data_designer/config/config_builder.py,sha256=vuPibkodbJxbCXdaI1tt1Uyo1SVCnAOfLB
42
42
  data_designer/config/data_designer_config.py,sha256=qOojviug05vHR2S4800sjd4OmxhSVi6kB8SAFXLlPog,1891
43
43
  data_designer/config/dataset_builders.py,sha256=jdCujJYFlKAiSkPNX2Qeyrs683GrRcCDv_m8ZZhtg64,368
44
44
  data_designer/config/dataset_metadata.py,sha256=UTlEgnHWgjwPuc7bP95T7gaKmcr7pIhFMy9vvbUwMV4,647
45
- data_designer/config/default_model_settings.py,sha256=d9ZuTDGMtS1rZpIDqoSQjCiD5tcHrUOr22X0-mGQspc,4497
45
+ data_designer/config/default_model_settings.py,sha256=c-llH2otfG0tMCMsxoz3ZcS1nFxIQQPfRedFXAydDbc,4868
46
46
  data_designer/config/errors.py,sha256=JhvUYecfLmP0gZjQzqA3OmfaSs9TRlC5E-ubnV_-3gs,560
47
47
  data_designer/config/exports.py,sha256=lNwteK4djETKXrMKh5PPeHeZvPAZ5RpnJt2otpoaUz0,4756
48
48
  data_designer/config/interface.py,sha256=ikmpm_KwencTpM-yg0auo7XMgcmMSa67S75IqdpFLfk,1676
49
49
  data_designer/config/models.py,sha256=OekrXEVnI9WdHzEVk-8fO0NtxLZtjKVtCL03RY8qwYs,15457
50
50
  data_designer/config/preview_results.py,sha256=WnPlDcHElIHNfjV_P-nLu_Dpul8D3Eyb5qyi3E173Gs,1744
51
51
  data_designer/config/processors.py,sha256=lnyUZA1EhO9NWjjVFFioYxSgeYpoAaM1J7UzwOYkvms,6028
52
- data_designer/config/run_config.py,sha256=5TA1PSmZ3Ca5V0GA1KTds3xrEwGPFZY9C35Vf_1cAs0,2429
52
+ data_designer/config/run_config.py,sha256=oJ163DpHXu9PzST5Hn9px-bIP9DYjIkCO7UGB93J7bI,2663
53
53
  data_designer/config/sampler_constraints.py,sha256=tQI1XLF5bS4TnyKMLo0nArvefnXI8dWCzov38r4qNCQ,1197
54
54
  data_designer/config/sampler_params.py,sha256=Gio-53vjSYOdPhF2CEq4HSWCXCaZMy4WpGPbuFVcWOM,27965
55
55
  data_designer/config/seed.py,sha256=eShSqOcSUzfCEZBnqY-rB0qZpRGxjeOE3fSaJAwacec,4668
@@ -63,7 +63,7 @@ data_designer/config/analysis/dataset_profiler.py,sha256=-5eX55IXivwUBMg2pI-d_3e
63
63
  data_designer/config/analysis/utils/errors.py,sha256=pvmdQ_YuIlWW4NFw-cX_rOoQf-GG8y_FiQzNctB__DQ,331
64
64
  data_designer/config/analysis/utils/reporting.py,sha256=teTzd1OHtpI4vbIinGOGsKXyNldO3F5eqbNdAztF0_s,7066
65
65
  data_designer/config/utils/code_lang.py,sha256=EqMJh1GL5ysUZIoyqx_6vmqenUKHm4J-RQtKXiA4EPg,2354
66
- data_designer/config/utils/constants.py,sha256=KU4ZCIe18gXdBp2N_BgZlRW90FIqjFPYmJtqgVY3Ink,8925
66
+ data_designer/config/utils/constants.py,sha256=eqDQ57b8B0v5qRSO0He45LEjSxtfxlsPtHRvBu1xkw0,8973
67
67
  data_designer/config/utils/errors.py,sha256=HCjer0YrF0bMn5j8gmgWaLb0395LAr_hxMD1ftOsOc8,520
68
68
  data_designer/config/utils/info.py,sha256=yOa4U8kI_CY4OfCKZxCm2okU8klAiThvyjKM5tG-F0A,3469
69
69
  data_designer/config/utils/io_helpers.py,sha256=kzvOR7QgqijkqU-O2enIlpCWwHvzc3oRaEl4Lsjh1Do,8466
@@ -72,7 +72,7 @@ data_designer/config/utils/numerical_helpers.py,sha256=DIubKzc8q2_Bw7xRjyOGwxYul
72
72
  data_designer/config/utils/type_helpers.py,sha256=XyVup24F4Bl7uNze_yUW9oD6EzFbfsJWKhpeMN2901A,4059
73
73
  data_designer/config/utils/visualization.py,sha256=_0Mn-jva0Oz1tVTQH1mnWSARpqZ2kh1JSzJEuikyy9s,18491
74
74
  data_designer/engine/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
75
- data_designer/engine/compiler.py,sha256=4a6ayCQjpULrGU2CXaBMDs-RU0TszT2oEkMK-vn51zk,2757
75
+ data_designer/engine/compiler.py,sha256=4QAeCJjINtH0afSXygdhiKMyq2KIfaDthK3ApZLgrQ0,4152
76
76
  data_designer/engine/configurable_task.py,sha256=6R4FPXPzIeK0lqNVSEXzRDtK14B3dFz38lplr-nkvRE,2539
77
77
  data_designer/engine/errors.py,sha256=YXI7ny83BQ16sOK43CpTm384hJTKuZkPTEAjlHlDIfA,1303
78
78
  data_designer/engine/model_provider.py,sha256=_uU5Bw7yrGlMROjHL4dN1mMTg1eN-LVW5JWcQxovhAA,2823
@@ -92,7 +92,7 @@ data_designer/engine/column_generators/generators/__init__.py,sha256=XLO09Ei8g0l
92
92
  data_designer/engine/column_generators/generators/base.py,sha256=QElk5KsaUQ3EYwlv40NcZgQsw3HIkX3YQV_0S3erl7Q,4209
93
93
  data_designer/engine/column_generators/generators/embedding.py,sha256=uB0jgHlCgctgIUf9ZfMqG1YThbJ0g-GCX3VdNbdDSko,1407
94
94
  data_designer/engine/column_generators/generators/expression.py,sha256=BiQcfVTinvQl3OI9nkdhB9B7FGBueWiHJwxTA8uNVuY,2330
95
- data_designer/engine/column_generators/generators/llm_completion.py,sha256=TGVCV0Sp2AI5KwJ7lG9Co7-zF6gVy-vmVg9eEKmiazE,3873
95
+ data_designer/engine/column_generators/generators/llm_completion.py,sha256=3S3ikNLLLGnutUdcuswL5dUfcLgT_-he8DiRZ9K706U,4721
96
96
  data_designer/engine/column_generators/generators/samplers.py,sha256=gNzURmu9K8Zb5MHamKvZPIxmWlFgl2W4FIVgaFcy4f0,3371
97
97
  data_designer/engine/column_generators/generators/seed_dataset.py,sha256=CoQPbz4Ww7pBLaGw8-CYqIk1sjfkBaoRMKZQexdfgKY,6824
98
98
  data_designer/engine/column_generators/generators/validation.py,sha256=YfYbk-8_ZUye0No6_Q7hIqpZv_tunnEZ6HkLSMFXlDE,6659
@@ -101,11 +101,11 @@ data_designer/engine/column_generators/utils/generator_classification.py,sha256=
101
101
  data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=gESiqMrQzbbcFpZas0sAAAkrH2DL0Z4Nq5ywBO-pQ6k,2141
102
102
  data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=LATVAlDYwL7HyM7Nogd6n9XTTk-j9s64o4z0LpKHMhQ,4819
103
103
  data_designer/engine/dataset_builders/artifact_storage.py,sha256=CKpTBtJTde7OQvsFZQa1v1autVz5yUxlBHkIKeATFnE,10999
104
- data_designer/engine/dataset_builders/column_wise_builder.py,sha256=_3_JJJG-tA9qLhNiEKbHxl1EHYBbMVAGUtaAdqO_wsc,15736
104
+ data_designer/engine/dataset_builders/column_wise_builder.py,sha256=lzCSk3dFmdZvKLPAVIRNp9oJQsiilthHRW7mB4dUUB4,15716
105
105
  data_designer/engine/dataset_builders/errors.py,sha256=gLXtPcGSMBG10PzQ85dOXskdA0mKbBQrHa_VtP9sbVY,400
106
106
  data_designer/engine/dataset_builders/multi_column_configs.py,sha256=U4Pg0ETCBq5phRhb2zt8IFa4fRx-aTMakomKOBnrs0U,1660
107
107
  data_designer/engine/dataset_builders/utils/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
108
- data_designer/engine/dataset_builders/utils/concurrency.py,sha256=Q0ro9UY-3-FFzfi3MZ29nMTSiDZgg1Um6y_HQFztDhk,8338
108
+ data_designer/engine/dataset_builders/utils/concurrency.py,sha256=Lga_xd8i3ZAPqJlKCB4GHG7uxWxws1m-UGAz9UeqU_8,8283
109
109
  data_designer/engine/dataset_builders/utils/config_compiler.py,sha256=NGI6U0vgG88d5YKj7oW_SIJ4-_fhA6VFhPbjqGRHea4,2441
110
110
  data_designer/engine/dataset_builders/utils/dag.py,sha256=RIEI75OtiphkuDl1vfI_MQC1xMiiIg29s-0C_fNZkWQ,2613
111
111
  data_designer/engine/dataset_builders/utils/dataset_batch_manager.py,sha256=IfWd_HcfEzIPhgFp2dJaxNIKRlrPsHqYATFXauvCfaw,8133
@@ -114,7 +114,7 @@ data_designer/engine/models/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE
114
114
  data_designer/engine/models/errors.py,sha256=k9oZnmk8DRD8U2SVKJJRLwrcdsCcVoJiOb_Q7ZyEdvg,12271
115
115
  data_designer/engine/models/facade.py,sha256=UBMpw_o2JcsWpJsPdpTPKfFZCh_i0eeG_oaWi1XeKds,12582
116
116
  data_designer/engine/models/factory.py,sha256=2NjI0iiGv8ayQ1c249lsJtha4pDmvmtSjdwvlvitRds,1581
117
- data_designer/engine/models/litellm_overrides.py,sha256=ECreuMCHo2qfLNmW1_53jdE74flbv3pwPa1NIiIQQx4,6145
117
+ data_designer/engine/models/litellm_overrides.py,sha256=e9IZCFQ6BhNWlOTncm8ErL8w4rtE1_4USh2mtUYxCZI,6207
118
118
  data_designer/engine/models/registry.py,sha256=7hZ6TQwwZf259yRZmc3ZI20a4wAo3PCOozPi9Mc5KLo,6827
119
119
  data_designer/engine/models/telemetry.py,sha256=wmuekvPRZjNz7p7ImKx5H_hqDRhTv_dSB-u2S6Ze3uo,12502
120
120
  data_designer/engine/models/usage.py,sha256=A0LV9Ycuj_7snOsaqnirs4mlkAjozv2mzj2om2FpDoU,2410
@@ -127,7 +127,7 @@ data_designer/engine/models/parsers/tag_parsers.py,sha256=HNAIBfXW1Wjdkw4IX-P9sH
127
127
  data_designer/engine/models/parsers/types.py,sha256=wEt80al1FykbMplZVjJ5uXFtacMx-a9GE4_QoqDJ6Us,2631
128
128
  data_designer/engine/models/recipes/base.py,sha256=AQg3Ay_E0hBEVg-sqSNVVZNMJfJ3r1eT14-b9yqymnQ,2630
129
129
  data_designer/engine/models/recipes/response_recipes.py,sha256=UX9m-8RTDj3sXkzEdKpkSj5z7jO-fQhdca3MSByb_Js,10189
130
- data_designer/engine/processing/utils.py,sha256=iu7JJ4foI3Gfd29ppIBGn9c0syO64PTyvW9CiaLVAHE,5201
130
+ data_designer/engine/processing/utils.py,sha256=g82KsdDR20g_isadpmgHnneQSX0W21aCVhkp5TIWEhw,5443
131
131
  data_designer/engine/processing/ginja/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
132
132
  data_designer/engine/processing/ginja/ast.py,sha256=w62yt434RDnJYrcfofIDThGv0C5H9XJE3VHOnxEzJVM,1964
133
133
  data_designer/engine/processing/ginja/environment.py,sha256=wJRbzPuUCQGvCi4zS4g8sYzihgu_6fn-tE_nYSL1AoU,18974
@@ -164,7 +164,7 @@ data_designer/engine/sampling_gen/data_sources/base.py,sha256=zUG5XTplD5pgHh4ytC
164
164
  data_designer/engine/sampling_gen/data_sources/errors.py,sha256=_9rbwUpaz0Pd2Ods4AVDQ7Uq4JvPyfHhTp51BdtJDto,367
165
165
  data_designer/engine/sampling_gen/data_sources/sources.py,sha256=53KVPp7REjNKA0rajGmT_tBkxwQqwrcIKhcijBGcfcs,13647
166
166
  data_designer/engine/sampling_gen/entities/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
167
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256=0_eUTtrWFGxTfTfqlz9ig9bJEtYeckb50J7w5LhYTr8,1883
167
+ data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256=r8qXWe8EquJognihPNGzma4fFuSQAAZHlkYVsGmcX2w,2006
168
168
  data_designer/engine/sampling_gen/entities/email_address_utils.py,sha256=THfD7muq5tMHkRWOATN-N3iSFgkKjT4e8hKquDFMTlU,5272
169
169
  data_designer/engine/sampling_gen/entities/errors.py,sha256=SbtwwG6JgoY4k6pq2-y-lD60nX_pqjf5QftmwgXt0us,352
170
170
  data_designer/engine/sampling_gen/entities/national_id_utils.py,sha256=XUFB6RhfLGFQUNyy0B6BSgtrG9NdEnIjfSALBwJplho,2652
@@ -179,7 +179,7 @@ data_designer/engine/validators/remote.py,sha256=rythhIrH2GvqncMQeF3FiJa9Om0KZWe
179
179
  data_designer/engine/validators/sql.py,sha256=AMaEdA-gj9j0zwVp809x3ycKltd51wVEhI8mMYGyxd4,2408
180
180
  data_designer/essentials/__init__.py,sha256=dIGYH9s0_VQJ1lG8S-ElZiISz59LHo9v7Y5upizcA1M,1135
181
181
  data_designer/interface/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
182
- data_designer/interface/data_designer.py,sha256=aX3Etg1qGpjivQQdplQ0Oi3aB7_6jzClk_4yrpcE5fY,17322
182
+ data_designer/interface/data_designer.py,sha256=nqsl2Et1wSF3TQzYx3gC1drJzrKlm4FtRE7QJYilbys,17544
183
183
  data_designer/interface/errors.py,sha256=Ft9GMeIrOHJv_PC_1rU6hWcNyq1GHdsFYZSc9HnUrxU,606
184
184
  data_designer/interface/results.py,sha256=3fGwlhif4ufqUGh-EgsGccrob4S6a7WZ6BgFiszTo_A,3871
185
185
  data_designer/plugins/__init__.py,sha256=qe1alcTEtnMSMdzknjb57vvjqKgFE5cEHXxBj8tPWMI,275
@@ -189,8 +189,8 @@ data_designer/plugins/registry.py,sha256=Cnt33Q25o9bS2v2YDbV3QPM57VNrtIBKAb4ERQR
189
189
  data_designer/plugins/testing/__init__.py,sha256=yyxrrH_i3q0Xb56QO9Ma35WtHlQ5PJF1b2pQoKa16xU,296
190
190
  data_designer/plugins/testing/stubs.py,sha256=9tUF209ayZR6f0Q1LsRDW4kEOTgPoIxV8jlq4QoWuW0,3498
191
191
  data_designer/plugins/testing/utils.py,sha256=a9LEgK827cnIzHEkgXOdgywrKDLBE36cyttrpG1ctT4,973
192
- data_designer-0.3.6.dist-info/METADATA,sha256=DRhL6EhZHVG81mZy-G6czvlm_r9F8sCiu0hIxAyU-q4,8119
193
- data_designer-0.3.6.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
194
- data_designer-0.3.6.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
195
- data_designer-0.3.6.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
196
- data_designer-0.3.6.dist-info/RECORD,,
192
+ data_designer-0.3.8.dist-info/METADATA,sha256=n9jXs34c2_rOL-Tme5Y6xrb4fMo0-GNNEVHlCl5WfLY,8119
193
+ data_designer-0.3.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
194
+ data_designer-0.3.8.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
195
+ data_designer-0.3.8.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
196
+ data_designer-0.3.8.dist-info/RECORD,,