data-designer 0.3.8__py3-none-any.whl → 0.3.8rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data_designer/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.8'
32
- __version_tuple__ = version_tuple = (0, 3, 8)
31
+ __version__ = version = '0.3.8rc1'
32
+ __version_tuple__ = version_tuple = (0, 3, 8, 'rc1')
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -71,20 +71,12 @@ def get_default_model_configs() -> list[ModelConfig]:
71
71
  return []
72
72
 
73
73
 
74
- def get_providers_with_missing_api_keys(providers: list[ModelProvider]) -> list[ModelProvider]:
75
- providers_with_missing_keys = []
76
-
77
- for provider in providers:
78
- if provider.api_key is None:
79
- # No API key specified at all
80
- providers_with_missing_keys.append(provider)
81
- elif provider.api_key.isupper() and "_" in provider.api_key:
82
- # Looks like an environment variable name, check if it's set
83
- if os.environ.get(provider.api_key) is None:
84
- providers_with_missing_keys.append(provider)
85
- # else: It's an actual API key value (not an env var), so it's valid
86
-
87
- return providers_with_missing_keys
74
+ def get_default_model_providers_missing_api_keys() -> list[str]:
75
+ missing_api_keys = []
76
+ for predefined_provider in PREDEFINED_PROVIDERS:
77
+ if os.environ.get(predefined_provider["api_key"]) is None:
78
+ missing_api_keys.append(predefined_provider["api_key"])
79
+ return missing_api_keys
88
80
 
89
81
 
90
82
  def get_default_providers() -> list[ModelProvider]:
@@ -26,8 +26,6 @@ class RunConfig(ConfigBase):
26
26
  buffer_size: Number of records to process in each batch during dataset generation.
27
27
  A batch is processed end-to-end (column generation, post-batch processors, and writing the batch
28
28
  to artifact storage) before moving on to the next batch. Must be > 0. Default is 1000.
29
- non_inference_max_parallel_workers: Maximum number of worker threads used for non-inference
30
- cell-by-cell generators. Must be >= 1. Default is 4.
31
29
  max_conversation_restarts: Maximum number of full conversation restarts permitted when
32
30
  generation tasks call `ModelFacade.generate(...)`. Must be >= 0. Default is 5.
33
31
  max_conversation_correction_steps: Maximum number of correction rounds permitted within a
@@ -39,7 +37,6 @@ class RunConfig(ConfigBase):
39
37
  shutdown_error_rate: float = Field(default=0.5, ge=0.0, le=1.0)
40
38
  shutdown_error_window: int = Field(default=10, ge=0)
41
39
  buffer_size: int = Field(default=1000, gt=0)
42
- non_inference_max_parallel_workers: int = Field(default=4, ge=1)
43
40
  max_conversation_restarts: int = Field(default=5, ge=0)
44
41
  max_conversation_correction_steps: int = Field(default=0, ge=0)
45
42
 
@@ -353,11 +353,9 @@ PREDEFINED_PROVIDERS_MODEL_MAP = {
353
353
  NEMOTRON_PERSONAS_DATASET_SIZES = {
354
354
  "en_US": "1.24 GB",
355
355
  "en_IN": "2.39 GB",
356
- "en_SG": "0.30 GB",
357
356
  "hi_Deva_IN": "4.14 GB",
358
357
  "hi_Latn_IN": "2.7 GB",
359
358
  "ja_JP": "1.69 GB",
360
- "pt_BR": "2.33 GB",
361
359
  }
362
360
 
363
361
  LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys())
@@ -31,7 +31,10 @@ from data_designer.engine.compiler import compile_data_designer_config
31
31
  from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage
32
32
  from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError
33
33
  from data_designer.engine.dataset_builders.multi_column_configs import MultiColumnConfig
34
- from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor
34
+ from data_designer.engine.dataset_builders.utils.concurrency import (
35
+ MAX_CONCURRENCY_PER_NON_LLM_GENERATOR,
36
+ ConcurrentThreadExecutor,
37
+ )
35
38
  from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs
36
39
  from data_designer.engine.dataset_builders.utils.dataset_batch_manager import DatasetBatchManager
37
40
  from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum, TelemetryHandler
@@ -199,7 +202,7 @@ class ColumnWiseDatasetBuilder:
199
202
  self.batch_manager.add_records(df.to_dict(orient="records"))
200
203
 
201
204
  def _run_cell_by_cell_generator(self, generator: ColumnGenerator) -> None:
202
- max_workers = self._resource_provider.run_config.non_inference_max_parallel_workers
205
+ max_workers = MAX_CONCURRENCY_PER_NON_LLM_GENERATOR
203
206
  if isinstance(generator, ColumnGeneratorWithModel):
204
207
  max_workers = generator.inference_parameters.max_parallel_requests
205
208
  self._fan_out_with_threads(generator, max_workers=max_workers)
@@ -16,6 +16,9 @@ from data_designer.engine.errors import DataDesignerRuntimeError, ErrorTrap
16
16
 
17
17
  logger = logging.getLogger(__name__)
18
18
 
19
+ # Constants
20
+ MAX_CONCURRENCY_PER_NON_LLM_GENERATOR = 4
21
+
19
22
 
20
23
  class ExecutorResults(BaseModel):
21
24
  failure_threshold: float = 0.0 # Error rate threshold
@@ -40,16 +40,13 @@ PII_FIELDS = [
40
40
  "state",
41
41
  "email_address",
42
42
  "phone_number",
43
- # Brazil-specific fields
44
- "race",
45
43
  # Japan-specific fields
46
44
  "area",
47
45
  "prefecture",
48
46
  "zone",
49
- # Brazil and India shared fields
50
- "religion",
51
47
  # India-specific fields
52
48
  "district",
49
+ "religion",
53
50
  "education_degree",
54
51
  "first_language",
55
52
  "second_language",
@@ -81,10 +78,9 @@ PERSONA_FIELDS = [
81
78
  # Japan-specific persona fields
82
79
  "aspects",
83
80
  "digital_skills",
84
- # Brazil and India shared persona fields
85
- "religious_persona",
86
- "religious_background",
87
81
  # India-specific persona fields
88
82
  "linguistic_persona",
83
+ "religious_persona",
89
84
  "linguistic_background",
85
+ "religious_background",
90
86
  ]
@@ -12,9 +12,9 @@ from data_designer.config.config_builder import DataDesignerConfigBuilder
12
12
  from data_designer.config.data_designer_config import DataDesignerConfig
13
13
  from data_designer.config.default_model_settings import (
14
14
  get_default_model_configs,
15
+ get_default_model_providers_missing_api_keys,
15
16
  get_default_provider_name,
16
17
  get_default_providers,
17
- get_providers_with_missing_api_keys,
18
18
  )
19
19
  from data_designer.config.interface import DataDesignerInterface
20
20
  from data_designer.config.models import (
@@ -28,6 +28,7 @@ from data_designer.config.utils.constants import (
28
28
  MANAGED_ASSETS_PATH,
29
29
  MODEL_CONFIGS_FILE_PATH,
30
30
  MODEL_PROVIDERS_FILE_PATH,
31
+ PREDEFINED_PROVIDERS,
31
32
  )
32
33
  from data_designer.config.utils.info import InfoType, InterfaceInfo
33
34
  from data_designer.engine.analysis.dataset_profiler import DataDesignerDatasetProfiler, DatasetProfilerConfig
@@ -316,8 +317,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
316
317
 
317
318
  Args:
318
319
  run_config: A RunConfig instance containing runtime settings such as
319
- early shutdown behavior, batch sizing via `buffer_size`, and non-inference worker
320
- concurrency via `non_inference_max_parallel_workers`. Import RunConfig from
320
+ early shutdown behavior and batch sizing via `buffer_size`. Import RunConfig from
321
321
  data_designer.essentials.
322
322
 
323
323
  Example:
@@ -334,11 +334,8 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
334
334
  def _resolve_model_providers(self, model_providers: list[ModelProvider] | None) -> list[ModelProvider]:
335
335
  if model_providers is None:
336
336
  model_providers = get_default_providers()
337
- # Check which providers have missing API keys (from YAML file or env vars)
338
- providers_with_missing_keys = get_providers_with_missing_api_keys(model_providers)
339
-
340
- if len(providers_with_missing_keys) == len(model_providers):
341
- # All providers have missing API keys
337
+ missing_api_keys = get_default_model_providers_missing_api_keys()
338
+ if len(missing_api_keys) == len(PREDEFINED_PROVIDERS):
342
339
  logger.warning(
343
340
  "🚨 You are trying to use a default model provider but your API keys are missing."
344
341
  "\n\t\t\tSet the API key for the default providers you intend to use and re-initialize the Data Designer object."
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer
3
- Version: 0.3.8
3
+ Version: 0.3.8rc1
4
4
  Summary: General framework for synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -1,5 +1,5 @@
1
1
  data_designer/__init__.py,sha256=iLr6FpW41-DFbGexuXCJ6gN1xBMNUZ2jfj9XxySmQhk,502
2
- data_designer/_version.py,sha256=iWEYFk8Ve-UDYmpiH5KfwfwhQTYtkxZlaGPwKE2xDy0,704
2
+ data_designer/_version.py,sha256=NQlgQnitduzFb4wWkqWxIRTi5E5gw_xAEogbRRBMSzA,714
3
3
  data_designer/errors.py,sha256=r1pBvmvRBAsPmb7oF_veubhkxZ2uPo9cGEDwykLziX4,220
4
4
  data_designer/lazy_heavy_imports.py,sha256=wULSEPQRUOZXvOnb0tdf6wNbRBpaaczYfAjY-pstCBM,1512
5
5
  data_designer/logging.py,sha256=gRi9BOqm95UC1-u4pn6n-G4EySy9HhwKVyKLRO4aqm4,5382
@@ -42,14 +42,14 @@ data_designer/config/config_builder.py,sha256=vuPibkodbJxbCXdaI1tt1Uyo1SVCnAOfLB
42
42
  data_designer/config/data_designer_config.py,sha256=qOojviug05vHR2S4800sjd4OmxhSVi6kB8SAFXLlPog,1891
43
43
  data_designer/config/dataset_builders.py,sha256=jdCujJYFlKAiSkPNX2Qeyrs683GrRcCDv_m8ZZhtg64,368
44
44
  data_designer/config/dataset_metadata.py,sha256=UTlEgnHWgjwPuc7bP95T7gaKmcr7pIhFMy9vvbUwMV4,647
45
- data_designer/config/default_model_settings.py,sha256=c-llH2otfG0tMCMsxoz3ZcS1nFxIQQPfRedFXAydDbc,4868
45
+ data_designer/config/default_model_settings.py,sha256=d9ZuTDGMtS1rZpIDqoSQjCiD5tcHrUOr22X0-mGQspc,4497
46
46
  data_designer/config/errors.py,sha256=JhvUYecfLmP0gZjQzqA3OmfaSs9TRlC5E-ubnV_-3gs,560
47
47
  data_designer/config/exports.py,sha256=lNwteK4djETKXrMKh5PPeHeZvPAZ5RpnJt2otpoaUz0,4756
48
48
  data_designer/config/interface.py,sha256=ikmpm_KwencTpM-yg0auo7XMgcmMSa67S75IqdpFLfk,1676
49
49
  data_designer/config/models.py,sha256=OekrXEVnI9WdHzEVk-8fO0NtxLZtjKVtCL03RY8qwYs,15457
50
50
  data_designer/config/preview_results.py,sha256=WnPlDcHElIHNfjV_P-nLu_Dpul8D3Eyb5qyi3E173Gs,1744
51
51
  data_designer/config/processors.py,sha256=lnyUZA1EhO9NWjjVFFioYxSgeYpoAaM1J7UzwOYkvms,6028
52
- data_designer/config/run_config.py,sha256=oJ163DpHXu9PzST5Hn9px-bIP9DYjIkCO7UGB93J7bI,2663
52
+ data_designer/config/run_config.py,sha256=5TA1PSmZ3Ca5V0GA1KTds3xrEwGPFZY9C35Vf_1cAs0,2429
53
53
  data_designer/config/sampler_constraints.py,sha256=tQI1XLF5bS4TnyKMLo0nArvefnXI8dWCzov38r4qNCQ,1197
54
54
  data_designer/config/sampler_params.py,sha256=Gio-53vjSYOdPhF2CEq4HSWCXCaZMy4WpGPbuFVcWOM,27965
55
55
  data_designer/config/seed.py,sha256=eShSqOcSUzfCEZBnqY-rB0qZpRGxjeOE3fSaJAwacec,4668
@@ -63,7 +63,7 @@ data_designer/config/analysis/dataset_profiler.py,sha256=-5eX55IXivwUBMg2pI-d_3e
63
63
  data_designer/config/analysis/utils/errors.py,sha256=pvmdQ_YuIlWW4NFw-cX_rOoQf-GG8y_FiQzNctB__DQ,331
64
64
  data_designer/config/analysis/utils/reporting.py,sha256=teTzd1OHtpI4vbIinGOGsKXyNldO3F5eqbNdAztF0_s,7066
65
65
  data_designer/config/utils/code_lang.py,sha256=EqMJh1GL5ysUZIoyqx_6vmqenUKHm4J-RQtKXiA4EPg,2354
66
- data_designer/config/utils/constants.py,sha256=eqDQ57b8B0v5qRSO0He45LEjSxtfxlsPtHRvBu1xkw0,8973
66
+ data_designer/config/utils/constants.py,sha256=KU4ZCIe18gXdBp2N_BgZlRW90FIqjFPYmJtqgVY3Ink,8925
67
67
  data_designer/config/utils/errors.py,sha256=HCjer0YrF0bMn5j8gmgWaLb0395LAr_hxMD1ftOsOc8,520
68
68
  data_designer/config/utils/info.py,sha256=yOa4U8kI_CY4OfCKZxCm2okU8klAiThvyjKM5tG-F0A,3469
69
69
  data_designer/config/utils/io_helpers.py,sha256=kzvOR7QgqijkqU-O2enIlpCWwHvzc3oRaEl4Lsjh1Do,8466
@@ -101,11 +101,11 @@ data_designer/engine/column_generators/utils/generator_classification.py,sha256=
101
101
  data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=gESiqMrQzbbcFpZas0sAAAkrH2DL0Z4Nq5ywBO-pQ6k,2141
102
102
  data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=LATVAlDYwL7HyM7Nogd6n9XTTk-j9s64o4z0LpKHMhQ,4819
103
103
  data_designer/engine/dataset_builders/artifact_storage.py,sha256=CKpTBtJTde7OQvsFZQa1v1autVz5yUxlBHkIKeATFnE,10999
104
- data_designer/engine/dataset_builders/column_wise_builder.py,sha256=lzCSk3dFmdZvKLPAVIRNp9oJQsiilthHRW7mB4dUUB4,15716
104
+ data_designer/engine/dataset_builders/column_wise_builder.py,sha256=_3_JJJG-tA9qLhNiEKbHxl1EHYBbMVAGUtaAdqO_wsc,15736
105
105
  data_designer/engine/dataset_builders/errors.py,sha256=gLXtPcGSMBG10PzQ85dOXskdA0mKbBQrHa_VtP9sbVY,400
106
106
  data_designer/engine/dataset_builders/multi_column_configs.py,sha256=U4Pg0ETCBq5phRhb2zt8IFa4fRx-aTMakomKOBnrs0U,1660
107
107
  data_designer/engine/dataset_builders/utils/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
108
- data_designer/engine/dataset_builders/utils/concurrency.py,sha256=Lga_xd8i3ZAPqJlKCB4GHG7uxWxws1m-UGAz9UeqU_8,8283
108
+ data_designer/engine/dataset_builders/utils/concurrency.py,sha256=Q0ro9UY-3-FFzfi3MZ29nMTSiDZgg1Um6y_HQFztDhk,8338
109
109
  data_designer/engine/dataset_builders/utils/config_compiler.py,sha256=NGI6U0vgG88d5YKj7oW_SIJ4-_fhA6VFhPbjqGRHea4,2441
110
110
  data_designer/engine/dataset_builders/utils/dag.py,sha256=RIEI75OtiphkuDl1vfI_MQC1xMiiIg29s-0C_fNZkWQ,2613
111
111
  data_designer/engine/dataset_builders/utils/dataset_batch_manager.py,sha256=IfWd_HcfEzIPhgFp2dJaxNIKRlrPsHqYATFXauvCfaw,8133
@@ -164,7 +164,7 @@ data_designer/engine/sampling_gen/data_sources/base.py,sha256=zUG5XTplD5pgHh4ytC
164
164
  data_designer/engine/sampling_gen/data_sources/errors.py,sha256=_9rbwUpaz0Pd2Ods4AVDQ7Uq4JvPyfHhTp51BdtJDto,367
165
165
  data_designer/engine/sampling_gen/data_sources/sources.py,sha256=53KVPp7REjNKA0rajGmT_tBkxwQqwrcIKhcijBGcfcs,13647
166
166
  data_designer/engine/sampling_gen/entities/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
167
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256=r8qXWe8EquJognihPNGzma4fFuSQAAZHlkYVsGmcX2w,2006
167
+ data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256=0_eUTtrWFGxTfTfqlz9ig9bJEtYeckb50J7w5LhYTr8,1883
168
168
  data_designer/engine/sampling_gen/entities/email_address_utils.py,sha256=THfD7muq5tMHkRWOATN-N3iSFgkKjT4e8hKquDFMTlU,5272
169
169
  data_designer/engine/sampling_gen/entities/errors.py,sha256=SbtwwG6JgoY4k6pq2-y-lD60nX_pqjf5QftmwgXt0us,352
170
170
  data_designer/engine/sampling_gen/entities/national_id_utils.py,sha256=XUFB6RhfLGFQUNyy0B6BSgtrG9NdEnIjfSALBwJplho,2652
@@ -179,7 +179,7 @@ data_designer/engine/validators/remote.py,sha256=rythhIrH2GvqncMQeF3FiJa9Om0KZWe
179
179
  data_designer/engine/validators/sql.py,sha256=AMaEdA-gj9j0zwVp809x3ycKltd51wVEhI8mMYGyxd4,2408
180
180
  data_designer/essentials/__init__.py,sha256=dIGYH9s0_VQJ1lG8S-ElZiISz59LHo9v7Y5upizcA1M,1135
181
181
  data_designer/interface/__init__.py,sha256=XLO09Ei8g0lU7hYlzKCvhvQhLFBe5CBwE4v2PqK9xWY,142
182
- data_designer/interface/data_designer.py,sha256=nqsl2Et1wSF3TQzYx3gC1drJzrKlm4FtRE7QJYilbys,17544
182
+ data_designer/interface/data_designer.py,sha256=aX3Etg1qGpjivQQdplQ0Oi3aB7_6jzClk_4yrpcE5fY,17322
183
183
  data_designer/interface/errors.py,sha256=Ft9GMeIrOHJv_PC_1rU6hWcNyq1GHdsFYZSc9HnUrxU,606
184
184
  data_designer/interface/results.py,sha256=3fGwlhif4ufqUGh-EgsGccrob4S6a7WZ6BgFiszTo_A,3871
185
185
  data_designer/plugins/__init__.py,sha256=qe1alcTEtnMSMdzknjb57vvjqKgFE5cEHXxBj8tPWMI,275
@@ -189,8 +189,8 @@ data_designer/plugins/registry.py,sha256=Cnt33Q25o9bS2v2YDbV3QPM57VNrtIBKAb4ERQR
189
189
  data_designer/plugins/testing/__init__.py,sha256=yyxrrH_i3q0Xb56QO9Ma35WtHlQ5PJF1b2pQoKa16xU,296
190
190
  data_designer/plugins/testing/stubs.py,sha256=9tUF209ayZR6f0Q1LsRDW4kEOTgPoIxV8jlq4QoWuW0,3498
191
191
  data_designer/plugins/testing/utils.py,sha256=a9LEgK827cnIzHEkgXOdgywrKDLBE36cyttrpG1ctT4,973
192
- data_designer-0.3.8.dist-info/METADATA,sha256=n9jXs34c2_rOL-Tme5Y6xrb4fMo0-GNNEVHlCl5WfLY,8119
193
- data_designer-0.3.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
194
- data_designer-0.3.8.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
195
- data_designer-0.3.8.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
196
- data_designer-0.3.8.dist-info/RECORD,,
192
+ data_designer-0.3.8rc1.dist-info/METADATA,sha256=YrRgO4uKxDznFADcGj2TqIucRNvbYIGk5_4R9Pqq2Qc,8122
193
+ data_designer-0.3.8rc1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
194
+ data_designer-0.3.8rc1.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
195
+ data_designer-0.3.8rc1.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
196
+ data_designer-0.3.8rc1.dist-info/RECORD,,