data-designer 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data_designer/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.0'
32
- __version_tuple__ = version_tuple = (0, 1, 0)
31
+ __version__ = version = '0.1.1'
32
+ __version_tuple__ = version_tuple = (0, 1, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -124,7 +124,7 @@ def _fetch_seed_dataset_column_names_from_datastore(
124
124
  raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
125
125
 
126
126
  datastore_settings = resolve_datastore_settings(datastore_settings)
127
- fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token)
127
+ fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
128
128
 
129
129
  with fs.open(f"datasets/{repo_id}/{filename}") as f:
130
130
  return get_file_column_names(f, file_type)
@@ -4,6 +4,7 @@
4
4
 
5
5
  from functools import lru_cache
6
6
  import logging
7
+ import os
7
8
  from pathlib import Path
8
9
  from typing import Any, Literal, Optional
9
10
 
@@ -15,7 +16,6 @@ from .utils.constants import (
15
16
  PREDEFINED_PROVIDERS,
16
17
  PREDEFINED_PROVIDERS_MODEL_MAP,
17
18
  )
18
- from .utils.info import ConfigBuilderInfo, InfoType, InterfaceInfo
19
19
  from .utils.io_helpers import load_config_file, save_config_file
20
20
 
21
21
  logger = logging.getLogger(__name__)
@@ -75,7 +75,15 @@ def get_default_model_configs() -> list[ModelConfig]:
75
75
  config_dict = load_config_file(MODEL_CONFIGS_FILE_PATH)
76
76
  if "model_configs" in config_dict:
77
77
  return [ModelConfig.model_validate(mc) for mc in config_dict["model_configs"]]
78
- raise FileNotFoundError(f"Default model configs file not found at {str(MODEL_CONFIGS_FILE_PATH)!r}")
78
+ return []
79
+
80
+
81
+ def get_defaul_model_providers_missing_api_keys() -> list[str]:
82
+ missing_api_keys = []
83
+ for predefined_provider in PREDEFINED_PROVIDERS:
84
+ if os.environ.get(predefined_provider["api_key"]) is None:
85
+ missing_api_keys.append(predefined_provider["api_key"])
86
+ return missing_api_keys
79
87
 
80
88
 
81
89
  def get_default_providers() -> list[ModelProvider]:
@@ -91,21 +99,17 @@ def get_default_provider_name() -> Optional[str]:
91
99
 
92
100
  def resolve_seed_default_model_settings() -> None:
93
101
  if not MODEL_CONFIGS_FILE_PATH.exists():
94
- logger.info(
102
+ logger.debug(
95
103
  f"🍾 Default model configs were not found, so writing the following to {str(MODEL_CONFIGS_FILE_PATH)!r}"
96
104
  )
97
- config_builder_info = ConfigBuilderInfo(model_configs=get_builtin_model_configs())
98
- config_builder_info.display(info_type=InfoType.MODEL_CONFIGS)
99
105
  save_config_file(
100
106
  MODEL_CONFIGS_FILE_PATH, {"model_configs": [mc.model_dump() for mc in get_builtin_model_configs()]}
101
107
  )
102
108
 
103
109
  if not MODEL_PROVIDERS_FILE_PATH.exists():
104
- logger.info(
110
+ logger.debug(
105
111
  f"🪄 Default model providers were not found, so writing the following to {str(MODEL_PROVIDERS_FILE_PATH)!r}"
106
112
  )
107
- interface_info = InterfaceInfo(model_providers=get_builtin_model_providers())
108
- interface_info.display(info_type=InfoType.MODEL_PROVIDERS)
109
113
  save_config_file(
110
114
  MODEL_PROVIDERS_FILE_PATH, {"providers": [p.model_dump() for p in get_builtin_model_providers()]}
111
115
  )
@@ -42,11 +42,29 @@ class HfHubSeedDatasetDataStore(SeedDatasetDataStore):
42
42
 
43
43
  def __init__(self, endpoint: str, token: str | None):
44
44
  self.hfapi = HfApi(endpoint=endpoint, token=token)
45
- self.hffs = HfFileSystem(endpoint=endpoint, token=token)
45
+ self.endpoint = endpoint
46
+ self.token = token
46
47
 
47
48
  def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection:
49
+ """Create a DuckDB connection with a fresh HfFileSystem registered.
50
+
51
+ Creates a new HfFileSystem instance for each connection to ensure file metadata
52
+ is fetched fresh from the datastore, avoiding cache-related issues when reading
53
+ recently updated parquet files.
54
+
55
+ Returns:
56
+ A DuckDB connection with the HfFileSystem registered for hf:// URI support.
57
+ """
58
+ # Use skip_instance_cache to avoid fsspec-level caching
59
+ hffs = HfFileSystem(endpoint=self.endpoint, token=self.token, skip_instance_cache=True)
60
+
61
+ # Clear all internal caches to avoid stale metadata issues
62
+ # HfFileSystem caches file metadata (size, etc.) which can become stale when files are re-uploaded
63
+ if hasattr(hffs, "dircache"):
64
+ hffs.dircache.clear()
65
+
48
66
  conn = duckdb.connect()
49
- conn.register_filesystem(self.hffs)
67
+ conn.register_filesystem(hffs)
50
68
  return conn
51
69
 
52
70
  def get_dataset_uri(self, file_id: str) -> str:
@@ -9,6 +9,7 @@ import pandas as pd
9
9
  from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
10
10
  from data_designer.config.config_builder import DataDesignerConfigBuilder
11
11
  from data_designer.config.default_model_settings import (
12
+ get_defaul_model_providers_missing_api_keys,
12
13
  get_default_model_configs,
13
14
  get_default_provider_name,
14
15
  get_default_providers,
@@ -26,8 +27,9 @@ from data_designer.config.utils.constants import (
26
27
  MANAGED_ASSETS_PATH,
27
28
  MODEL_CONFIGS_FILE_PATH,
28
29
  MODEL_PROVIDERS_FILE_PATH,
30
+ PREDEFINED_PROVIDERS,
29
31
  )
30
- from data_designer.config.utils.info import InterfaceInfo
32
+ from data_designer.config.utils.info import InfoType, InterfaceInfo
31
33
  from data_designer.config.utils.io_helpers import write_seed_dataset
32
34
  from data_designer.config.utils.misc import can_run_data_designer_locally
33
35
  from data_designer.engine.analysis.dataset_profiler import (
@@ -103,7 +105,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
103
105
  self._artifact_path = Path(artifact_path) if artifact_path is not None else Path.cwd() / "artifacts"
104
106
  self._buffer_size = DEFAULT_BUFFER_SIZE
105
107
  self._managed_assets_path = Path(managed_assets_path or MANAGED_ASSETS_PATH)
106
- self._model_providers = model_providers or self.get_default_model_providers()
108
+ self._model_providers = self._resolve_model_providers(model_providers)
107
109
  self._model_provider_registry = resolve_model_provider_registry(
108
110
  self._model_providers, get_default_provider_name()
109
111
  )
@@ -151,7 +153,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
151
153
  Returns:
152
154
  InterfaceInfo object with information about the Data Designer interface.
153
155
  """
154
- return InterfaceInfo(model_providers=self._model_providers)
156
+ return self._get_interface_info(self._model_providers)
155
157
 
156
158
  def create(
157
159
  self,
@@ -307,6 +309,22 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
307
309
  raise InvalidBufferValueError("Buffer size must be greater than 0.")
308
310
  self._buffer_size = buffer_size
309
311
 
312
+ def _resolve_model_providers(self, model_providers: list[ModelProvider] | None) -> list[ModelProvider]:
313
+ if model_providers is None:
314
+ if can_run_data_designer_locally():
315
+ model_providers = get_default_providers()
316
+ missing_api_keys = get_defaul_model_providers_missing_api_keys()
317
+ if len(missing_api_keys) == len(PREDEFINED_PROVIDERS):
318
+ logger.warning(
319
+ "🚨 You are trying to use a default model provider but your API keys are missing."
320
+ "\n\t\t\tSet the API key for the default providers you intend to use and re-initialize the Data Designer object."
321
+ "\n\t\t\tAlternatively, you can provide your own model providers during Data Designer object initialization."
322
+ "\n\t\t\tSee https://nvidia-nemo.github.io/DataDesigner/models/model-providers/ for more information."
323
+ )
324
+ self._get_interface_info(model_providers).display(InfoType.MODEL_PROVIDERS)
325
+ return model_providers
326
+ return model_providers or []
327
+
310
328
  def _create_dataset_builder(
311
329
  self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider
312
330
  ) -> ColumnWiseDatasetBuilder:
@@ -349,3 +367,6 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
349
367
  )
350
368
  ),
351
369
  )
370
+
371
+ def _get_interface_info(self, model_providers: list[ModelProvider]) -> InterfaceInfo:
372
+ return InterfaceInfo(model_providers=model_providers)
@@ -1,18 +1,18 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: General framework for synthetic data generation
5
+ License-Expression: Apache-2.0
5
6
  License-File: LICENSE
6
7
  Classifier: Development Status :: 4 - Beta
7
8
  Classifier: Intended Audience :: Developers
8
9
  Classifier: Intended Audience :: Science/Research
9
- Classifier: License :: Other/Proprietary License
10
+ Classifier: License :: OSI Approved :: Apache Software License
10
11
  Classifier: Programming Language :: Python :: 3.10
11
12
  Classifier: Programming Language :: Python :: 3.11
12
13
  Classifier: Programming Language :: Python :: 3.12
13
14
  Classifier: Programming Language :: Python :: 3.13
14
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
- Classifier: Topic :: Scientific/Engineering :: Human Machine Interfaces
16
16
  Classifier: Topic :: Software Development
17
17
  Requires-Python: >=3.10
18
18
  Requires-Dist: anyascii<1.0,>=0.3.3
@@ -51,7 +51,7 @@ Description-Content-Type: text/markdown
51
51
 
52
52
  [![CI](https://github.com/NVIDIA-NeMo/DataDesigner/actions/workflows/ci.yml/badge.svg)](https://github.com/NVIDIA-NeMo/DataDesigner/actions/workflows/ci.yml)
53
53
  [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
54
- [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![NeMo Microservices](https://img.shields.io/badge/NeMo-Microservices-76b900)](https://docs.nvidia.com/nemo/microservices/latest/index.html)
54
+ [![Python 3.10 - 3.13](https://img.shields.io/badge/🐍_Python-3.10_|_3.11_|_3.12_|_3.13-blue.svg)](https://www.python.org/downloads/) [![NeMo Microservices](https://img.shields.io/badge/NeMo-Microservices-76b900)](https://docs.nvidia.com/nemo/microservices/latest/index.html) [![Code](https://img.shields.io/badge/Code-Documentation-8A2BE2.svg)](https://nvidia-nemo.github.io/DataDesigner/)
55
55
 
56
56
  **Generate high-quality synthetic datasets from scratch or using your own seed data.**
57
57
 
@@ -147,10 +147,12 @@ preview.display_sample_record()
147
147
 
148
148
  ### 📚 Learn more
149
149
 
150
- - **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner)** – Detailed walkthrough with more examples
151
- - **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/1-the-basics/)** – Step-by-step interactive tutorials
150
+ - **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/quick-start/)** – Detailed walkthrough with more examples
151
+ - **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/intro/)** – Step-by-step interactive tutorials
152
152
  - **[Column Types](https://nvidia-nemo.github.io/DataDesigner/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
153
+ - **[Validators](https://nvidia-nemo.github.io/DataDesigner/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
153
154
  - **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/models/model-configs/)** – Configure custom models and providers
155
+ - **[Person Sampling](https://nvidia-nemo.github.io/DataDesigner/concepts/person_sampling/)** – Learn how to sample realistic person data with demographic attributes
154
156
 
155
157
  ### 🔧 Configure models via CLI
156
158
 
@@ -162,12 +164,27 @@ data-designer config list # View current settings
162
164
 
163
165
  ### 🤝 Get involved
164
166
 
165
- - **[Contributing Guide](https://nvidia-nemo.github.io/DataDesigner/CONTRIBUTING.md)** – Help improve Data Designer
166
- - **[GitHub Issues](https://github.com/NVIDIA-NeMo/DataDesigner/issues)** – Report bugs or request features
167
- - **[GitHub Discussions](https://github.com/NVIDIA-NeMo/DataDesigner/discussions)** – Ask questions and share ideas
167
+ - **[Contributing Guide](https://nvidia-nemo.github.io/DataDesigner/CONTRIBUTING)** – Help improve Data Designer
168
+ - **[GitHub Issues](https://github.com/NVIDIA-NeMo/DataDesigner/issues)** – Report bugs or make a feature request
168
169
 
169
170
  ---
170
171
 
171
172
  ## License
172
173
 
173
174
  Apache License 2.0 – see [LICENSE](LICENSE) for details.
175
+
176
+ ---
177
+
178
+ ## Citation
179
+
180
+ If you use NeMo Data Designer in your research, please cite it using the following BibTeX entry:
181
+
182
+ ```bibtex
183
+ @misc{nemo-data-designer,
184
+ author = {The NeMo Data Designer Team},
185
+ title = {NeMo Data Designer: A framework for generating synthetic data from scratch or based on your own seed data},
186
+ howpublished = {\url{https://github.com/NVIDIA-NeMo/DataDesigner}},
187
+ year = {2025},
188
+ note = {GitHub Repository},
189
+ }
190
+ ```
@@ -1,5 +1,5 @@
1
1
  data_designer/__init__.py,sha256=iCeqRnb640RrL2QpA630GY5Ng7JiDt83Vq0DwLnNugU,461
2
- data_designer/_version.py,sha256=5jwwVncvCiTnhOedfkzzxmxsggwmTBORdFL_4wq0ZeY,704
2
+ data_designer/_version.py,sha256=m8HxkqoKGw_wAJtc4ZokpJKNLXqp4zwnNhbnfDtro7w,704
3
3
  data_designer/errors.py,sha256=Z4eN9XwzZvGRdBluSNoSqQYkPPzNQIDf0ET_OqWRZh8,179
4
4
  data_designer/logging.py,sha256=O6LlQRj4IdkvEEYiMkKfMb_ZDgN1YpkGQUCqcp7nY6w,5354
5
5
  data_designer/plugin_manager.py,sha256=jWoo80x0oCiOIJMA43t-vK-_hVv9_xt4WhBcurYoDqw,3098
@@ -36,8 +36,8 @@ data_designer/config/column_types.py,sha256=V0Ijwb-asYOX-GQyG9W-X_A-FIbFSajKuus5
36
36
  data_designer/config/config_builder.py,sha256=NlAe6cwN6IAE90A8uPLsOdABmmYyUt6UnGYZwgmf_xE,27288
37
37
  data_designer/config/data_designer_config.py,sha256=cvIXMVQzYn9vC4GINPz972pDBmt-HrV5dvw1568LVmE,1719
38
38
  data_designer/config/dataset_builders.py,sha256=1pNFy_pkQ5lJ6AVZ43AeTuSbz6yC_l7Ndcyp5yaT8hQ,327
39
- data_designer/config/datastore.py,sha256=oPC4jeupalPexhe8K2BkMSlPvDaOZWAyoDuaq9m-Uo4,6272
40
- data_designer/config/default_model_settings.py,sha256=TMnxGQNAE7ipTmPF1R0qJBEUX199FWdTnjNiy5oR1Bo,4668
39
+ data_designer/config/datastore.py,sha256=okuwUz-M5bSThvp_a9erKRoG4ej0bey1HUQBA7hgL98,6298
40
+ data_designer/config/default_model_settings.py,sha256=b_oWsD350rb43009kcRxuPNgCZegB_noohURR4n1ZR0,4516
41
41
  data_designer/config/errors.py,sha256=XneHH6tKHG2sZ71HzmPr7k3UBZ_psnSANknT30n-aa8,449
42
42
  data_designer/config/interface.py,sha256=2_tHvxtKAv0C5L7K4ztm-Xa1A-u9Njlwo2drdPa2qmk,1499
43
43
  data_designer/config/models.py,sha256=5Cy55BnKYyr-I1UHLUTqZxe6Ca9uVQWpUiwt9X0ZlrU,7521
@@ -133,7 +133,7 @@ data_designer/engine/resources/managed_dataset_generator.py,sha256=KXrWdgod-NFaC
133
133
  data_designer/engine/resources/managed_dataset_repository.py,sha256=lqVxuoCxc07QTrhnAR1mgDiHFkzjjkx2IwcrxrdbloY,7547
134
134
  data_designer/engine/resources/managed_storage.py,sha256=jRnGeCTGlu6FxC6tOCssPiSpbHEf0mbqFfm3mM0utdA,2079
135
135
  data_designer/engine/resources/resource_provider.py,sha256=CbB2D538ECGkvyHF1V63_TDn-wStCoklV7bF0y4mabY,1859
136
- data_designer/engine/resources/seed_dataset_data_store.py,sha256=uD8g_7dmVvGmOIG5NMnkMok_0zSdEHVQ1kQcfFqWIG4,2226
136
+ data_designer/engine/resources/seed_dataset_data_store.py,sha256=dM2HgfyUgbF7MidN8dn5S-LAR0GVPJfjqXpDPTP2XoA,3035
137
137
  data_designer/engine/sampling_gen/column.py,sha256=gDIPth7vK2797rGtLhf_kVGMAC-khefKHodeeDoqV-I,3946
138
138
  data_designer/engine/sampling_gen/constraints.py,sha256=RyhRF9KeUOwEiHr_TN3QwLWOVLTpuCFpCI_3Qr-9Whs,3028
139
139
  data_designer/engine/sampling_gen/errors.py,sha256=UBZBtosD07EisCdeo8r-Uq4h0QL3tYS1qwtEmca8_jM,828
@@ -163,15 +163,15 @@ data_designer/engine/validators/remote.py,sha256=jtDIvWzfHh17m2ac_Fp93p49Th8RlkB
163
163
  data_designer/engine/validators/sql.py,sha256=bxbyxPxDT9yuwjhABVEY40iR1pzWRFi65WU4tPgG2bE,2250
164
164
  data_designer/essentials/__init__.py,sha256=zrDZ7hahOmOhCPdfoj0z9ALN10lXIesfwd2qXRqTcdY,4125
165
165
  data_designer/interface/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
166
- data_designer/interface/data_designer.py,sha256=yh_lqEvL0LoqXX-KYDflVjVp8yGFkhSUe_yzZxtV__Q,14904
166
+ data_designer/interface/data_designer.py,sha256=MSzT9OFd3V6saZID0vfQxx0oB6Fth8GmEFcnmFNXOVo,16271
167
167
  data_designer/interface/errors.py,sha256=jagKT3tPUnYq4e3e6AkTnBkcayHyEfxjPMBzx-GEKe4,565
168
168
  data_designer/interface/results.py,sha256=qFxa8SuCXeADiRpaCMBwJcExkJBCfUPeGCdcJSTjoTc,2111
169
169
  data_designer/plugins/__init__.py,sha256=c_V7q4QhfVoNf_uc9UwmXCsWqwtyWogI7YoN_0PzzE4,234
170
170
  data_designer/plugins/errors.py,sha256=yPIHpSddEr-o9ZcNVibb2hI-73O15Kg_Od8SlmQlnRs,297
171
171
  data_designer/plugins/plugin.py,sha256=7ErdUyrTdOb5PCBE3msdhTOrvQpldjOQw90-Bu4Bosc,2522
172
172
  data_designer/plugins/registry.py,sha256=iPDTh4duV1cKt7H1fXkj1bKLG6SyUKmzQ9xh-vjEoaM,3018
173
- data_designer-0.1.0.dist-info/METADATA,sha256=pW_EXcja79dhuYz8nL5RuenZqpBSEnS8r85TY6B87dc,5918
174
- data_designer-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
175
- data_designer-0.1.0.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
176
- data_designer-0.1.0.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
177
- data_designer-0.1.0.dist-info/RECORD,,
173
+ data_designer-0.1.1.dist-info/METADATA,sha256=MeR9kVPEkyXH8I-qiYdZpTiZ1yM2FWWi3PiPYsGsX9c,6698
174
+ data_designer-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
175
+ data_designer-0.1.1.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
176
+ data_designer-0.1.1.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
177
+ data_designer-0.1.1.dist-info/RECORD,,