PyPI - data-designer - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

data-designer 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data_designer/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.1.0'
-__version_tuple__ = version_tuple = (0, 1, 0)
+__version__ = version = '0.1.2'
+__version_tuple__ = version_tuple = (0, 1, 2)
 __commit_id__ = commit_id = None

data_designer/config/column_configs.py CHANGED Viewed

@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from abc import ABC
-from typing import Literal, Optional, Type, Union
+from typing import Annotated, Literal, Optional, Type, Union
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Discriminator, Field, model_validator
 from typing_extensions import Self
 from .base import ConfigBase
@@ -89,11 +89,36 @@ class SamplerColumnConfig(SingleColumnConfig):
     """
     sampler_type: SamplerType
-    params: SamplerParamsT
-    conditional_params: dict[str, SamplerParamsT] = {}
+    params: Annotated[SamplerParamsT, Discriminator("sampler_type")]
+    conditional_params: dict[str, Annotated[SamplerParamsT, Discriminator("sampler_type")]] = {}
     convert_to: Optional[str] = None
     column_type: Literal["sampler"] = "sampler"
+    @model_validator(mode="before")
+    @classmethod
+    def inject_sampler_type_into_params(cls, data: dict) -> dict:
+        """Inject sampler_type into params dict to enable discriminated union resolution.
+        This allows users to pass params as a simple dict without the sampler_type field,
+        which will be automatically added based on the outer sampler_type field.
+        """
+        if isinstance(data, dict):
+            sampler_type = data.get("sampler_type")
+            params = data.get("params")
+            # If params is a dict and doesn't have sampler_type, inject it
+            if sampler_type and isinstance(params, dict) and "sampler_type" not in params:
+                data["params"] = {"sampler_type": sampler_type, **params}
+            # Handle conditional_params similarly
+            conditional_params = data.get("conditional_params")
+            if conditional_params and isinstance(conditional_params, dict):
+                for condition, cond_params in conditional_params.items():
+                    if isinstance(cond_params, dict) and "sampler_type" not in cond_params:
+                        data["conditional_params"][condition] = {"sampler_type": sampler_type, **cond_params}
+        return data
 class LLMTextColumnConfig(SingleColumnConfig):
     """Configuration for text generation columns using Large Language Models.

data_designer/config/datastore.py CHANGED Viewed

@@ -31,34 +31,37 @@ class DatastoreSettings(BaseModel):
     token: Optional[str] = Field(default=None, description="If needed, token to use for authentication.")
-def get_file_column_names(file_path: Union[str, Path], file_type: str) -> list[str]:
-    """Extract column names based on file type. Supports glob patterns like '../path/*.parquet'."""
-    file_path = Path(file_path)
-    if "*" in str(file_path):
-        matching_files = sorted(file_path.parent.glob(file_path.name))
-        if not matching_files:
-            raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
-        logger.debug(f"0️⃣ Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
-        file_path = matching_files[0]
+def get_file_column_names(file_reference: Union[str, Path, HfFileSystem], file_type: str) -> list[str]:
+    """Get column names from a dataset file.
+    Args:
+        file_reference: Path to the dataset file, or an HfFileSystem object.
+        file_type: Type of the dataset file. Must be one of: 'parquet', 'json', 'jsonl', 'csv'.
+    Raises:
+        InvalidFilePathError: If the file type is not supported.
+    Returns:
+        List of column names.
+    """
     if file_type == "parquet":
         try:
-            schema = pq.read_schema(file_path)
+            schema = pq.read_schema(file_reference)
             if hasattr(schema, "names"):
                 return schema.names
             else:
                 return [field.name for field in schema]
         except Exception as e:
-            logger.warning(f"Failed to process parquet file {file_path}: {e}")
+            logger.warning(f"Failed to process parquet file {file_reference}: {e}")
             return []
     elif file_type in ["json", "jsonl"]:
-        return pd.read_json(file_path, orient="records", lines=True, nrows=1).columns.tolist()
+        return pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist()
     elif file_type == "csv":
         try:
-            df = pd.read_csv(file_path, nrows=1)
+            df = pd.read_csv(file_reference, nrows=1)
             return df.columns.tolist()
         except (pd.errors.EmptyDataError, pd.errors.ParserError) as e:
-            logger.warning(f"Failed to process CSV file {file_path}: {e}")
+            logger.warning(f"Failed to process CSV file {file_reference}: {e}")
             return []
     else:
         raise InvalidFilePathError(f"🛑 Unsupported file type: {file_type!r}")
@@ -66,12 +69,36 @@ def get_file_column_names(file_path: Union[str, Path], file_type: str) -> list[s
 def fetch_seed_dataset_column_names(seed_dataset_reference: SeedDatasetReference) -> list[str]:
     if hasattr(seed_dataset_reference, "datastore_settings"):
-        return _fetch_seed_dataset_column_names_from_datastore(
+        return fetch_seed_dataset_column_names_from_datastore(
             seed_dataset_reference.repo_id,
             seed_dataset_reference.filename,
             seed_dataset_reference.datastore_settings,
         )
-    return _fetch_seed_dataset_column_names_from_local_file(seed_dataset_reference.dataset)
+    return fetch_seed_dataset_column_names_from_local_file(seed_dataset_reference.dataset)
+def fetch_seed_dataset_column_names_from_datastore(
+    repo_id: str,
+    filename: str,
+    datastore_settings: Optional[Union[DatastoreSettings, dict]] = None,
+) -> list[str]:
+    file_type = filename.split(".")[-1]
+    if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
+        raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
+    datastore_settings = resolve_datastore_settings(datastore_settings)
+    fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
+    file_path = _extract_single_file_path_from_glob_pattern_if_present(f"datasets/{repo_id}/{filename}", fs=fs)
+    with fs.open(file_path) as f:
+        return get_file_column_names(f, file_type)
+def fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
+    dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
+    dataset_path = _extract_single_file_path_from_glob_pattern_if_present(dataset_path)
+    return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
 def resolve_datastore_settings(datastore_settings: DatastoreSettings | dict | None) -> DatastoreSettings:
@@ -114,25 +141,34 @@ def upload_to_hf_hub(
     return f"{repo_id}/{filename}"
-def _fetch_seed_dataset_column_names_from_datastore(
-    repo_id: str,
-    filename: str,
-    datastore_settings: Optional[Union[DatastoreSettings, dict]] = None,
-) -> list[str]:
-    file_type = filename.split(".")[-1]
-    if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
-        raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
-    datastore_settings = resolve_datastore_settings(datastore_settings)
-    fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token)
-    with fs.open(f"datasets/{repo_id}/{filename}") as f:
-        return get_file_column_names(f, file_type)
+def _extract_single_file_path_from_glob_pattern_if_present(
+    file_path: str | Path,
+    fs: HfFileSystem | None = None,
+) -> Path:
+    file_path = Path(file_path)
-def _fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
-    dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
-    return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
+    # no glob pattern
+    if "*" not in str(file_path):
+        return file_path
+    # glob pattern with HfFileSystem
+    if fs is not None:
+        file_to_check = None
+        file_extension = file_path.name.split(".")[-1]
+        for file in fs.ls(str(file_path.parent)):
+            filename = file["name"]
+            if filename.endswith(f".{file_extension}"):
+                file_to_check = filename
+        if file_to_check is None:
+            raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
+        logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
+        return Path(file_to_check)
+    # glob pattern with local file system
+    if not (matching_files := sorted(file_path.parent.glob(file_path.name))):
+        raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
+    logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
+    return matching_files[0]
 def _validate_dataset_path(dataset_path: Union[str, Path], allow_glob_pattern: bool = False) -> Path:

data_designer/config/default_model_settings.py CHANGED Viewed

@@ -4,6 +4,7 @@
 from functools import lru_cache
 import logging
+import os
 from pathlib import Path
 from typing import Any, Literal, Optional
@@ -15,7 +16,6 @@ from .utils.constants import (
     PREDEFINED_PROVIDERS,
     PREDEFINED_PROVIDERS_MODEL_MAP,
 )
-from .utils.info import ConfigBuilderInfo, InfoType, InterfaceInfo
 from .utils.io_helpers import load_config_file, save_config_file
 logger = logging.getLogger(__name__)
@@ -75,7 +75,15 @@ def get_default_model_configs() -> list[ModelConfig]:
         config_dict = load_config_file(MODEL_CONFIGS_FILE_PATH)
         if "model_configs" in config_dict:
             return [ModelConfig.model_validate(mc) for mc in config_dict["model_configs"]]
-    raise FileNotFoundError(f"Default model configs file not found at {str(MODEL_CONFIGS_FILE_PATH)!r}")
+    return []
+def get_default_model_providers_missing_api_keys() -> list[str]:
+    missing_api_keys = []
+    for predefined_provider in PREDEFINED_PROVIDERS:
+        if os.environ.get(predefined_provider["api_key"]) is None:
+            missing_api_keys.append(predefined_provider["api_key"])
+    return missing_api_keys
 def get_default_providers() -> list[ModelProvider]:
@@ -91,21 +99,17 @@ def get_default_provider_name() -> Optional[str]:
 def resolve_seed_default_model_settings() -> None:
     if not MODEL_CONFIGS_FILE_PATH.exists():
-        logger.info(
+        logger.debug(
             f"🍾 Default model configs were not found, so writing the following to {str(MODEL_CONFIGS_FILE_PATH)!r}"
         )
-        config_builder_info = ConfigBuilderInfo(model_configs=get_builtin_model_configs())
-        config_builder_info.display(info_type=InfoType.MODEL_CONFIGS)
         save_config_file(
             MODEL_CONFIGS_FILE_PATH, {"model_configs": [mc.model_dump() for mc in get_builtin_model_configs()]}
         )
     if not MODEL_PROVIDERS_FILE_PATH.exists():
-        logger.info(
+        logger.debug(
             f"🪄  Default model providers were not found, so writing the following to {str(MODEL_PROVIDERS_FILE_PATH)!r}"
         )
-        interface_info = InterfaceInfo(model_providers=get_builtin_model_providers())
-        interface_info.display(info_type=InfoType.MODEL_PROVIDERS)
         save_config_file(
             MODEL_PROVIDERS_FILE_PATH, {"providers": [p.model_dump() for p in get_builtin_model_providers()]}
         )

data_designer/config/sampler_params.py CHANGED Viewed

@@ -66,6 +66,7 @@ class CategorySamplerParams(ConfigBase):
             "Larger values will be sampled with higher probability."
         ),
     )
+    sampler_type: Literal[SamplerType.CATEGORY] = SamplerType.CATEGORY
     @model_validator(mode="after")
     def _normalize_weights_if_needed(self) -> Self:
@@ -106,6 +107,7 @@ class DatetimeSamplerParams(ConfigBase):
         default="D",
         description="Sampling units, e.g. the smallest possible time interval between samples.",
     )
+    sampler_type: Literal[SamplerType.DATETIME] = SamplerType.DATETIME
     @field_validator("start", "end")
     @classmethod
@@ -136,6 +138,7 @@ class SubcategorySamplerParams(ConfigBase):
         ...,
         description="Mapping from each value of parent category to a list of subcategory values.",
     )
+    sampler_type: Literal[SamplerType.SUBCATEGORY] = SamplerType.SUBCATEGORY
 class TimeDeltaSamplerParams(ConfigBase):
@@ -187,6 +190,7 @@ class TimeDeltaSamplerParams(ConfigBase):
         default="D",
         description="Sampling units, e.g. the smallest possible time interval between samples.",
     )
+    sampler_type: Literal[SamplerType.TIMEDELTA] = SamplerType.TIMEDELTA
     @model_validator(mode="after")
     def _validate_min_less_than_max(self) -> Self:
@@ -219,6 +223,7 @@ class UUIDSamplerParams(ConfigBase):
         default=False,
         description="If true, all letters in the UUID will be capitalized.",
     )
+    sampler_type: Literal[SamplerType.UUID] = SamplerType.UUID
     @property
     def last_index(self) -> int:
@@ -257,6 +262,7 @@ class ScipySamplerParams(ConfigBase):
     decimal_places: Optional[int] = Field(
         default=None, description="Number of decimal places to round the sampled values to."
     )
+    sampler_type: Literal[SamplerType.SCIPY] = SamplerType.SCIPY
 class BinomialSamplerParams(ConfigBase):
@@ -273,6 +279,7 @@ class BinomialSamplerParams(ConfigBase):
     n: int = Field(..., description="Number of trials.")
     p: float = Field(..., description="Probability of success on each trial.", ge=0.0, le=1.0)
+    sampler_type: Literal[SamplerType.BINOMIAL] = SamplerType.BINOMIAL
 class BernoulliSamplerParams(ConfigBase):
@@ -288,6 +295,7 @@ class BernoulliSamplerParams(ConfigBase):
     """
     p: float = Field(..., description="Probability of success.", ge=0.0, le=1.0)
+    sampler_type: Literal[SamplerType.BERNOULLI] = SamplerType.BERNOULLI
 class BernoulliMixtureSamplerParams(ConfigBase):
@@ -327,6 +335,7 @@ class BernoulliMixtureSamplerParams(ConfigBase):
         ...,
         description="Parameters of the scipy.stats distribution given in `dist_name`.",
     )
+    sampler_type: Literal[SamplerType.BERNOULLI_MIXTURE] = SamplerType.BERNOULLI_MIXTURE
 class GaussianSamplerParams(ConfigBase):
@@ -350,6 +359,7 @@ class GaussianSamplerParams(ConfigBase):
     decimal_places: Optional[int] = Field(
         default=None, description="Number of decimal places to round the sampled values to."
     )
+    sampler_type: Literal[SamplerType.GAUSSIAN] = SamplerType.GAUSSIAN
 class PoissonSamplerParams(ConfigBase):
@@ -369,6 +379,7 @@ class PoissonSamplerParams(ConfigBase):
     """
     mean: float = Field(..., description="Mean number of events in a fixed interval.")
+    sampler_type: Literal[SamplerType.POISSON] = SamplerType.POISSON
 class UniformSamplerParams(ConfigBase):
@@ -390,6 +401,7 @@ class UniformSamplerParams(ConfigBase):
     decimal_places: Optional[int] = Field(
         default=None, description="Number of decimal places to round the sampled values to."
     )
+    sampler_type: Literal[SamplerType.UNIFORM] = SamplerType.UNIFORM
 #########################################
@@ -470,11 +482,12 @@ class PersonSamplerParams(ConfigBase):
         default=False,
         description="If True, then append synthetic persona columns to each generated person.",
     )
+    sampler_type: Literal[SamplerType.PERSON] = SamplerType.PERSON
     @property
     def generator_kwargs(self) -> list[str]:
         """Keyword arguments to pass to the person generator."""
-        return [f for f in list(PersonSamplerParams.model_fields) if f != "locale"]
+        return [f for f in list(PersonSamplerParams.model_fields) if f not in ("locale", "sampler_type")]
     @property
     def people_gen_key(self) -> str:
@@ -533,11 +546,12 @@ class PersonFromFakerSamplerParams(ConfigBase):
         min_length=2,
         max_length=2,
     )
+    sampler_type: Literal[SamplerType.PERSON_FROM_FAKER] = SamplerType.PERSON_FROM_FAKER
     @property
     def generator_kwargs(self) -> list[str]:
         """Keyword arguments to pass to the person generator."""
-        return [f for f in list(PersonFromFakerSamplerParams.model_fields) if f != "locale"]
+        return [f for f in list(PersonFromFakerSamplerParams.model_fields) if f not in ("locale", "sampler_type")]
     @property
     def people_gen_key(self) -> str:

data_designer/engine/resources/seed_dataset_data_store.py CHANGED Viewed

@@ -42,11 +42,29 @@ class HfHubSeedDatasetDataStore(SeedDatasetDataStore):
     def __init__(self, endpoint: str, token: str | None):
         self.hfapi = HfApi(endpoint=endpoint, token=token)
-        self.hffs = HfFileSystem(endpoint=endpoint, token=token)
+        self.endpoint = endpoint
+        self.token = token
     def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection:
+        """Create a DuckDB connection with a fresh HfFileSystem registered.
+        Creates a new HfFileSystem instance for each connection to ensure file metadata
+        is fetched fresh from the datastore, avoiding cache-related issues when reading
+        recently updated parquet files.
+        Returns:
+            A DuckDB connection with the HfFileSystem registered for hf:// URI support.
+        """
+        # Use skip_instance_cache to avoid fsspec-level caching
+        hffs = HfFileSystem(endpoint=self.endpoint, token=self.token, skip_instance_cache=True)
+        # Clear all internal caches to avoid stale metadata issues
+        # HfFileSystem caches file metadata (size, etc.) which can become stale when files are re-uploaded
+        if hasattr(hffs, "dircache"):
+            hffs.dircache.clear()
         conn = duckdb.connect()
-        conn.register_filesystem(self.hffs)
+        conn.register_filesystem(hffs)
         return conn
     def get_dataset_uri(self, file_id: str) -> str:

data_designer/interface/data_designer.py CHANGED Viewed

@@ -10,6 +10,7 @@ from data_designer.config.analysis.dataset_profiler import DatasetProfilerResult
 from data_designer.config.config_builder import DataDesignerConfigBuilder
 from data_designer.config.default_model_settings import (
     get_default_model_configs,
+    get_default_model_providers_missing_api_keys,
     get_default_provider_name,
     get_default_providers,
     resolve_seed_default_model_settings,
@@ -26,8 +27,9 @@ from data_designer.config.utils.constants import (
     MANAGED_ASSETS_PATH,
     MODEL_CONFIGS_FILE_PATH,
     MODEL_PROVIDERS_FILE_PATH,
+    PREDEFINED_PROVIDERS,
 )
-from data_designer.config.utils.info import InterfaceInfo
+from data_designer.config.utils.info import InfoType, InterfaceInfo
 from data_designer.config.utils.io_helpers import write_seed_dataset
 from data_designer.config.utils.misc import can_run_data_designer_locally
 from data_designer.engine.analysis.dataset_profiler import (
@@ -103,7 +105,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
         self._artifact_path = Path(artifact_path) if artifact_path is not None else Path.cwd() / "artifacts"
         self._buffer_size = DEFAULT_BUFFER_SIZE
         self._managed_assets_path = Path(managed_assets_path or MANAGED_ASSETS_PATH)
-        self._model_providers = model_providers or self.get_default_model_providers()
+        self._model_providers = self._resolve_model_providers(model_providers)
         self._model_provider_registry = resolve_model_provider_registry(
             self._model_providers, get_default_provider_name()
         )
@@ -151,7 +153,7 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
         Returns:
             InterfaceInfo object with information about the Data Designer interface.
         """
-        return InterfaceInfo(model_providers=self._model_providers)
+        return self._get_interface_info(self._model_providers)
     def create(
         self,
@@ -307,6 +309,22 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
             raise InvalidBufferValueError("Buffer size must be greater than 0.")
         self._buffer_size = buffer_size
+    def _resolve_model_providers(self, model_providers: list[ModelProvider] | None) -> list[ModelProvider]:
+        if model_providers is None:
+            if can_run_data_designer_locally():
+                model_providers = get_default_providers()
+                missing_api_keys = get_default_model_providers_missing_api_keys()
+                if len(missing_api_keys) == len(PREDEFINED_PROVIDERS):
+                    logger.warning(
+                        "🚨 You are trying to use a default model provider but your API keys are missing."
+                        "\n\t\t\tSet the API key for the default providers you intend to use and re-initialize the Data Designer object."
+                        "\n\t\t\tAlternatively, you can provide your own model providers during Data Designer object initialization."
+                        "\n\t\t\tSee https://nvidia-nemo.github.io/DataDesigner/models/model-providers/ for more information."
+                    )
+                    self._get_interface_info(model_providers).display(InfoType.MODEL_PROVIDERS)
+                return model_providers
+        return model_providers or []
     def _create_dataset_builder(
         self, config_builder: DataDesignerConfigBuilder, resource_provider: ResourceProvider
     ) -> ColumnWiseDatasetBuilder:
@@ -349,3 +367,6 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
                 )
             ),
         )
+    def _get_interface_info(self, model_providers: list[ModelProvider]) -> InterfaceInfo:
+        return InterfaceInfo(model_providers=model_providers)

{data_designer-0.1.0.dist-info → data_designer-0.1.2.dist-info}/METADATA RENAMED Viewed

@@ -1,18 +1,18 @@
 Metadata-Version: 2.4
 Name: data-designer
-Version: 0.1.0
+Version: 0.1.2
 Summary: General framework for synthetic data generation
+License-Expression: Apache-2.0
 License-File: LICENSE
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
-Classifier: License :: Other/Proprietary License
+Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
-Classifier: Topic :: Scientific/Engineering :: Human Machine Interfaces
 Classifier: Topic :: Software Development
 Requires-Python: >=3.10
 Requires-Dist: anyascii<1.0,>=0.3.3
@@ -51,7 +51,7 @@ Description-Content-Type: text/markdown
 [![CI](https://github.com/NVIDIA-NeMo/DataDesigner/actions/workflows/ci.yml/badge.svg)](https://github.com/NVIDIA-NeMo/DataDesigner/actions/workflows/ci.yml)
 [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
-[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![NeMo Microservices](https://img.shields.io/badge/NeMo-Microservices-76b900)](https://docs.nvidia.com/nemo/microservices/latest/index.html)
+[![Python 3.10 - 3.13](https://img.shields.io/badge/🐍_Python-3.10_|_3.11_|_3.12_|_3.13-blue.svg)](https://www.python.org/downloads/) [![NeMo Microservices](https://img.shields.io/badge/NeMo-Microservices-76b900)](https://docs.nvidia.com/nemo/microservices/latest/index.html) [![Code](https://img.shields.io/badge/Code-Documentation-8A2BE2.svg)](https://nvidia-nemo.github.io/DataDesigner/)
 **Generate high-quality synthetic datasets from scratch or using your own seed data.**
@@ -97,8 +97,7 @@ export NVIDIA_API_KEY="your-api-key-here"
 export OPENAI_API_KEY="your-openai-api-key-here"
 ```
-### 3. Generate your first dataset
+### 3. Start generating data!
 ```python
 from data_designer.essentials import (
     CategorySamplerParams,
@@ -139,18 +138,18 @@ preview = data_designer.preview(config_builder=config_builder)
 preview.display_sample_record()
 ```
-**That's it!** You've created a dataset.
 ---
 ## What's next?
 ### 📚 Learn more
-- **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner)** – Detailed walkthrough with more examples
-- **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/1-the-basics/)** – Step-by-step interactive tutorials
+- **[Quick Start Guide](https://nvidia-nemo.github.io/DataDesigner/quick-start/)** – Detailed walkthrough with more examples
+- **[Tutorial Notebooks](https://nvidia-nemo.github.io/DataDesigner/notebooks/)** – Step-by-step interactive tutorials
 - **[Column Types](https://nvidia-nemo.github.io/DataDesigner/concepts/columns/)** – Explore samplers, LLM columns, validators, and more
+- **[Validators](https://nvidia-nemo.github.io/DataDesigner/concepts/validators/)** – Learn how to validate generated data with Python, SQL, and remote validators
 - **[Model Configuration](https://nvidia-nemo.github.io/DataDesigner/models/model-configs/)** – Configure custom models and providers
+- **[Person Sampling](https://nvidia-nemo.github.io/DataDesigner/concepts/person_sampling/)** – Learn how to sample realistic person data with demographic attributes
 ### 🔧 Configure models via CLI
@@ -162,12 +161,27 @@ data-designer config list      # View current settings
 ### 🤝 Get involved
-- **[Contributing Guide](https://nvidia-nemo.github.io/DataDesigner/CONTRIBUTING.md)** – Help improve Data Designer
-- **[GitHub Issues](https://github.com/NVIDIA-NeMo/DataDesigner/issues)** – Report bugs or request features
-- **[GitHub Discussions](https://github.com/NVIDIA-NeMo/DataDesigner/discussions)** – Ask questions and share ideas
+- **[Contributing Guide](https://nvidia-nemo.github.io/DataDesigner/CONTRIBUTING)** – Help improve Data Designer
+- **[GitHub Issues](https://github.com/NVIDIA-NeMo/DataDesigner/issues)** – Report bugs or make a feature request
 ---
 ## License
 Apache License 2.0 – see [LICENSE](LICENSE) for details.
+---
+## Citation
+If you use NeMo Data Designer in your research, please cite it using the following BibTeX entry:
+```bibtex
+@misc{nemo-data-designer,
+  author = {The NeMo Data Designer Team},
+  title = {NeMo Data Designer: A framework for generating synthetic data from scratch or based on your own seed data},
+  howpublished = {\url{https://github.com/NVIDIA-NeMo/DataDesigner}},
+  year = {2025},
+  note = {GitHub Repository},
+}
+```

{data_designer-0.1.0.dist-info → data_designer-0.1.2.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 data_designer/__init__.py,sha256=iCeqRnb640RrL2QpA630GY5Ng7JiDt83Vq0DwLnNugU,461
-data_designer/_version.py,sha256=5jwwVncvCiTnhOedfkzzxmxsggwmTBORdFL_4wq0ZeY,704
+data_designer/_version.py,sha256=Ok5oAXdWgR9aghaFXTafTeDW6sYO3uVe6d2Nket57R4,704
 data_designer/errors.py,sha256=Z4eN9XwzZvGRdBluSNoSqQYkPPzNQIDf0ET_OqWRZh8,179
 data_designer/logging.py,sha256=O6LlQRj4IdkvEEYiMkKfMb_ZDgN1YpkGQUCqcp7nY6w,5354
 data_designer/plugin_manager.py,sha256=jWoo80x0oCiOIJMA43t-vK-_hVv9_xt4WhBcurYoDqw,3098
@@ -31,20 +31,20 @@ data_designer/cli/services/model_service.py,sha256=Fn3c0qMZqFAEqzBr0haLjp-nLKAkk
 data_designer/cli/services/provider_service.py,sha256=pdD2_C4yK0YBabcuan95H86UreZJ5zWFGI3Ue99mXXo,3916
 data_designer/config/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
 data_designer/config/base.py,sha256=xCbvwxXKRityWqeGP4zTXVuPHAOoUdpuQr8_t8vY8f8,2423
-data_designer/config/column_configs.py,sha256=QG65him__Xj4d47YX8x7jgVOZz81FrB9C8hpWTGzxLM,16640
+data_designer/config/column_configs.py,sha256=ixpanQApbn4LUyW7E4IJefXQG6c0eYbGxF-GGwV1xCg,18000
 data_designer/config/column_types.py,sha256=V0Ijwb-asYOX-GQyG9W-X_A-FIbFSajKuus58sG8CSM,6774
 data_designer/config/config_builder.py,sha256=NlAe6cwN6IAE90A8uPLsOdABmmYyUt6UnGYZwgmf_xE,27288
 data_designer/config/data_designer_config.py,sha256=cvIXMVQzYn9vC4GINPz972pDBmt-HrV5dvw1568LVmE,1719
 data_designer/config/dataset_builders.py,sha256=1pNFy_pkQ5lJ6AVZ43AeTuSbz6yC_l7Ndcyp5yaT8hQ,327
-data_designer/config/datastore.py,sha256=oPC4jeupalPexhe8K2BkMSlPvDaOZWAyoDuaq9m-Uo4,6272
-data_designer/config/default_model_settings.py,sha256=TMnxGQNAE7ipTmPF1R0qJBEUX199FWdTnjNiy5oR1Bo,4668
+data_designer/config/datastore.py,sha256=Ra6MsPCK6Q1Y8JbTQGRrKtyceig1s41ishyKSZoxgno,7572
+data_designer/config/default_model_settings.py,sha256=aMud_RrRStHnDSbwLxU3BnmIu08YtB1-EG6UUY9NedI,4517
 data_designer/config/errors.py,sha256=XneHH6tKHG2sZ71HzmPr7k3UBZ_psnSANknT30n-aa8,449
 data_designer/config/interface.py,sha256=2_tHvxtKAv0C5L7K4ztm-Xa1A-u9Njlwo2drdPa2qmk,1499
 data_designer/config/models.py,sha256=5Cy55BnKYyr-I1UHLUTqZxe6Ca9uVQWpUiwt9X0ZlrU,7521
 data_designer/config/preview_results.py,sha256=H6ETFI6L1TW8MEC9KYsJ1tXGIC5cloCggBCCZd6jiEE,1087
 data_designer/config/processors.py,sha256=qOF_plBoh6UEFNwUpyDgkqIuSDUaSM2S7k-kSAEB5p8,1328
 data_designer/config/sampler_constraints.py,sha256=4JxP-nge5KstqtctJnVg5RLM1w9mA7qFi_BjgTJl9CE,1167
-data_designer/config/sampler_params.py,sha256=rrub7LPnXb032ClEZfo0eB0WhMekW8DFH8yr20xSz3s,25759
+data_designer/config/sampler_params.py,sha256=NCm2uWEzFHjz8ZzSmiKcVp5jI5okp53tq9l-bWBm4FQ,26821
 data_designer/config/seed.py,sha256=g-iUToYSIFuTv3sbwSG_dF-9RwC8r8AvCD-vS8c_jDg,5487
 data_designer/config/validator_params.py,sha256=sNxFIF2bk_N4jJD-aMH1N5MQynDip08AoMI1ajxtRdc,3909
 data_designer/config/analysis/column_profilers.py,sha256=Qss9gr7oHNcjijW_MMIX9JkFX-V9v5vPwYWCnxLjMDY,2749
@@ -133,7 +133,7 @@ data_designer/engine/resources/managed_dataset_generator.py,sha256=KXrWdgod-NFaC
 data_designer/engine/resources/managed_dataset_repository.py,sha256=lqVxuoCxc07QTrhnAR1mgDiHFkzjjkx2IwcrxrdbloY,7547
 data_designer/engine/resources/managed_storage.py,sha256=jRnGeCTGlu6FxC6tOCssPiSpbHEf0mbqFfm3mM0utdA,2079
 data_designer/engine/resources/resource_provider.py,sha256=CbB2D538ECGkvyHF1V63_TDn-wStCoklV7bF0y4mabY,1859
-data_designer/engine/resources/seed_dataset_data_store.py,sha256=uD8g_7dmVvGmOIG5NMnkMok_0zSdEHVQ1kQcfFqWIG4,2226
+data_designer/engine/resources/seed_dataset_data_store.py,sha256=dM2HgfyUgbF7MidN8dn5S-LAR0GVPJfjqXpDPTP2XoA,3035
 data_designer/engine/sampling_gen/column.py,sha256=gDIPth7vK2797rGtLhf_kVGMAC-khefKHodeeDoqV-I,3946
 data_designer/engine/sampling_gen/constraints.py,sha256=RyhRF9KeUOwEiHr_TN3QwLWOVLTpuCFpCI_3Qr-9Whs,3028
 data_designer/engine/sampling_gen/errors.py,sha256=UBZBtosD07EisCdeo8r-Uq4h0QL3tYS1qwtEmca8_jM,828
@@ -163,15 +163,15 @@ data_designer/engine/validators/remote.py,sha256=jtDIvWzfHh17m2ac_Fp93p49Th8RlkB
 data_designer/engine/validators/sql.py,sha256=bxbyxPxDT9yuwjhABVEY40iR1pzWRFi65WU4tPgG2bE,2250
 data_designer/essentials/__init__.py,sha256=zrDZ7hahOmOhCPdfoj0z9ALN10lXIesfwd2qXRqTcdY,4125
 data_designer/interface/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
-data_designer/interface/data_designer.py,sha256=yh_lqEvL0LoqXX-KYDflVjVp8yGFkhSUe_yzZxtV__Q,14904
+data_designer/interface/data_designer.py,sha256=EzOT_kkWXm9-1Zgbj4RvBfV6_r5ABR7mOuNwbgvKKLQ,16273
 data_designer/interface/errors.py,sha256=jagKT3tPUnYq4e3e6AkTnBkcayHyEfxjPMBzx-GEKe4,565
 data_designer/interface/results.py,sha256=qFxa8SuCXeADiRpaCMBwJcExkJBCfUPeGCdcJSTjoTc,2111
 data_designer/plugins/__init__.py,sha256=c_V7q4QhfVoNf_uc9UwmXCsWqwtyWogI7YoN_0PzzE4,234
 data_designer/plugins/errors.py,sha256=yPIHpSddEr-o9ZcNVibb2hI-73O15Kg_Od8SlmQlnRs,297
 data_designer/plugins/plugin.py,sha256=7ErdUyrTdOb5PCBE3msdhTOrvQpldjOQw90-Bu4Bosc,2522
 data_designer/plugins/registry.py,sha256=iPDTh4duV1cKt7H1fXkj1bKLG6SyUKmzQ9xh-vjEoaM,3018
-data_designer-0.1.0.dist-info/METADATA,sha256=pW_EXcja79dhuYz8nL5RuenZqpBSEnS8r85TY6B87dc,5918
-data_designer-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-data_designer-0.1.0.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
-data_designer-0.1.0.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
-data_designer-0.1.0.dist-info/RECORD,,
+data_designer-0.1.2.dist-info/METADATA,sha256=PjPyL9UQ0Ys4XPqRuruAjuUJ6XPMDf1n1bz17wwoct4,6644
+data_designer-0.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+data_designer-0.1.2.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
+data_designer-0.1.2.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
+data_designer-0.1.2.dist-info/RECORD,,

{data_designer-0.1.0.dist-info → data_designer-0.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{data_designer-0.1.0.dist-info → data_designer-0.1.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{data_designer-0.1.0.dist-info → data_designer-0.1.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

data-designer 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

data-designer 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl