PyPI - data-designer - Versions diffs - 0.1.0__py3-none-any.whl - Mend

data-designer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (177) hide show

data_designer/__init__.py +15 -0
data_designer/_version.py +34 -0
data_designer/cli/README.md +236 -0
data_designer/cli/__init__.py +6 -0
data_designer/cli/commands/__init__.py +2 -0
data_designer/cli/commands/list.py +130 -0
data_designer/cli/commands/models.py +10 -0
data_designer/cli/commands/providers.py +11 -0
data_designer/cli/commands/reset.py +100 -0
data_designer/cli/controllers/__init__.py +7 -0
data_designer/cli/controllers/model_controller.py +246 -0
data_designer/cli/controllers/provider_controller.py +317 -0
data_designer/cli/forms/__init__.py +20 -0
data_designer/cli/forms/builder.py +51 -0
data_designer/cli/forms/field.py +180 -0
data_designer/cli/forms/form.py +59 -0
data_designer/cli/forms/model_builder.py +125 -0
data_designer/cli/forms/provider_builder.py +76 -0
data_designer/cli/main.py +44 -0
data_designer/cli/repositories/__init__.py +8 -0
data_designer/cli/repositories/base.py +39 -0
data_designer/cli/repositories/model_repository.py +42 -0
data_designer/cli/repositories/provider_repository.py +43 -0
data_designer/cli/services/__init__.py +7 -0
data_designer/cli/services/model_service.py +116 -0
data_designer/cli/services/provider_service.py +111 -0
data_designer/cli/ui.py +448 -0
data_designer/cli/utils.py +47 -0
data_designer/config/__init__.py +2 -0
data_designer/config/analysis/column_profilers.py +89 -0
data_designer/config/analysis/column_statistics.py +274 -0
data_designer/config/analysis/dataset_profiler.py +60 -0
data_designer/config/analysis/utils/errors.py +8 -0
data_designer/config/analysis/utils/reporting.py +188 -0
data_designer/config/base.py +68 -0
data_designer/config/column_configs.py +354 -0
data_designer/config/column_types.py +168 -0
data_designer/config/config_builder.py +660 -0
data_designer/config/data_designer_config.py +40 -0
data_designer/config/dataset_builders.py +11 -0
data_designer/config/datastore.py +151 -0
data_designer/config/default_model_settings.py +123 -0
data_designer/config/errors.py +19 -0
data_designer/config/interface.py +54 -0
data_designer/config/models.py +231 -0
data_designer/config/preview_results.py +32 -0
data_designer/config/processors.py +41 -0
data_designer/config/sampler_constraints.py +51 -0
data_designer/config/sampler_params.py +604 -0
data_designer/config/seed.py +145 -0
data_designer/config/utils/code_lang.py +83 -0
data_designer/config/utils/constants.py +313 -0
data_designer/config/utils/errors.py +19 -0
data_designer/config/utils/info.py +88 -0
data_designer/config/utils/io_helpers.py +273 -0
data_designer/config/utils/misc.py +81 -0
data_designer/config/utils/numerical_helpers.py +28 -0
data_designer/config/utils/type_helpers.py +100 -0
data_designer/config/utils/validation.py +336 -0
data_designer/config/utils/visualization.py +427 -0
data_designer/config/validator_params.py +96 -0
data_designer/engine/__init__.py +2 -0
data_designer/engine/analysis/column_profilers/base.py +55 -0
data_designer/engine/analysis/column_profilers/judge_score_profiler.py +160 -0
data_designer/engine/analysis/column_profilers/registry.py +20 -0
data_designer/engine/analysis/column_statistics.py +142 -0
data_designer/engine/analysis/dataset_profiler.py +125 -0
data_designer/engine/analysis/errors.py +7 -0
data_designer/engine/analysis/utils/column_statistics_calculations.py +209 -0
data_designer/engine/analysis/utils/judge_score_processing.py +128 -0
data_designer/engine/column_generators/__init__.py +2 -0
data_designer/engine/column_generators/generators/__init__.py +2 -0
data_designer/engine/column_generators/generators/base.py +61 -0
data_designer/engine/column_generators/generators/expression.py +63 -0
data_designer/engine/column_generators/generators/llm_generators.py +172 -0
data_designer/engine/column_generators/generators/samplers.py +75 -0
data_designer/engine/column_generators/generators/seed_dataset.py +149 -0
data_designer/engine/column_generators/generators/validation.py +147 -0
data_designer/engine/column_generators/registry.py +56 -0
data_designer/engine/column_generators/utils/errors.py +13 -0
data_designer/engine/column_generators/utils/judge_score_factory.py +57 -0
data_designer/engine/column_generators/utils/prompt_renderer.py +98 -0
data_designer/engine/configurable_task.py +82 -0
data_designer/engine/dataset_builders/artifact_storage.py +181 -0
data_designer/engine/dataset_builders/column_wise_builder.py +287 -0
data_designer/engine/dataset_builders/errors.py +13 -0
data_designer/engine/dataset_builders/multi_column_configs.py +44 -0
data_designer/engine/dataset_builders/utils/__init__.py +2 -0
data_designer/engine/dataset_builders/utils/concurrency.py +184 -0
data_designer/engine/dataset_builders/utils/config_compiler.py +60 -0
data_designer/engine/dataset_builders/utils/dag.py +56 -0
data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +190 -0
data_designer/engine/dataset_builders/utils/errors.py +13 -0
data_designer/engine/errors.py +49 -0
data_designer/engine/model_provider.py +75 -0
data_designer/engine/models/__init__.py +2 -0
data_designer/engine/models/errors.py +308 -0
data_designer/engine/models/facade.py +225 -0
data_designer/engine/models/litellm_overrides.py +162 -0
data_designer/engine/models/parsers/__init__.py +2 -0
data_designer/engine/models/parsers/errors.py +34 -0
data_designer/engine/models/parsers/parser.py +236 -0
data_designer/engine/models/parsers/postprocessors.py +93 -0
data_designer/engine/models/parsers/tag_parsers.py +60 -0
data_designer/engine/models/parsers/types.py +82 -0
data_designer/engine/models/recipes/base.py +79 -0
data_designer/engine/models/recipes/response_recipes.py +291 -0
data_designer/engine/models/registry.py +118 -0
data_designer/engine/models/usage.py +75 -0
data_designer/engine/models/utils.py +38 -0
data_designer/engine/processing/ginja/__init__.py +2 -0
data_designer/engine/processing/ginja/ast.py +64 -0
data_designer/engine/processing/ginja/environment.py +461 -0
data_designer/engine/processing/ginja/exceptions.py +54 -0
data_designer/engine/processing/ginja/record.py +30 -0
data_designer/engine/processing/gsonschema/__init__.py +2 -0
data_designer/engine/processing/gsonschema/exceptions.py +8 -0
data_designer/engine/processing/gsonschema/schema_transformers.py +81 -0
data_designer/engine/processing/gsonschema/types.py +8 -0
data_designer/engine/processing/gsonschema/validators.py +143 -0
data_designer/engine/processing/processors/base.py +15 -0
data_designer/engine/processing/processors/drop_columns.py +46 -0
data_designer/engine/processing/processors/registry.py +20 -0
data_designer/engine/processing/utils.py +120 -0
data_designer/engine/registry/base.py +97 -0
data_designer/engine/registry/data_designer_registry.py +37 -0
data_designer/engine/registry/errors.py +10 -0
data_designer/engine/resources/managed_dataset_generator.py +35 -0
data_designer/engine/resources/managed_dataset_repository.py +194 -0
data_designer/engine/resources/managed_storage.py +63 -0
data_designer/engine/resources/resource_provider.py +46 -0
data_designer/engine/resources/seed_dataset_data_store.py +66 -0
data_designer/engine/sampling_gen/column.py +89 -0
data_designer/engine/sampling_gen/constraints.py +95 -0
data_designer/engine/sampling_gen/data_sources/base.py +214 -0
data_designer/engine/sampling_gen/data_sources/errors.py +10 -0
data_designer/engine/sampling_gen/data_sources/sources.py +342 -0
data_designer/engine/sampling_gen/entities/__init__.py +2 -0
data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +64 -0
data_designer/engine/sampling_gen/entities/email_address_utils.py +169 -0
data_designer/engine/sampling_gen/entities/errors.py +8 -0
data_designer/engine/sampling_gen/entities/national_id_utils.py +100 -0
data_designer/engine/sampling_gen/entities/person.py +142 -0
data_designer/engine/sampling_gen/entities/phone_number.py +122 -0
data_designer/engine/sampling_gen/errors.py +24 -0
data_designer/engine/sampling_gen/generator.py +121 -0
data_designer/engine/sampling_gen/jinja_utils.py +60 -0
data_designer/engine/sampling_gen/people_gen.py +203 -0
data_designer/engine/sampling_gen/person_constants.py +54 -0
data_designer/engine/sampling_gen/schema.py +143 -0
data_designer/engine/sampling_gen/schema_builder.py +59 -0
data_designer/engine/sampling_gen/utils.py +40 -0
data_designer/engine/secret_resolver.py +80 -0
data_designer/engine/validators/__init__.py +17 -0
data_designer/engine/validators/base.py +36 -0
data_designer/engine/validators/local_callable.py +34 -0
data_designer/engine/validators/python.py +245 -0
data_designer/engine/validators/remote.py +83 -0
data_designer/engine/validators/sql.py +60 -0
data_designer/errors.py +5 -0
data_designer/essentials/__init__.py +137 -0
data_designer/interface/__init__.py +2 -0
data_designer/interface/data_designer.py +351 -0
data_designer/interface/errors.py +16 -0
data_designer/interface/results.py +55 -0
data_designer/logging.py +161 -0
data_designer/plugin_manager.py +83 -0
data_designer/plugins/__init__.py +6 -0
data_designer/plugins/errors.py +10 -0
data_designer/plugins/plugin.py +69 -0
data_designer/plugins/registry.py +86 -0
data_designer-0.1.0.dist-info/METADATA +173 -0
data_designer-0.1.0.dist-info/RECORD +177 -0
data_designer-0.1.0.dist-info/WHEEL +4 -0
data_designer-0.1.0.dist-info/entry_points.txt +2 -0
data_designer-0.1.0.dist-info/licenses/LICENSE +201 -0

data_designer/engine/sampling_gen/entities/email_address_utils.py ADDED Viewed

@@ -0,0 +1,169 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from datetime import date
+import random
+import re
+import anyascii
+def get_email_address(
+    first_name: str,
+    middle_name: str | None,
+    last_name: str,
+    age: int,
+    birth_date: date,
+) -> str:
+    """
+    Generate an email address based on a person's attributes.
+    """
+    domain = get_email_domain_by_age(age)
+    username_base = get_email_basename_by_name(first_name, middle_name, last_name)
+    suffix = get_email_suffix_by_birth_date(birth_date)
+    # Combine to form email
+    return f"{username_base}{suffix}@{domain}"
+def get_email_domain_by_age(age: int) -> str:
+    """
+    Get a free email domain heuristically dependent on
+    overall number of subscribers and user age.
+    """
+    # Common free email domains
+    # Source: https://www.sellcell.com/blog/most-popular-email-provider-by-number-of-users/
+    # Split heuristically into age demographics
+    # Also adjusted to maintain the approximate 38/27/35 split between these groups
+    email_domains_under_30 = {
+        "gmail.com": 710,  # gmail.com total: 1500
+        "icloud.com": 300,  # icloud.com total: 850
+        "outlook.com": 50,  # outlook.com total: 200
+        "hotmail.com": 40,  # hotmail.com total: 200
+        "yahoo.com": 35,  # yahoo.com total: 230
+        "protonmail.com": 20,  # protonmail.com total: 50
+        "zoho.com": 3,  # zoho.com total: 15
+        "gmx.com": 3,  # gmx.com total: 11
+        "aol.com": 0.1,  # aol.com total: 1.5
+    }
+    email_domains_30_50 = {
+        "gmail.com": 360,
+        "icloud.com": 270,
+        "outlook.com": 60,
+        "hotmail.com": 50,
+        "yahoo.com": 60,
+        "protonmail.com": 18,
+        "zoho.com": 7,
+        "gmx.com": 4,
+        "aol.com": 0.3,
+    }
+    email_domains_over_50 = {
+        "gmail.com": 430,
+        "icloud.com": 280,
+        "outlook.com": 90,
+        "hotmail.com": 110,
+        "yahoo.com": 135,
+        "protonmail.com": 12,
+        "zoho.com": 5,
+        "gmx.com": 4,
+        "aol.com": 1.1,
+    }
+    if age < 30:
+        return random.choices(
+            list(email_domains_under_30.keys()),
+            weights=list(email_domains_under_30.values()),
+            k=1,
+        )[0]
+    elif age < 50:
+        return random.choices(
+            list(email_domains_30_50.keys()),
+            weights=list(email_domains_30_50.values()),
+            k=1,
+        )[0]
+    else:
+        return random.choices(
+            list(email_domains_over_50.keys()),
+            weights=list(email_domains_over_50.values()),
+            k=1,
+        )[0]
+def get_email_basename_by_name(first_name: str, middle_name: str | None, last_name: str) -> str:
+    """
+    Get a email address basename heuristically dependent on first and last name.
+    Patterns include:
+        - firstname.lastname
+        - firstnamelastname
+        - firstinitiallastname
+        - firstname_lastname
+        - lastnamefirstinitial
+        - firstnamelastinitial
+        - firstnamemiddlename
+        - firstnamemiddleinitiallastname
+        - firstnamemiddlenamelastname
+    """
+    # Normalize names (lowercase, remove spaces and special chars)
+    first = re.sub(r"[^a-z0-9]", "", anyascii.anyascii(first_name).lower())
+    last = re.sub(r"[^a-z0-9]", "", anyascii.anyascii(last_name).lower())
+    assert len(first) > 0 and len(last) > 0, (
+        "Both first and last name must be non-empty, after removing non-alphanumeric."
+    )
+    first_initial = first[0]
+    last_initial = last[0]
+    # Generate username patterns
+    username_patterns = [
+        f"{first}.{last}",
+        f"{first}{last}",
+        f"{first_initial}{last}",
+        f"{first}_{last}",
+        f"{last}{first_initial}",
+        f"{first}{last_initial}",
+    ]
+    # Higher probability for more common patterns
+    pattern_weights = [0.3, 0.2, 0.15, 0.1, 0.15, 0.1]
+    if middle_name:
+        middle = re.sub(r"[^a-z0-9]", "", anyascii.anyascii(middle_name).lower())
+        middle_initial = middle[0]
+        username_patterns.extend(
+            [
+                f"{first}{middle}",
+                f"{first}{middle_initial}{last}",
+                f"{first}{middle}{last}",
+            ]
+        )
+        pattern_weights = [0.25, 0.17, 0.12, 0.08, 0.12, 0.08, 0.06, 0.06, 0.06]
+    return random.choices(username_patterns, weights=pattern_weights, k=1)[0]
+def get_email_suffix_by_birth_date(birth_date: date) -> str:
+    """
+    Get a email address suffix heuristically dependent on birth date.
+    Suffices include:
+        - Empty
+        - Random 1-2 digit number
+        - Last 2 digits of birth year
+        - Full birth year
+        - Birth day
+    """
+    # Suffix patterns (could be empty)
+    birth_day = birth_date.day
+    birth_year = birth_date.year
+    birth_year_short = birth_year % 100
+    suffix_patterns = [
+        "",
+        str(random.randint(1, 99)),
+        f"{birth_year_short:02d}",
+        str(birth_date.year),
+        str(birth_day),
+    ]
+    suffix_weights = [0.4, 0.3, 0.1, 0.1, 0.1]
+    # Select pattern and suffix based on weights
+    return random.choices(suffix_patterns, weights=suffix_weights, k=1)[0]

data_designer/engine/sampling_gen/entities/errors.py ADDED Viewed

@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from data_designer.errors import DataDesignerError
+class MissingPersonFieldsError(DataDesignerError):
+    """Exception for all errors related to missing person fields."""

data_designer/engine/sampling_gen/entities/national_id_utils.py ADDED Viewed

@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from datetime import date
+import random
+SSN_RANDOMIZATION_DATE = date(2011, 6, 25)
+# Area number mapping by state code (pre-2011)
+STATE_TO_AREA_SSN = {
+    "NH": [1, 3],
+    "ME": [4, 7],
+    "VT": [8, 9],
+    "MA": [10, 34],
+    "RI": [35, 39],
+    "CT": [40, 49],
+    "NY": [50, 134],
+    "NJ": [135, 158],
+    "PA": [159, 211],
+    "MD": [212, 220],
+    "DE": [221, 222],
+    "VA": [223, 231],
+    "WV": [232, 236],
+    "NC": [237, 246],
+    "SC": [247, 251],
+    "GA": [252, 260],
+    "FL": [261, 267],
+    "OH": [268, 302],
+    "IN": [303, 317],
+    "IL": [318, 361],
+    "MI": [362, 386],
+    "WI": [387, 399],
+    "KY": [400, 407],
+    "TN": [408, 415],
+    "AL": [416, 424],
+    "MS": [425, 428],
+    "AR": [429, 432],
+    "LA": [433, 439],
+    "OK": [440, 448],
+    "TX": [449, 467],
+    "MN": [468, 477],
+    "IA": [478, 485],
+    "MO": [486, 500],
+    "ND": [501, 502],
+    "SD": [503, 504],
+    "NE": [505, 508],
+    "KS": [509, 515],
+    "MT": [516, 517],
+    "ID": [518, 519],
+    "WY": [520, 520],
+    "CO": [521, 524],
+    "NM": [525, 527],
+    "AZ": [526, 527],
+    "UT": [528, 529],
+    "NV": [530, 530],
+    "WA": [531, 539],
+    "OR": [540, 544],
+    "CA": [545, 573],
+    "AK": [574, 574],
+    "HI": [575, 576],
+    "DC": [577, 579],
+    "VI": [580, 580],
+    "PR": [580, 599],
+    "GU": [586, 586],
+    "AS": [586, 586],
+}
+def generate_ssn(state: str, birth_date: date) -> str:
+    """
+    Generate a synthetic SSN based on state and birth date.
+    The first three digits are derived from the state the person lives in,
+    if born after June 25, 2011, with an 80% chance. Otherwise, the first
+    three digits are randomly chosen from the possible codes.
+    Args:
+        state (str): Two-letter state code (e.g., "NY", "CA")
+        birth_date (date): Date of birth
+    Returns:
+        str: A formatted synthetic SSN in the format "XXX-XX-XXXX"
+    """
+    if birth_date < SSN_RANDOMIZATION_DATE:
+        if random.random() < 0.3:
+            # Maybe born in a different state
+            area_range = random.choice(list(STATE_TO_AREA_SSN.values()))
+        area_range = STATE_TO_AREA_SSN.get(state, [1, 899])
+    else:
+        area_range = [1, 899]
+    area = 666
+    while area == 666:
+        # Unallowed area code
+        area = random.randint(area_range[0], area_range[1])
+    # Group number
+    group = random.randint(1, 99)
+    # Serial number
+    serial = random.randint(1, 9999)
+    return f"{area:03d}-{group:02d}-{serial:04d}"

data_designer/engine/sampling_gen/entities/person.py ADDED Viewed

@@ -0,0 +1,142 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from datetime import date, timedelta
+import random
+from typing import Any, Literal, TypeAlias
+from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
+from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
+from data_designer.engine.resources.managed_dataset_repository import load_managed_dataset_repository
+from data_designer.engine.resources.managed_storage import ManagedBlobStorage
+from data_designer.engine.sampling_gen.entities.dataset_based_person_fields import (
+    PERSONA_FIELDS,
+    PII_FIELDS,
+    REQUIRED_FIELDS,
+)
+from data_designer.engine.sampling_gen.entities.email_address_utils import get_email_address
+from data_designer.engine.sampling_gen.entities.errors import MissingPersonFieldsError
+from data_designer.engine.sampling_gen.entities.national_id_utils import generate_ssn
+from data_designer.engine.sampling_gen.entities.phone_number import PhoneNumber
+from data_designer.engine.sampling_gen.errors import DatasetNotAvailableForLocaleError
+SexT: TypeAlias = Literal["Male", "Female"]
+def convert_age_to_birth_date(age: int) -> date:
+    today = date.today()
+    start_date = today.replace(year=today.year - age - 1)
+    end_date = today.replace(year=today.year - age)
+    days_between = (end_date - start_date).days
+    random_days = random.randint(0, days_between)
+    birthdate = start_date + timedelta(days=random_days)
+    return birthdate
+def generate_email_address(
+    first_name: str,
+    middle_name: str | None,
+    last_name: str,
+    age: int,
+    birth_date: date,
+) -> str | None:
+    """
+    Generate an email address based on the person's attributes.
+    Email address is None for children. Uses common free email domains.
+    """
+    if age < 18:
+        return None
+    return get_email_address(
+        first_name=first_name,
+        middle_name=middle_name,
+        last_name=last_name,
+        age=age,
+        birth_date=birth_date,
+    )
+def get_national_id(locale: str | None, region: str | None, birth_date: date) -> str | None:
+    if locale != "en_US":
+        return None
+    if region is None:
+        return None
+    return generate_ssn(state=region, birth_date=birth_date)
+def generate_phone_number(locale: str, age: int, postcode: str | None, style: str = "dash") -> str | None:
+    """
+    Generate a phone number correlated with location (postcode).
+    Phone number is None for children.
+    """
+    if locale != "en_US":
+        return None
+    if age < 18:
+        return None
+    if postcode is None:
+        return None
+    locality_var = random.random()
+    if locality_var < 0.6:
+        # Exact match to postcode 60% of the time
+        return PhoneNumber.from_zip_prefix(postcode).format(style=style)
+    elif locality_var < 0.8:
+        # Nearby postcodes 20% of the time
+        return PhoneNumber.from_zip_prefix(postcode[:4]).format(style=style)
+    elif locality_var < 0.9:
+        # More distant postcodes 10% of the time
+        return PhoneNumber.from_zip_prefix(postcode[:3]).format(style=style)
+    # Random (population-weighted) area code 10% of the time
+    return PhoneNumber.generate().format(style=style)
+def generate_and_insert_derived_fields(person_record: dict[str, Any]) -> dict[str, str | None]:
+    _verify_required_fields(person_record)
+    birth_date = convert_age_to_birth_date(person_record.get("age"))
+    person_record.update(
+        {
+            # Note: All data must be serializable to JSON.
+            "birth_date": birth_date.isoformat(),
+            "phone_number": generate_phone_number(
+                locale=person_record.get("locale"),
+                age=person_record.get("age"),
+                postcode=person_record.get("postcode"),
+            ),
+            "email_address": generate_email_address(
+                first_name=person_record.get("first_name"),
+                middle_name=person_record.get("middle_name"),
+                last_name=person_record.get("last_name"),
+                age=person_record.get("age"),
+                birth_date=birth_date,
+            ),
+            "national_id": get_national_id(
+                locale=person_record.get("locale"),
+                region=person_record.get("region"),
+                birth_date=birth_date,
+            ),
+        }
+    )
+    if person_record.get("locale") == "en_US" and "region" in person_record and "state" not in person_record:
+        state = person_record.pop("region")
+        person_record.update({"state": state})
+    return {
+        **{k: v for k, v in person_record.items() if k in PII_FIELDS},
+        **{k: v for k, v in person_record.items() if k in ["state", "phone_number", "email_address", "national_id"]},
+        **{k: v for k, v in person_record.items() if k in PERSONA_FIELDS},
+    }
+def load_person_data_sampler(blob_storage: ManagedBlobStorage, locale: str) -> ManagedDatasetGenerator:
+    if locale not in LOCALES_WITH_MANAGED_DATASETS:
+        raise DatasetNotAvailableForLocaleError(f"Locale {locale} is not supported by the managed dataset generator.")
+    return ManagedDatasetGenerator(
+        managed_datasets=load_managed_dataset_repository(blob_storage, [locale]),
+        dataset_name=locale,
+    )
+def _verify_required_fields(person_record: dict[str, Any]) -> None:
+    """Verify that the person record contains all required fields."""
+    missing_fields = REQUIRED_FIELDS - set(person_record.keys())
+    if missing_fields:
+        raise MissingPersonFieldsError(f"Person data is missing the following required fields: {missing_fields}")

data_designer/engine/sampling_gen/entities/phone_number.py ADDED Viewed

@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+import random
+from typing import Optional
+import pandas as pd
+from pydantic import BaseModel, Field, field_validator
+ZIP_AREA_CODE_DATA = pd.read_parquet(Path(__file__).parent / "assets" / "zip_area_code_map.parquet")
+ZIPCODE_AREA_CODE_MAP = dict(zip(ZIP_AREA_CODE_DATA["zipcode"], ZIP_AREA_CODE_DATA["area_code"]))
+ZIPCODE_POPULATION_MAP = dict(zip(ZIP_AREA_CODE_DATA["zipcode"], ZIP_AREA_CODE_DATA["count"]))
+def get_area_code(zip_prefix: Optional[str] = None) -> str:
+    """
+    Sample an area code for the given ZIP code prefix, population-weighted.
+    Args:
+        zip_prefix: The prefix of a ZIP code, 5 digits or fewer. If None, sample from all ZIP codes.
+    Returns:
+        A sampled area code matching the prefix, population-weighted.
+    """
+    if zip_prefix is None:
+        zipcodes, weights = zip(*ZIPCODE_POPULATION_MAP.items())
+        zipcode = random.choices(zipcodes, weights=weights, k=1)[0]
+        return str(ZIPCODE_AREA_CODE_MAP[zipcode])
+    if len(zip_prefix) == 5:
+        try:
+            return str(ZIPCODE_AREA_CODE_MAP[zip_prefix])
+        except KeyError:
+            raise ValueError(f"ZIP code {zip_prefix} not found.")
+    matching_zipcodes = [[z, c] for z, c in ZIPCODE_POPULATION_MAP.items() if z.startswith(zip_prefix)]
+    zipcodes, weights = zip(*matching_zipcodes)
+    if not zipcodes:
+        raise ValueError(f"No ZIP codes found with prefix {zip_prefix}.")
+    zipcode = random.choices(zipcodes, weights=weights, k=1)[0]
+    return str(ZIPCODE_AREA_CODE_MAP[zipcode])
+class PhoneNumber(BaseModel):
+    """
+    A phone number object that supports various formatting styles
+    """
+    country_code: str = Field(default="1")
+    area_code: str
+    prefix: str  # First part of the local number
+    line_number: str  # Second part of the local number
+    @field_validator("country_code", "area_code", "prefix", "line_number")
+    @classmethod
+    def validate_digits(cls, v):
+        if not v.isdigit():
+            raise ValueError("Must contain only digits")
+        return v
+    @field_validator("country_code")
+    @classmethod
+    def validate_country_code_length(cls, v):
+        max_length = 3
+        if len(v) > max_length:
+            raise ValueError(f"Country code {v} is longer than {max_length} digits")
+        return v
+    def format(self, style: str = "dash") -> str:
+        """
+        Format the phone number according to the specified style.
+        Args:
+            style: One of "dash", "parentheses", "dot", "space", "no_separation",
+                  "international_plus", "international"
+        Returns:
+            Formatted phone number string
+        """
+        if style == "dash":
+            formatted = f"{self.area_code}-{self.prefix}-{self.line_number}"
+        elif style == "parentheses":
+            formatted = f"({self.area_code}) {self.prefix}-{self.line_number}"
+        elif style == "dot":
+            formatted = f"{self.area_code}.{self.prefix}.{self.line_number}"
+        elif style == "space":
+            formatted = f"{self.area_code} {self.prefix} {self.line_number}"
+        elif style == "no_separation":
+            formatted = f"{self.area_code}{self.prefix}{self.line_number}"
+        elif style == "international_plus":
+            cc = self.country_code or "1"  # Default to US/Canada
+            formatted = f"+{cc} {self.area_code} {self.prefix} {self.line_number}"
+        elif style == "international":
+            cc = int(self.country_code or 1)  # Default to US/Canada
+            formatted = f"{cc:03d} {self.area_code} {self.prefix} {self.line_number}"
+        else:
+            raise ValueError(f"Unsupported format style: {style}")
+        return formatted
+    @classmethod
+    def from_area_code(cls, area_code: str) -> "PhoneNumber":
+        prefix = str(random.randint(200, 1000))
+        line_number = str(random.randint(0, 10000)).zfill(4)
+        return PhoneNumber(area_code=area_code, prefix=prefix, line_number=line_number)
+    @classmethod
+    def from_zip_prefix(cls, zip_prefix: str) -> "PhoneNumber":
+        """Create a phone number from the given ZIP code prefix."""
+        area_code = get_area_code(zip_prefix)
+        return cls.from_area_code(area_code)
+    @classmethod
+    def generate(cls) -> "PhoneNumber":
+        """Create a random valid US phone number."""
+        area_code = get_area_code()
+        return cls.from_area_code(area_code)
+    def __str__(self) -> str:
+        return self.format("dash")
+    def __repr__(self) -> str:
+        return f"PhoneNumber({str(self)})"

data_designer/engine/sampling_gen/errors.py ADDED Viewed

@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from data_designer.engine.errors import DataDesignerError
+class SamplingGenError(DataDesignerError):
+    """Base exception for all errors in the sampling_gen library."""
+class RejectionSamplingError(SamplingGenError):
+    """Exception for all errors related to rejection sampling."""
+class DataConversionError(SamplingGenError):
+    """Exception for all errors related to data conversion."""
+class DatasetNotAvailableForLocaleError(SamplingGenError):
+    """Exception for all errors related to the dataset not being available for a given locale."""
+class ManagedDatasetGeneratorError(SamplingGenError):
+    """Exception for all errors related to the managed dataset generator."""

data_designer/engine/sampling_gen/generator.py ADDED Viewed

@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from collections.abc import Callable
+from typing import TYPE_CHECKING
+import networkx as nx
+import numpy as np
+import pandas as pd
+from data_designer.engine.sampling_gen.data_sources.base import RadomStateT
+from data_designer.engine.sampling_gen.errors import RejectionSamplingError
+from data_designer.engine.sampling_gen.jinja_utils import JinjaDataFrame
+from data_designer.engine.sampling_gen.people_gen import create_people_gen_resource
+from data_designer.engine.sampling_gen.schema import DataSchema
+from data_designer.engine.sampling_gen.utils import check_random_state
+if TYPE_CHECKING:
+    from data_designer.engine.dataset_builders.multi_column_configs import SamplerMultiColumnConfig
+    from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
+    from data_designer.engine.sampling_gen.column import ConditionalDataColumn
+class DatasetGenerator:
+    """Generates synthetic datasets based on the given schema definition.
+    This object generates synthetic data based on the schema using sampling-based
+    methods (implemented as "data sources"), including handling conditional generation
+    and enforcing constraints through rejection sampling.
+    Args:
+        sampler_columns: Sampler columns to generate.
+        random_state: Random number generator or seed for reproducibility.
+        person_generator_loader: A function that loads a person generator. If None,
+            person generation will not be supported.
+    Note:
+        The generator leverages the schema's DAG to topologically sort the columns
+        and uses rejection sampling to satisfy constraints. If constraints are too strict,
+        generation may fail with a RejectionSamplingError.
+    """
+    def __init__(
+        self,
+        sampler_columns: SamplerMultiColumnConfig | None,
+        random_state: RadomStateT | None = None,
+        person_generator_loader: Callable[[bool], ManagedDatasetGenerator] | None = None,
+        *,
+        schema: DataSchema | None = None,
+        max_rejections_factor: int = 5,
+    ):
+        # This is temporary while we need the legacy and refactored code to coexist.
+        if schema is not None:
+            self.schema = schema
+            self.max_rejections_factor = max_rejections_factor
+        else:
+            self.schema = DataSchema(
+                columns=[column.model_dump() for column in sampler_columns.columns],
+                constraints=sampler_columns.constraints,
+            )
+            self.max_rejections_factor = sampler_columns.max_rejections_factor
+        self.rng = check_random_state(random_state)
+        self._dag = self.schema.dag.to_networkx()
+        self._shared_sampler_kwargs = {
+            "random_state": self.rng,
+            "people_gen_resource": create_people_gen_resource(self.schema, person_generator_loader),
+        }
+    def _round_if_needed(self, column: ConditionalDataColumn, df: pd.DataFrame) -> pd.DataFrame:
+        if hasattr(column.params, "decimal_places") and column.params.decimal_places is not None:
+            df[column.name] = df[column.name].round(column.params.decimal_places)
+        return df
+    def _run_rejection_sampling(self, df: pd.DataFrame, column: ConditionalDataColumn) -> pd.DataFrame:
+        name = column.name
+        num_iterations = 0
+        num_samples = len(df)
+        needs_samples = np.ones(num_samples, dtype=bool)
+        while needs_samples.any():
+            for condition in column.conditions:
+                index = JinjaDataFrame(condition).select_index(df[needs_samples])
+                src = column.get_sampler(condition, **self._shared_sampler_kwargs)
+                df = src.inject_data_column(df, name, index)
+            df[name] = column.get_default_sampler(**self._shared_sampler_kwargs).preproc(df[name], column.convert_to)
+            # Check all constraints on the column.
+            batch_mask = np.ones(num_samples, dtype=bool)
+            for constraint in self.schema.get_constraint_checkers(name):
+                batch_mask &= constraint.check(df)
+            needs_samples[batch_mask] = False
+            num_iterations += 1
+            if num_iterations > self.max_rejections_factor * num_samples:
+                raise RejectionSamplingError(
+                    "Exceeded the maximum number of rejections (max_rejections_factor * "
+                    f"num_samples = {self.max_rejections_factor * num_samples}) while "
+                    f"sampling `{column.name}`. Please consider adjusting the constraints "
+                    "and/or column's generation configuration."
+                )
+        return df
+    def generate(self, num_samples: int) -> pd.DataFrame:
+        dataset = pd.DataFrame(index=range(num_samples))
+        for column_name in nx.topological_sort(self._dag):
+            column = self.schema.get_column(column_name)
+            dataset = self._run_rejection_sampling(dataset, column)
+        for column in self.schema.columns:
+            dataset[column.name] = column.get_default_sampler(**self._shared_sampler_kwargs).postproc(
+                dataset[column.name], column.convert_to
+            )
+            dataset = self._round_if_needed(column, dataset)
+        return dataset[self.schema.column_names]