data-designer-engine 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/engine/__init__.py +2 -0
- data_designer/engine/_version.py +34 -0
- data_designer/engine/analysis/column_profilers/base.py +49 -0
- data_designer/engine/analysis/column_profilers/judge_score_profiler.py +153 -0
- data_designer/engine/analysis/column_profilers/registry.py +22 -0
- data_designer/engine/analysis/column_statistics.py +145 -0
- data_designer/engine/analysis/dataset_profiler.py +149 -0
- data_designer/engine/analysis/errors.py +9 -0
- data_designer/engine/analysis/utils/column_statistics_calculations.py +234 -0
- data_designer/engine/analysis/utils/judge_score_processing.py +132 -0
- data_designer/engine/column_generators/__init__.py +2 -0
- data_designer/engine/column_generators/generators/__init__.py +2 -0
- data_designer/engine/column_generators/generators/base.py +122 -0
- data_designer/engine/column_generators/generators/embedding.py +35 -0
- data_designer/engine/column_generators/generators/expression.py +55 -0
- data_designer/engine/column_generators/generators/llm_completion.py +116 -0
- data_designer/engine/column_generators/generators/samplers.py +69 -0
- data_designer/engine/column_generators/generators/seed_dataset.py +144 -0
- data_designer/engine/column_generators/generators/validation.py +140 -0
- data_designer/engine/column_generators/registry.py +60 -0
- data_designer/engine/column_generators/utils/errors.py +15 -0
- data_designer/engine/column_generators/utils/generator_classification.py +43 -0
- data_designer/engine/column_generators/utils/judge_score_factory.py +58 -0
- data_designer/engine/column_generators/utils/prompt_renderer.py +100 -0
- data_designer/engine/compiler.py +97 -0
- data_designer/engine/configurable_task.py +71 -0
- data_designer/engine/dataset_builders/artifact_storage.py +283 -0
- data_designer/engine/dataset_builders/column_wise_builder.py +354 -0
- data_designer/engine/dataset_builders/errors.py +15 -0
- data_designer/engine/dataset_builders/multi_column_configs.py +46 -0
- data_designer/engine/dataset_builders/utils/__init__.py +2 -0
- data_designer/engine/dataset_builders/utils/concurrency.py +212 -0
- data_designer/engine/dataset_builders/utils/config_compiler.py +62 -0
- data_designer/engine/dataset_builders/utils/dag.py +62 -0
- data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +200 -0
- data_designer/engine/dataset_builders/utils/errors.py +15 -0
- data_designer/engine/dataset_builders/utils/progress_tracker.py +122 -0
- data_designer/engine/errors.py +51 -0
- data_designer/engine/model_provider.py +77 -0
- data_designer/engine/models/__init__.py +2 -0
- data_designer/engine/models/errors.py +300 -0
- data_designer/engine/models/facade.py +284 -0
- data_designer/engine/models/factory.py +42 -0
- data_designer/engine/models/litellm_overrides.py +179 -0
- data_designer/engine/models/parsers/__init__.py +2 -0
- data_designer/engine/models/parsers/errors.py +34 -0
- data_designer/engine/models/parsers/parser.py +235 -0
- data_designer/engine/models/parsers/postprocessors.py +93 -0
- data_designer/engine/models/parsers/tag_parsers.py +62 -0
- data_designer/engine/models/parsers/types.py +84 -0
- data_designer/engine/models/recipes/base.py +81 -0
- data_designer/engine/models/recipes/response_recipes.py +293 -0
- data_designer/engine/models/registry.py +151 -0
- data_designer/engine/models/telemetry.py +362 -0
- data_designer/engine/models/usage.py +73 -0
- data_designer/engine/models/utils.py +101 -0
- data_designer/engine/processing/ginja/__init__.py +2 -0
- data_designer/engine/processing/ginja/ast.py +65 -0
- data_designer/engine/processing/ginja/environment.py +463 -0
- data_designer/engine/processing/ginja/exceptions.py +56 -0
- data_designer/engine/processing/ginja/record.py +32 -0
- data_designer/engine/processing/gsonschema/__init__.py +2 -0
- data_designer/engine/processing/gsonschema/exceptions.py +15 -0
- data_designer/engine/processing/gsonschema/schema_transformers.py +83 -0
- data_designer/engine/processing/gsonschema/types.py +10 -0
- data_designer/engine/processing/gsonschema/validators.py +202 -0
- data_designer/engine/processing/processors/base.py +13 -0
- data_designer/engine/processing/processors/drop_columns.py +42 -0
- data_designer/engine/processing/processors/registry.py +25 -0
- data_designer/engine/processing/processors/schema_transform.py +71 -0
- data_designer/engine/processing/utils.py +169 -0
- data_designer/engine/registry/base.py +99 -0
- data_designer/engine/registry/data_designer_registry.py +39 -0
- data_designer/engine/registry/errors.py +12 -0
- data_designer/engine/resources/managed_dataset_generator.py +39 -0
- data_designer/engine/resources/managed_dataset_repository.py +197 -0
- data_designer/engine/resources/managed_storage.py +65 -0
- data_designer/engine/resources/resource_provider.py +77 -0
- data_designer/engine/resources/seed_reader.py +154 -0
- data_designer/engine/sampling_gen/column.py +91 -0
- data_designer/engine/sampling_gen/constraints.py +100 -0
- data_designer/engine/sampling_gen/data_sources/base.py +217 -0
- data_designer/engine/sampling_gen/data_sources/errors.py +12 -0
- data_designer/engine/sampling_gen/data_sources/sources.py +347 -0
- data_designer/engine/sampling_gen/entities/__init__.py +2 -0
- data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +90 -0
- data_designer/engine/sampling_gen/entities/email_address_utils.py +171 -0
- data_designer/engine/sampling_gen/entities/errors.py +10 -0
- data_designer/engine/sampling_gen/entities/national_id_utils.py +102 -0
- data_designer/engine/sampling_gen/entities/person.py +144 -0
- data_designer/engine/sampling_gen/entities/phone_number.py +128 -0
- data_designer/engine/sampling_gen/errors.py +26 -0
- data_designer/engine/sampling_gen/generator.py +122 -0
- data_designer/engine/sampling_gen/jinja_utils.py +64 -0
- data_designer/engine/sampling_gen/people_gen.py +199 -0
- data_designer/engine/sampling_gen/person_constants.py +56 -0
- data_designer/engine/sampling_gen/schema.py +147 -0
- data_designer/engine/sampling_gen/schema_builder.py +61 -0
- data_designer/engine/sampling_gen/utils.py +46 -0
- data_designer/engine/secret_resolver.py +82 -0
- data_designer/engine/testing/__init__.py +12 -0
- data_designer/engine/testing/stubs.py +133 -0
- data_designer/engine/testing/utils.py +20 -0
- data_designer/engine/validation.py +367 -0
- data_designer/engine/validators/__init__.py +19 -0
- data_designer/engine/validators/base.py +38 -0
- data_designer/engine/validators/local_callable.py +39 -0
- data_designer/engine/validators/python.py +254 -0
- data_designer/engine/validators/remote.py +89 -0
- data_designer/engine/validators/sql.py +65 -0
- data_designer_engine-0.4.0.dist-info/METADATA +50 -0
- data_designer_engine-0.4.0.dist-info/RECORD +114 -0
- data_designer_engine-0.4.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
This file contains all possible fields that:
|
|
6
|
+
|
|
7
|
+
1. Exist in a managed PII + persona dataset
|
|
8
|
+
2. Are included in the final generated dataset
|
|
9
|
+
|
|
10
|
+
Do not add any other code or logic in this file.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
REQUIRED_FIELDS = {"first_name", "last_name", "age", "locale"}
|
|
16
|
+
|
|
17
|
+
PII_FIELDS = [
|
|
18
|
+
# Core demographic fields
|
|
19
|
+
"uuid",
|
|
20
|
+
"first_name",
|
|
21
|
+
"middle_name",
|
|
22
|
+
"last_name",
|
|
23
|
+
"sex",
|
|
24
|
+
"age",
|
|
25
|
+
"birth_date",
|
|
26
|
+
"marital_status",
|
|
27
|
+
"postcode",
|
|
28
|
+
"city",
|
|
29
|
+
"region",
|
|
30
|
+
"country",
|
|
31
|
+
"locale",
|
|
32
|
+
"bachelors_field",
|
|
33
|
+
"education_level",
|
|
34
|
+
"occupation",
|
|
35
|
+
"national_id",
|
|
36
|
+
# US-specific fields
|
|
37
|
+
"street_name",
|
|
38
|
+
"street_number",
|
|
39
|
+
"unit",
|
|
40
|
+
"state",
|
|
41
|
+
"email_address",
|
|
42
|
+
"phone_number",
|
|
43
|
+
# Brazil-specific fields
|
|
44
|
+
"race",
|
|
45
|
+
# Japan-specific fields
|
|
46
|
+
"area",
|
|
47
|
+
"prefecture",
|
|
48
|
+
"zone",
|
|
49
|
+
# Brazil and India shared fields
|
|
50
|
+
"religion",
|
|
51
|
+
# India-specific fields
|
|
52
|
+
"district",
|
|
53
|
+
"education_degree",
|
|
54
|
+
"first_language",
|
|
55
|
+
"second_language",
|
|
56
|
+
"third_language",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
PERSONA_FIELDS = [
|
|
60
|
+
# Core persona fields
|
|
61
|
+
"persona",
|
|
62
|
+
"career_goals_and_ambitions",
|
|
63
|
+
"arts_persona",
|
|
64
|
+
"culinary_persona",
|
|
65
|
+
"cultural_background",
|
|
66
|
+
"detailed_persona",
|
|
67
|
+
"finance_persona",
|
|
68
|
+
"healthcare_persona",
|
|
69
|
+
"hobbies_and_interests_list",
|
|
70
|
+
"hobbies_and_interests",
|
|
71
|
+
"professional_persona",
|
|
72
|
+
"skills_and_expertise_list",
|
|
73
|
+
"skills_and_expertise",
|
|
74
|
+
"sports_persona",
|
|
75
|
+
"travel_persona",
|
|
76
|
+
"openness",
|
|
77
|
+
"conscientiousness",
|
|
78
|
+
"extraversion",
|
|
79
|
+
"agreeableness",
|
|
80
|
+
"neuroticism",
|
|
81
|
+
# Japan-specific persona fields
|
|
82
|
+
"aspects",
|
|
83
|
+
"digital_skills",
|
|
84
|
+
# Brazil and India shared persona fields
|
|
85
|
+
"religious_persona",
|
|
86
|
+
"religious_background",
|
|
87
|
+
# India-specific persona fields
|
|
88
|
+
"linguistic_persona",
|
|
89
|
+
"linguistic_background",
|
|
90
|
+
]
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import random
|
|
7
|
+
import re
|
|
8
|
+
from datetime import date
|
|
9
|
+
|
|
10
|
+
import anyascii
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_email_address(
|
|
14
|
+
first_name: str,
|
|
15
|
+
middle_name: str | None,
|
|
16
|
+
last_name: str,
|
|
17
|
+
age: int,
|
|
18
|
+
birth_date: date,
|
|
19
|
+
) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Generate an email address based on a person's attributes.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
domain = get_email_domain_by_age(age)
|
|
25
|
+
username_base = get_email_basename_by_name(first_name, middle_name, last_name)
|
|
26
|
+
suffix = get_email_suffix_by_birth_date(birth_date)
|
|
27
|
+
|
|
28
|
+
# Combine to form email
|
|
29
|
+
return f"{username_base}{suffix}@{domain}"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_email_domain_by_age(age: int) -> str:
|
|
33
|
+
"""
|
|
34
|
+
Get a free email domain heuristically dependent on
|
|
35
|
+
overall number of subscribers and user age.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
# Common free email domains
|
|
39
|
+
# Source: https://www.sellcell.com/blog/most-popular-email-provider-by-number-of-users/
|
|
40
|
+
# Split heuristically into age demographics
|
|
41
|
+
# Also adjusted to maintain the approximate 38/27/35 split between these groups
|
|
42
|
+
email_domains_under_30 = {
|
|
43
|
+
"gmail.com": 710, # gmail.com total: 1500
|
|
44
|
+
"icloud.com": 300, # icloud.com total: 850
|
|
45
|
+
"outlook.com": 50, # outlook.com total: 200
|
|
46
|
+
"hotmail.com": 40, # hotmail.com total: 200
|
|
47
|
+
"yahoo.com": 35, # yahoo.com total: 230
|
|
48
|
+
"protonmail.com": 20, # protonmail.com total: 50
|
|
49
|
+
"zoho.com": 3, # zoho.com total: 15
|
|
50
|
+
"gmx.com": 3, # gmx.com total: 11
|
|
51
|
+
"aol.com": 0.1, # aol.com total: 1.5
|
|
52
|
+
}
|
|
53
|
+
email_domains_30_50 = {
|
|
54
|
+
"gmail.com": 360,
|
|
55
|
+
"icloud.com": 270,
|
|
56
|
+
"outlook.com": 60,
|
|
57
|
+
"hotmail.com": 50,
|
|
58
|
+
"yahoo.com": 60,
|
|
59
|
+
"protonmail.com": 18,
|
|
60
|
+
"zoho.com": 7,
|
|
61
|
+
"gmx.com": 4,
|
|
62
|
+
"aol.com": 0.3,
|
|
63
|
+
}
|
|
64
|
+
email_domains_over_50 = {
|
|
65
|
+
"gmail.com": 430,
|
|
66
|
+
"icloud.com": 280,
|
|
67
|
+
"outlook.com": 90,
|
|
68
|
+
"hotmail.com": 110,
|
|
69
|
+
"yahoo.com": 135,
|
|
70
|
+
"protonmail.com": 12,
|
|
71
|
+
"zoho.com": 5,
|
|
72
|
+
"gmx.com": 4,
|
|
73
|
+
"aol.com": 1.1,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if age < 30:
|
|
77
|
+
return random.choices(
|
|
78
|
+
list(email_domains_under_30.keys()),
|
|
79
|
+
weights=list(email_domains_under_30.values()),
|
|
80
|
+
k=1,
|
|
81
|
+
)[0]
|
|
82
|
+
elif age < 50:
|
|
83
|
+
return random.choices(
|
|
84
|
+
list(email_domains_30_50.keys()),
|
|
85
|
+
weights=list(email_domains_30_50.values()),
|
|
86
|
+
k=1,
|
|
87
|
+
)[0]
|
|
88
|
+
else:
|
|
89
|
+
return random.choices(
|
|
90
|
+
list(email_domains_over_50.keys()),
|
|
91
|
+
weights=list(email_domains_over_50.values()),
|
|
92
|
+
k=1,
|
|
93
|
+
)[0]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def get_email_basename_by_name(first_name: str, middle_name: str | None, last_name: str) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Get a email address basename heuristically dependent on first and last name.
|
|
99
|
+
|
|
100
|
+
Patterns include:
|
|
101
|
+
- firstname.lastname
|
|
102
|
+
- firstnamelastname
|
|
103
|
+
- firstinitiallastname
|
|
104
|
+
- firstname_lastname
|
|
105
|
+
- lastnamefirstinitial
|
|
106
|
+
- firstnamelastinitial
|
|
107
|
+
- firstnamemiddlename
|
|
108
|
+
- firstnamemiddleinitiallastname
|
|
109
|
+
- firstnamemiddlenamelastname
|
|
110
|
+
"""
|
|
111
|
+
# Normalize names (lowercase, remove spaces and special chars)
|
|
112
|
+
first = re.sub(r"[^a-z0-9]", "", anyascii.anyascii(first_name).lower())
|
|
113
|
+
last = re.sub(r"[^a-z0-9]", "", anyascii.anyascii(last_name).lower())
|
|
114
|
+
assert len(first) > 0 and len(last) > 0, (
|
|
115
|
+
"Both first and last name must be non-empty, after removing non-alphanumeric."
|
|
116
|
+
)
|
|
117
|
+
first_initial = first[0]
|
|
118
|
+
last_initial = last[0]
|
|
119
|
+
|
|
120
|
+
# Generate username patterns
|
|
121
|
+
username_patterns = [
|
|
122
|
+
f"{first}.{last}",
|
|
123
|
+
f"{first}{last}",
|
|
124
|
+
f"{first_initial}{last}",
|
|
125
|
+
f"{first}_{last}",
|
|
126
|
+
f"{last}{first_initial}",
|
|
127
|
+
f"{first}{last_initial}",
|
|
128
|
+
]
|
|
129
|
+
# Higher probability for more common patterns
|
|
130
|
+
pattern_weights = [0.3, 0.2, 0.15, 0.1, 0.15, 0.1]
|
|
131
|
+
if middle_name:
|
|
132
|
+
middle = re.sub(r"[^a-z0-9]", "", anyascii.anyascii(middle_name).lower())
|
|
133
|
+
middle_initial = middle[0]
|
|
134
|
+
username_patterns.extend(
|
|
135
|
+
[
|
|
136
|
+
f"{first}{middle}",
|
|
137
|
+
f"{first}{middle_initial}{last}",
|
|
138
|
+
f"{first}{middle}{last}",
|
|
139
|
+
]
|
|
140
|
+
)
|
|
141
|
+
pattern_weights = [0.25, 0.17, 0.12, 0.08, 0.12, 0.08, 0.06, 0.06, 0.06]
|
|
142
|
+
|
|
143
|
+
return random.choices(username_patterns, weights=pattern_weights, k=1)[0]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_email_suffix_by_birth_date(birth_date: date) -> str:
|
|
147
|
+
"""
|
|
148
|
+
Get a email address suffix heuristically dependent on birth date.
|
|
149
|
+
|
|
150
|
+
Suffices include:
|
|
151
|
+
- Empty
|
|
152
|
+
- Random 1-2 digit number
|
|
153
|
+
- Last 2 digits of birth year
|
|
154
|
+
- Full birth year
|
|
155
|
+
- Birth day
|
|
156
|
+
"""
|
|
157
|
+
# Suffix patterns (could be empty)
|
|
158
|
+
birth_day = birth_date.day
|
|
159
|
+
birth_year = birth_date.year
|
|
160
|
+
birth_year_short = birth_year % 100
|
|
161
|
+
suffix_patterns = [
|
|
162
|
+
"",
|
|
163
|
+
str(random.randint(1, 99)),
|
|
164
|
+
f"{birth_year_short:02d}",
|
|
165
|
+
str(birth_date.year),
|
|
166
|
+
str(birth_day),
|
|
167
|
+
]
|
|
168
|
+
suffix_weights = [0.4, 0.3, 0.1, 0.1, 0.1]
|
|
169
|
+
|
|
170
|
+
# Select pattern and suffix based on weights
|
|
171
|
+
return random.choices(suffix_patterns, weights=suffix_weights, k=1)[0]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from data_designer.errors import DataDesignerError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MissingPersonFieldsError(DataDesignerError):
|
|
10
|
+
"""Exception for all errors related to missing person fields."""
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import random
|
|
7
|
+
from datetime import date
|
|
8
|
+
|
|
9
|
+
SSN_RANDOMIZATION_DATE = date(2011, 6, 25)
|
|
10
|
+
|
|
11
|
+
# Area number mapping by state code (pre-2011)
|
|
12
|
+
STATE_TO_AREA_SSN = {
|
|
13
|
+
"NH": [1, 3],
|
|
14
|
+
"ME": [4, 7],
|
|
15
|
+
"VT": [8, 9],
|
|
16
|
+
"MA": [10, 34],
|
|
17
|
+
"RI": [35, 39],
|
|
18
|
+
"CT": [40, 49],
|
|
19
|
+
"NY": [50, 134],
|
|
20
|
+
"NJ": [135, 158],
|
|
21
|
+
"PA": [159, 211],
|
|
22
|
+
"MD": [212, 220],
|
|
23
|
+
"DE": [221, 222],
|
|
24
|
+
"VA": [223, 231],
|
|
25
|
+
"WV": [232, 236],
|
|
26
|
+
"NC": [237, 246],
|
|
27
|
+
"SC": [247, 251],
|
|
28
|
+
"GA": [252, 260],
|
|
29
|
+
"FL": [261, 267],
|
|
30
|
+
"OH": [268, 302],
|
|
31
|
+
"IN": [303, 317],
|
|
32
|
+
"IL": [318, 361],
|
|
33
|
+
"MI": [362, 386],
|
|
34
|
+
"WI": [387, 399],
|
|
35
|
+
"KY": [400, 407],
|
|
36
|
+
"TN": [408, 415],
|
|
37
|
+
"AL": [416, 424],
|
|
38
|
+
"MS": [425, 428],
|
|
39
|
+
"AR": [429, 432],
|
|
40
|
+
"LA": [433, 439],
|
|
41
|
+
"OK": [440, 448],
|
|
42
|
+
"TX": [449, 467],
|
|
43
|
+
"MN": [468, 477],
|
|
44
|
+
"IA": [478, 485],
|
|
45
|
+
"MO": [486, 500],
|
|
46
|
+
"ND": [501, 502],
|
|
47
|
+
"SD": [503, 504],
|
|
48
|
+
"NE": [505, 508],
|
|
49
|
+
"KS": [509, 515],
|
|
50
|
+
"MT": [516, 517],
|
|
51
|
+
"ID": [518, 519],
|
|
52
|
+
"WY": [520, 520],
|
|
53
|
+
"CO": [521, 524],
|
|
54
|
+
"NM": [525, 527],
|
|
55
|
+
"AZ": [526, 527],
|
|
56
|
+
"UT": [528, 529],
|
|
57
|
+
"NV": [530, 530],
|
|
58
|
+
"WA": [531, 539],
|
|
59
|
+
"OR": [540, 544],
|
|
60
|
+
"CA": [545, 573],
|
|
61
|
+
"AK": [574, 574],
|
|
62
|
+
"HI": [575, 576],
|
|
63
|
+
"DC": [577, 579],
|
|
64
|
+
"VI": [580, 580],
|
|
65
|
+
"PR": [580, 599],
|
|
66
|
+
"GU": [586, 586],
|
|
67
|
+
"AS": [586, 586],
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def generate_ssn(state: str, birth_date: date) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Generate a synthetic SSN based on state and birth date.
|
|
74
|
+
|
|
75
|
+
The first three digits are derived from the state the person lives in,
|
|
76
|
+
if born after June 25, 2011, with an 80% chance. Otherwise, the first
|
|
77
|
+
three digits are randomly chosen from the possible codes.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
state (str): Two-letter state code (e.g., "NY", "CA")
|
|
81
|
+
birth_date (date): Date of birth
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
str: A formatted synthetic SSN in the format "XXX-XX-XXXX"
|
|
85
|
+
|
|
86
|
+
"""
|
|
87
|
+
if birth_date < SSN_RANDOMIZATION_DATE:
|
|
88
|
+
if random.random() < 0.3:
|
|
89
|
+
# Maybe born in a different state
|
|
90
|
+
area_range = random.choice(list(STATE_TO_AREA_SSN.values()))
|
|
91
|
+
area_range = STATE_TO_AREA_SSN.get(state, [1, 899])
|
|
92
|
+
else:
|
|
93
|
+
area_range = [1, 899]
|
|
94
|
+
area = 666
|
|
95
|
+
while area == 666:
|
|
96
|
+
# Unallowed area code
|
|
97
|
+
area = random.randint(area_range[0], area_range[1])
|
|
98
|
+
# Group number
|
|
99
|
+
group = random.randint(1, 99)
|
|
100
|
+
# Serial number
|
|
101
|
+
serial = random.randint(1, 9999)
|
|
102
|
+
return f"{area:03d}-{group:02d}-{serial:04d}"
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import random
|
|
7
|
+
from datetime import date, timedelta
|
|
8
|
+
from typing import Any, Literal, TypeAlias
|
|
9
|
+
|
|
10
|
+
from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
|
|
11
|
+
from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
|
|
12
|
+
from data_designer.engine.resources.managed_dataset_repository import load_managed_dataset_repository
|
|
13
|
+
from data_designer.engine.resources.managed_storage import ManagedBlobStorage
|
|
14
|
+
from data_designer.engine.sampling_gen.entities.dataset_based_person_fields import (
|
|
15
|
+
PERSONA_FIELDS,
|
|
16
|
+
PII_FIELDS,
|
|
17
|
+
REQUIRED_FIELDS,
|
|
18
|
+
)
|
|
19
|
+
from data_designer.engine.sampling_gen.entities.email_address_utils import get_email_address
|
|
20
|
+
from data_designer.engine.sampling_gen.entities.errors import MissingPersonFieldsError
|
|
21
|
+
from data_designer.engine.sampling_gen.entities.national_id_utils import generate_ssn
|
|
22
|
+
from data_designer.engine.sampling_gen.entities.phone_number import PhoneNumber
|
|
23
|
+
from data_designer.engine.sampling_gen.errors import DatasetNotAvailableForLocaleError
|
|
24
|
+
|
|
25
|
+
SexT: TypeAlias = Literal["Male", "Female"]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def convert_age_to_birth_date(age: int) -> date:
|
|
29
|
+
today = date.today()
|
|
30
|
+
start_date = today.replace(year=today.year - age - 1)
|
|
31
|
+
end_date = today.replace(year=today.year - age)
|
|
32
|
+
days_between = (end_date - start_date).days
|
|
33
|
+
random_days = random.randint(0, days_between)
|
|
34
|
+
birthdate = start_date + timedelta(days=random_days)
|
|
35
|
+
return birthdate
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def generate_email_address(
|
|
39
|
+
first_name: str,
|
|
40
|
+
middle_name: str | None,
|
|
41
|
+
last_name: str,
|
|
42
|
+
age: int,
|
|
43
|
+
birth_date: date,
|
|
44
|
+
) -> str | None:
|
|
45
|
+
"""
|
|
46
|
+
Generate an email address based on the person's attributes.
|
|
47
|
+
Email address is None for children. Uses common free email domains.
|
|
48
|
+
"""
|
|
49
|
+
if age < 18:
|
|
50
|
+
return None
|
|
51
|
+
return get_email_address(
|
|
52
|
+
first_name=first_name,
|
|
53
|
+
middle_name=middle_name,
|
|
54
|
+
last_name=last_name,
|
|
55
|
+
age=age,
|
|
56
|
+
birth_date=birth_date,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_national_id(locale: str | None, region: str | None, birth_date: date) -> str | None:
|
|
61
|
+
if locale != "en_US":
|
|
62
|
+
return None
|
|
63
|
+
if region is None:
|
|
64
|
+
return None
|
|
65
|
+
return generate_ssn(state=region, birth_date=birth_date)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def generate_phone_number(locale: str, age: int, postcode: str | None, style: str = "dash") -> str | None:
|
|
69
|
+
"""
|
|
70
|
+
Generate a phone number correlated with location (postcode).
|
|
71
|
+
Phone number is None for children.
|
|
72
|
+
"""
|
|
73
|
+
if locale != "en_US":
|
|
74
|
+
return None
|
|
75
|
+
if age < 18:
|
|
76
|
+
return None
|
|
77
|
+
if postcode is None:
|
|
78
|
+
return None
|
|
79
|
+
locality_var = random.random()
|
|
80
|
+
if locality_var < 0.6:
|
|
81
|
+
# Exact match to postcode 60% of the time
|
|
82
|
+
return PhoneNumber.from_zip_prefix(postcode).format(style=style)
|
|
83
|
+
elif locality_var < 0.8:
|
|
84
|
+
# Nearby postcodes 20% of the time
|
|
85
|
+
return PhoneNumber.from_zip_prefix(postcode[:4]).format(style=style)
|
|
86
|
+
elif locality_var < 0.9:
|
|
87
|
+
# More distant postcodes 10% of the time
|
|
88
|
+
return PhoneNumber.from_zip_prefix(postcode[:3]).format(style=style)
|
|
89
|
+
# Random (population-weighted) area code 10% of the time
|
|
90
|
+
return PhoneNumber.generate().format(style=style)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def generate_and_insert_derived_fields(person_record: dict[str, Any]) -> dict[str, str | None]:
|
|
94
|
+
_verify_required_fields(person_record)
|
|
95
|
+
birth_date = convert_age_to_birth_date(person_record.get("age"))
|
|
96
|
+
person_record.update(
|
|
97
|
+
{
|
|
98
|
+
# Note: All data must be serializable to JSON.
|
|
99
|
+
"birth_date": birth_date.isoformat(),
|
|
100
|
+
"phone_number": generate_phone_number(
|
|
101
|
+
locale=person_record.get("locale"),
|
|
102
|
+
age=person_record.get("age"),
|
|
103
|
+
postcode=person_record.get("postcode"),
|
|
104
|
+
),
|
|
105
|
+
"email_address": generate_email_address(
|
|
106
|
+
first_name=person_record.get("first_name"),
|
|
107
|
+
middle_name=person_record.get("middle_name"),
|
|
108
|
+
last_name=person_record.get("last_name"),
|
|
109
|
+
age=person_record.get("age"),
|
|
110
|
+
birth_date=birth_date,
|
|
111
|
+
),
|
|
112
|
+
"national_id": get_national_id(
|
|
113
|
+
locale=person_record.get("locale"),
|
|
114
|
+
region=person_record.get("region"),
|
|
115
|
+
birth_date=birth_date,
|
|
116
|
+
),
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
if person_record.get("locale") == "en_US" and "region" in person_record and "state" not in person_record:
|
|
120
|
+
state = person_record.pop("region")
|
|
121
|
+
person_record.update({"state": state})
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
**{k: v for k, v in person_record.items() if k in PII_FIELDS},
|
|
125
|
+
**{k: v for k, v in person_record.items() if k in ["state", "phone_number", "email_address", "national_id"]},
|
|
126
|
+
**{k: v for k, v in person_record.items() if k in PERSONA_FIELDS},
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def load_person_data_sampler(blob_storage: ManagedBlobStorage, locale: str) -> ManagedDatasetGenerator:
|
|
131
|
+
if locale not in LOCALES_WITH_MANAGED_DATASETS:
|
|
132
|
+
raise DatasetNotAvailableForLocaleError(f"Locale {locale} is not supported by the managed dataset generator.")
|
|
133
|
+
|
|
134
|
+
return ManagedDatasetGenerator(
|
|
135
|
+
managed_datasets=load_managed_dataset_repository(blob_storage, [locale]),
|
|
136
|
+
dataset_name=locale,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _verify_required_fields(person_record: dict[str, Any]) -> None:
|
|
141
|
+
"""Verify that the person record contains all required fields."""
|
|
142
|
+
missing_fields = REQUIRED_FIELDS - set(person_record.keys())
|
|
143
|
+
if missing_fields:
|
|
144
|
+
raise MissingPersonFieldsError(f"Person data is missing the following required fields: {missing_fields}")
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import random
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field, field_validator
|
|
11
|
+
|
|
12
|
+
from data_designer.lazy_heavy_imports import pd
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
ZIP_AREA_CODE_DATA = pd.read_parquet(Path(__file__).parent / "assets" / "zip_area_code_map.parquet")
|
|
18
|
+
ZIPCODE_AREA_CODE_MAP = dict(zip(ZIP_AREA_CODE_DATA["zipcode"], ZIP_AREA_CODE_DATA["area_code"]))
|
|
19
|
+
ZIPCODE_POPULATION_MAP = dict(zip(ZIP_AREA_CODE_DATA["zipcode"], ZIP_AREA_CODE_DATA["count"]))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_area_code(zip_prefix: str | None = None) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Sample an area code for the given ZIP code prefix, population-weighted.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
zip_prefix: The prefix of a ZIP code, 5 digits or fewer. If None, sample from all ZIP codes.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
A sampled area code matching the prefix, population-weighted.
|
|
31
|
+
"""
|
|
32
|
+
if zip_prefix is None:
|
|
33
|
+
zipcodes, weights = zip(*ZIPCODE_POPULATION_MAP.items())
|
|
34
|
+
zipcode = random.choices(zipcodes, weights=weights, k=1)[0]
|
|
35
|
+
return str(ZIPCODE_AREA_CODE_MAP[zipcode])
|
|
36
|
+
if len(zip_prefix) == 5:
|
|
37
|
+
try:
|
|
38
|
+
return str(ZIPCODE_AREA_CODE_MAP[zip_prefix])
|
|
39
|
+
except KeyError:
|
|
40
|
+
raise ValueError(f"ZIP code {zip_prefix} not found.")
|
|
41
|
+
matching_zipcodes = [[z, c] for z, c in ZIPCODE_POPULATION_MAP.items() if z.startswith(zip_prefix)]
|
|
42
|
+
zipcodes, weights = zip(*matching_zipcodes)
|
|
43
|
+
if not zipcodes:
|
|
44
|
+
raise ValueError(f"No ZIP codes found with prefix {zip_prefix}.")
|
|
45
|
+
zipcode = random.choices(zipcodes, weights=weights, k=1)[0]
|
|
46
|
+
return str(ZIPCODE_AREA_CODE_MAP[zipcode])
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class PhoneNumber(BaseModel):
|
|
50
|
+
"""
|
|
51
|
+
A phone number object that supports various formatting styles
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
country_code: str = Field(default="1")
|
|
55
|
+
area_code: str
|
|
56
|
+
prefix: str # First part of the local number
|
|
57
|
+
line_number: str # Second part of the local number
|
|
58
|
+
|
|
59
|
+
@field_validator("country_code", "area_code", "prefix", "line_number")
|
|
60
|
+
@classmethod
|
|
61
|
+
def validate_digits(cls, v):
|
|
62
|
+
if not v.isdigit():
|
|
63
|
+
raise ValueError("Must contain only digits")
|
|
64
|
+
return v
|
|
65
|
+
|
|
66
|
+
@field_validator("country_code")
|
|
67
|
+
@classmethod
|
|
68
|
+
def validate_country_code_length(cls, v):
|
|
69
|
+
max_length = 3
|
|
70
|
+
if len(v) > max_length:
|
|
71
|
+
raise ValueError(f"Country code {v} is longer than {max_length} digits")
|
|
72
|
+
return v
|
|
73
|
+
|
|
74
|
+
def format(self, style: str = "dash") -> str:
|
|
75
|
+
"""
|
|
76
|
+
Format the phone number according to the specified style.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
style: One of "dash", "parentheses", "dot", "space", "no_separation",
|
|
80
|
+
"international_plus", "international"
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Formatted phone number string
|
|
84
|
+
"""
|
|
85
|
+
if style == "dash":
|
|
86
|
+
formatted = f"{self.area_code}-{self.prefix}-{self.line_number}"
|
|
87
|
+
elif style == "parentheses":
|
|
88
|
+
formatted = f"({self.area_code}) {self.prefix}-{self.line_number}"
|
|
89
|
+
elif style == "dot":
|
|
90
|
+
formatted = f"{self.area_code}.{self.prefix}.{self.line_number}"
|
|
91
|
+
elif style == "space":
|
|
92
|
+
formatted = f"{self.area_code} {self.prefix} {self.line_number}"
|
|
93
|
+
elif style == "no_separation":
|
|
94
|
+
formatted = f"{self.area_code}{self.prefix}{self.line_number}"
|
|
95
|
+
elif style == "international_plus":
|
|
96
|
+
cc = self.country_code or "1" # Default to US/Canada
|
|
97
|
+
formatted = f"+{cc} {self.area_code} {self.prefix} {self.line_number}"
|
|
98
|
+
elif style == "international":
|
|
99
|
+
cc = int(self.country_code or 1) # Default to US/Canada
|
|
100
|
+
formatted = f"{cc:03d} {self.area_code} {self.prefix} {self.line_number}"
|
|
101
|
+
else:
|
|
102
|
+
raise ValueError(f"Unsupported format style: {style}")
|
|
103
|
+
|
|
104
|
+
return formatted
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def from_area_code(cls, area_code: str) -> "PhoneNumber":
|
|
108
|
+
prefix = str(random.randint(200, 1000))
|
|
109
|
+
line_number = str(random.randint(0, 10000)).zfill(4)
|
|
110
|
+
return PhoneNumber(area_code=area_code, prefix=prefix, line_number=line_number)
|
|
111
|
+
|
|
112
|
+
@classmethod
|
|
113
|
+
def from_zip_prefix(cls, zip_prefix: str) -> "PhoneNumber":
|
|
114
|
+
"""Create a phone number from the given ZIP code prefix."""
|
|
115
|
+
area_code = get_area_code(zip_prefix)
|
|
116
|
+
return cls.from_area_code(area_code)
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def generate(cls) -> "PhoneNumber":
|
|
120
|
+
"""Create a random valid US phone number."""
|
|
121
|
+
area_code = get_area_code()
|
|
122
|
+
return cls.from_area_code(area_code)
|
|
123
|
+
|
|
124
|
+
def __str__(self) -> str:
|
|
125
|
+
return self.format("dash")
|
|
126
|
+
|
|
127
|
+
def __repr__(self) -> str:
|
|
128
|
+
return f"PhoneNumber({str(self)})"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from data_designer.engine.errors import DataDesignerError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SamplingGenError(DataDesignerError):
|
|
10
|
+
"""Base exception for all errors in the sampling_gen library."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RejectionSamplingError(SamplingGenError):
|
|
14
|
+
"""Exception for all errors related to rejection sampling."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DataConversionError(SamplingGenError):
|
|
18
|
+
"""Exception for all errors related to data conversion."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DatasetNotAvailableForLocaleError(SamplingGenError):
|
|
22
|
+
"""Exception for all errors related to the dataset not being available for a given locale."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ManagedDatasetGeneratorError(SamplingGenError):
|
|
26
|
+
"""Exception for all errors related to the managed dataset generator."""
|