data-designer 0.3.8rc1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. data_designer/cli/commands/__init__.py +1 -1
  2. data_designer/interface/__init__.py +21 -1
  3. data_designer/{_version.py → interface/_version.py} +2 -2
  4. data_designer/interface/data_designer.py +8 -11
  5. {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/METADATA +10 -42
  6. data_designer-0.4.0.dist-info/RECORD +39 -0
  7. data_designer/__init__.py +0 -17
  8. data_designer/config/__init__.py +0 -2
  9. data_designer/config/analysis/__init__.py +0 -2
  10. data_designer/config/analysis/column_profilers.py +0 -159
  11. data_designer/config/analysis/column_statistics.py +0 -421
  12. data_designer/config/analysis/dataset_profiler.py +0 -84
  13. data_designer/config/analysis/utils/errors.py +0 -10
  14. data_designer/config/analysis/utils/reporting.py +0 -192
  15. data_designer/config/base.py +0 -69
  16. data_designer/config/column_configs.py +0 -470
  17. data_designer/config/column_types.py +0 -141
  18. data_designer/config/config_builder.py +0 -595
  19. data_designer/config/data_designer_config.py +0 -40
  20. data_designer/config/dataset_builders.py +0 -13
  21. data_designer/config/dataset_metadata.py +0 -18
  22. data_designer/config/default_model_settings.py +0 -121
  23. data_designer/config/errors.py +0 -24
  24. data_designer/config/exports.py +0 -145
  25. data_designer/config/interface.py +0 -55
  26. data_designer/config/models.py +0 -455
  27. data_designer/config/preview_results.py +0 -41
  28. data_designer/config/processors.py +0 -148
  29. data_designer/config/run_config.py +0 -48
  30. data_designer/config/sampler_constraints.py +0 -52
  31. data_designer/config/sampler_params.py +0 -639
  32. data_designer/config/seed.py +0 -116
  33. data_designer/config/seed_source.py +0 -84
  34. data_designer/config/seed_source_types.py +0 -19
  35. data_designer/config/utils/code_lang.py +0 -82
  36. data_designer/config/utils/constants.py +0 -363
  37. data_designer/config/utils/errors.py +0 -21
  38. data_designer/config/utils/info.py +0 -94
  39. data_designer/config/utils/io_helpers.py +0 -258
  40. data_designer/config/utils/misc.py +0 -78
  41. data_designer/config/utils/numerical_helpers.py +0 -30
  42. data_designer/config/utils/type_helpers.py +0 -106
  43. data_designer/config/utils/visualization.py +0 -482
  44. data_designer/config/validator_params.py +0 -94
  45. data_designer/engine/__init__.py +0 -2
  46. data_designer/engine/analysis/column_profilers/base.py +0 -49
  47. data_designer/engine/analysis/column_profilers/judge_score_profiler.py +0 -153
  48. data_designer/engine/analysis/column_profilers/registry.py +0 -22
  49. data_designer/engine/analysis/column_statistics.py +0 -145
  50. data_designer/engine/analysis/dataset_profiler.py +0 -149
  51. data_designer/engine/analysis/errors.py +0 -9
  52. data_designer/engine/analysis/utils/column_statistics_calculations.py +0 -234
  53. data_designer/engine/analysis/utils/judge_score_processing.py +0 -132
  54. data_designer/engine/column_generators/__init__.py +0 -2
  55. data_designer/engine/column_generators/generators/__init__.py +0 -2
  56. data_designer/engine/column_generators/generators/base.py +0 -122
  57. data_designer/engine/column_generators/generators/embedding.py +0 -35
  58. data_designer/engine/column_generators/generators/expression.py +0 -55
  59. data_designer/engine/column_generators/generators/llm_completion.py +0 -113
  60. data_designer/engine/column_generators/generators/samplers.py +0 -69
  61. data_designer/engine/column_generators/generators/seed_dataset.py +0 -144
  62. data_designer/engine/column_generators/generators/validation.py +0 -140
  63. data_designer/engine/column_generators/registry.py +0 -60
  64. data_designer/engine/column_generators/utils/errors.py +0 -15
  65. data_designer/engine/column_generators/utils/generator_classification.py +0 -43
  66. data_designer/engine/column_generators/utils/judge_score_factory.py +0 -58
  67. data_designer/engine/column_generators/utils/prompt_renderer.py +0 -100
  68. data_designer/engine/compiler.py +0 -97
  69. data_designer/engine/configurable_task.py +0 -71
  70. data_designer/engine/dataset_builders/artifact_storage.py +0 -283
  71. data_designer/engine/dataset_builders/column_wise_builder.py +0 -338
  72. data_designer/engine/dataset_builders/errors.py +0 -15
  73. data_designer/engine/dataset_builders/multi_column_configs.py +0 -46
  74. data_designer/engine/dataset_builders/utils/__init__.py +0 -2
  75. data_designer/engine/dataset_builders/utils/concurrency.py +0 -215
  76. data_designer/engine/dataset_builders/utils/config_compiler.py +0 -62
  77. data_designer/engine/dataset_builders/utils/dag.py +0 -62
  78. data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +0 -200
  79. data_designer/engine/dataset_builders/utils/errors.py +0 -15
  80. data_designer/engine/errors.py +0 -51
  81. data_designer/engine/model_provider.py +0 -77
  82. data_designer/engine/models/__init__.py +0 -2
  83. data_designer/engine/models/errors.py +0 -300
  84. data_designer/engine/models/facade.py +0 -287
  85. data_designer/engine/models/factory.py +0 -42
  86. data_designer/engine/models/litellm_overrides.py +0 -179
  87. data_designer/engine/models/parsers/__init__.py +0 -2
  88. data_designer/engine/models/parsers/errors.py +0 -34
  89. data_designer/engine/models/parsers/parser.py +0 -235
  90. data_designer/engine/models/parsers/postprocessors.py +0 -93
  91. data_designer/engine/models/parsers/tag_parsers.py +0 -62
  92. data_designer/engine/models/parsers/types.py +0 -84
  93. data_designer/engine/models/recipes/base.py +0 -81
  94. data_designer/engine/models/recipes/response_recipes.py +0 -293
  95. data_designer/engine/models/registry.py +0 -146
  96. data_designer/engine/models/telemetry.py +0 -359
  97. data_designer/engine/models/usage.py +0 -73
  98. data_designer/engine/models/utils.py +0 -38
  99. data_designer/engine/processing/ginja/__init__.py +0 -2
  100. data_designer/engine/processing/ginja/ast.py +0 -65
  101. data_designer/engine/processing/ginja/environment.py +0 -463
  102. data_designer/engine/processing/ginja/exceptions.py +0 -56
  103. data_designer/engine/processing/ginja/record.py +0 -32
  104. data_designer/engine/processing/gsonschema/__init__.py +0 -2
  105. data_designer/engine/processing/gsonschema/exceptions.py +0 -15
  106. data_designer/engine/processing/gsonschema/schema_transformers.py +0 -83
  107. data_designer/engine/processing/gsonschema/types.py +0 -10
  108. data_designer/engine/processing/gsonschema/validators.py +0 -202
  109. data_designer/engine/processing/processors/base.py +0 -13
  110. data_designer/engine/processing/processors/drop_columns.py +0 -42
  111. data_designer/engine/processing/processors/registry.py +0 -25
  112. data_designer/engine/processing/processors/schema_transform.py +0 -49
  113. data_designer/engine/processing/utils.py +0 -169
  114. data_designer/engine/registry/base.py +0 -99
  115. data_designer/engine/registry/data_designer_registry.py +0 -39
  116. data_designer/engine/registry/errors.py +0 -12
  117. data_designer/engine/resources/managed_dataset_generator.py +0 -39
  118. data_designer/engine/resources/managed_dataset_repository.py +0 -197
  119. data_designer/engine/resources/managed_storage.py +0 -65
  120. data_designer/engine/resources/resource_provider.py +0 -77
  121. data_designer/engine/resources/seed_reader.py +0 -154
  122. data_designer/engine/sampling_gen/column.py +0 -91
  123. data_designer/engine/sampling_gen/constraints.py +0 -100
  124. data_designer/engine/sampling_gen/data_sources/base.py +0 -217
  125. data_designer/engine/sampling_gen/data_sources/errors.py +0 -12
  126. data_designer/engine/sampling_gen/data_sources/sources.py +0 -347
  127. data_designer/engine/sampling_gen/entities/__init__.py +0 -2
  128. data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
  129. data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +0 -86
  130. data_designer/engine/sampling_gen/entities/email_address_utils.py +0 -171
  131. data_designer/engine/sampling_gen/entities/errors.py +0 -10
  132. data_designer/engine/sampling_gen/entities/national_id_utils.py +0 -102
  133. data_designer/engine/sampling_gen/entities/person.py +0 -144
  134. data_designer/engine/sampling_gen/entities/phone_number.py +0 -128
  135. data_designer/engine/sampling_gen/errors.py +0 -26
  136. data_designer/engine/sampling_gen/generator.py +0 -122
  137. data_designer/engine/sampling_gen/jinja_utils.py +0 -64
  138. data_designer/engine/sampling_gen/people_gen.py +0 -199
  139. data_designer/engine/sampling_gen/person_constants.py +0 -56
  140. data_designer/engine/sampling_gen/schema.py +0 -147
  141. data_designer/engine/sampling_gen/schema_builder.py +0 -61
  142. data_designer/engine/sampling_gen/utils.py +0 -46
  143. data_designer/engine/secret_resolver.py +0 -82
  144. data_designer/engine/validation.py +0 -367
  145. data_designer/engine/validators/__init__.py +0 -19
  146. data_designer/engine/validators/base.py +0 -38
  147. data_designer/engine/validators/local_callable.py +0 -39
  148. data_designer/engine/validators/python.py +0 -254
  149. data_designer/engine/validators/remote.py +0 -89
  150. data_designer/engine/validators/sql.py +0 -65
  151. data_designer/errors.py +0 -7
  152. data_designer/essentials/__init__.py +0 -33
  153. data_designer/lazy_heavy_imports.py +0 -54
  154. data_designer/logging.py +0 -163
  155. data_designer/plugin_manager.py +0 -78
  156. data_designer/plugins/__init__.py +0 -8
  157. data_designer/plugins/errors.py +0 -15
  158. data_designer/plugins/plugin.py +0 -141
  159. data_designer/plugins/registry.py +0 -88
  160. data_designer/plugins/testing/__init__.py +0 -10
  161. data_designer/plugins/testing/stubs.py +0 -116
  162. data_designer/plugins/testing/utils.py +0 -20
  163. data_designer-0.3.8rc1.dist-info/RECORD +0 -196
  164. data_designer-0.3.8rc1.dist-info/licenses/LICENSE +0 -201
  165. {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/WHEEL +0 -0
  166. {data_designer-0.3.8rc1.dist-info → data_designer-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,86 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- """
5
- This file contains all possible fields that:
6
-
7
- 1. Exist in a managed PII + persona dataset
8
- 2. Are included in the final generated dataset
9
-
10
- Do not add any other code or logic in this file.
11
- """
12
-
13
- from __future__ import annotations
14
-
15
- REQUIRED_FIELDS = {"first_name", "last_name", "age", "locale"}
16
-
17
- PII_FIELDS = [
18
- # Core demographic fields
19
- "uuid",
20
- "first_name",
21
- "middle_name",
22
- "last_name",
23
- "sex",
24
- "age",
25
- "birth_date",
26
- "marital_status",
27
- "postcode",
28
- "city",
29
- "region",
30
- "country",
31
- "locale",
32
- "bachelors_field",
33
- "education_level",
34
- "occupation",
35
- "national_id",
36
- # US-specific fields
37
- "street_name",
38
- "street_number",
39
- "unit",
40
- "state",
41
- "email_address",
42
- "phone_number",
43
- # Japan-specific fields
44
- "area",
45
- "prefecture",
46
- "zone",
47
- # India-specific fields
48
- "district",
49
- "religion",
50
- "education_degree",
51
- "first_language",
52
- "second_language",
53
- "third_language",
54
- ]
55
-
56
- PERSONA_FIELDS = [
57
- # Core persona fields
58
- "persona",
59
- "career_goals_and_ambitions",
60
- "arts_persona",
61
- "culinary_persona",
62
- "cultural_background",
63
- "detailed_persona",
64
- "finance_persona",
65
- "healthcare_persona",
66
- "hobbies_and_interests_list",
67
- "hobbies_and_interests",
68
- "professional_persona",
69
- "skills_and_expertise_list",
70
- "skills_and_expertise",
71
- "sports_persona",
72
- "travel_persona",
73
- "openness",
74
- "conscientiousness",
75
- "extraversion",
76
- "agreeableness",
77
- "neuroticism",
78
- # Japan-specific persona fields
79
- "aspects",
80
- "digital_skills",
81
- # India-specific persona fields
82
- "linguistic_persona",
83
- "religious_persona",
84
- "linguistic_background",
85
- "religious_background",
86
- ]
@@ -1,171 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import random
7
- import re
8
- from datetime import date
9
-
10
- import anyascii
11
-
12
-
13
- def get_email_address(
14
- first_name: str,
15
- middle_name: str | None,
16
- last_name: str,
17
- age: int,
18
- birth_date: date,
19
- ) -> str:
20
- """
21
- Generate an email address based on a person's attributes.
22
- """
23
-
24
- domain = get_email_domain_by_age(age)
25
- username_base = get_email_basename_by_name(first_name, middle_name, last_name)
26
- suffix = get_email_suffix_by_birth_date(birth_date)
27
-
28
- # Combine to form email
29
- return f"{username_base}{suffix}@{domain}"
30
-
31
-
32
- def get_email_domain_by_age(age: int) -> str:
33
- """
34
- Get a free email domain heuristically dependent on
35
- overall number of subscribers and user age.
36
- """
37
-
38
- # Common free email domains
39
- # Source: https://www.sellcell.com/blog/most-popular-email-provider-by-number-of-users/
40
- # Split heuristically into age demographics
41
- # Also adjusted to maintain the approximate 38/27/35 split between these groups
42
- email_domains_under_30 = {
43
- "gmail.com": 710, # gmail.com total: 1500
44
- "icloud.com": 300, # icloud.com total: 850
45
- "outlook.com": 50, # outlook.com total: 200
46
- "hotmail.com": 40, # hotmail.com total: 200
47
- "yahoo.com": 35, # yahoo.com total: 230
48
- "protonmail.com": 20, # protonmail.com total: 50
49
- "zoho.com": 3, # zoho.com total: 15
50
- "gmx.com": 3, # gmx.com total: 11
51
- "aol.com": 0.1, # aol.com total: 1.5
52
- }
53
- email_domains_30_50 = {
54
- "gmail.com": 360,
55
- "icloud.com": 270,
56
- "outlook.com": 60,
57
- "hotmail.com": 50,
58
- "yahoo.com": 60,
59
- "protonmail.com": 18,
60
- "zoho.com": 7,
61
- "gmx.com": 4,
62
- "aol.com": 0.3,
63
- }
64
- email_domains_over_50 = {
65
- "gmail.com": 430,
66
- "icloud.com": 280,
67
- "outlook.com": 90,
68
- "hotmail.com": 110,
69
- "yahoo.com": 135,
70
- "protonmail.com": 12,
71
- "zoho.com": 5,
72
- "gmx.com": 4,
73
- "aol.com": 1.1,
74
- }
75
-
76
- if age < 30:
77
- return random.choices(
78
- list(email_domains_under_30.keys()),
79
- weights=list(email_domains_under_30.values()),
80
- k=1,
81
- )[0]
82
- elif age < 50:
83
- return random.choices(
84
- list(email_domains_30_50.keys()),
85
- weights=list(email_domains_30_50.values()),
86
- k=1,
87
- )[0]
88
- else:
89
- return random.choices(
90
- list(email_domains_over_50.keys()),
91
- weights=list(email_domains_over_50.values()),
92
- k=1,
93
- )[0]
94
-
95
-
96
- def get_email_basename_by_name(first_name: str, middle_name: str | None, last_name: str) -> str:
97
- """
98
- Get a email address basename heuristically dependent on first and last name.
99
-
100
- Patterns include:
101
- - firstname.lastname
102
- - firstnamelastname
103
- - firstinitiallastname
104
- - firstname_lastname
105
- - lastnamefirstinitial
106
- - firstnamelastinitial
107
- - firstnamemiddlename
108
- - firstnamemiddleinitiallastname
109
- - firstnamemiddlenamelastname
110
- """
111
- # Normalize names (lowercase, remove spaces and special chars)
112
- first = re.sub(r"[^a-z0-9]", "", anyascii.anyascii(first_name).lower())
113
- last = re.sub(r"[^a-z0-9]", "", anyascii.anyascii(last_name).lower())
114
- assert len(first) > 0 and len(last) > 0, (
115
- "Both first and last name must be non-empty, after removing non-alphanumeric."
116
- )
117
- first_initial = first[0]
118
- last_initial = last[0]
119
-
120
- # Generate username patterns
121
- username_patterns = [
122
- f"{first}.{last}",
123
- f"{first}{last}",
124
- f"{first_initial}{last}",
125
- f"{first}_{last}",
126
- f"{last}{first_initial}",
127
- f"{first}{last_initial}",
128
- ]
129
- # Higher probability for more common patterns
130
- pattern_weights = [0.3, 0.2, 0.15, 0.1, 0.15, 0.1]
131
- if middle_name:
132
- middle = re.sub(r"[^a-z0-9]", "", anyascii.anyascii(middle_name).lower())
133
- middle_initial = middle[0]
134
- username_patterns.extend(
135
- [
136
- f"{first}{middle}",
137
- f"{first}{middle_initial}{last}",
138
- f"{first}{middle}{last}",
139
- ]
140
- )
141
- pattern_weights = [0.25, 0.17, 0.12, 0.08, 0.12, 0.08, 0.06, 0.06, 0.06]
142
-
143
- return random.choices(username_patterns, weights=pattern_weights, k=1)[0]
144
-
145
-
146
- def get_email_suffix_by_birth_date(birth_date: date) -> str:
147
- """
148
- Get a email address suffix heuristically dependent on birth date.
149
-
150
- Suffices include:
151
- - Empty
152
- - Random 1-2 digit number
153
- - Last 2 digits of birth year
154
- - Full birth year
155
- - Birth day
156
- """
157
- # Suffix patterns (could be empty)
158
- birth_day = birth_date.day
159
- birth_year = birth_date.year
160
- birth_year_short = birth_year % 100
161
- suffix_patterns = [
162
- "",
163
- str(random.randint(1, 99)),
164
- f"{birth_year_short:02d}",
165
- str(birth_date.year),
166
- str(birth_day),
167
- ]
168
- suffix_weights = [0.4, 0.3, 0.1, 0.1, 0.1]
169
-
170
- # Select pattern and suffix based on weights
171
- return random.choices(suffix_patterns, weights=suffix_weights, k=1)[0]
@@ -1,10 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.errors import DataDesignerError
7
-
8
-
9
- class MissingPersonFieldsError(DataDesignerError):
10
- """Exception for all errors related to missing person fields."""
@@ -1,102 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import random
7
- from datetime import date
8
-
9
- SSN_RANDOMIZATION_DATE = date(2011, 6, 25)
10
-
11
- # Area number mapping by state code (pre-2011)
12
- STATE_TO_AREA_SSN = {
13
- "NH": [1, 3],
14
- "ME": [4, 7],
15
- "VT": [8, 9],
16
- "MA": [10, 34],
17
- "RI": [35, 39],
18
- "CT": [40, 49],
19
- "NY": [50, 134],
20
- "NJ": [135, 158],
21
- "PA": [159, 211],
22
- "MD": [212, 220],
23
- "DE": [221, 222],
24
- "VA": [223, 231],
25
- "WV": [232, 236],
26
- "NC": [237, 246],
27
- "SC": [247, 251],
28
- "GA": [252, 260],
29
- "FL": [261, 267],
30
- "OH": [268, 302],
31
- "IN": [303, 317],
32
- "IL": [318, 361],
33
- "MI": [362, 386],
34
- "WI": [387, 399],
35
- "KY": [400, 407],
36
- "TN": [408, 415],
37
- "AL": [416, 424],
38
- "MS": [425, 428],
39
- "AR": [429, 432],
40
- "LA": [433, 439],
41
- "OK": [440, 448],
42
- "TX": [449, 467],
43
- "MN": [468, 477],
44
- "IA": [478, 485],
45
- "MO": [486, 500],
46
- "ND": [501, 502],
47
- "SD": [503, 504],
48
- "NE": [505, 508],
49
- "KS": [509, 515],
50
- "MT": [516, 517],
51
- "ID": [518, 519],
52
- "WY": [520, 520],
53
- "CO": [521, 524],
54
- "NM": [525, 527],
55
- "AZ": [526, 527],
56
- "UT": [528, 529],
57
- "NV": [530, 530],
58
- "WA": [531, 539],
59
- "OR": [540, 544],
60
- "CA": [545, 573],
61
- "AK": [574, 574],
62
- "HI": [575, 576],
63
- "DC": [577, 579],
64
- "VI": [580, 580],
65
- "PR": [580, 599],
66
- "GU": [586, 586],
67
- "AS": [586, 586],
68
- }
69
-
70
-
71
- def generate_ssn(state: str, birth_date: date) -> str:
72
- """
73
- Generate a synthetic SSN based on state and birth date.
74
-
75
- The first three digits are derived from the state the person lives in,
76
- if born after June 25, 2011, with an 80% chance. Otherwise, the first
77
- three digits are randomly chosen from the possible codes.
78
-
79
- Args:
80
- state (str): Two-letter state code (e.g., "NY", "CA")
81
- birth_date (date): Date of birth
82
-
83
- Returns:
84
- str: A formatted synthetic SSN in the format "XXX-XX-XXXX"
85
-
86
- """
87
- if birth_date < SSN_RANDOMIZATION_DATE:
88
- if random.random() < 0.3:
89
- # Maybe born in a different state
90
- area_range = random.choice(list(STATE_TO_AREA_SSN.values()))
91
- area_range = STATE_TO_AREA_SSN.get(state, [1, 899])
92
- else:
93
- area_range = [1, 899]
94
- area = 666
95
- while area == 666:
96
- # Unallowed area code
97
- area = random.randint(area_range[0], area_range[1])
98
- # Group number
99
- group = random.randint(1, 99)
100
- # Serial number
101
- serial = random.randint(1, 9999)
102
- return f"{area:03d}-{group:02d}-{serial:04d}"
@@ -1,144 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import random
7
- from datetime import date, timedelta
8
- from typing import Any, Literal, TypeAlias
9
-
10
- from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS
11
- from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator
12
- from data_designer.engine.resources.managed_dataset_repository import load_managed_dataset_repository
13
- from data_designer.engine.resources.managed_storage import ManagedBlobStorage
14
- from data_designer.engine.sampling_gen.entities.dataset_based_person_fields import (
15
- PERSONA_FIELDS,
16
- PII_FIELDS,
17
- REQUIRED_FIELDS,
18
- )
19
- from data_designer.engine.sampling_gen.entities.email_address_utils import get_email_address
20
- from data_designer.engine.sampling_gen.entities.errors import MissingPersonFieldsError
21
- from data_designer.engine.sampling_gen.entities.national_id_utils import generate_ssn
22
- from data_designer.engine.sampling_gen.entities.phone_number import PhoneNumber
23
- from data_designer.engine.sampling_gen.errors import DatasetNotAvailableForLocaleError
24
-
25
- SexT: TypeAlias = Literal["Male", "Female"]
26
-
27
-
28
- def convert_age_to_birth_date(age: int) -> date:
29
- today = date.today()
30
- start_date = today.replace(year=today.year - age - 1)
31
- end_date = today.replace(year=today.year - age)
32
- days_between = (end_date - start_date).days
33
- random_days = random.randint(0, days_between)
34
- birthdate = start_date + timedelta(days=random_days)
35
- return birthdate
36
-
37
-
38
- def generate_email_address(
39
- first_name: str,
40
- middle_name: str | None,
41
- last_name: str,
42
- age: int,
43
- birth_date: date,
44
- ) -> str | None:
45
- """
46
- Generate an email address based on the person's attributes.
47
- Email address is None for children. Uses common free email domains.
48
- """
49
- if age < 18:
50
- return None
51
- return get_email_address(
52
- first_name=first_name,
53
- middle_name=middle_name,
54
- last_name=last_name,
55
- age=age,
56
- birth_date=birth_date,
57
- )
58
-
59
-
60
- def get_national_id(locale: str | None, region: str | None, birth_date: date) -> str | None:
61
- if locale != "en_US":
62
- return None
63
- if region is None:
64
- return None
65
- return generate_ssn(state=region, birth_date=birth_date)
66
-
67
-
68
- def generate_phone_number(locale: str, age: int, postcode: str | None, style: str = "dash") -> str | None:
69
- """
70
- Generate a phone number correlated with location (postcode).
71
- Phone number is None for children.
72
- """
73
- if locale != "en_US":
74
- return None
75
- if age < 18:
76
- return None
77
- if postcode is None:
78
- return None
79
- locality_var = random.random()
80
- if locality_var < 0.6:
81
- # Exact match to postcode 60% of the time
82
- return PhoneNumber.from_zip_prefix(postcode).format(style=style)
83
- elif locality_var < 0.8:
84
- # Nearby postcodes 20% of the time
85
- return PhoneNumber.from_zip_prefix(postcode[:4]).format(style=style)
86
- elif locality_var < 0.9:
87
- # More distant postcodes 10% of the time
88
- return PhoneNumber.from_zip_prefix(postcode[:3]).format(style=style)
89
- # Random (population-weighted) area code 10% of the time
90
- return PhoneNumber.generate().format(style=style)
91
-
92
-
93
- def generate_and_insert_derived_fields(person_record: dict[str, Any]) -> dict[str, str | None]:
94
- _verify_required_fields(person_record)
95
- birth_date = convert_age_to_birth_date(person_record.get("age"))
96
- person_record.update(
97
- {
98
- # Note: All data must be serializable to JSON.
99
- "birth_date": birth_date.isoformat(),
100
- "phone_number": generate_phone_number(
101
- locale=person_record.get("locale"),
102
- age=person_record.get("age"),
103
- postcode=person_record.get("postcode"),
104
- ),
105
- "email_address": generate_email_address(
106
- first_name=person_record.get("first_name"),
107
- middle_name=person_record.get("middle_name"),
108
- last_name=person_record.get("last_name"),
109
- age=person_record.get("age"),
110
- birth_date=birth_date,
111
- ),
112
- "national_id": get_national_id(
113
- locale=person_record.get("locale"),
114
- region=person_record.get("region"),
115
- birth_date=birth_date,
116
- ),
117
- }
118
- )
119
- if person_record.get("locale") == "en_US" and "region" in person_record and "state" not in person_record:
120
- state = person_record.pop("region")
121
- person_record.update({"state": state})
122
-
123
- return {
124
- **{k: v for k, v in person_record.items() if k in PII_FIELDS},
125
- **{k: v for k, v in person_record.items() if k in ["state", "phone_number", "email_address", "national_id"]},
126
- **{k: v for k, v in person_record.items() if k in PERSONA_FIELDS},
127
- }
128
-
129
-
130
- def load_person_data_sampler(blob_storage: ManagedBlobStorage, locale: str) -> ManagedDatasetGenerator:
131
- if locale not in LOCALES_WITH_MANAGED_DATASETS:
132
- raise DatasetNotAvailableForLocaleError(f"Locale {locale} is not supported by the managed dataset generator.")
133
-
134
- return ManagedDatasetGenerator(
135
- managed_datasets=load_managed_dataset_repository(blob_storage, [locale]),
136
- dataset_name=locale,
137
- )
138
-
139
-
140
- def _verify_required_fields(person_record: dict[str, Any]) -> None:
141
- """Verify that the person record contains all required fields."""
142
- missing_fields = REQUIRED_FIELDS - set(person_record.keys())
143
- if missing_fields:
144
- raise MissingPersonFieldsError(f"Person data is missing the following required fields: {missing_fields}")
@@ -1,128 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- import random
7
- from pathlib import Path
8
- from typing import TYPE_CHECKING
9
-
10
- from pydantic import BaseModel, Field, field_validator
11
-
12
- from data_designer.lazy_heavy_imports import pd
13
-
14
- if TYPE_CHECKING:
15
- import pandas as pd
16
-
17
- ZIP_AREA_CODE_DATA = pd.read_parquet(Path(__file__).parent / "assets" / "zip_area_code_map.parquet")
18
- ZIPCODE_AREA_CODE_MAP = dict(zip(ZIP_AREA_CODE_DATA["zipcode"], ZIP_AREA_CODE_DATA["area_code"]))
19
- ZIPCODE_POPULATION_MAP = dict(zip(ZIP_AREA_CODE_DATA["zipcode"], ZIP_AREA_CODE_DATA["count"]))
20
-
21
-
22
- def get_area_code(zip_prefix: str | None = None) -> str:
23
- """
24
- Sample an area code for the given ZIP code prefix, population-weighted.
25
-
26
- Args:
27
- zip_prefix: The prefix of a ZIP code, 5 digits or fewer. If None, sample from all ZIP codes.
28
-
29
- Returns:
30
- A sampled area code matching the prefix, population-weighted.
31
- """
32
- if zip_prefix is None:
33
- zipcodes, weights = zip(*ZIPCODE_POPULATION_MAP.items())
34
- zipcode = random.choices(zipcodes, weights=weights, k=1)[0]
35
- return str(ZIPCODE_AREA_CODE_MAP[zipcode])
36
- if len(zip_prefix) == 5:
37
- try:
38
- return str(ZIPCODE_AREA_CODE_MAP[zip_prefix])
39
- except KeyError:
40
- raise ValueError(f"ZIP code {zip_prefix} not found.")
41
- matching_zipcodes = [[z, c] for z, c in ZIPCODE_POPULATION_MAP.items() if z.startswith(zip_prefix)]
42
- zipcodes, weights = zip(*matching_zipcodes)
43
- if not zipcodes:
44
- raise ValueError(f"No ZIP codes found with prefix {zip_prefix}.")
45
- zipcode = random.choices(zipcodes, weights=weights, k=1)[0]
46
- return str(ZIPCODE_AREA_CODE_MAP[zipcode])
47
-
48
-
49
- class PhoneNumber(BaseModel):
50
- """
51
- A phone number object that supports various formatting styles
52
- """
53
-
54
- country_code: str = Field(default="1")
55
- area_code: str
56
- prefix: str # First part of the local number
57
- line_number: str # Second part of the local number
58
-
59
- @field_validator("country_code", "area_code", "prefix", "line_number")
60
- @classmethod
61
- def validate_digits(cls, v):
62
- if not v.isdigit():
63
- raise ValueError("Must contain only digits")
64
- return v
65
-
66
- @field_validator("country_code")
67
- @classmethod
68
- def validate_country_code_length(cls, v):
69
- max_length = 3
70
- if len(v) > max_length:
71
- raise ValueError(f"Country code {v} is longer than {max_length} digits")
72
- return v
73
-
74
- def format(self, style: str = "dash") -> str:
75
- """
76
- Format the phone number according to the specified style.
77
-
78
- Args:
79
- style: One of "dash", "parentheses", "dot", "space", "no_separation",
80
- "international_plus", "international"
81
-
82
- Returns:
83
- Formatted phone number string
84
- """
85
- if style == "dash":
86
- formatted = f"{self.area_code}-{self.prefix}-{self.line_number}"
87
- elif style == "parentheses":
88
- formatted = f"({self.area_code}) {self.prefix}-{self.line_number}"
89
- elif style == "dot":
90
- formatted = f"{self.area_code}.{self.prefix}.{self.line_number}"
91
- elif style == "space":
92
- formatted = f"{self.area_code} {self.prefix} {self.line_number}"
93
- elif style == "no_separation":
94
- formatted = f"{self.area_code}{self.prefix}{self.line_number}"
95
- elif style == "international_plus":
96
- cc = self.country_code or "1" # Default to US/Canada
97
- formatted = f"+{cc} {self.area_code} {self.prefix} {self.line_number}"
98
- elif style == "international":
99
- cc = int(self.country_code or 1) # Default to US/Canada
100
- formatted = f"{cc:03d} {self.area_code} {self.prefix} {self.line_number}"
101
- else:
102
- raise ValueError(f"Unsupported format style: {style}")
103
-
104
- return formatted
105
-
106
- @classmethod
107
- def from_area_code(cls, area_code: str) -> "PhoneNumber":
108
- prefix = str(random.randint(200, 1000))
109
- line_number = str(random.randint(0, 10000)).zfill(4)
110
- return PhoneNumber(area_code=area_code, prefix=prefix, line_number=line_number)
111
-
112
- @classmethod
113
- def from_zip_prefix(cls, zip_prefix: str) -> "PhoneNumber":
114
- """Create a phone number from the given ZIP code prefix."""
115
- area_code = get_area_code(zip_prefix)
116
- return cls.from_area_code(area_code)
117
-
118
- @classmethod
119
- def generate(cls) -> "PhoneNumber":
120
- """Create a random valid US phone number."""
121
- area_code = get_area_code()
122
- return cls.from_area_code(area_code)
123
-
124
- def __str__(self) -> str:
125
- return self.format("dash")
126
-
127
- def __repr__(self) -> str:
128
- return f"PhoneNumber({str(self)})"
@@ -1,26 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- from __future__ import annotations
5
-
6
- from data_designer.engine.errors import DataDesignerError
7
-
8
-
9
- class SamplingGenError(DataDesignerError):
10
- """Base exception for all errors in the sampling_gen library."""
11
-
12
-
13
- class RejectionSamplingError(SamplingGenError):
14
- """Exception for all errors related to rejection sampling."""
15
-
16
-
17
- class DataConversionError(SamplingGenError):
18
- """Exception for all errors related to data conversion."""
19
-
20
-
21
- class DatasetNotAvailableForLocaleError(SamplingGenError):
22
- """Exception for all errors related to the dataset not being available for a given locale."""
23
-
24
-
25
- class ManagedDatasetGeneratorError(SamplingGenError):
26
- """Exception for all errors related to the managed dataset generator."""