data-designer 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_designer/_version.py +2 -2
- data_designer/config/sampler_params.py +0 -3
- data_designer/engine/dataset_builders/artifact_storage.py +15 -1
- data_designer/engine/dataset_builders/column_wise_builder.py +2 -2
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +31 -9
- data_designer/interface/data_designer.py +5 -1
- {data_designer-0.1.2.dist-info → data_designer-0.1.3.dist-info}/METADATA +1 -1
- {data_designer-0.1.2.dist-info → data_designer-0.1.3.dist-info}/RECORD +11 -11
- {data_designer-0.1.2.dist-info → data_designer-0.1.3.dist-info}/WHEEL +1 -1
- {data_designer-0.1.2.dist-info → data_designer-0.1.3.dist-info}/entry_points.txt +0 -0
- {data_designer-0.1.2.dist-info → data_designer-0.1.3.dist-info}/licenses/LICENSE +0 -0
data_designer/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.3'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 3)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -430,9 +430,6 @@ class PersonSamplerParams(ConfigBase):
|
|
|
430
430
|
age_range: Two-element list [min_age, max_age] specifying the age range to sample from
|
|
431
431
|
(inclusive). Defaults to a standard age range. Both values must be between minimum and
|
|
432
432
|
maximum allowed ages.
|
|
433
|
-
state: Only supported for "en_US" locale. Filters to sample people from specified US state(s).
|
|
434
|
-
Must be provided as two-letter state abbreviations (e.g., "CA", "NY", "TX"). Can be a
|
|
435
|
-
single state or a list of states.
|
|
436
433
|
with_synthetic_personas: If True, appends additional synthetic persona columns including
|
|
437
434
|
personality traits, interests, and background descriptions. Only supported for certain
|
|
438
435
|
locales with managed datasets.
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
2
|
# SPDX-License-Identifier: Apache-2.0
|
|
3
3
|
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from functools import cached_property
|
|
4
6
|
import json
|
|
5
7
|
import logging
|
|
6
8
|
from pathlib import Path
|
|
@@ -36,9 +38,21 @@ class ArtifactStorage(BaseModel):
|
|
|
36
38
|
def artifact_path_exists(self) -> bool:
|
|
37
39
|
return self.artifact_path.exists()
|
|
38
40
|
|
|
41
|
+
@cached_property
|
|
42
|
+
def resolved_dataset_name(self) -> str:
|
|
43
|
+
dataset_path = self.artifact_path / self.dataset_name
|
|
44
|
+
if dataset_path.exists() and len(list(dataset_path.iterdir())) > 0:
|
|
45
|
+
new_dataset_name = f"{self.dataset_name}_{datetime.now().strftime('%m-%d-%Y_%H%M%S')}"
|
|
46
|
+
logger.info(
|
|
47
|
+
f"📂 Dataset path {str(dataset_path)!r} already exists. Dataset from this session"
|
|
48
|
+
f"\n\t\t will be saved to {str(self.artifact_path / new_dataset_name)!r} instead."
|
|
49
|
+
)
|
|
50
|
+
return new_dataset_name
|
|
51
|
+
return self.dataset_name
|
|
52
|
+
|
|
39
53
|
@property
|
|
40
54
|
def base_dataset_path(self) -> Path:
|
|
41
|
-
return self.artifact_path / self.
|
|
55
|
+
return self.artifact_path / self.resolved_dataset_name
|
|
42
56
|
|
|
43
57
|
@property
|
|
44
58
|
def dropped_columns_dataset_path(self) -> Path:
|
|
@@ -88,8 +88,8 @@ class ColumnWiseDatasetBuilder:
|
|
|
88
88
|
start_time = time.perf_counter()
|
|
89
89
|
|
|
90
90
|
self.batch_manager.start(num_records=num_records, buffer_size=buffer_size)
|
|
91
|
-
for batch_idx in range(
|
|
92
|
-
logger.info(f"⏳ Processing batch {batch_idx} of {self.batch_manager.num_batches}")
|
|
91
|
+
for batch_idx in range(self.batch_manager.num_batches):
|
|
92
|
+
logger.info(f"⏳ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}")
|
|
93
93
|
self._run_batch(generators)
|
|
94
94
|
df_batch = self._run_processors(
|
|
95
95
|
stage=BuildStage.POST_BATCH,
|
|
@@ -14,6 +14,7 @@ REQUIRED_FIELDS = {"first_name", "last_name", "age", "locale"}
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
PII_FIELDS = [
|
|
17
|
+
# Core demographic fields
|
|
17
18
|
"uuid",
|
|
18
19
|
"first_name",
|
|
19
20
|
"middle_name",
|
|
@@ -22,25 +23,38 @@ PII_FIELDS = [
|
|
|
22
23
|
"age",
|
|
23
24
|
"birth_date",
|
|
24
25
|
"marital_status",
|
|
25
|
-
"street_name",
|
|
26
|
-
"street_number",
|
|
27
|
-
"unit",
|
|
28
26
|
"postcode",
|
|
29
|
-
"region",
|
|
30
27
|
"city",
|
|
31
|
-
"
|
|
28
|
+
"region",
|
|
32
29
|
"country",
|
|
33
|
-
"
|
|
34
|
-
"zone",
|
|
30
|
+
"locale",
|
|
35
31
|
"bachelors_field",
|
|
36
|
-
"education_degree",
|
|
37
32
|
"education_level",
|
|
38
33
|
"occupation",
|
|
39
|
-
"
|
|
34
|
+
"national_id",
|
|
35
|
+
# US-specific fields
|
|
36
|
+
"street_name",
|
|
37
|
+
"street_number",
|
|
38
|
+
"unit",
|
|
39
|
+
"state",
|
|
40
|
+
"email_address",
|
|
41
|
+
"phone_number",
|
|
42
|
+
# Japan-specific fields
|
|
43
|
+
"area",
|
|
44
|
+
"prefecture",
|
|
45
|
+
"zone",
|
|
46
|
+
# India-specific fields
|
|
47
|
+
"district",
|
|
48
|
+
"religion",
|
|
49
|
+
"education_degree",
|
|
50
|
+
"first_language",
|
|
51
|
+
"second_language",
|
|
52
|
+
"third_language",
|
|
40
53
|
]
|
|
41
54
|
|
|
42
55
|
|
|
43
56
|
PERSONA_FIELDS = [
|
|
57
|
+
# Core persona fields
|
|
44
58
|
"persona",
|
|
45
59
|
"career_goals_and_ambitions",
|
|
46
60
|
"arts_persona",
|
|
@@ -61,4 +75,12 @@ PERSONA_FIELDS = [
|
|
|
61
75
|
"extraversion",
|
|
62
76
|
"agreeableness",
|
|
63
77
|
"neuroticism",
|
|
78
|
+
# Japan-specific persona fields
|
|
79
|
+
"aspects",
|
|
80
|
+
"digital_skills",
|
|
81
|
+
# India-specific persona fields
|
|
82
|
+
"linguistic_persona",
|
|
83
|
+
"religious_persona",
|
|
84
|
+
"linguistic_background",
|
|
85
|
+
"religious_background",
|
|
64
86
|
]
|
|
@@ -173,7 +173,11 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
|
|
|
173
173
|
configuration (columns, constraints, seed data, etc.).
|
|
174
174
|
num_records: Number of records to generate.
|
|
175
175
|
dataset_name: Name of the dataset. This name will be used as the dataset
|
|
176
|
-
folder name in the artifact path directory.
|
|
176
|
+
folder name in the artifact path directory. If a non-empty directory with the
|
|
177
|
+
same name already exists, dataset will be saved to a new directory with
|
|
178
|
+
a datetime stamp. For example, if the dataset name is "awesome_dataset" and a directory
|
|
179
|
+
with the same name already exists, the dataset will be saved to a new directory
|
|
180
|
+
with the name "awesome_dataset_2025-01-01_12-00-00".
|
|
177
181
|
|
|
178
182
|
Returns:
|
|
179
183
|
DatasetCreationResults object with methods for loading the generated dataset,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
data_designer/__init__.py,sha256=iCeqRnb640RrL2QpA630GY5Ng7JiDt83Vq0DwLnNugU,461
|
|
2
|
-
data_designer/_version.py,sha256=
|
|
2
|
+
data_designer/_version.py,sha256=q5nF98G8SoVeJqaknL0xdyxtv0egsqb0fK06_84Izu8,704
|
|
3
3
|
data_designer/errors.py,sha256=Z4eN9XwzZvGRdBluSNoSqQYkPPzNQIDf0ET_OqWRZh8,179
|
|
4
4
|
data_designer/logging.py,sha256=O6LlQRj4IdkvEEYiMkKfMb_ZDgN1YpkGQUCqcp7nY6w,5354
|
|
5
5
|
data_designer/plugin_manager.py,sha256=jWoo80x0oCiOIJMA43t-vK-_hVv9_xt4WhBcurYoDqw,3098
|
|
@@ -44,7 +44,7 @@ data_designer/config/models.py,sha256=5Cy55BnKYyr-I1UHLUTqZxe6Ca9uVQWpUiwt9X0Zlr
|
|
|
44
44
|
data_designer/config/preview_results.py,sha256=H6ETFI6L1TW8MEC9KYsJ1tXGIC5cloCggBCCZd6jiEE,1087
|
|
45
45
|
data_designer/config/processors.py,sha256=qOF_plBoh6UEFNwUpyDgkqIuSDUaSM2S7k-kSAEB5p8,1328
|
|
46
46
|
data_designer/config/sampler_constraints.py,sha256=4JxP-nge5KstqtctJnVg5RLM1w9mA7qFi_BjgTJl9CE,1167
|
|
47
|
-
data_designer/config/sampler_params.py,sha256=
|
|
47
|
+
data_designer/config/sampler_params.py,sha256=W2GGRwzWZ4RlJAjDpyqSoF6bjpYjT7WHIhS3D0GfupE,26574
|
|
48
48
|
data_designer/config/seed.py,sha256=g-iUToYSIFuTv3sbwSG_dF-9RwC8r8AvCD-vS8c_jDg,5487
|
|
49
49
|
data_designer/config/validator_params.py,sha256=sNxFIF2bk_N4jJD-aMH1N5MQynDip08AoMI1ajxtRdc,3909
|
|
50
50
|
data_designer/config/analysis/column_profilers.py,sha256=Qss9gr7oHNcjijW_MMIX9JkFX-V9v5vPwYWCnxLjMDY,2749
|
|
@@ -87,8 +87,8 @@ data_designer/engine/column_generators/generators/validation.py,sha256=MbDFXzief
|
|
|
87
87
|
data_designer/engine/column_generators/utils/errors.py,sha256=ugNwaqnPdrPZI7YnKLbYwFjYUSm0WAzgaVu_u6i5Rc8,365
|
|
88
88
|
data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=JRoaZgRGK24dH0zx7MNGSccK196tQK_l0sbwNkurg7c,2132
|
|
89
89
|
data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=d4tbyPsgmFDikW3nxL5is9RNaajMkoPDCrfkQkxw7rc,4760
|
|
90
|
-
data_designer/engine/dataset_builders/artifact_storage.py,sha256=
|
|
91
|
-
data_designer/engine/dataset_builders/column_wise_builder.py,sha256=
|
|
90
|
+
data_designer/engine/dataset_builders/artifact_storage.py,sha256=0hpjJ4s3kQ3h-cEpgtIcDpx3UIEMH1FNX5Sp_8yRU9s,7995
|
|
91
|
+
data_designer/engine/dataset_builders/column_wise_builder.py,sha256=bXaFhFD0GsY-9b_GLXY345N0BH5z2YjiWrs_yFDqYgA,13074
|
|
92
92
|
data_designer/engine/dataset_builders/errors.py,sha256=1kChleChG4rASWIiL4Bel6Ox6aFZjQUrh5ogPt1CDWo,359
|
|
93
93
|
data_designer/engine/dataset_builders/multi_column_configs.py,sha256=t28fhI-WRIBohFnAJ80l5EAETEDB5rJ5RSWInMiRfyE,1619
|
|
94
94
|
data_designer/engine/dataset_builders/utils/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
@@ -148,7 +148,7 @@ data_designer/engine/sampling_gen/data_sources/base.py,sha256=BRU9pzDvgB5B1Mgtj8
|
|
|
148
148
|
data_designer/engine/sampling_gen/data_sources/errors.py,sha256=5pq42e5yvUqaH-g09jWvJolYCO2I2Rdrqo1O0gwet8Y,326
|
|
149
149
|
data_designer/engine/sampling_gen/data_sources/sources.py,sha256=63YaRau37NIc2TDn8JvTOsd0zfnY4_aaF9UOU5ryKSo,13387
|
|
150
150
|
data_designer/engine/sampling_gen/entities/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
151
|
-
data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256
|
|
151
|
+
data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256=W_QSYNO2ynsXGJ71y_M9uRpYjjcbcAFhp1MpDFdl9YM,1844
|
|
152
152
|
data_designer/engine/sampling_gen/entities/email_address_utils.py,sha256=-V4zuuFq1t3nzzO_FqzCWApPcWNKAh-ZQYFMmCiu5RE,5231
|
|
153
153
|
data_designer/engine/sampling_gen/entities/errors.py,sha256=QEq-6Ld9OlModEYbse0pvY21OC5CyO-OalrL03-iLME,311
|
|
154
154
|
data_designer/engine/sampling_gen/entities/national_id_utils.py,sha256=vxxHnrfQP98W8dWGysCjvfIT-h1xEGdfxn5xF_-UeXw,2611
|
|
@@ -163,15 +163,15 @@ data_designer/engine/validators/remote.py,sha256=jtDIvWzfHh17m2ac_Fp93p49Th8RlkB
|
|
|
163
163
|
data_designer/engine/validators/sql.py,sha256=bxbyxPxDT9yuwjhABVEY40iR1pzWRFi65WU4tPgG2bE,2250
|
|
164
164
|
data_designer/essentials/__init__.py,sha256=zrDZ7hahOmOhCPdfoj0z9ALN10lXIesfwd2qXRqTcdY,4125
|
|
165
165
|
data_designer/interface/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
|
|
166
|
-
data_designer/interface/data_designer.py,sha256=
|
|
166
|
+
data_designer/interface/data_designer.py,sha256=USPTruC5axBJNEWEnYBJ4ol2d3mXGubHELBmWeahFe8,16664
|
|
167
167
|
data_designer/interface/errors.py,sha256=jagKT3tPUnYq4e3e6AkTnBkcayHyEfxjPMBzx-GEKe4,565
|
|
168
168
|
data_designer/interface/results.py,sha256=qFxa8SuCXeADiRpaCMBwJcExkJBCfUPeGCdcJSTjoTc,2111
|
|
169
169
|
data_designer/plugins/__init__.py,sha256=c_V7q4QhfVoNf_uc9UwmXCsWqwtyWogI7YoN_0PzzE4,234
|
|
170
170
|
data_designer/plugins/errors.py,sha256=yPIHpSddEr-o9ZcNVibb2hI-73O15Kg_Od8SlmQlnRs,297
|
|
171
171
|
data_designer/plugins/plugin.py,sha256=7ErdUyrTdOb5PCBE3msdhTOrvQpldjOQw90-Bu4Bosc,2522
|
|
172
172
|
data_designer/plugins/registry.py,sha256=iPDTh4duV1cKt7H1fXkj1bKLG6SyUKmzQ9xh-vjEoaM,3018
|
|
173
|
-
data_designer-0.1.
|
|
174
|
-
data_designer-0.1.
|
|
175
|
-
data_designer-0.1.
|
|
176
|
-
data_designer-0.1.
|
|
177
|
-
data_designer-0.1.
|
|
173
|
+
data_designer-0.1.3.dist-info/METADATA,sha256=fCI36BVPIOC7FVxQviBmzWMX8HRnc69afkJ82xPYXbY,6644
|
|
174
|
+
data_designer-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
175
|
+
data_designer-0.1.3.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
|
|
176
|
+
data_designer-0.1.3.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
|
|
177
|
+
data_designer-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|