data-designer 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data_designer/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.2'
32
- __version_tuple__ = version_tuple = (0, 1, 2)
31
+ __version__ = version = '0.1.3'
32
+ __version_tuple__ = version_tuple = (0, 1, 3)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -430,9 +430,6 @@ class PersonSamplerParams(ConfigBase):
430
430
  age_range: Two-element list [min_age, max_age] specifying the age range to sample from
431
431
  (inclusive). Defaults to a standard age range. Both values must be between minimum and
432
432
  maximum allowed ages.
433
- state: Only supported for "en_US" locale. Filters to sample people from specified US state(s).
434
- Must be provided as two-letter state abbreviations (e.g., "CA", "NY", "TX"). Can be a
435
- single state or a list of states.
436
433
  with_synthetic_personas: If True, appends additional synthetic persona columns including
437
434
  personality traits, interests, and background descriptions. Only supported for certain
438
435
  locales with managed datasets.
@@ -1,6 +1,8 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from datetime import datetime
5
+ from functools import cached_property
4
6
  import json
5
7
  import logging
6
8
  from pathlib import Path
@@ -36,9 +38,21 @@ class ArtifactStorage(BaseModel):
36
38
  def artifact_path_exists(self) -> bool:
37
39
  return self.artifact_path.exists()
38
40
 
41
+ @cached_property
42
+ def resolved_dataset_name(self) -> str:
43
+ dataset_path = self.artifact_path / self.dataset_name
44
+ if dataset_path.exists() and len(list(dataset_path.iterdir())) > 0:
45
+ new_dataset_name = f"{self.dataset_name}_{datetime.now().strftime('%m-%d-%Y_%H%M%S')}"
46
+ logger.info(
47
+ f"📂 Dataset path {str(dataset_path)!r} already exists. Dataset from this session"
48
+ f"\n\t\t will be saved to {str(self.artifact_path / new_dataset_name)!r} instead."
49
+ )
50
+ return new_dataset_name
51
+ return self.dataset_name
52
+
39
53
  @property
40
54
  def base_dataset_path(self) -> Path:
41
- return self.artifact_path / self.dataset_name
55
+ return self.artifact_path / self.resolved_dataset_name
42
56
 
43
57
  @property
44
58
  def dropped_columns_dataset_path(self) -> Path:
@@ -88,8 +88,8 @@ class ColumnWiseDatasetBuilder:
88
88
  start_time = time.perf_counter()
89
89
 
90
90
  self.batch_manager.start(num_records=num_records, buffer_size=buffer_size)
91
- for batch_idx in range(1, self.batch_manager.num_batches + 1):
92
- logger.info(f"⏳ Processing batch {batch_idx} of {self.batch_manager.num_batches}")
91
+ for batch_idx in range(self.batch_manager.num_batches):
92
+ logger.info(f"⏳ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}")
93
93
  self._run_batch(generators)
94
94
  df_batch = self._run_processors(
95
95
  stage=BuildStage.POST_BATCH,
@@ -14,6 +14,7 @@ REQUIRED_FIELDS = {"first_name", "last_name", "age", "locale"}
14
14
 
15
15
 
16
16
  PII_FIELDS = [
17
+ # Core demographic fields
17
18
  "uuid",
18
19
  "first_name",
19
20
  "middle_name",
@@ -22,25 +23,38 @@ PII_FIELDS = [
22
23
  "age",
23
24
  "birth_date",
24
25
  "marital_status",
25
- "street_name",
26
- "street_number",
27
- "unit",
28
26
  "postcode",
29
- "region",
30
27
  "city",
31
- "district",
28
+ "region",
32
29
  "country",
33
- "area",
34
- "zone",
30
+ "locale",
35
31
  "bachelors_field",
36
- "education_degree",
37
32
  "education_level",
38
33
  "occupation",
39
- "locale",
34
+ "national_id",
35
+ # US-specific fields
36
+ "street_name",
37
+ "street_number",
38
+ "unit",
39
+ "state",
40
+ "email_address",
41
+ "phone_number",
42
+ # Japan-specific fields
43
+ "area",
44
+ "prefecture",
45
+ "zone",
46
+ # India-specific fields
47
+ "district",
48
+ "religion",
49
+ "education_degree",
50
+ "first_language",
51
+ "second_language",
52
+ "third_language",
40
53
  ]
41
54
 
42
55
 
43
56
  PERSONA_FIELDS = [
57
+ # Core persona fields
44
58
  "persona",
45
59
  "career_goals_and_ambitions",
46
60
  "arts_persona",
@@ -61,4 +75,12 @@ PERSONA_FIELDS = [
61
75
  "extraversion",
62
76
  "agreeableness",
63
77
  "neuroticism",
78
+ # Japan-specific persona fields
79
+ "aspects",
80
+ "digital_skills",
81
+ # India-specific persona fields
82
+ "linguistic_persona",
83
+ "religious_persona",
84
+ "linguistic_background",
85
+ "religious_background",
64
86
  ]
@@ -173,7 +173,11 @@ class DataDesigner(DataDesignerInterface[DatasetCreationResults]):
173
173
  configuration (columns, constraints, seed data, etc.).
174
174
  num_records: Number of records to generate.
175
175
  dataset_name: Name of the dataset. This name will be used as the dataset
176
- folder name in the artifact path directory.
176
+ folder name in the artifact path directory. If a non-empty directory with the
177
+ same name already exists, dataset will be saved to a new directory with
178
+ a datetime stamp. For example, if the dataset name is "awesome_dataset" and a directory
179
+ with the same name already exists, the dataset will be saved to a new directory
180
+ with the name "awesome_dataset_2025-01-01_12-00-00".
177
181
 
178
182
  Returns:
179
183
  DatasetCreationResults object with methods for loading the generated dataset,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: General framework for synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -1,5 +1,5 @@
1
1
  data_designer/__init__.py,sha256=iCeqRnb640RrL2QpA630GY5Ng7JiDt83Vq0DwLnNugU,461
2
- data_designer/_version.py,sha256=Ok5oAXdWgR9aghaFXTafTeDW6sYO3uVe6d2Nket57R4,704
2
+ data_designer/_version.py,sha256=q5nF98G8SoVeJqaknL0xdyxtv0egsqb0fK06_84Izu8,704
3
3
  data_designer/errors.py,sha256=Z4eN9XwzZvGRdBluSNoSqQYkPPzNQIDf0ET_OqWRZh8,179
4
4
  data_designer/logging.py,sha256=O6LlQRj4IdkvEEYiMkKfMb_ZDgN1YpkGQUCqcp7nY6w,5354
5
5
  data_designer/plugin_manager.py,sha256=jWoo80x0oCiOIJMA43t-vK-_hVv9_xt4WhBcurYoDqw,3098
@@ -44,7 +44,7 @@ data_designer/config/models.py,sha256=5Cy55BnKYyr-I1UHLUTqZxe6Ca9uVQWpUiwt9X0Zlr
44
44
  data_designer/config/preview_results.py,sha256=H6ETFI6L1TW8MEC9KYsJ1tXGIC5cloCggBCCZd6jiEE,1087
45
45
  data_designer/config/processors.py,sha256=qOF_plBoh6UEFNwUpyDgkqIuSDUaSM2S7k-kSAEB5p8,1328
46
46
  data_designer/config/sampler_constraints.py,sha256=4JxP-nge5KstqtctJnVg5RLM1w9mA7qFi_BjgTJl9CE,1167
47
- data_designer/config/sampler_params.py,sha256=NCm2uWEzFHjz8ZzSmiKcVp5jI5okp53tq9l-bWBm4FQ,26821
47
+ data_designer/config/sampler_params.py,sha256=W2GGRwzWZ4RlJAjDpyqSoF6bjpYjT7WHIhS3D0GfupE,26574
48
48
  data_designer/config/seed.py,sha256=g-iUToYSIFuTv3sbwSG_dF-9RwC8r8AvCD-vS8c_jDg,5487
49
49
  data_designer/config/validator_params.py,sha256=sNxFIF2bk_N4jJD-aMH1N5MQynDip08AoMI1ajxtRdc,3909
50
50
  data_designer/config/analysis/column_profilers.py,sha256=Qss9gr7oHNcjijW_MMIX9JkFX-V9v5vPwYWCnxLjMDY,2749
@@ -87,8 +87,8 @@ data_designer/engine/column_generators/generators/validation.py,sha256=MbDFXzief
87
87
  data_designer/engine/column_generators/utils/errors.py,sha256=ugNwaqnPdrPZI7YnKLbYwFjYUSm0WAzgaVu_u6i5Rc8,365
88
88
  data_designer/engine/column_generators/utils/judge_score_factory.py,sha256=JRoaZgRGK24dH0zx7MNGSccK196tQK_l0sbwNkurg7c,2132
89
89
  data_designer/engine/column_generators/utils/prompt_renderer.py,sha256=d4tbyPsgmFDikW3nxL5is9RNaajMkoPDCrfkQkxw7rc,4760
90
- data_designer/engine/dataset_builders/artifact_storage.py,sha256=NlO8H4g4ZaI5iDwI-xnhyyKGTdLX5JunqQuiQNXW-yI,7303
91
- data_designer/engine/dataset_builders/column_wise_builder.py,sha256=pu7mJIc5Ld4TLeTDsh9sCzKHgCbe7cC5PDF4RmxXw8o,13077
90
+ data_designer/engine/dataset_builders/artifact_storage.py,sha256=0hpjJ4s3kQ3h-cEpgtIcDpx3UIEMH1FNX5Sp_8yRU9s,7995
91
+ data_designer/engine/dataset_builders/column_wise_builder.py,sha256=bXaFhFD0GsY-9b_GLXY345N0BH5z2YjiWrs_yFDqYgA,13074
92
92
  data_designer/engine/dataset_builders/errors.py,sha256=1kChleChG4rASWIiL4Bel6Ox6aFZjQUrh5ogPt1CDWo,359
93
93
  data_designer/engine/dataset_builders/multi_column_configs.py,sha256=t28fhI-WRIBohFnAJ80l5EAETEDB5rJ5RSWInMiRfyE,1619
94
94
  data_designer/engine/dataset_builders/utils/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
@@ -148,7 +148,7 @@ data_designer/engine/sampling_gen/data_sources/base.py,sha256=BRU9pzDvgB5B1Mgtj8
148
148
  data_designer/engine/sampling_gen/data_sources/errors.py,sha256=5pq42e5yvUqaH-g09jWvJolYCO2I2Rdrqo1O0gwet8Y,326
149
149
  data_designer/engine/sampling_gen/data_sources/sources.py,sha256=63YaRau37NIc2TDn8JvTOsd0zfnY4_aaF9UOU5ryKSo,13387
150
150
  data_designer/engine/sampling_gen/entities/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
151
- data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256=-_ebkhKeRYtlGpY8ZKGuc40aJfeWQahW2L-BBRxRnO0,1316
151
+ data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py,sha256=W_QSYNO2ynsXGJ71y_M9uRpYjjcbcAFhp1MpDFdl9YM,1844
152
152
  data_designer/engine/sampling_gen/entities/email_address_utils.py,sha256=-V4zuuFq1t3nzzO_FqzCWApPcWNKAh-ZQYFMmCiu5RE,5231
153
153
  data_designer/engine/sampling_gen/entities/errors.py,sha256=QEq-6Ld9OlModEYbse0pvY21OC5CyO-OalrL03-iLME,311
154
154
  data_designer/engine/sampling_gen/entities/national_id_utils.py,sha256=vxxHnrfQP98W8dWGysCjvfIT-h1xEGdfxn5xF_-UeXw,2611
@@ -163,15 +163,15 @@ data_designer/engine/validators/remote.py,sha256=jtDIvWzfHh17m2ac_Fp93p49Th8RlkB
163
163
  data_designer/engine/validators/sql.py,sha256=bxbyxPxDT9yuwjhABVEY40iR1pzWRFi65WU4tPgG2bE,2250
164
164
  data_designer/essentials/__init__.py,sha256=zrDZ7hahOmOhCPdfoj0z9ALN10lXIesfwd2qXRqTcdY,4125
165
165
  data_designer/interface/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
166
- data_designer/interface/data_designer.py,sha256=EzOT_kkWXm9-1Zgbj4RvBfV6_r5ABR7mOuNwbgvKKLQ,16273
166
+ data_designer/interface/data_designer.py,sha256=USPTruC5axBJNEWEnYBJ4ol2d3mXGubHELBmWeahFe8,16664
167
167
  data_designer/interface/errors.py,sha256=jagKT3tPUnYq4e3e6AkTnBkcayHyEfxjPMBzx-GEKe4,565
168
168
  data_designer/interface/results.py,sha256=qFxa8SuCXeADiRpaCMBwJcExkJBCfUPeGCdcJSTjoTc,2111
169
169
  data_designer/plugins/__init__.py,sha256=c_V7q4QhfVoNf_uc9UwmXCsWqwtyWogI7YoN_0PzzE4,234
170
170
  data_designer/plugins/errors.py,sha256=yPIHpSddEr-o9ZcNVibb2hI-73O15Kg_Od8SlmQlnRs,297
171
171
  data_designer/plugins/plugin.py,sha256=7ErdUyrTdOb5PCBE3msdhTOrvQpldjOQw90-Bu4Bosc,2522
172
172
  data_designer/plugins/registry.py,sha256=iPDTh4duV1cKt7H1fXkj1bKLG6SyUKmzQ9xh-vjEoaM,3018
173
- data_designer-0.1.2.dist-info/METADATA,sha256=PjPyL9UQ0Ys4XPqRuruAjuUJ6XPMDf1n1bz17wwoct4,6644
174
- data_designer-0.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
175
- data_designer-0.1.2.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
176
- data_designer-0.1.2.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
177
- data_designer-0.1.2.dist-info/RECORD,,
173
+ data_designer-0.1.3.dist-info/METADATA,sha256=fCI36BVPIOC7FVxQviBmzWMX8HRnc69afkJ82xPYXbY,6644
174
+ data_designer-0.1.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
175
+ data_designer-0.1.3.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
176
+ data_designer-0.1.3.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
177
+ data_designer-0.1.3.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any