data-designer 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data_designer/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.1'
32
- __version_tuple__ = version_tuple = (0, 2, 1)
31
+ __version__ = version = '0.2.2'
32
+ __version_tuple__ = version_tuple = (0, 2, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -224,6 +224,9 @@ class DataDesignerConfigBuilder:
224
224
 
225
225
  Returns:
226
226
  The current Data Designer config builder instance.
227
+
228
+ Raises:
229
+ BuilderConfigurationError: If the column name collides with an existing seed dataset column.
227
230
  """
228
231
  if column_config is None:
229
232
  if name is None or column_type is None:
@@ -240,6 +243,13 @@ class DataDesignerConfigBuilder:
240
243
  f"{', '.join([t.__name__ for t in allowed_column_configs])}"
241
244
  )
242
245
 
246
+ existing_config = self._column_configs.get(column_config.name)
247
+ if existing_config is not None and isinstance(existing_config, SeedDatasetColumnConfig):
248
+ raise BuilderConfigurationError(
249
+ f"🛑 Column {column_config.name!r} already exists as a seed dataset column. "
250
+ "Please use a different column name or update the seed dataset."
251
+ )
252
+
243
253
  self._column_configs[column_config.name] = column_config
244
254
  return self
245
255
 
@@ -578,7 +588,18 @@ class DataDesignerConfigBuilder:
578
588
 
579
589
  Returns:
580
590
  The current Data Designer config builder instance.
591
+
592
+ Raises:
593
+ BuilderConfigurationError: If any seed dataset column name collides with an existing column.
581
594
  """
595
+ seed_column_names = fetch_seed_dataset_column_names(dataset_reference)
596
+ colliding_columns = [name for name in seed_column_names if name in self._column_configs]
597
+ if colliding_columns:
598
+ raise BuilderConfigurationError(
599
+ f"🛑 Seed dataset column(s) {colliding_columns} collide with existing column(s). "
600
+ "Please remove the conflicting columns or use a seed dataset with different column names."
601
+ )
602
+
582
603
  self._seed_config = SeedConfig(
583
604
  dataset=dataset_reference.dataset,
584
605
  sampling_strategy=sampling_strategy,
@@ -587,7 +608,7 @@ class DataDesignerConfigBuilder:
587
608
  self.set_seed_datastore_settings(
588
609
  dataset_reference.datastore_settings if hasattr(dataset_reference, "datastore_settings") else None
589
610
  )
590
- for column_name in fetch_seed_dataset_column_names(dataset_reference):
611
+ for column_name in seed_column_names:
591
612
  self._column_configs[column_name] = SeedDatasetColumnConfig(name=column_name)
592
613
  return self
593
614
 
@@ -1,23 +1,27 @@
1
1
  # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
+ from __future__ import annotations
5
+
4
6
  import functools
5
7
  import logging
6
8
  from abc import ABC, abstractmethod
7
- from typing import overload
9
+ from enum import Enum
10
+ from typing import TYPE_CHECKING, overload
8
11
 
9
12
  import pandas as pd
10
13
 
11
- from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP
12
- from data_designer.config.models import BaseInferenceParams, ModelConfig
13
- from data_designer.config.utils.type_helpers import StrEnum
14
14
  from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, DataT, TaskConfigT
15
- from data_designer.engine.models.facade import ModelFacade
15
+
16
+ if TYPE_CHECKING:
17
+ from data_designer.config.models import BaseInferenceParams, ModelConfig
18
+ from data_designer.engine.models.facade import ModelFacade
19
+
16
20
 
17
21
  logger = logging.getLogger(__name__)
18
22
 
19
23
 
20
- class GenerationStrategy(StrEnum):
24
+ class GenerationStrategy(str, Enum):
21
25
  CELL_BY_CELL = "cell_by_cell"
22
26
  FULL_COLUMN = "full_column"
23
27
 
@@ -82,8 +86,7 @@ class WithModelGeneration:
82
86
  return self.model_config.inference_parameters
83
87
 
84
88
  def log_pre_generation(self) -> None:
85
- emoji = COLUMN_TYPE_EMOJI_MAP[self.config.column_type]
86
- logger.info(f"{emoji} Preparing {self.config.column_type} column generation")
89
+ logger.info(f"Preparing {self.config.column_type} column generation")
87
90
  logger.info(f" |-- column name: {self.config.name!r}")
88
91
  logger.info(f" |-- model config:\n{self.model_config.model_dump_json(indent=4)}")
89
92
  if self.model_config.provider is None:
@@ -2,8 +2,8 @@
2
2
  # SPDX-License-Identifier: Apache-2.0
3
3
 
4
4
  import ast
5
+ import json
5
6
  import logging
6
- import re
7
7
  import subprocess
8
8
  import tempfile
9
9
  from collections import defaultdict
@@ -179,9 +179,8 @@ class PythonValidator(BaseValidator):
179
179
  for file in Path(codebase_path).glob("*.py"):
180
180
  processed[file.stem] = PythonLinterMessages()
181
181
 
182
- # Run ruff linter
182
+ # Run ruff linter with JSON output
183
183
  ruff_bin = find_ruff_bin()
184
- env = {"NO_COLOR": "1"}
185
184
 
186
185
  ruff_exec = subprocess.run(
187
186
  [
@@ -189,9 +188,9 @@ class PythonValidator(BaseValidator):
189
188
  "check",
190
189
  "--select",
191
190
  "E,F6,F7,F8,SIM,PLC,PLE,PLR,PLW",
191
+ "--output-format=json",
192
192
  codebase_path,
193
193
  ],
194
- env=env,
195
194
  text=True,
196
195
  capture_output=True,
197
196
  check=False,
@@ -199,30 +198,34 @@ class PythonValidator(BaseValidator):
199
198
  )
200
199
  ruff_output = ruff_exec.stdout
201
200
 
202
- # Parse ruff output
203
- if "All checks passed!" in ruff_output:
204
- return processed # no errors or warnings
205
-
206
- pattern = r"(.*):([0-9]*):([0-9]*): ([A-Za-z0-9]*):? (?:\[\*\] )?(.*)\n"
207
- errors = re.findall(pattern, ruff_output)
201
+ # Parse JSON output
202
+ try:
203
+ diagnostics = json.loads(ruff_output)
204
+ except json.JSONDecodeError as e:
205
+ raise RuntimeError(f"Failed to parse ruff JSON output: {e}")
208
206
 
209
- if errors == []: # output could not be parsed
210
- raise RuntimeError("ruff's output could not be parsed")
207
+ if not diagnostics:
208
+ return processed # no errors or warnings
211
209
 
212
- try:
213
- for error in errors:
214
- filename, line, column, symbol, message = error
215
- processed[Path(filename).stem].add(
216
- PythonLinterMessage(
217
- type=TYPE_FROM_SYMBOL[re.sub(r"[^A-Za-z]+", "", symbol)],
218
- symbol=symbol,
219
- line=int(line),
220
- column=int(column),
221
- message=message,
222
- )
210
+ for diagnostic in diagnostics:
211
+ filename = diagnostic["filename"]
212
+ code = diagnostic["code"]
213
+ location = diagnostic["location"]
214
+ message = diagnostic["message"]
215
+
216
+ # Extract alphabetic prefix from code for type mapping
217
+ alpha_prefix = "".join(c for c in code if c.isalpha())
218
+ error_type = TYPE_FROM_SYMBOL.get(alpha_prefix, "warning")
219
+
220
+ processed[Path(filename).stem].add(
221
+ PythonLinterMessage(
222
+ type=error_type,
223
+ symbol=code,
224
+ line=location["row"],
225
+ column=location["column"],
226
+ message=message,
223
227
  )
224
- except Exception: # output not in expected format
225
- raise RuntimeError("ruff's output not in expected format")
228
+ )
226
229
 
227
230
  return processed
228
231
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: General framework for synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -15,36 +15,34 @@ Classifier: Programming Language :: Python :: 3.13
15
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
16
  Classifier: Topic :: Software Development
17
17
  Requires-Python: >=3.10
18
- Requires-Dist: anyascii<1.0,>=0.3.3
19
- Requires-Dist: datasets>=4.0.0
20
- Requires-Dist: duckdb==1.1.3
21
- Requires-Dist: faker==20.1.0
22
- Requires-Dist: httpx-retries>=0.4.2
23
- Requires-Dist: httpx>=0.27.2
24
- Requires-Dist: huggingface-hub>=0.34.4
18
+ Requires-Dist: anyascii<1,>=0.3.3
19
+ Requires-Dist: duckdb<2,>=1.1.3
20
+ Requires-Dist: faker<21,>=20.1.0
21
+ Requires-Dist: httpx-retries<1,>=0.4.2
22
+ Requires-Dist: httpx<1,>=0.27.2
23
+ Requires-Dist: huggingface-hub<2,>=1.0.1
25
24
  Requires-Dist: jinja2<4,>=3.1.6
26
- Requires-Dist: json-repair==0.48.0
27
- Requires-Dist: jsonpath-rust-bindings>=1.0
28
- Requires-Dist: litellm==1.73.6
29
- Requires-Dist: lxml>=6.0.2
30
- Requires-Dist: marko==2.1.2
31
- Requires-Dist: networkx==3.0
32
- Requires-Dist: numpy>=1.23.5
33
- Requires-Dist: pandas>=1.5.3
34
- Requires-Dist: prompt-toolkit>=3.0.0
35
- Requires-Dist: pyarrow>=19.0.1
36
- Requires-Dist: pydantic>=2.9.2
37
- Requires-Dist: pydantic[email]>=2.9.2
38
- Requires-Dist: pygments>=2.19.2
39
- Requires-Dist: python-json-logger==2.0.7
40
- Requires-Dist: pyyaml>=6.0.1
25
+ Requires-Dist: json-repair<1,>=0.48.0
26
+ Requires-Dist: jsonpath-rust-bindings<2,>=1.0
27
+ Requires-Dist: litellm<2,>=1.73.6
28
+ Requires-Dist: lxml<7,>=6.0.2
29
+ Requires-Dist: marko<3,>=2.1.2
30
+ Requires-Dist: networkx<4,>=3.0
31
+ Requires-Dist: numpy<3,>=1.23.5
32
+ Requires-Dist: pandas<3,>=2.3.3
33
+ Requires-Dist: prompt-toolkit<4,>=3.0.0
34
+ Requires-Dist: pyarrow<20,>=19.0.1
35
+ Requires-Dist: pydantic[email]<3,>=2.9.2
36
+ Requires-Dist: pygments<3,>=2.19.2
37
+ Requires-Dist: python-json-logger<4,>=3
38
+ Requires-Dist: pyyaml<7,>=6.0.1
41
39
  Requires-Dist: requests<3,>=2.32.2
42
- Requires-Dist: rich>=13.7.1
43
- Requires-Dist: ruff==0.12.3
44
- Requires-Dist: scipy>=1.11.0
45
- Requires-Dist: sqlfluff==3.2.0
46
- Requires-Dist: tiktoken>=0.8.0
47
- Requires-Dist: typer>=0.12.0
40
+ Requires-Dist: rich<14,>=13.7.1
41
+ Requires-Dist: ruff<1,>=0.14.10
42
+ Requires-Dist: scipy<2,>=1.11.0
43
+ Requires-Dist: sqlfluff<4,>=3.2.0
44
+ Requires-Dist: tiktoken<1,>=0.8.0
45
+ Requires-Dist: typer<1,>=0.12.0
48
46
  Description-Content-Type: text/markdown
49
47
 
50
48
  # 🎨 NeMo Data Designer
@@ -1,5 +1,5 @@
1
1
  data_designer/__init__.py,sha256=iCeqRnb640RrL2QpA630GY5Ng7JiDt83Vq0DwLnNugU,461
2
- data_designer/_version.py,sha256=vYqoJTG51NOUmYyL0xt8asRK8vUT4lGAdal_EZ59mvw,704
2
+ data_designer/_version.py,sha256=o3ZTescp-19Z9cvBGq9dQnbppljgzdUYUf98Nov0spY,704
3
3
  data_designer/errors.py,sha256=Z4eN9XwzZvGRdBluSNoSqQYkPPzNQIDf0ET_OqWRZh8,179
4
4
  data_designer/logging.py,sha256=ZsruJ0tx_4NK0PIMyxCZJJ0wJugoDff9UP3PbsdEDxQ,5341
5
5
  data_designer/plugin_manager.py,sha256=xaMX274gdlYLNNPIrAOmJNLaZlG_0ROJ0H29v8t2aKs,2604
@@ -37,7 +37,7 @@ data_designer/config/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMb
37
37
  data_designer/config/base.py,sha256=ypam9XX6dg8Q_55su84WmVExNXsmt5jb3eeW3JLlHwc,2396
38
38
  data_designer/config/column_configs.py,sha256=pjpy5z3Kk7i4WmIjOxdiW5Awpjy5CxQSy0YMy0QxtvA,18961
39
39
  data_designer/config/column_types.py,sha256=EILVM42d4TMl2xbSj5htMsenJwybCHIc_G8AUXyrjWU,7197
40
- data_designer/config/config_builder.py,sha256=bXRFbRsYxLlED3rQN6u0qtBLgRoHLw767q_QtQoTZmI,28151
40
+ data_designer/config/config_builder.py,sha256=n8in3O-hR2j3wJBnZMCoT5NawlobJDWTyNZCIYSgWIo,29241
41
41
  data_designer/config/data_designer_config.py,sha256=D2b4Dl8pR6kCkvPoZ3APxC9pVBqXi5EJMVK1WBZ6ni8,1886
42
42
  data_designer/config/dataset_builders.py,sha256=1pNFy_pkQ5lJ6AVZ43AeTuSbz6yC_l7Ndcyp5yaT8hQ,327
43
43
  data_designer/config/datastore.py,sha256=gEHR2hYlJwD_vzjuaSOMRiYjtwdQhyO9q1afZDrhBCo,7586
@@ -84,7 +84,7 @@ data_designer/engine/analysis/utils/judge_score_processing.py,sha256=rl11e3PxAOQ
84
84
  data_designer/engine/column_generators/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
85
85
  data_designer/engine/column_generators/registry.py,sha256=Eg6tqNM7mmEPNom1fWF9S5D3qABpMennOHGEGePwJN0,3060
86
86
  data_designer/engine/column_generators/generators/__init__.py,sha256=9eG4WHKyrJcNoK4GEz6BCw_E0Ewo9elQoDN4TLMbAog,137
87
- data_designer/engine/column_generators/generators/base.py,sha256=48kQHNcT6k6-anMRoSelgoPhsdrb90n6BQqc45ZE7n8,3327
87
+ data_designer/engine/column_generators/generators/base.py,sha256=zurwtamM2l3shLa4SLjUOE0zOTDozQ5wPGAvDkrNYqE,3231
88
88
  data_designer/engine/column_generators/generators/embedding.py,sha256=xYnFWRJ2W7JuwK8CRIUhv4QiT_DCGDuQkuHFKXCxrow,1724
89
89
  data_designer/engine/column_generators/generators/expression.py,sha256=7xniEj8aPscWDYLrnNbG2mF3s08C7aR8ZgNUCzr_x8g,2539
90
90
  data_designer/engine/column_generators/generators/llm_completion.py,sha256=XqpXzYczbZ6efUIVuvcm2O_mXBnXCMAvcjeyaB5dIFA,5301
@@ -167,7 +167,7 @@ data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet,sha2
167
167
  data_designer/engine/validators/__init__.py,sha256=lMouN5BTbDNi31KfTQNjV7qrL46q-ssejXNT24iDTGI,652
168
168
  data_designer/engine/validators/base.py,sha256=lzO4gRoCDvh3BFP0sM4OjcwG-84qodYFW-G9NEOk3Cs,964
169
169
  data_designer/engine/validators/local_callable.py,sha256=oCUXj_NRt0gVqUIh0fLrvw-iURDR6OHFrVi5GOMhXj8,1387
170
- data_designer/engine/validators/python.py,sha256=DK6xxTzUHD8JUEmfP14W4hKdSb45ifbmvrWoU_o_l7Y,7871
170
+ data_designer/engine/validators/python.py,sha256=jAp1u8yLjqfebh60xGapkHVjMz58WHB0QjfMc2zQCaY,7894
171
171
  data_designer/engine/validators/remote.py,sha256=jtDIvWzfHh17m2ac_Fp93p49Th8RlkBzzih2jiqD7gk,2929
172
172
  data_designer/engine/validators/sql.py,sha256=bxbyxPxDT9yuwjhABVEY40iR1pzWRFi65WU4tPgG2bE,2250
173
173
  data_designer/essentials/__init__.py,sha256=eHuZFJTmeRf_b6KQZ2vZeqy1afJ7y7RMTm7q4Jrg58s,1012
@@ -179,8 +179,8 @@ data_designer/plugins/__init__.py,sha256=c_V7q4QhfVoNf_uc9UwmXCsWqwtyWogI7YoN_0P
179
179
  data_designer/plugins/errors.py,sha256=yPIHpSddEr-o9ZcNVibb2hI-73O15Kg_Od8SlmQlnRs,297
180
180
  data_designer/plugins/plugin.py,sha256=a2KfoCNhYa8U0uQrPSBWfuyjXOb5WeITzFRpEdZFo6s,2516
181
181
  data_designer/plugins/registry.py,sha256=c0X03TnA_J60RWpxaVJEmtIXKvA9up-LznrUHXDcYxg,3012
182
- data_designer-0.2.1.dist-info/METADATA,sha256=StxkeuCq3NdJmhiMMeaXpWJzF4zauPid6O7haFw06VU,7626
183
- data_designer-0.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
184
- data_designer-0.2.1.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
185
- data_designer-0.2.1.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
186
- data_designer-0.2.1.dist-info/RECORD,,
182
+ data_designer-0.2.2.dist-info/METADATA,sha256=kcCjCe9CSOS7xenYsG6NduNpMm5ELNmRBBv3goYAqoY,7636
183
+ data_designer-0.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
184
+ data_designer-0.2.2.dist-info/entry_points.txt,sha256=NWWWidyDxN6CYX6y664PhBYMhbaYTQTyprqfYAgkyCg,57
185
+ data_designer-0.2.2.dist-info/licenses/LICENSE,sha256=cSWJDwVqHyQgly8Zmt3pqXJ2eQbZVYwN9qd0NMssxXY,11336
186
+ data_designer-0.2.2.dist-info/RECORD,,