data-designer-config 0.4.0rc1__tar.gz → 0.4.0rc2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/PKG-INFO +1 -1
  2. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/_version.py +2 -2
  3. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/models.py +45 -14
  4. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_models.py +147 -9
  5. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/.gitignore +0 -0
  6. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/README.md +0 -0
  7. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/pyproject.toml +0 -0
  8. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/__init__.py +0 -0
  9. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/analysis/__init__.py +0 -0
  10. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/analysis/column_profilers.py +0 -0
  11. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/analysis/column_statistics.py +0 -0
  12. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/analysis/dataset_profiler.py +0 -0
  13. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/analysis/utils/errors.py +0 -0
  14. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/analysis/utils/reporting.py +0 -0
  15. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/base.py +0 -0
  16. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/column_configs.py +0 -0
  17. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/column_types.py +0 -0
  18. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/config_builder.py +0 -0
  19. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/data_designer_config.py +0 -0
  20. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/dataset_builders.py +0 -0
  21. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/dataset_metadata.py +0 -0
  22. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/default_model_settings.py +0 -0
  23. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/errors.py +0 -0
  24. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/interface.py +0 -0
  25. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/preview_results.py +0 -0
  26. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/processors.py +0 -0
  27. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/run_config.py +0 -0
  28. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/sampler_constraints.py +0 -0
  29. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/sampler_params.py +0 -0
  30. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/seed.py +0 -0
  31. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/seed_source.py +0 -0
  32. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/seed_source_types.py +0 -0
  33. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/testing/__init__.py +0 -0
  34. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/testing/fixtures.py +0 -0
  35. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/utils/code_lang.py +0 -0
  36. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/utils/constants.py +0 -0
  37. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/utils/errors.py +0 -0
  38. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/utils/info.py +0 -0
  39. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/utils/io_helpers.py +0 -0
  40. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/utils/misc.py +0 -0
  41. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/utils/numerical_helpers.py +0 -0
  42. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/utils/type_helpers.py +0 -0
  43. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/utils/visualization.py +0 -0
  44. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/config/validator_params.py +0 -0
  45. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/errors.py +0 -0
  46. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/lazy_heavy_imports.py +0 -0
  47. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/logging.py +0 -0
  48. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/plugin_manager.py +0 -0
  49. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/plugins/__init__.py +0 -0
  50. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/plugins/errors.py +0 -0
  51. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/plugins/plugin.py +0 -0
  52. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/src/data_designer/plugins/registry.py +0 -0
  53. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/analysis/conftest.py +0 -0
  54. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/analysis/test_column_statistics.py +0 -0
  55. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/analysis/test_dataset_profiler_results.py +0 -0
  56. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/analysis/utils/test_reporting.py +0 -0
  57. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_columns.py +0 -0
  58. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_config_builder.py +0 -0
  59. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_data_designer_config.py +0 -0
  60. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_default_model_settings.py +0 -0
  61. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_processors.py +0 -0
  62. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_sampler_constraints.py +0 -0
  63. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_sampler_params.py +0 -0
  64. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_seed.py +0 -0
  65. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_seed_source.py +0 -0
  66. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/test_validator_params.py +0 -0
  67. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/utils/__init__.py +0 -0
  68. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/utils/test_code_lang.py +0 -0
  69. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/utils/test_info.py +0 -0
  70. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/utils/test_io_helpers.py +0 -0
  71. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/utils/test_misc.py +0 -0
  72. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/utils/test_type_helpers.py +0 -0
  73. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/config/utils/test_visualization.py +0 -0
  74. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/conftest.py +0 -0
  75. {data_designer_config-0.4.0rc1 → data_designer_config-0.4.0rc2}/tests/test_logging.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-designer-config
3
- Version: 0.4.0rc1
3
+ Version: 0.4.0rc2
4
4
  Summary: Configuration layer for DataDesigner synthetic data generation
5
5
  License-Expression: Apache-2.0
6
6
  Classifier: Development Status :: 4 - Beta
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.4.0rc1'
32
- __version_tuple__ = version_tuple = (0, 4, 0, 'rc1')
31
+ __version__ = version = '0.4.0rc2'
32
+ __version_tuple__ = version_tuple = (0, 4, 0, 'rc2')
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -3,6 +3,7 @@
3
3
 
4
4
  from __future__ import annotations
5
5
 
6
+ import json
6
7
  import logging
7
8
  from abc import ABC, abstractmethod
8
9
  from enum import Enum
@@ -65,7 +66,7 @@ class ModalityContext(ABC, BaseModel):
65
66
  data_type: ModalityDataType
66
67
 
67
68
  @abstractmethod
68
- def get_context(self, record: dict) -> dict[str, Any]: ...
69
+ def get_contexts(self, record: dict) -> list[dict[str, Any]]: ...
69
70
 
70
71
 
71
72
  class ImageContext(ModalityContext):
@@ -81,25 +82,53 @@ class ImageContext(ModalityContext):
81
82
  modality: Modality = Modality.IMAGE
82
83
  image_format: ImageFormat | None = None
83
84
 
84
- def get_context(self, record: dict) -> dict[str, Any]:
85
- """Get the context for the image modality.
85
+ def get_contexts(self, record: dict) -> list[dict[str, Any]]:
86
+ """Get the contexts for the image modality.
86
87
 
87
88
  Args:
88
- record: The record containing the image data.
89
+ record: The record containing the image data. The data can be:
90
+ - A JSON serialized list of strings
91
+ - A list of strings
92
+ - A single string
89
93
 
90
94
  Returns:
91
- The context for the image modality.
95
+ A list of image contexts.
92
96
  """
93
- context = dict(type="image_url")
94
- context_value = record[self.column_name]
95
- if self.data_type == ModalityDataType.URL:
96
- context["image_url"] = context_value
97
+ raw_value = record[self.column_name]
98
+
99
+ # Normalize to list of strings
100
+ if isinstance(raw_value, str):
101
+ # Try to parse as JSON first
102
+ try:
103
+ parsed_value = json.loads(raw_value)
104
+ if isinstance(parsed_value, list):
105
+ context_values = parsed_value
106
+ else:
107
+ context_values = [raw_value]
108
+ except (json.JSONDecodeError, TypeError):
109
+ context_values = [raw_value]
110
+ elif isinstance(raw_value, list):
111
+ context_values = raw_value
112
+ elif hasattr(raw_value, "__iter__") and not isinstance(raw_value, (str, bytes, dict)):
113
+ # Handle array-like objects (numpy arrays, pandas Series, etc.)
114
+ context_values = list(raw_value)
97
115
  else:
98
- context["image_url"] = {
99
- "url": f"data:image/{self.image_format.value};base64,{context_value}",
100
- "format": self.image_format.value,
101
- }
102
- return context
116
+ context_values = [raw_value]
117
+
118
+ # Build context list
119
+ contexts = []
120
+ for context_value in context_values:
121
+ context = dict(type="image_url")
122
+ if self.data_type == ModalityDataType.URL:
123
+ context["image_url"] = context_value
124
+ else:
125
+ context["image_url"] = {
126
+ "url": f"data:image/{self.image_format.value};base64,{context_value}",
127
+ "format": self.image_format.value,
128
+ }
129
+ contexts.append(context)
130
+
131
+ return contexts
103
132
 
104
133
  @model_validator(mode="after")
105
134
  def _validate_image_format(self) -> Self:
@@ -399,12 +428,14 @@ class ModelConfig(ConfigBase):
399
428
  inference_parameters: Inference parameters for the model (temperature, top_p, max_tokens, etc.).
400
429
  The generation_type is determined by the type of inference_parameters.
401
430
  provider: Optional model provider name if using custom providers.
431
+ skip_health_check: Whether to skip the health check for this model. Defaults to False.
402
432
  """
403
433
 
404
434
  alias: str
405
435
  model: str
406
436
  inference_parameters: InferenceParamsT = Field(default_factory=ChatCompletionInferenceParams)
407
437
  provider: str | None = None
438
+ skip_health_check: bool = False
408
439
 
409
440
  @property
410
441
  def generation_type(self) -> GenerationType:
@@ -4,6 +4,7 @@
4
4
  import json
5
5
  import tempfile
6
6
  from collections import Counter
7
+ from typing import TYPE_CHECKING
7
8
 
8
9
  import pytest
9
10
  import yaml
@@ -24,22 +25,159 @@ from data_designer.config.models import (
24
25
  UniformDistributionParams,
25
26
  load_model_configs,
26
27
  )
28
+ from data_designer.lazy_heavy_imports import np
27
29
 
30
+ if TYPE_CHECKING:
31
+ import numpy as np
28
32
 
29
- def test_image_context_get_context():
33
+
34
+ def test_image_context_get_contexts_single_string():
35
+ """Test get_contexts with a single string value."""
30
36
  image_context = ImageContext(
31
37
  column_name="image_base64", data_type=ModalityDataType.BASE64, image_format=ImageFormat.PNG
32
38
  )
33
- assert image_context.get_context({"image_base64": "somebase64encodedimagestring"}) == {
34
- "type": "image_url",
35
- "image_url": {"url": "data:image/png;base64,somebase64encodedimagestring", "format": "png"},
36
- }
39
+ assert image_context.get_contexts({"image_base64": "somebase64encodedimagestring"}) == [
40
+ {
41
+ "type": "image_url",
42
+ "image_url": {"url": "data:image/png;base64,somebase64encodedimagestring", "format": "png"},
43
+ }
44
+ ]
37
45
 
38
46
  image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
39
- assert image_context.get_context({"image_url": "https://example.com/examle_image.png"}) == {
40
- "type": "image_url",
41
- "image_url": "https://example.com/examle_image.png",
42
- }
47
+ assert image_context.get_contexts({"image_url": "https://example.com/examle_image.png"}) == [
48
+ {
49
+ "type": "image_url",
50
+ "image_url": "https://example.com/examle_image.png",
51
+ }
52
+ ]
53
+
54
+
55
+ def test_image_context_get_contexts_list_of_strings():
56
+ """Test get_contexts with a list of strings."""
57
+ image_context = ImageContext(
58
+ column_name="image_base64", data_type=ModalityDataType.BASE64, image_format=ImageFormat.PNG
59
+ )
60
+ assert image_context.get_contexts({"image_base64": ["image1base64", "image2base64", "image3base64"]}) == [
61
+ {
62
+ "type": "image_url",
63
+ "image_url": {"url": "data:image/png;base64,image1base64", "format": "png"},
64
+ },
65
+ {
66
+ "type": "image_url",
67
+ "image_url": {"url": "data:image/png;base64,image2base64", "format": "png"},
68
+ },
69
+ {
70
+ "type": "image_url",
71
+ "image_url": {"url": "data:image/png;base64,image3base64", "format": "png"},
72
+ },
73
+ ]
74
+
75
+ image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
76
+ assert image_context.get_contexts(
77
+ {"image_url": ["https://example.com/image1.png", "https://example.com/image2.png"]}
78
+ ) == [
79
+ {
80
+ "type": "image_url",
81
+ "image_url": "https://example.com/image1.png",
82
+ },
83
+ {
84
+ "type": "image_url",
85
+ "image_url": "https://example.com/image2.png",
86
+ },
87
+ ]
88
+
89
+
90
+ def test_image_context_get_contexts_numpy_array():
91
+ """Test get_contexts with numpy arrays (happens after parquet serialization)."""
92
+ image_context = ImageContext(
93
+ column_name="image_base64", data_type=ModalityDataType.BASE64, image_format=ImageFormat.PNG
94
+ )
95
+ numpy_array = np.array(["image1base64", "image2base64"])
96
+ assert image_context.get_contexts({"image_base64": numpy_array}) == [
97
+ {
98
+ "type": "image_url",
99
+ "image_url": {"url": "data:image/png;base64,image1base64", "format": "png"},
100
+ },
101
+ {
102
+ "type": "image_url",
103
+ "image_url": {"url": "data:image/png;base64,image2base64", "format": "png"},
104
+ },
105
+ ]
106
+
107
+ image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
108
+ numpy_array = np.array(["https://example.com/image1.png", "https://example.com/image2.png"])
109
+ assert image_context.get_contexts({"image_url": numpy_array}) == [
110
+ {
111
+ "type": "image_url",
112
+ "image_url": "https://example.com/image1.png",
113
+ },
114
+ {
115
+ "type": "image_url",
116
+ "image_url": "https://example.com/image2.png",
117
+ },
118
+ ]
119
+
120
+
121
+ def test_image_context_get_contexts_json_serialized_list():
122
+ """Test get_contexts with a JSON serialized list of strings."""
123
+ image_context = ImageContext(
124
+ column_name="image_base64", data_type=ModalityDataType.BASE64, image_format=ImageFormat.PNG
125
+ )
126
+ json_str = json.dumps(["image1base64", "image2base64"])
127
+ assert image_context.get_contexts({"image_base64": json_str}) == [
128
+ {
129
+ "type": "image_url",
130
+ "image_url": {"url": "data:image/png;base64,image1base64", "format": "png"},
131
+ },
132
+ {
133
+ "type": "image_url",
134
+ "image_url": {"url": "data:image/png;base64,image2base64", "format": "png"},
135
+ },
136
+ ]
137
+
138
+ image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
139
+ json_str = json.dumps(["https://example.com/image1.png", "https://example.com/image2.png"])
140
+ assert image_context.get_contexts({"image_url": json_str}) == [
141
+ {
142
+ "type": "image_url",
143
+ "image_url": "https://example.com/image1.png",
144
+ },
145
+ {
146
+ "type": "image_url",
147
+ "image_url": "https://example.com/image2.png",
148
+ },
149
+ ]
150
+
151
+
152
+ def test_image_context_get_contexts_json_string_not_list():
153
+ """Test get_contexts with a JSON string that isn't a list (should treat as single string)."""
154
+ image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
155
+ json_str = json.dumps({"nested": "object"})
156
+ # Should treat the entire JSON string as a single image URL
157
+ assert image_context.get_contexts({"image_url": json_str}) == [
158
+ {
159
+ "type": "image_url",
160
+ "image_url": json_str,
161
+ }
162
+ ]
163
+
164
+
165
+ def test_image_context_get_contexts_invalid_json():
166
+ """Test get_contexts with invalid JSON string (should treat as single string)."""
167
+ image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
168
+ invalid_json = "not a valid json string"
169
+ assert image_context.get_contexts({"image_url": invalid_json}) == [
170
+ {
171
+ "type": "image_url",
172
+ "image_url": invalid_json,
173
+ }
174
+ ]
175
+
176
+
177
+ def test_image_context_get_contexts_empty_list():
178
+ """Test get_contexts with an empty list."""
179
+ image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
180
+ assert image_context.get_contexts({"image_url": []}) == []
43
181
 
44
182
 
45
183
  def test_image_context_validate_image_format():