tfds-nightly 4.9.9.dev202508260044__py3-none-any.whl → 4.9.9.dev202508280044__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -107,17 +107,21 @@ def array_datatype_converter(
107
107
  elif enp.lazy.is_np_dtype(field.data_type):
108
108
  field_dtype = field.data_type
109
109
 
110
+ description = croissant_utils.extract_localized_string(
111
+ field.description, field_name='description'
112
+ )
113
+
110
114
  if len(field.array_shape_tuple) == 1:
111
- return sequence_feature.Sequence(feature, doc=field.description)
115
+ return sequence_feature.Sequence(feature, doc=description)
112
116
  elif (-1 in field.array_shape_tuple) or (field_dtype is None):
113
117
  for _ in range(len(field.array_shape_tuple)):
114
- feature = sequence_feature.Sequence(feature, doc=field.description)
118
+ feature = sequence_feature.Sequence(feature, doc=description)
115
119
  return feature
116
120
  else:
117
121
  return tensor_feature.Tensor(
118
122
  shape=field.array_shape_tuple,
119
123
  dtype=field_dtype,
120
- doc=field.description,
124
+ doc=description,
121
125
  )
122
126
 
123
127
 
@@ -151,6 +155,9 @@ def datatype_converter(
151
155
  }
152
156
 
153
157
  field_data_type = field.data_type
158
+ description = croissant_utils.extract_localized_string(
159
+ field.description, field_name='description'
160
+ )
154
161
 
155
162
  if not field_data_type:
156
163
  # Fields with sub fields are of type None.
@@ -162,12 +169,12 @@ def datatype_converter(
162
169
  )
163
170
  for subfield in field.sub_fields
164
171
  },
165
- doc=field.description,
172
+ doc=description,
166
173
  )
167
174
  else:
168
175
  feature = None
169
176
  elif field_data_type == bytes:
170
- feature = text_feature.Text(doc=field.description)
177
+ feature = text_feature.Text(doc=description)
171
178
  elif field_data_type in dtype_mapping:
172
179
  feature = dtype_mapping[field_data_type]
173
180
  elif enp.lazy.is_np_dtype(field_data_type):
@@ -175,9 +182,9 @@ def datatype_converter(
175
182
  # We return a text feature for date-time features (mlc.DataType.DATE,
176
183
  # mlc.DataType.DATETIME, and mlc.DataType.TIME).
177
184
  elif field_data_type == pd.Timestamp or field_data_type == datetime.time:
178
- feature = text_feature.Text(doc=field.description)
185
+ feature = text_feature.Text(doc=description)
179
186
  elif field_data_type == mlc.DataType.IMAGE_OBJECT:
180
- feature = image_feature.Image(doc=field.description)
187
+ feature = image_feature.Image(doc=description)
181
188
  elif field_data_type == mlc.DataType.BOUNDING_BOX:
182
189
  # TFDS uses REL_YXYX by default, but Hugging Face doesn't enforce a format.
183
190
  if bbox_format := field.source.format:
@@ -190,14 +197,14 @@ def datatype_converter(
190
197
  f'{[format.value for format in bb_utils.BBoxFormat]}'
191
198
  ) from e
192
199
  feature = bounding_boxes.BBoxFeature(
193
- doc=field.description, bbox_format=bbox_format
200
+ doc=description, bbox_format=bbox_format
194
201
  )
195
202
  elif field_data_type == mlc.DataType.AUDIO_OBJECT:
196
203
  feature = audio_feature.Audio(
197
- doc=field.description, sample_rate=field.source.sampling_rate
204
+ doc=description, sample_rate=field.source.sampling_rate
198
205
  )
199
206
  elif field_data_type == mlc.DataType.VIDEO_OBJECT:
200
- feature = video_feature.Video(doc=field.description)
207
+ feature = video_feature.Video(doc=description)
201
208
  else:
202
209
  raise ValueError(
203
210
  f'Unknown data type: {field_data_type} for field {field.id}.'
@@ -262,7 +262,12 @@ def test_datatype_converter_complex(
262
262
  subfield_types: Dict[str, Type[Any]] | None,
263
263
  ):
264
264
  actual_feature = croissant_builder.datatype_converter(mlc_field)
265
- assert actual_feature.doc.desc == mlc_field.description
265
+ expected_description = mlc_field.description
266
+ if isinstance(expected_description, dict):
267
+ expected_description = expected_description.get(
268
+ "en", next(iter(expected_description.values()))
269
+ )
270
+ assert actual_feature.doc.desc == expected_description
266
271
  assert isinstance(actual_feature, feature_type)
267
272
  if subfield_types is not None:
268
273
  for feature_name in actual_feature.keys():
@@ -271,6 +276,25 @@ def test_datatype_converter_complex(
271
276
  )
272
277
 
273
278
 
279
+ def test_datatype_converter_multilingual_description():
280
+ mlc_field = mlc.Field(
281
+ data_types=mlc.DataType.TEXT,
282
+ description={"en": "English desc", "fr": "Description française"},
283
+ )
284
+ actual_feature = croissant_builder.datatype_converter(mlc_field)
285
+ assert actual_feature.doc.desc == "English desc"
286
+
287
+ mlc_field_no_en = mlc.Field(
288
+ data_types=mlc.DataType.TEXT,
289
+ description={
290
+ "de": "Deutsche Beschreibung",
291
+ "fr": "Description française",
292
+ },
293
+ )
294
+ actual_feature_no_en = croissant_builder.datatype_converter(mlc_field_no_en)
295
+ assert actual_feature_no_en.doc.desc == "Deutsche Beschreibung"
296
+
297
+
274
298
  def test_datatype_converter_none():
275
299
  field = mlc.Field(
276
300
  name="my_field", id="my_field", description="Field with empty data type."
@@ -63,6 +63,65 @@ def get_croissant_version(version: str | None) -> str | None:
63
63
  return version
64
64
 
65
65
 
66
+ def extract_localized_string(
67
+ attribute: str | dict[str, str] | None,
68
+ language: str | None = None,
69
+ field_name: str = "text field",
70
+ ) -> str | None:
71
+ """Returns the text in the specified language from a potentially localized object.
72
+
73
+ Some attributes in Croissant (e.g., `name` and `description`) can be
74
+ localized, meaning that they can be either simple strings, or dictionaries
75
+ mapping language codes to strings (e.g., `{"en": "English Name", "fr": "Nom
76
+ français"}`). This function extracts the text in the specified language from a
77
+ potentially localized object.
78
+
79
+ Args:
80
+ attribute: The object containing the text, which can be a simple string, a
81
+ dictionary mapping language codes to strings, or None.
82
+ language: The desired language code. If None, a heuristic is used: 'en' is
83
+ preferred, otherwise the first available language in the dictionary.
84
+ field_name: The name of the field being processed (e.g., "name",
85
+ "description"), used for error messages.
86
+
87
+ Returns:
88
+ The text string in the desired language, or None if the input is None.
89
+
90
+ Raises:
91
+ ValueError: If the text_object is an empty dictionary, or if the specified
92
+ language is not found.
93
+ TypeError: If attribute is not a str, dict, or None.
94
+ """
95
+ if attribute is None:
96
+ return None
97
+ if isinstance(attribute, str):
98
+ return attribute
99
+
100
+ if not isinstance(attribute, dict):
101
+ raise TypeError(
102
+ f"{field_name} must be a string, dictionary, or None. Got"
103
+ f" {type(attribute)}"
104
+ )
105
+
106
+ if language is None:
107
+ # Try a heuristic language, e.g., 'en'.
108
+ if "en" in attribute:
109
+ return attribute["en"]
110
+ # Otherwise, take the first language in the dict.
111
+ try:
112
+ first_lang = next(iter(attribute))
113
+ return attribute[first_lang]
114
+ except StopIteration as exc:
115
+ raise ValueError(f"Dataset `{field_name}` dictionary is empty.") from exc
116
+ elif language in attribute:
117
+ return attribute[language]
118
+ else:
119
+ raise ValueError(
120
+ f"Language '{language}' not found in {field_name} keys:"
121
+ f" {list(attribute.keys())}."
122
+ )
123
+
124
+
66
125
  def get_dataset_name(dataset: mlc.Dataset, language: str | None = None) -> str:
67
126
  """Returns dataset name of the given MLcroissant dataset.
68
127
 
@@ -73,26 +132,14 @@ def get_dataset_name(dataset: mlc.Dataset, language: str | None = None) -> str:
73
132
  """
74
133
  if (url := dataset.metadata.url) and url.startswith(_HUGGINGFACE_URL_PREFIX):
75
134
  return url.removeprefix(_HUGGINGFACE_URL_PREFIX)
76
- name = dataset.metadata.name
77
- if isinstance(name, dict):
78
- if language is None:
79
- # Try a heuristic language, e.g., 'en'.
80
- if "en" in name:
81
- return name["en"]
82
- # Otherwise, take the first language in the dict.
83
- try:
84
- first_lang = next(iter(name))
85
- return name[first_lang]
86
- except StopIteration as exc:
87
- raise ValueError("Dataset name dictionary is empty.") from exc
88
- elif language not in dataset.metadata.name:
89
- raise ValueError(
90
- f"Language {language} not found in dataset names {name}."
91
- )
92
- else:
93
- return name[language]
94
- # At this point, name is not a dict anymore.
95
- return typing.cast(str, name)
135
+ name = extract_localized_string(
136
+ dataset.metadata.name, language=language, field_name="name"
137
+ )
138
+ if name is None:
139
+ # This case should ideally be prevented by mlcroissant's validation
140
+ # ensuring metadata.name is not None.
141
+ raise ValueError("Dataset name is missing.")
142
+ return name
96
143
 
97
144
 
98
145
  def get_tfds_dataset_name(
@@ -33,7 +33,131 @@ from tensorflow_datasets.core.utils import croissant_utils
33
33
  def test_get_tfds_dataset_name(croissant_name, croissant_url, tfds_name):
34
34
  metadata = mlc.Metadata(name=croissant_name, url=croissant_url)
35
35
  dataset = mlc.Dataset.from_metadata(metadata)
36
- assert croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
36
+ assert (
37
+ croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
38
+ ), f'Expected TFDS name: {tfds_name}'
39
+
40
+
41
+ @pytest.mark.parametrize(
42
+ 'attribute,language,expected_text',
43
+ [
44
+ ({'en': 'English Text', 'fr': 'Texte Français'}, None, 'English Text'),
45
+ (
46
+ {'de': 'Deutscher Text', 'fr': 'Texte Français'},
47
+ None,
48
+ 'Deutscher Text',
49
+ ),
50
+ (
51
+ {'en': 'English Text', 'fr': 'Texte Français'},
52
+ 'fr',
53
+ 'Texte Français',
54
+ ),
55
+ ('Simple Text', None, 'Simple Text'),
56
+ ('Simple Text', 'en', 'Simple Text'),
57
+ (None, None, None),
58
+ ],
59
+ )
60
+ def test_extract_localized_string(attribute, language, expected_text):
61
+ assert (
62
+ croissant_utils.extract_localized_string(attribute, language=language)
63
+ == expected_text
64
+ )
65
+
66
+
67
+ def test_extract_localized_string_raises():
68
+ # Language not found.
69
+ with pytest.raises(
70
+ ValueError,
71
+ match=r"Language 'de' not found in text field keys:",
72
+ ):
73
+ croissant_utils.extract_localized_string(
74
+ {'en': 'English Text', 'fr': 'Texte Français'}, language='de'
75
+ )
76
+
77
+ # Empty dictionary.
78
+ with pytest.raises(
79
+ ValueError, match='Dataset `text field` dictionary is empty'
80
+ ):
81
+ croissant_utils.extract_localized_string({}, language=None)
82
+
83
+ # Incorrect type.
84
+ with pytest.raises(TypeError, match='must be a string, dictionary, or None'):
85
+ croissant_utils.extract_localized_string(123)
86
+
87
+
88
+ @pytest.mark.parametrize(
89
+ 'croissant_name,language,expected_name',
90
+ [
91
+ ({'en': 'English Name', 'fr': 'Nom Français'}, None, 'English Name'),
92
+ (
93
+ {'de': 'Deutscher Name', 'fr': 'Nom Français'},
94
+ None,
95
+ 'Deutscher Name',
96
+ ),
97
+ ({'en': 'English Name', 'fr': 'Nom Français'}, 'fr', 'Nom Français'),
98
+ ('Simple Name', None, 'Simple Name'),
99
+ ],
100
+ )
101
+ def test_get_dataset_name(croissant_name, language, expected_name):
102
+ ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
103
+ metadata = mlc.Metadata(name=croissant_name, ctx=ctx, url=None)
104
+ dataset = mlc.Dataset.from_metadata(metadata)
105
+ assert (
106
+ croissant_utils.get_dataset_name(dataset, language=language)
107
+ == expected_name
108
+ )
109
+
110
+
111
+ def test_get_dataset_name_raises():
112
+ ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
113
+ # Test language not found in name.
114
+ metadata_lang_not_found = mlc.Metadata(
115
+ name={'en': 'English Name', 'fr': 'Nom Français'}, ctx=ctx, url=None
116
+ )
117
+ dataset_lang_not_found = mlc.Dataset.from_metadata(metadata_lang_not_found)
118
+ with pytest.raises(
119
+ ValueError, match=r"Language 'de' not found in name keys:"
120
+ ):
121
+ croissant_utils.get_dataset_name(dataset_lang_not_found, language='de')
122
+
123
+ # Test empty dictionary name.
124
+ metadata_empty_dict = mlc.Metadata(name={}, ctx=ctx, url=None)
125
+ dataset_empty_dict = mlc.Dataset.from_metadata(metadata_empty_dict)
126
+ with pytest.raises(ValueError, match='Dataset `name` dictionary is empty.'):
127
+ croissant_utils.get_dataset_name(dataset_empty_dict, language=None)
128
+
129
+
130
+ def test_get_dataset_name_url_precedence():
131
+ ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
132
+ # Test that URL prefix removal works and takes precedence over name.
133
+ metadata = mlc.Metadata(
134
+ name='Should Be Ignored',
135
+ ctx=ctx,
136
+ url='https://huggingface.co/datasets/user/dataset_name',
137
+ )
138
+ dataset = mlc.Dataset.from_metadata(metadata)
139
+ assert croissant_utils.get_dataset_name(dataset) == 'user/dataset_name'
140
+
141
+ # Test that URL precedence also works when the name is a dict.
142
+ metadata_dict_name = mlc.Metadata(
143
+ name={'en': 'Should Be Ignored'},
144
+ ctx=ctx,
145
+ url='https://huggingface.co/datasets/another/other_dataset',
146
+ )
147
+ dataset_dict_name = mlc.Dataset.from_metadata(metadata_dict_name)
148
+ assert (
149
+ croissant_utils.get_dataset_name(dataset_dict_name)
150
+ == 'another/other_dataset'
151
+ )
152
+
153
+ # Test that non-HuggingFace URLs don't cause name to be ignored.
154
+ metadata_other_url = mlc.Metadata(
155
+ name='Not Ignored',
156
+ ctx=ctx,
157
+ url='https://example.com/dataset',
158
+ )
159
+ dataset_other_url = mlc.Dataset.from_metadata(metadata_other_url)
160
+ assert croissant_utils.get_dataset_name(dataset_other_url) == 'Not Ignored'
37
161
 
38
162
 
39
163
  @pytest.mark.parametrize(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tfds-nightly
3
- Version: 4.9.9.dev202508260044
3
+ Version: 4.9.9.dev202508280044
4
4
  Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
5
5
  Home-page: https://github.com/tensorflow/datasets
6
6
  Download-URL: https://github.com/tensorflow/datasets/tags
@@ -141,8 +141,8 @@ tensorflow_datasets/core/data_sources/python_test.py,sha256=O3yqMPx40JlHN0uFfZPN
141
141
  tensorflow_datasets/core/dataset_builders/__init__.py,sha256=StTA3euephqDZdpTzJQgfWNqB5inZosrAhaWg2BOeio,1945
142
142
  tensorflow_datasets/core/dataset_builders/adhoc_builder.py,sha256=1a-5hVjf9t24SD9fWzDDuKoOrA-Vmydf5QxvU7ap-sI,9263
143
143
  tensorflow_datasets/core/dataset_builders/adhoc_builder_test.py,sha256=yhRwrznK78MvHeWGRggnMTiyx_SlR1z30iD5VU3Gweo,13096
144
- tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=XmnbIKiEN9OnY_RC8P7-83hbUfvtuJhbm24HfNFpiQs,17088
145
- tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=_8JVvhkv_QRUhN4GEw6V1PEryJXp8-DLzuVKzjkozgo,15370
144
+ tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=nmRIRZZGJjXtJgcvlTOsNKbqsAjosjn_M_zOu86uc04,17253
145
+ tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=ordMGrhNh-S1MjfY0QO8HUnsangqJCQCo3wCVBvMToA,16220
146
146
  tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py,sha256=Loq3qeGk1Ias-d2oT_dK47BRNgTA4LKJchNGh7aA4a0,18313
147
147
  tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py,sha256=6N3DLsry9LhDqhpleaoXrrhaGiLJMBgUlwDnAji-1fI,4389
148
148
  tensorflow_datasets/core/dataset_builders/view_builder.py,sha256=eaCtjN5Vg4rK8JD3auA4PhF9mjH5HvQ9dslDX8LbwyM,11907
@@ -245,8 +245,8 @@ tensorflow_datasets/core/utils/bool_utils_test.py,sha256=rwFRcYV0wBknvYODjeTgRDq
245
245
  tensorflow_datasets/core/utils/colormap.csv,sha256=DDayUU9R19cxhcG3fj4cFwhI46W20U7ofBG0kToUHOw,2732
246
246
  tensorflow_datasets/core/utils/conversion_utils.py,sha256=V8kFmJu38op7-8ufZvEn0fLOH8FMkjQebQ1NstIMRYo,6747
247
247
  tensorflow_datasets/core/utils/conversion_utils_test.py,sha256=rP_nbzQWzmZc_GXp3Y6TirwIGJqiQbF-JtY3B1tOuN0,5346
248
- tensorflow_datasets/core/utils/croissant_utils.py,sha256=9-_j86KKKkfxgg0aAM1zxlqCdkaC-0p9XzdWjSLmOwk,6265
249
- tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=UdkAVYDTPm1L0zmMESScurV_IMA5K3qAKmL_umeMJZI,4497
248
+ tensorflow_datasets/core/utils/croissant_utils.py,sha256=Fxx5Zeti24mMQ4BZst4W28dhxonSr1NhHGVn3W1N8j8,7986
249
+ tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=ftyUNMIkzZZB10VlNA2gS7oclLhC4eGTrGJURgzQjwM,8710
250
250
  tensorflow_datasets/core/utils/docs.py,sha256=nRE4d8wxYZav8AcT3dkiY0yplAJBx1hygWxkeKj_V7I,1412
251
251
  tensorflow_datasets/core/utils/dtype_utils.py,sha256=LvDe1hbgQem57RiqXjG9U5Roj8-1KkBMmSYTtgctx2U,3246
252
252
  tensorflow_datasets/core/utils/dtype_utils_test.py,sha256=-Qe2fQzDO5sjS36ZL-dY9w0tNrJXokIoSRFEQCv5dQA,3259
@@ -2471,10 +2471,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=PXS8DMNW-MDrT2p5oy4Ic
2471
2471
  tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=vGwSGeM8WE4Q-l0-eEE1sBojmk6YT0l1OO60AWa4Q40,719
2472
2472
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=q-vX_FBzIwsFxL4sY9vuyQ3UQD2PLM4yhUR4U6l-qao,16903
2473
2473
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=ZymHT1NkmD-pUnh3BmM3_g30c5afsWYnmqDD9dVyDSA,1778
2474
- tfds_nightly-4.9.9.dev202508260044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2475
- tfds_nightly-4.9.9.dev202508260044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2476
- tfds_nightly-4.9.9.dev202508260044.dist-info/METADATA,sha256=OlIMhl94mKrf1q3B2umMYXvVuSZqZtNXZfwqnqGm3-0,11291
2477
- tfds_nightly-4.9.9.dev202508260044.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2478
- tfds_nightly-4.9.9.dev202508260044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2479
- tfds_nightly-4.9.9.dev202508260044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2480
- tfds_nightly-4.9.9.dev202508260044.dist-info/RECORD,,
2474
+ tfds_nightly-4.9.9.dev202508280044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2475
+ tfds_nightly-4.9.9.dev202508280044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2476
+ tfds_nightly-4.9.9.dev202508280044.dist-info/METADATA,sha256=NYDGgvfbautnlGWNWg2_8pOq9eblXgaPjGU-2__DPco,11291
2477
+ tfds_nightly-4.9.9.dev202508280044.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2478
+ tfds_nightly-4.9.9.dev202508280044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2479
+ tfds_nightly-4.9.9.dev202508280044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2480
+ tfds_nightly-4.9.9.dev202508280044.dist-info/RECORD,,