PyPI - tfds-nightly - Versions diffs - 4.9.9.dev202508260044__py3-none-any.whl → 4.9.9.dev202508280044__py3-none-any.whl - Mend

tfds-nightly 4.9.9.dev202508260044py3-none-any.whl → 4.9.9.dev202508280044py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

tensorflow_datasets/core/dataset_builders/croissant_builder.py CHANGED Viewed

@@ -107,17 +107,21 @@ def array_datatype_converter(
   elif enp.lazy.is_np_dtype(field.data_type):
     field_dtype = field.data_type
+  description = croissant_utils.extract_localized_string(
+      field.description, field_name='description'
+  )
   if len(field.array_shape_tuple) == 1:
-    return sequence_feature.Sequence(feature, doc=field.description)
+    return sequence_feature.Sequence(feature, doc=description)
   elif (-1 in field.array_shape_tuple) or (field_dtype is None):
     for _ in range(len(field.array_shape_tuple)):
-      feature = sequence_feature.Sequence(feature, doc=field.description)
+      feature = sequence_feature.Sequence(feature, doc=description)
     return feature
   else:
     return tensor_feature.Tensor(
         shape=field.array_shape_tuple,
         dtype=field_dtype,
-        doc=field.description,
+        doc=description,
     )
@@ -151,6 +155,9 @@ def datatype_converter(
   }
   field_data_type = field.data_type
+  description = croissant_utils.extract_localized_string(
+      field.description, field_name='description'
+  )
   if not field_data_type:
     # Fields with sub fields are of type None.
@@ -162,12 +169,12 @@ def datatype_converter(
               )
               for subfield in field.sub_fields
           },
-          doc=field.description,
+          doc=description,
       )
     else:
       feature = None
   elif field_data_type == bytes:
-    feature = text_feature.Text(doc=field.description)
+    feature = text_feature.Text(doc=description)
   elif field_data_type in dtype_mapping:
     feature = dtype_mapping[field_data_type]
   elif enp.lazy.is_np_dtype(field_data_type):
@@ -175,9 +182,9 @@ def datatype_converter(
   # We return a text feature for date-time features (mlc.DataType.DATE,
   # mlc.DataType.DATETIME, and mlc.DataType.TIME).
   elif field_data_type == pd.Timestamp or field_data_type == datetime.time:
-    feature = text_feature.Text(doc=field.description)
+    feature = text_feature.Text(doc=description)
   elif field_data_type == mlc.DataType.IMAGE_OBJECT:
-    feature = image_feature.Image(doc=field.description)
+    feature = image_feature.Image(doc=description)
   elif field_data_type == mlc.DataType.BOUNDING_BOX:
     # TFDS uses REL_YXYX by default, but Hugging Face doesn't enforce a format.
     if bbox_format := field.source.format:
@@ -190,14 +197,14 @@ def datatype_converter(
             f'{[format.value for format in bb_utils.BBoxFormat]}'
         ) from e
     feature = bounding_boxes.BBoxFeature(
-        doc=field.description, bbox_format=bbox_format
+        doc=description, bbox_format=bbox_format
     )
   elif field_data_type == mlc.DataType.AUDIO_OBJECT:
     feature = audio_feature.Audio(
-        doc=field.description, sample_rate=field.source.sampling_rate
+        doc=description, sample_rate=field.source.sampling_rate
     )
   elif field_data_type == mlc.DataType.VIDEO_OBJECT:
-    feature = video_feature.Video(doc=field.description)
+    feature = video_feature.Video(doc=description)
   else:
     raise ValueError(
         f'Unknown data type: {field_data_type} for field {field.id}.'

tensorflow_datasets/core/dataset_builders/croissant_builder_test.py CHANGED Viewed

@@ -262,7 +262,12 @@ def test_datatype_converter_complex(
     subfield_types: Dict[str, Type[Any]] | None,
 ):
   actual_feature = croissant_builder.datatype_converter(mlc_field)
-  assert actual_feature.doc.desc == mlc_field.description
+  expected_description = mlc_field.description
+  if isinstance(expected_description, dict):
+    expected_description = expected_description.get(
+        "en", next(iter(expected_description.values()))
+    )
+  assert actual_feature.doc.desc == expected_description
   assert isinstance(actual_feature, feature_type)
   if subfield_types is not None:
     for feature_name in actual_feature.keys():
@@ -271,6 +276,25 @@ def test_datatype_converter_complex(
       )
+def test_datatype_converter_multilingual_description():
+  mlc_field = mlc.Field(
+      data_types=mlc.DataType.TEXT,
+      description={"en": "English desc", "fr": "Description française"},
+  )
+  actual_feature = croissant_builder.datatype_converter(mlc_field)
+  assert actual_feature.doc.desc == "English desc"
+  mlc_field_no_en = mlc.Field(
+      data_types=mlc.DataType.TEXT,
+      description={
+          "de": "Deutsche Beschreibung",
+          "fr": "Description française",
+      },
+  )
+  actual_feature_no_en = croissant_builder.datatype_converter(mlc_field_no_en)
+  assert actual_feature_no_en.doc.desc == "Deutsche Beschreibung"
 def test_datatype_converter_none():
   field = mlc.Field(
       name="my_field", id="my_field", description="Field with empty data type."

tensorflow_datasets/core/utils/croissant_utils.py CHANGED Viewed

@@ -63,6 +63,65 @@ def get_croissant_version(version: str | None) -> str | None:
   return version
+def extract_localized_string(
+    attribute: str | dict[str, str] | None,
+    language: str | None = None,
+    field_name: str = "text field",
+) -> str | None:
+  """Returns the text in the specified language from a potentially localized object.
+  Some attributes in Croissant (e.g., `name` and `description`) can be
+  localized, meaning that they can be either simple strings, or dictionaries
+  mapping language codes to strings (e.g., `{"en": "English Name", "fr": "Nom
+  français"}`). This function extracts the text in the specified language from a
+  potentially localized object.
+  Args:
+    attribute: The object containing the text, which can be a simple string, a
+      dictionary mapping language codes to strings, or None.
+    language: The desired language code. If None, a heuristic is used: 'en' is
+      preferred, otherwise the first available language in the dictionary.
+    field_name: The name of the field being processed (e.g., "name",
+      "description"), used for error messages.
+  Returns:
+    The text string in the desired language, or None if the input is None.
+  Raises:
+    ValueError: If the text_object is an empty dictionary, or if the specified
+      language is not found.
+    TypeError: If attribute is not a str, dict, or None.
+  """
+  if attribute is None:
+    return None
+  if isinstance(attribute, str):
+    return attribute
+  if not isinstance(attribute, dict):
+    raise TypeError(
+        f"{field_name} must be a string, dictionary, or None. Got"
+        f" {type(attribute)}"
+    )
+  if language is None:
+    # Try a heuristic language, e.g., 'en'.
+    if "en" in attribute:
+      return attribute["en"]
+    # Otherwise, take the first language in the dict.
+    try:
+      first_lang = next(iter(attribute))
+      return attribute[first_lang]
+    except StopIteration as exc:
+      raise ValueError(f"Dataset `{field_name}` dictionary is empty.") from exc
+  elif language in attribute:
+    return attribute[language]
+  else:
+    raise ValueError(
+        f"Language '{language}' not found in {field_name} keys:"
+        f" {list(attribute.keys())}."
+    )
 def get_dataset_name(dataset: mlc.Dataset, language: str | None = None) -> str:
   """Returns dataset name of the given MLcroissant dataset.
@@ -73,26 +132,14 @@ def get_dataset_name(dataset: mlc.Dataset, language: str | None = None) -> str:
   """
   if (url := dataset.metadata.url) and url.startswith(_HUGGINGFACE_URL_PREFIX):
     return url.removeprefix(_HUGGINGFACE_URL_PREFIX)
-  name = dataset.metadata.name
-  if isinstance(name, dict):
-    if language is None:
-      # Try a heuristic language, e.g., 'en'.
-      if "en" in name:
-        return name["en"]
-      # Otherwise, take the first language in the dict.
-      try:
-        first_lang = next(iter(name))
-        return name[first_lang]
-      except StopIteration as exc:
-        raise ValueError("Dataset name dictionary is empty.") from exc
-    elif language not in dataset.metadata.name:
-      raise ValueError(
-          f"Language {language} not found in dataset names {name}."
-      )
-    else:
-      return name[language]
-  # At this point, name is not a dict anymore.
-  return typing.cast(str, name)
+  name = extract_localized_string(
+      dataset.metadata.name, language=language, field_name="name"
+  )
+  if name is None:
+    # This case should ideally be prevented by mlcroissant's validation
+    # ensuring metadata.name is not None.
+    raise ValueError("Dataset name is missing.")
+  return name
 def get_tfds_dataset_name(

tensorflow_datasets/core/utils/croissant_utils_test.py CHANGED Viewed

@@ -33,7 +33,131 @@ from tensorflow_datasets.core.utils import croissant_utils
 def test_get_tfds_dataset_name(croissant_name, croissant_url, tfds_name):
   metadata = mlc.Metadata(name=croissant_name, url=croissant_url)
   dataset = mlc.Dataset.from_metadata(metadata)
-  assert croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
+  assert (
+      croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
+  ), f'Expected TFDS name: {tfds_name}'
+@pytest.mark.parametrize(
+    'attribute,language,expected_text',
+    [
+        ({'en': 'English Text', 'fr': 'Texte Français'}, None, 'English Text'),
+        (
+            {'de': 'Deutscher Text', 'fr': 'Texte Français'},
+            None,
+            'Deutscher Text',
+        ),
+        (
+            {'en': 'English Text', 'fr': 'Texte Français'},
+            'fr',
+            'Texte Français',
+        ),
+        ('Simple Text', None, 'Simple Text'),
+        ('Simple Text', 'en', 'Simple Text'),
+        (None, None, None),
+    ],
+)
+def test_extract_localized_string(attribute, language, expected_text):
+  assert (
+      croissant_utils.extract_localized_string(attribute, language=language)
+      == expected_text
+  )
+def test_extract_localized_string_raises():
+  # Language not found.
+  with pytest.raises(
+      ValueError,
+      match=r"Language 'de' not found in text field keys:",
+  ):
+    croissant_utils.extract_localized_string(
+        {'en': 'English Text', 'fr': 'Texte Français'}, language='de'
+    )
+  # Empty dictionary.
+  with pytest.raises(
+      ValueError, match='Dataset `text field` dictionary is empty'
+  ):
+    croissant_utils.extract_localized_string({}, language=None)
+  # Incorrect type.
+  with pytest.raises(TypeError, match='must be a string, dictionary, or None'):
+    croissant_utils.extract_localized_string(123)
+@pytest.mark.parametrize(
+    'croissant_name,language,expected_name',
+    [
+        ({'en': 'English Name', 'fr': 'Nom Français'}, None, 'English Name'),
+        (
+            {'de': 'Deutscher Name', 'fr': 'Nom Français'},
+            None,
+            'Deutscher Name',
+        ),
+        ({'en': 'English Name', 'fr': 'Nom Français'}, 'fr', 'Nom Français'),
+        ('Simple Name', None, 'Simple Name'),
+    ],
+)
+def test_get_dataset_name(croissant_name, language, expected_name):
+  ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
+  metadata = mlc.Metadata(name=croissant_name, ctx=ctx, url=None)
+  dataset = mlc.Dataset.from_metadata(metadata)
+  assert (
+      croissant_utils.get_dataset_name(dataset, language=language)
+      == expected_name
+  )
+def test_get_dataset_name_raises():
+  ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
+  # Test language not found in name.
+  metadata_lang_not_found = mlc.Metadata(
+      name={'en': 'English Name', 'fr': 'Nom Français'}, ctx=ctx, url=None
+  )
+  dataset_lang_not_found = mlc.Dataset.from_metadata(metadata_lang_not_found)
+  with pytest.raises(
+      ValueError, match=r"Language 'de' not found in name keys:"
+  ):
+    croissant_utils.get_dataset_name(dataset_lang_not_found, language='de')
+  # Test empty dictionary name.
+  metadata_empty_dict = mlc.Metadata(name={}, ctx=ctx, url=None)
+  dataset_empty_dict = mlc.Dataset.from_metadata(metadata_empty_dict)
+  with pytest.raises(ValueError, match='Dataset `name` dictionary is empty.'):
+    croissant_utils.get_dataset_name(dataset_empty_dict, language=None)
+def test_get_dataset_name_url_precedence():
+  ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
+  # Test that URL prefix removal works and takes precedence over name.
+  metadata = mlc.Metadata(
+      name='Should Be Ignored',
+      ctx=ctx,
+      url='https://huggingface.co/datasets/user/dataset_name',
+  )
+  dataset = mlc.Dataset.from_metadata(metadata)
+  assert croissant_utils.get_dataset_name(dataset) == 'user/dataset_name'
+  # Test that URL precedence also works when the name is a dict.
+  metadata_dict_name = mlc.Metadata(
+      name={'en': 'Should Be Ignored'},
+      ctx=ctx,
+      url='https://huggingface.co/datasets/another/other_dataset',
+  )
+  dataset_dict_name = mlc.Dataset.from_metadata(metadata_dict_name)
+  assert (
+      croissant_utils.get_dataset_name(dataset_dict_name)
+      == 'another/other_dataset'
+  )
+  # Test that non-HuggingFace URLs don't cause name to be ignored.
+  metadata_other_url = mlc.Metadata(
+      name='Not Ignored',
+      ctx=ctx,
+      url='https://example.com/dataset',
+  )
+  dataset_other_url = mlc.Dataset.from_metadata(metadata_other_url)
+  assert croissant_utils.get_dataset_name(dataset_other_url) == 'Not Ignored'
 @pytest.mark.parametrize(

{tfds_nightly-4.9.9.dev202508260044.dist-info → tfds_nightly-4.9.9.dev202508280044.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: tfds-nightly
-Version: 4.9.9.dev202508260044
+Version: 4.9.9.dev202508280044
 Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
 Home-page: https://github.com/tensorflow/datasets
 Download-URL: https://github.com/tensorflow/datasets/tags

{tfds_nightly-4.9.9.dev202508260044.dist-info → tfds_nightly-4.9.9.dev202508280044.dist-info}/RECORD RENAMED Viewed

@@ -141,8 +141,8 @@ tensorflow_datasets/core/data_sources/python_test.py,sha256=O3yqMPx40JlHN0uFfZPN
 tensorflow_datasets/core/dataset_builders/__init__.py,sha256=StTA3euephqDZdpTzJQgfWNqB5inZosrAhaWg2BOeio,1945
 tensorflow_datasets/core/dataset_builders/adhoc_builder.py,sha256=1a-5hVjf9t24SD9fWzDDuKoOrA-Vmydf5QxvU7ap-sI,9263
 tensorflow_datasets/core/dataset_builders/adhoc_builder_test.py,sha256=yhRwrznK78MvHeWGRggnMTiyx_SlR1z30iD5VU3Gweo,13096
-tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=XmnbIKiEN9OnY_RC8P7-83hbUfvtuJhbm24HfNFpiQs,17088
-tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=_8JVvhkv_QRUhN4GEw6V1PEryJXp8-DLzuVKzjkozgo,15370
+tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=nmRIRZZGJjXtJgcvlTOsNKbqsAjosjn_M_zOu86uc04,17253
+tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=ordMGrhNh-S1MjfY0QO8HUnsangqJCQCo3wCVBvMToA,16220
 tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py,sha256=Loq3qeGk1Ias-d2oT_dK47BRNgTA4LKJchNGh7aA4a0,18313
 tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py,sha256=6N3DLsry9LhDqhpleaoXrrhaGiLJMBgUlwDnAji-1fI,4389
 tensorflow_datasets/core/dataset_builders/view_builder.py,sha256=eaCtjN5Vg4rK8JD3auA4PhF9mjH5HvQ9dslDX8LbwyM,11907
@@ -245,8 +245,8 @@ tensorflow_datasets/core/utils/bool_utils_test.py,sha256=rwFRcYV0wBknvYODjeTgRDq
 tensorflow_datasets/core/utils/colormap.csv,sha256=DDayUU9R19cxhcG3fj4cFwhI46W20U7ofBG0kToUHOw,2732
 tensorflow_datasets/core/utils/conversion_utils.py,sha256=V8kFmJu38op7-8ufZvEn0fLOH8FMkjQebQ1NstIMRYo,6747
 tensorflow_datasets/core/utils/conversion_utils_test.py,sha256=rP_nbzQWzmZc_GXp3Y6TirwIGJqiQbF-JtY3B1tOuN0,5346
-tensorflow_datasets/core/utils/croissant_utils.py,sha256=9-_j86KKKkfxgg0aAM1zxlqCdkaC-0p9XzdWjSLmOwk,6265
-tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=UdkAVYDTPm1L0zmMESScurV_IMA5K3qAKmL_umeMJZI,4497
+tensorflow_datasets/core/utils/croissant_utils.py,sha256=Fxx5Zeti24mMQ4BZst4W28dhxonSr1NhHGVn3W1N8j8,7986
+tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=ftyUNMIkzZZB10VlNA2gS7oclLhC4eGTrGJURgzQjwM,8710
 tensorflow_datasets/core/utils/docs.py,sha256=nRE4d8wxYZav8AcT3dkiY0yplAJBx1hygWxkeKj_V7I,1412
 tensorflow_datasets/core/utils/dtype_utils.py,sha256=LvDe1hbgQem57RiqXjG9U5Roj8-1KkBMmSYTtgctx2U,3246
 tensorflow_datasets/core/utils/dtype_utils_test.py,sha256=-Qe2fQzDO5sjS36ZL-dY9w0tNrJXokIoSRFEQCv5dQA,3259
@@ -2471,10 +2471,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=PXS8DMNW-MDrT2p5oy4Ic
 tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=vGwSGeM8WE4Q-l0-eEE1sBojmk6YT0l1OO60AWa4Q40,719
 tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=q-vX_FBzIwsFxL4sY9vuyQ3UQD2PLM4yhUR4U6l-qao,16903
 tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=ZymHT1NkmD-pUnh3BmM3_g30c5afsWYnmqDD9dVyDSA,1778
-tfds_nightly-4.9.9.dev202508260044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
-tfds_nightly-4.9.9.dev202508260044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
-tfds_nightly-4.9.9.dev202508260044.dist-info/METADATA,sha256=OlIMhl94mKrf1q3B2umMYXvVuSZqZtNXZfwqnqGm3-0,11291
-tfds_nightly-4.9.9.dev202508260044.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-tfds_nightly-4.9.9.dev202508260044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
-tfds_nightly-4.9.9.dev202508260044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
-tfds_nightly-4.9.9.dev202508260044.dist-info/RECORD,,
+tfds_nightly-4.9.9.dev202508280044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
+tfds_nightly-4.9.9.dev202508280044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
+tfds_nightly-4.9.9.dev202508280044.dist-info/METADATA,sha256=NYDGgvfbautnlGWNWg2_8pOq9eblXgaPjGU-2__DPco,11291
+tfds_nightly-4.9.9.dev202508280044.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+tfds_nightly-4.9.9.dev202508280044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
+tfds_nightly-4.9.9.dev202508280044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
+tfds_nightly-4.9.9.dev202508280044.dist-info/RECORD,,

{tfds_nightly-4.9.9.dev202508260044.dist-info → tfds_nightly-4.9.9.dev202508280044.dist-info}/WHEEL RENAMED Viewed

File without changes

{tfds_nightly-4.9.9.dev202508260044.dist-info → tfds_nightly-4.9.9.dev202508280044.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{tfds_nightly-4.9.9.dev202508260044.dist-info → tfds_nightly-4.9.9.dev202508280044.dist-info}/licenses/AUTHORS RENAMED Viewed

File without changes

{tfds_nightly-4.9.9.dev202508260044.dist-info → tfds_nightly-4.9.9.dev202508280044.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{tfds_nightly-4.9.9.dev202508260044.dist-info → tfds_nightly-4.9.9.dev202508280044.dist-info}/top_level.txt RENAMED Viewed

File without changes

tfds-nightly 4.9.9.dev202508260044__py3-none-any.whl → 4.9.9.dev202508280044__py3-none-any.whl

tfds-nightly 4.9.9.dev202508260044py3-none-any.whl → 4.9.9.dev202508280044py3-none-any.whl