tfds-nightly 4.9.8.dev202505210044__py3-none-any.whl → 4.9.8.dev202505220044__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -282,11 +282,17 @@ class CroissantBuilder(
282
282
  self.name = croissant_utils.get_tfds_dataset_name(dataset)
283
283
  self.metadata = dataset.metadata
284
284
 
285
- # In TFDS, version is a mandatory attribute, while in Croissant it is only a
286
- # recommended attribute. If the version is unspecified in Croissant, we set
287
- # it to `1.0.0` in TFDS.
285
+ # The dataset version is determined using the following precedence:
286
+ # * overwrite_version (if provided).
287
+ # * The version from Croissant metadata (self.metadata.version),
288
+ # automatically converting major.minor formats to major.minor.0 (e.g., "1.2"
289
+ # becomes "1.2.0"). See croissant_utils.get_croissant_version for details.
290
+ # * Defaults to '1.0.0' if no version is specified (version is optional in
291
+ # Croissant, but mandatory in TFDS).
288
292
  self.VERSION = version_lib.Version( # pylint: disable=invalid-name
289
- overwrite_version or self.metadata.version or '1.0.0'
293
+ overwrite_version
294
+ or croissant_utils.get_croissant_version(self.metadata.version)
295
+ or '1.0.0'
290
296
  )
291
297
  self.RELEASE_NOTES = {} # pylint: disable=invalid-name
292
298
 
@@ -249,6 +249,17 @@ def test_sequence_feature_datatype_converter():
249
249
  assert isinstance(actual_feature.feature, text_feature.Text)
250
250
 
251
251
 
252
+ def test_version_converter(tmp_path):
253
+ with testing.dummy_croissant_file(version="1.0") as croissant_file:
254
+ builder = croissant_builder.CroissantBuilder(
255
+ jsonld=croissant_file,
256
+ file_format=FileFormat.ARRAY_RECORD,
257
+ disable_shuffling=True,
258
+ data_dir=tmp_path,
259
+ )
260
+ assert builder.version == "1.0.0"
261
+
262
+
252
263
  @pytest.fixture(name="crs_builder")
253
264
  def mock_croissant_dataset_builder(tmp_path, request):
254
265
  dataset_name = request.param["dataset_name"]
@@ -18,6 +18,7 @@
18
18
  from __future__ import annotations
19
19
 
20
20
  import dataclasses
21
+ import re
21
22
  import typing
22
23
 
23
24
  from tensorflow_datasets.core.utils import conversion_utils
@@ -28,6 +29,7 @@ if typing.TYPE_CHECKING:
28
29
  import mlcroissant as mlc
29
30
 
30
31
  _HUGGINGFACE_URL_PREFIX = "https://huggingface.co/datasets/"
32
+ _VERSION_REGEX_WITHOUT_PATCH = re.compile(r"^(?P<major>\d+)\.(?P<minor>\d+)$")
31
33
 
32
34
 
33
35
  @dataclasses.dataclass(frozen=True)
@@ -40,6 +42,27 @@ class SplitReference:
40
42
  reference_field: mlc.Field
41
43
 
42
44
 
45
+ def get_croissant_version(version: str | None) -> str | None:
46
+ """Returns the possibly corrected Croissant version in TFDS format.
47
+
48
+ TFDS expects versions to follow the Semantic versioning 2.0.0 syntax, but
49
+ Croissant is more lax and accepts also {major.minor}. To avoid raising errors
50
+ in these cases, we add a `0` as a patch version to the Croissant-provided
51
+ version.
52
+
53
+ Args:
54
+ version: The Croissant version.
55
+
56
+ Returns:
57
+ The Croissant version in TFDS format.
58
+ """
59
+ if not version:
60
+ return None
61
+ if _VERSION_REGEX_WITHOUT_PATCH.match(version):
62
+ return f"{version}.0"
63
+ return version
64
+
65
+
43
66
  def get_dataset_name(dataset: mlc.Dataset) -> str:
44
67
  """Returns dataset name of the given MLcroissant dataset."""
45
68
  if (url := dataset.metadata.url) and url.startswith(_HUGGINGFACE_URL_PREFIX):
@@ -36,6 +36,22 @@ def test_get_tfds_dataset_name(croissant_name, croissant_url, tfds_name):
36
36
  assert croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
37
37
 
38
38
 
39
+ @pytest.mark.parametrize(
40
+ 'croissant_version,tfds_version',
41
+ [
42
+ ('1.0', '1.0.0'),
43
+ ('1.2', '1.2.0'),
44
+ ('1.2.3', '1.2.3'),
45
+ ('1.2.3.4', '1.2.3.4'),
46
+ (None, None),
47
+ ],
48
+ )
49
+ def test_get_croissant_version(croissant_version, tfds_version):
50
+ assert (
51
+ croissant_utils.get_croissant_version(croissant_version) == tfds_version
52
+ )
53
+
54
+
39
55
  def test_get_record_set_ids():
40
56
  metadata = mlc.Metadata(
41
57
  name='dummy_dataset',
@@ -724,6 +724,7 @@ def dummy_croissant_file(
724
724
  raw_data_filename: epath.PathLike = 'raw_data.jsonl',
725
725
  croissant_filename: epath.PathLike = 'croissant.json',
726
726
  split_names: Sequence[str] | None = None,
727
+ version: str = '1.2.0',
727
728
  ) -> Iterator[epath.Path]:
728
729
  """Yields temporary path to a dummy Croissant file.
729
730
 
@@ -746,6 +747,7 @@ def dummy_croissant_file(
746
747
  If None, the function will create a split record set with the default
747
748
  split names `train` and `test`. If `split_names` is defined, the `split`
748
749
  key in the entries must match one of the split names.
750
+ version: The version of the dataset. Defaults to `1.2.0`.
749
751
  """
750
752
  if entries is None:
751
753
  entries = [
@@ -874,7 +876,7 @@ def dummy_croissant_file(
874
876
  url='https://dummy_url',
875
877
  distribution=distribution,
876
878
  record_sets=record_sets,
877
- version='1.2.0',
879
+ version=version,
878
880
  license='Public',
879
881
  )
880
882
  # Write Croissant JSON-LD to tempdir.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tfds-nightly
3
- Version: 4.9.8.dev202505210044
3
+ Version: 4.9.8.dev202505220044
4
4
  Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
5
5
  Home-page: https://github.com/tensorflow/datasets
6
6
  Download-URL: https://github.com/tensorflow/datasets/tags
@@ -141,8 +141,8 @@ tensorflow_datasets/core/data_sources/python_test.py,sha256=Rg_ui2fA75532ma134VU
141
141
  tensorflow_datasets/core/dataset_builders/__init__.py,sha256=KULxN1WzBvUwTw3K3SHFM-sE4wbm9UC6cX0tOm0jEZg,1945
142
142
  tensorflow_datasets/core/dataset_builders/adhoc_builder.py,sha256=fS6OxRtI3DKdcsniJagTHYfopuoFs55cpW0_gPbhnkY,9216
143
143
  tensorflow_datasets/core/dataset_builders/adhoc_builder_test.py,sha256=PAQXfp_qGs4KaAzYbTMm8y87HvnCUca035Jrafjq3zw,13096
144
- tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=-ulqR9Z89WzQESqjJ2i5AjZZ0wOrwktZR_82gFyq55g,15586
145
- tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=S_dVDe_fUt-fIRTh8e7beJtLbUMcc5_OBIQ-Ji1OpUg,10199
144
+ tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=1_-GgOu_5ONlInvjZZFGGR44mcxlp_ZdVX9ttC83Z-s,15913
145
+ tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=Jc1EZI357oS1Vfszgy8E9uLMNoFh-kzQjcAz0g1OTN8,10538
146
146
  tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py,sha256=si5OoTPeftUyPj9w5OuYKjwfx1ByZghBo6vfEKQQGhA,18313
147
147
  tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py,sha256=UCvcr6DRwcWLu7x3o_PDyLJBI3r84ylfYMCO3qhD-8w,4389
148
148
  tensorflow_datasets/core/dataset_builders/view_builder.py,sha256=HhLOqlkWwOSvr_8wzwwTRapgXY8yUg_bbBGasTb3cQw,11907
@@ -245,8 +245,8 @@ tensorflow_datasets/core/utils/bool_utils_test.py,sha256=4x3vqQvc8PtC-ptrUpu5TWx
245
245
  tensorflow_datasets/core/utils/colormap.csv,sha256=DDayUU9R19cxhcG3fj4cFwhI46W20U7ofBG0kToUHOw,2732
246
246
  tensorflow_datasets/core/utils/conversion_utils.py,sha256=f_ZW71wx_dyg5LJlmuaU_TtiApfC2IqsG6fsNR8SOQc,6340
247
247
  tensorflow_datasets/core/utils/conversion_utils_test.py,sha256=Ipjqju_81kweBgqOYjoHErRxBWSbskQhSs6HTxRtlso,5043
248
- tensorflow_datasets/core/utils/croissant_utils.py,sha256=-hPQQjNQ0f4zG7qKBG38ak4OfYZCZsJUiPkuMYb5ISw,4482
249
- tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=R5lQnwTXml9hSOrHwTav7CvAZQvAP3EaRxQJJ3MXOlk,4123
248
+ tensorflow_datasets/core/utils/croissant_utils.py,sha256=WVmrKqmjYf1u6BOQWMs0yGic85aVxDOZtIPbcDsAL1o,5181
249
+ tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=QbfAv-Ktrib4ARsR7fIoMXTsezfK1UAsgOQWTocjtaA,4497
250
250
  tensorflow_datasets/core/utils/docs.py,sha256=e8sua3eiqkDsPJUbYrgp_FUwnnHwMyMzTMNWKMHn464,1549
251
251
  tensorflow_datasets/core/utils/dtype_utils.py,sha256=E3Zwe_Hv6BiN_qv8VI4I1oHFd3MVgs2NCcimXMoahxI,3246
252
252
  tensorflow_datasets/core/utils/dtype_utils_test.py,sha256=LAE2idzXz0BiNHELm3rmCdhwKIstkWj8ZRmztslcLi0,3259
@@ -2124,7 +2124,7 @@ tensorflow_datasets/testing/mocking.py,sha256=52uOXhIEZM-Aet6fRAW13REVSmq7mWT_kA
2124
2124
  tensorflow_datasets/testing/mocking_test.py,sha256=OZQAAzOQLOEwGJaNZhEYg0Dn0Vi9a8xKswjbSWBeYao,13796
2125
2125
  tensorflow_datasets/testing/test_case.py,sha256=5BHRT99KJWx7Zzn8fkgsq09h2lkrPONyWEZdt_S9aGY,2554
2126
2126
  tensorflow_datasets/testing/test_case_in_context.py,sha256=Yf4ehrvtCnxR4HKN1c8Qi6prURWfLuxvwQieKfnMD8g,1872
2127
- tensorflow_datasets/testing/test_utils.py,sha256=UlbaYTVRITXWyztCwbGcw5vezEo1McXnGFDw53W0Zwo,26646
2127
+ tensorflow_datasets/testing/test_utils.py,sha256=FouQSpUAuE-ZmiA0QHy_VuQjdabrT_gi7F2odjB4m04,26736
2128
2128
  tensorflow_datasets/testing/test_utils_test.py,sha256=KKKlkXRtp-FidZ_B1taONT-sS46nyzTNcUDdtGS0FMo,9576
2129
2129
  tensorflow_datasets/testing/version_test.py,sha256=PsD-paPb4DkrMvOdskmRvqyXXDp9hOQFDPYoQ9k2osA,2791
2130
2130
  tensorflow_datasets/text/__init__.py,sha256=IvEJrcFmicw-ptk5p3aZkJnXOnpCW61u-k2NBgNgQX8,5319
@@ -2460,10 +2460,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=DJ687VN9hAp6SLXnr_P12
2460
2460
  tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=38tJQ73VHz8vOJn-AyZh2we2YJucbSRIgmgcrsC6bQM,719
2461
2461
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=k-7YD1SGr5bASfdR2_09rrqz-8cpWdIcBWWEXhCvzuk,16903
2462
2462
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=vLlluBW77ASNVC4ix7t8idkSUBI6q1-B7zmRV_ICCQM,1778
2463
- tfds_nightly-4.9.8.dev202505210044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2464
- tfds_nightly-4.9.8.dev202505210044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2465
- tfds_nightly-4.9.8.dev202505210044.dist-info/METADATA,sha256=dpem5js_Zav1R-lDwja__6ikFBPzJl3VbvKKl1tGNAk,11963
2466
- tfds_nightly-4.9.8.dev202505210044.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
2467
- tfds_nightly-4.9.8.dev202505210044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2468
- tfds_nightly-4.9.8.dev202505210044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2469
- tfds_nightly-4.9.8.dev202505210044.dist-info/RECORD,,
2463
+ tfds_nightly-4.9.8.dev202505220044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2464
+ tfds_nightly-4.9.8.dev202505220044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2465
+ tfds_nightly-4.9.8.dev202505220044.dist-info/METADATA,sha256=hG0sC5hrzN2O7iYLzt9_ZLcwzz2Dv3xT6IKvGjnhReM,11963
2466
+ tfds_nightly-4.9.8.dev202505220044.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
2467
+ tfds_nightly-4.9.8.dev202505220044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2468
+ tfds_nightly-4.9.8.dev202505220044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2469
+ tfds_nightly-4.9.8.dev202505220044.dist-info/RECORD,,