tfds-nightly 4.9.8.dev202505200044__py3-none-any.whl → 4.9.8.dev202505220044__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -270,6 +270,9 @@ class CroissantBuilder(
270
270
  the values should be the values to filter by. If a record matches all
271
271
  the filters, it will be included in the dataset.
272
272
  **kwargs: kwargs to pass to GeneratorBasedBuilder directly.
273
+
274
+ Raises:
275
+ ValueError: If no record sets are found in the Croissant JSON-LD.
273
276
  """
274
277
  if mapping is None:
275
278
  mapping = {}
@@ -279,11 +282,17 @@ class CroissantBuilder(
279
282
  self.name = croissant_utils.get_tfds_dataset_name(dataset)
280
283
  self.metadata = dataset.metadata
281
284
 
282
- # In TFDS, version is a mandatory attribute, while in Croissant it is only a
283
- # recommended attribute. If the version is unspecified in Croissant, we set
284
- # it to `1.0.0` in TFDS.
285
+ # The dataset version is determined using the following precedence:
286
+ # * overwrite_version (if provided).
287
+ # * The version from Croissant metadata (self.metadata.version),
288
+ # automatically converting major.minor formats to major.minor.0 (e.g., "1.2"
289
+ # becomes "1.2.0"). See croissant_utils.get_croissant_version for details.
290
+ # * Defaults to '1.0.0' if no version is specified (version is optional in
291
+ # Croissant, but mandatory in TFDS).
285
292
  self.VERSION = version_lib.Version( # pylint: disable=invalid-name
286
- overwrite_version or self.metadata.version or '1.0.0'
293
+ overwrite_version
294
+ or croissant_utils.get_croissant_version(self.metadata.version)
295
+ or '1.0.0'
287
296
  )
288
297
  self.RELEASE_NOTES = {} # pylint: disable=invalid-name
289
298
 
@@ -293,6 +302,12 @@ class CroissantBuilder(
293
302
  conversion_utils.to_tfds_name(record_set_id)
294
303
  for record_set_id in record_set_ids
295
304
  ]
305
+ if not config_names:
306
+ raise ValueError(
307
+ 'No record sets found in the Croissant JSON-LD. At least one record'
308
+ ' set is required to be able to download and prepare the dataset.'
309
+ )
310
+
296
311
  self.BUILDER_CONFIGS: list[dataset_builder.BuilderConfig] = [ # pylint: disable=invalid-name
297
312
  dataset_builder.BuilderConfig(name=config_name)
298
313
  for config_name in config_names
@@ -249,6 +249,17 @@ def test_sequence_feature_datatype_converter():
249
249
  assert isinstance(actual_feature.feature, text_feature.Text)
250
250
 
251
251
 
252
+ def test_version_converter(tmp_path):
253
+ with testing.dummy_croissant_file(version="1.0") as croissant_file:
254
+ builder = croissant_builder.CroissantBuilder(
255
+ jsonld=croissant_file,
256
+ file_format=FileFormat.ARRAY_RECORD,
257
+ disable_shuffling=True,
258
+ data_dir=tmp_path,
259
+ )
260
+ assert builder.version == "1.0.0"
261
+
262
+
252
263
  @pytest.fixture(name="crs_builder")
253
264
  def mock_croissant_dataset_builder(tmp_path, request):
254
265
  dataset_name = request.param["dataset_name"]
@@ -18,6 +18,7 @@
18
18
  from __future__ import annotations
19
19
 
20
20
  import dataclasses
21
+ import re
21
22
  import typing
22
23
 
23
24
  from tensorflow_datasets.core.utils import conversion_utils
@@ -28,6 +29,7 @@ if typing.TYPE_CHECKING:
28
29
  import mlcroissant as mlc
29
30
 
30
31
  _HUGGINGFACE_URL_PREFIX = "https://huggingface.co/datasets/"
32
+ _VERSION_REGEX_WITHOUT_PATCH = re.compile(r"^(?P<major>\d+)\.(?P<minor>\d+)$")
31
33
 
32
34
 
33
35
  @dataclasses.dataclass(frozen=True)
@@ -40,6 +42,27 @@ class SplitReference:
40
42
  reference_field: mlc.Field
41
43
 
42
44
 
45
+ def get_croissant_version(version: str | None) -> str | None:
46
+ """Returns the possibly corrected Croissant version in TFDS format.
47
+
48
+ TFDS expects versions to follow the Semantic versioning 2.0.0 syntax, but
49
+ Croissant is more lax and accepts also {major.minor}. To avoid raising errors
50
+ in these cases, we add a `0` as a patch version to the Croissant-provided
51
+ version.
52
+
53
+ Args:
54
+ version: The Croissant version.
55
+
56
+ Returns:
57
+ The Croissant version in TFDS format.
58
+ """
59
+ if not version:
60
+ return None
61
+ if _VERSION_REGEX_WITHOUT_PATCH.match(version):
62
+ return f"{version}.0"
63
+ return version
64
+
65
+
43
66
  def get_dataset_name(dataset: mlc.Dataset) -> str:
44
67
  """Returns dataset name of the given MLcroissant dataset."""
45
68
  if (url := dataset.metadata.url) and url.startswith(_HUGGINGFACE_URL_PREFIX):
@@ -36,6 +36,22 @@ def test_get_tfds_dataset_name(croissant_name, croissant_url, tfds_name):
36
36
  assert croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
37
37
 
38
38
 
39
+ @pytest.mark.parametrize(
40
+ 'croissant_version,tfds_version',
41
+ [
42
+ ('1.0', '1.0.0'),
43
+ ('1.2', '1.2.0'),
44
+ ('1.2.3', '1.2.3'),
45
+ ('1.2.3.4', '1.2.3.4'),
46
+ (None, None),
47
+ ],
48
+ )
49
+ def test_get_croissant_version(croissant_version, tfds_version):
50
+ assert (
51
+ croissant_utils.get_croissant_version(croissant_version) == tfds_version
52
+ )
53
+
54
+
39
55
  def test_get_record_set_ids():
40
56
  metadata = mlc.Metadata(
41
57
  name='dummy_dataset',
@@ -724,6 +724,7 @@ def dummy_croissant_file(
724
724
  raw_data_filename: epath.PathLike = 'raw_data.jsonl',
725
725
  croissant_filename: epath.PathLike = 'croissant.json',
726
726
  split_names: Sequence[str] | None = None,
727
+ version: str = '1.2.0',
727
728
  ) -> Iterator[epath.Path]:
728
729
  """Yields temporary path to a dummy Croissant file.
729
730
 
@@ -746,6 +747,7 @@ def dummy_croissant_file(
746
747
  If None, the function will create a split record set with the default
747
748
  split names `train` and `test`. If `split_names` is defined, the `split`
748
749
  key in the entries must match one of the split names.
750
+ version: The version of the dataset. Defaults to `1.2.0`.
749
751
  """
750
752
  if entries is None:
751
753
  entries = [
@@ -874,7 +876,7 @@ def dummy_croissant_file(
874
876
  url='https://dummy_url',
875
877
  distribution=distribution,
876
878
  record_sets=record_sets,
877
- version='1.2.0',
879
+ version=version,
878
880
  license='Public',
879
881
  )
880
882
  # Write Croissant JSON-LD to tempdir.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tfds-nightly
3
- Version: 4.9.8.dev202505200044
3
+ Version: 4.9.8.dev202505220044
4
4
  Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
5
5
  Home-page: https://github.com/tensorflow/datasets
6
6
  Download-URL: https://github.com/tensorflow/datasets/tags
@@ -52,7 +52,7 @@ Requires-Dist: jupyter; extra == "tests-all"
52
52
  Requires-Dist: pytest; extra == "tests-all"
53
53
  Requires-Dist: pytest-shard; extra == "tests-all"
54
54
  Requires-Dist: pytest-xdist; extra == "tests-all"
55
- Requires-Dist: apache-beam; extra == "tests-all"
55
+ Requires-Dist: apache-beam<2.65.0; extra == "tests-all"
56
56
  Requires-Dist: conllu; extra == "tests-all"
57
57
  Requires-Dist: mlcroissant>=1.0.9; extra == "tests-all"
58
58
  Requires-Dist: pandas; extra == "tests-all"
@@ -60,15 +60,15 @@ Requires-Dist: pydub; extra == "tests-all"
60
60
  Requires-Dist: pyyaml; extra == "tests-all"
61
61
  Requires-Dist: tensorflow-io[tensorflow]; python_version < "3.12" and extra == "tests-all"
62
62
  Requires-Dist: scipy; extra == "tests-all"
63
- Requires-Dist: apache-beam; extra == "tests-all"
63
+ Requires-Dist: apache-beam<2.65.0; extra == "tests-all"
64
64
  Requires-Dist: gcsfs; extra == "tests-all"
65
65
  Requires-Dist: zarr<3.0.0; extra == "tests-all"
66
- Requires-Dist: apache-beam; extra == "tests-all"
66
+ Requires-Dist: apache-beam<2.65.0; extra == "tests-all"
67
67
  Requires-Dist: gcld3; extra == "tests-all"
68
68
  Requires-Dist: langdetect; extra == "tests-all"
69
69
  Requires-Dist: nltk==3.8.1; extra == "tests-all"
70
70
  Requires-Dist: tldextract; extra == "tests-all"
71
- Requires-Dist: apache-beam; extra == "tests-all"
71
+ Requires-Dist: apache-beam<2.65.0; extra == "tests-all"
72
72
  Requires-Dist: matplotlib; extra == "tests-all"
73
73
  Requires-Dist: Pillow; extra == "tests-all"
74
74
  Requires-Dist: pydub; extra == "tests-all"
@@ -96,8 +96,8 @@ Requires-Dist: scipy; extra == "tests-all"
96
96
  Requires-Dist: scipy; extra == "tests-all"
97
97
  Requires-Dist: pandas; extra == "tests-all"
98
98
  Requires-Dist: Pillow; extra == "tests-all"
99
- Requires-Dist: apache-beam; extra == "tests-all"
100
- Requires-Dist: apache-beam; extra == "tests-all"
99
+ Requires-Dist: apache-beam<2.65.0; extra == "tests-all"
100
+ Requires-Dist: apache-beam<2.65.0; extra == "tests-all"
101
101
  Requires-Dist: mwparserfromhell; extra == "tests-all"
102
102
  Requires-Dist: mwxml; extra == "tests-all"
103
103
  Requires-Dist: bs4; extra == "tests-all"
@@ -110,7 +110,7 @@ Requires-Dist: jupyter; extra == "dev"
110
110
  Requires-Dist: pytest; extra == "dev"
111
111
  Requires-Dist: pytest-shard; extra == "dev"
112
112
  Requires-Dist: pytest-xdist; extra == "dev"
113
- Requires-Dist: apache-beam; extra == "dev"
113
+ Requires-Dist: apache-beam<2.65.0; extra == "dev"
114
114
  Requires-Dist: conllu; extra == "dev"
115
115
  Requires-Dist: mlcroissant>=1.0.9; extra == "dev"
116
116
  Requires-Dist: pandas; extra == "dev"
@@ -175,18 +175,18 @@ Requires-Dist: datasets; extra == "huggingface"
175
175
  Provides-Extra: aflw2k3d
176
176
  Requires-Dist: scipy; extra == "aflw2k3d"
177
177
  Provides-Extra: beir
178
- Requires-Dist: apache-beam; extra == "beir"
178
+ Requires-Dist: apache-beam<2.65.0; extra == "beir"
179
179
  Provides-Extra: ble-wind-field
180
180
  Requires-Dist: gcsfs; extra == "ble-wind-field"
181
181
  Requires-Dist: zarr<3.0.0; extra == "ble-wind-field"
182
182
  Provides-Extra: c4
183
- Requires-Dist: apache-beam; extra == "c4"
183
+ Requires-Dist: apache-beam<2.65.0; extra == "c4"
184
184
  Requires-Dist: gcld3; extra == "c4"
185
185
  Requires-Dist: langdetect; extra == "c4"
186
186
  Requires-Dist: nltk==3.8.1; extra == "c4"
187
187
  Requires-Dist: tldextract; extra == "c4"
188
188
  Provides-Extra: c4-wsrs
189
- Requires-Dist: apache-beam; extra == "c4-wsrs"
189
+ Requires-Dist: apache-beam<2.65.0; extra == "c4-wsrs"
190
190
  Provides-Extra: cats-vs-dogs
191
191
  Requires-Dist: matplotlib; extra == "cats-vs-dogs"
192
192
  Provides-Extra: colorectal-histology
@@ -242,9 +242,9 @@ Requires-Dist: pandas; extra == "wake-vision"
242
242
  Provides-Extra: wider-face
243
243
  Requires-Dist: Pillow; extra == "wider-face"
244
244
  Provides-Extra: wiki-dialog
245
- Requires-Dist: apache-beam; extra == "wiki-dialog"
245
+ Requires-Dist: apache-beam<2.65.0; extra == "wiki-dialog"
246
246
  Provides-Extra: wikipedia
247
- Requires-Dist: apache-beam; extra == "wikipedia"
247
+ Requires-Dist: apache-beam<2.65.0; extra == "wikipedia"
248
248
  Requires-Dist: mwparserfromhell; extra == "wikipedia"
249
249
  Requires-Dist: mwxml; extra == "wikipedia"
250
250
  Provides-Extra: wsc273
@@ -141,8 +141,8 @@ tensorflow_datasets/core/data_sources/python_test.py,sha256=Rg_ui2fA75532ma134VU
141
141
  tensorflow_datasets/core/dataset_builders/__init__.py,sha256=KULxN1WzBvUwTw3K3SHFM-sE4wbm9UC6cX0tOm0jEZg,1945
142
142
  tensorflow_datasets/core/dataset_builders/adhoc_builder.py,sha256=fS6OxRtI3DKdcsniJagTHYfopuoFs55cpW0_gPbhnkY,9216
143
143
  tensorflow_datasets/core/dataset_builders/adhoc_builder_test.py,sha256=PAQXfp_qGs4KaAzYbTMm8y87HvnCUca035Jrafjq3zw,13096
144
- tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=g3LZEeyozAdjgKd6mlAOdBa6C6VBEeu1vnikWUYXxzc,15287
145
- tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=S_dVDe_fUt-fIRTh8e7beJtLbUMcc5_OBIQ-Ji1OpUg,10199
144
+ tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=1_-GgOu_5ONlInvjZZFGGR44mcxlp_ZdVX9ttC83Z-s,15913
145
+ tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=Jc1EZI357oS1Vfszgy8E9uLMNoFh-kzQjcAz0g1OTN8,10538
146
146
  tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py,sha256=si5OoTPeftUyPj9w5OuYKjwfx1ByZghBo6vfEKQQGhA,18313
147
147
  tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py,sha256=UCvcr6DRwcWLu7x3o_PDyLJBI3r84ylfYMCO3qhD-8w,4389
148
148
  tensorflow_datasets/core/dataset_builders/view_builder.py,sha256=HhLOqlkWwOSvr_8wzwwTRapgXY8yUg_bbBGasTb3cQw,11907
@@ -245,8 +245,8 @@ tensorflow_datasets/core/utils/bool_utils_test.py,sha256=4x3vqQvc8PtC-ptrUpu5TWx
245
245
  tensorflow_datasets/core/utils/colormap.csv,sha256=DDayUU9R19cxhcG3fj4cFwhI46W20U7ofBG0kToUHOw,2732
246
246
  tensorflow_datasets/core/utils/conversion_utils.py,sha256=f_ZW71wx_dyg5LJlmuaU_TtiApfC2IqsG6fsNR8SOQc,6340
247
247
  tensorflow_datasets/core/utils/conversion_utils_test.py,sha256=Ipjqju_81kweBgqOYjoHErRxBWSbskQhSs6HTxRtlso,5043
248
- tensorflow_datasets/core/utils/croissant_utils.py,sha256=-hPQQjNQ0f4zG7qKBG38ak4OfYZCZsJUiPkuMYb5ISw,4482
249
- tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=R5lQnwTXml9hSOrHwTav7CvAZQvAP3EaRxQJJ3MXOlk,4123
248
+ tensorflow_datasets/core/utils/croissant_utils.py,sha256=WVmrKqmjYf1u6BOQWMs0yGic85aVxDOZtIPbcDsAL1o,5181
249
+ tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=QbfAv-Ktrib4ARsR7fIoMXTsezfK1UAsgOQWTocjtaA,4497
250
250
  tensorflow_datasets/core/utils/docs.py,sha256=e8sua3eiqkDsPJUbYrgp_FUwnnHwMyMzTMNWKMHn464,1549
251
251
  tensorflow_datasets/core/utils/dtype_utils.py,sha256=E3Zwe_Hv6BiN_qv8VI4I1oHFd3MVgs2NCcimXMoahxI,3246
252
252
  tensorflow_datasets/core/utils/dtype_utils_test.py,sha256=LAE2idzXz0BiNHELm3rmCdhwKIstkWj8ZRmztslcLi0,3259
@@ -2124,7 +2124,7 @@ tensorflow_datasets/testing/mocking.py,sha256=52uOXhIEZM-Aet6fRAW13REVSmq7mWT_kA
2124
2124
  tensorflow_datasets/testing/mocking_test.py,sha256=OZQAAzOQLOEwGJaNZhEYg0Dn0Vi9a8xKswjbSWBeYao,13796
2125
2125
  tensorflow_datasets/testing/test_case.py,sha256=5BHRT99KJWx7Zzn8fkgsq09h2lkrPONyWEZdt_S9aGY,2554
2126
2126
  tensorflow_datasets/testing/test_case_in_context.py,sha256=Yf4ehrvtCnxR4HKN1c8Qi6prURWfLuxvwQieKfnMD8g,1872
2127
- tensorflow_datasets/testing/test_utils.py,sha256=UlbaYTVRITXWyztCwbGcw5vezEo1McXnGFDw53W0Zwo,26646
2127
+ tensorflow_datasets/testing/test_utils.py,sha256=FouQSpUAuE-ZmiA0QHy_VuQjdabrT_gi7F2odjB4m04,26736
2128
2128
  tensorflow_datasets/testing/test_utils_test.py,sha256=KKKlkXRtp-FidZ_B1taONT-sS46nyzTNcUDdtGS0FMo,9576
2129
2129
  tensorflow_datasets/testing/version_test.py,sha256=PsD-paPb4DkrMvOdskmRvqyXXDp9hOQFDPYoQ9k2osA,2791
2130
2130
  tensorflow_datasets/text/__init__.py,sha256=IvEJrcFmicw-ptk5p3aZkJnXOnpCW61u-k2NBgNgQX8,5319
@@ -2460,10 +2460,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=DJ687VN9hAp6SLXnr_P12
2460
2460
  tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=38tJQ73VHz8vOJn-AyZh2we2YJucbSRIgmgcrsC6bQM,719
2461
2461
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=k-7YD1SGr5bASfdR2_09rrqz-8cpWdIcBWWEXhCvzuk,16903
2462
2462
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=vLlluBW77ASNVC4ix7t8idkSUBI6q1-B7zmRV_ICCQM,1778
2463
- tfds_nightly-4.9.8.dev202505200044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2464
- tfds_nightly-4.9.8.dev202505200044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2465
- tfds_nightly-4.9.8.dev202505200044.dist-info/METADATA,sha256=_W0gTuzS5sXNQIVzrwiSY16_J96yk3DHQuxCgKGhauQ,11879
2466
- tfds_nightly-4.9.8.dev202505200044.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
2467
- tfds_nightly-4.9.8.dev202505200044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2468
- tfds_nightly-4.9.8.dev202505200044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2469
- tfds_nightly-4.9.8.dev202505200044.dist-info/RECORD,,
2463
+ tfds_nightly-4.9.8.dev202505220044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2464
+ tfds_nightly-4.9.8.dev202505220044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2465
+ tfds_nightly-4.9.8.dev202505220044.dist-info/METADATA,sha256=hG0sC5hrzN2O7iYLzt9_ZLcwzz2Dv3xT6IKvGjnhReM,11963
2466
+ tfds_nightly-4.9.8.dev202505220044.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
2467
+ tfds_nightly-4.9.8.dev202505220044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2468
+ tfds_nightly-4.9.8.dev202505220044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2469
+ tfds_nightly-4.9.8.dev202505220044.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.7.1)
2
+ Generator: setuptools (80.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5