tfds-nightly 4.9.8.dev202505210044__py3-none-any.whl → 4.9.8.dev202505230044__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorflow_datasets/core/dataset_builders/croissant_builder.py +10 -4
- tensorflow_datasets/core/dataset_builders/croissant_builder_test.py +11 -0
- tensorflow_datasets/core/utils/croissant_utils.py +23 -0
- tensorflow_datasets/core/utils/croissant_utils_test.py +16 -0
- tensorflow_datasets/testing/test_utils.py +3 -1
- {tfds_nightly-4.9.8.dev202505210044.dist-info → tfds_nightly-4.9.8.dev202505230044.dist-info}/METADATA +1 -1
- {tfds_nightly-4.9.8.dev202505210044.dist-info → tfds_nightly-4.9.8.dev202505230044.dist-info}/RECORD +12 -12
- {tfds_nightly-4.9.8.dev202505210044.dist-info → tfds_nightly-4.9.8.dev202505230044.dist-info}/WHEEL +0 -0
- {tfds_nightly-4.9.8.dev202505210044.dist-info → tfds_nightly-4.9.8.dev202505230044.dist-info}/entry_points.txt +0 -0
- {tfds_nightly-4.9.8.dev202505210044.dist-info → tfds_nightly-4.9.8.dev202505230044.dist-info}/licenses/AUTHORS +0 -0
- {tfds_nightly-4.9.8.dev202505210044.dist-info → tfds_nightly-4.9.8.dev202505230044.dist-info}/licenses/LICENSE +0 -0
- {tfds_nightly-4.9.8.dev202505210044.dist-info → tfds_nightly-4.9.8.dev202505230044.dist-info}/top_level.txt +0 -0
@@ -282,11 +282,17 @@ class CroissantBuilder(
|
|
282
282
|
self.name = croissant_utils.get_tfds_dataset_name(dataset)
|
283
283
|
self.metadata = dataset.metadata
|
284
284
|
|
285
|
-
#
|
286
|
-
#
|
287
|
-
#
|
285
|
+
# The dataset version is determined using the following precedence:
|
286
|
+
# * overwrite_version (if provided).
|
287
|
+
# * The version from Croissant metadata (self.metadata.version),
|
288
|
+
# automatically converting major.minor formats to major.minor.0 (e.g., "1.2"
|
289
|
+
# becomes "1.2.0"). See croissant_utils.get_croissant_version for details.
|
290
|
+
# * Defaults to '1.0.0' if no version is specified (version is optional in
|
291
|
+
# Croissant, but mandatory in TFDS).
|
288
292
|
self.VERSION = version_lib.Version( # pylint: disable=invalid-name
|
289
|
-
overwrite_version
|
293
|
+
overwrite_version
|
294
|
+
or croissant_utils.get_croissant_version(self.metadata.version)
|
295
|
+
or '1.0.0'
|
290
296
|
)
|
291
297
|
self.RELEASE_NOTES = {} # pylint: disable=invalid-name
|
292
298
|
|
@@ -249,6 +249,17 @@ def test_sequence_feature_datatype_converter():
|
|
249
249
|
assert isinstance(actual_feature.feature, text_feature.Text)
|
250
250
|
|
251
251
|
|
252
|
+
def test_version_converter(tmp_path):
|
253
|
+
with testing.dummy_croissant_file(version="1.0") as croissant_file:
|
254
|
+
builder = croissant_builder.CroissantBuilder(
|
255
|
+
jsonld=croissant_file,
|
256
|
+
file_format=FileFormat.ARRAY_RECORD,
|
257
|
+
disable_shuffling=True,
|
258
|
+
data_dir=tmp_path,
|
259
|
+
)
|
260
|
+
assert builder.version == "1.0.0"
|
261
|
+
|
262
|
+
|
252
263
|
@pytest.fixture(name="crs_builder")
|
253
264
|
def mock_croissant_dataset_builder(tmp_path, request):
|
254
265
|
dataset_name = request.param["dataset_name"]
|
@@ -18,6 +18,7 @@
|
|
18
18
|
from __future__ import annotations
|
19
19
|
|
20
20
|
import dataclasses
|
21
|
+
import re
|
21
22
|
import typing
|
22
23
|
|
23
24
|
from tensorflow_datasets.core.utils import conversion_utils
|
@@ -28,6 +29,7 @@ if typing.TYPE_CHECKING:
|
|
28
29
|
import mlcroissant as mlc
|
29
30
|
|
30
31
|
_HUGGINGFACE_URL_PREFIX = "https://huggingface.co/datasets/"
|
32
|
+
_VERSION_REGEX_WITHOUT_PATCH = re.compile(r"^(?P<major>\d+)\.(?P<minor>\d+)$")
|
31
33
|
|
32
34
|
|
33
35
|
@dataclasses.dataclass(frozen=True)
|
@@ -40,6 +42,27 @@ class SplitReference:
|
|
40
42
|
reference_field: mlc.Field
|
41
43
|
|
42
44
|
|
45
|
+
def get_croissant_version(version: str | None) -> str | None:
|
46
|
+
"""Returns the possibly corrected Croissant version in TFDS format.
|
47
|
+
|
48
|
+
TFDS expects versions to follow the Semantic versioning 2.0.0 syntax, but
|
49
|
+
Croissant is more lax and accepts also {major.minor}. To avoid raising errors
|
50
|
+
in these cases, we add a `0` as a patch version to the Croissant-provided
|
51
|
+
version.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
version: The Croissant version.
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
The Croissant version in TFDS format.
|
58
|
+
"""
|
59
|
+
if not version:
|
60
|
+
return None
|
61
|
+
if _VERSION_REGEX_WITHOUT_PATCH.match(version):
|
62
|
+
return f"{version}.0"
|
63
|
+
return version
|
64
|
+
|
65
|
+
|
43
66
|
def get_dataset_name(dataset: mlc.Dataset) -> str:
|
44
67
|
"""Returns dataset name of the given MLcroissant dataset."""
|
45
68
|
if (url := dataset.metadata.url) and url.startswith(_HUGGINGFACE_URL_PREFIX):
|
@@ -36,6 +36,22 @@ def test_get_tfds_dataset_name(croissant_name, croissant_url, tfds_name):
|
|
36
36
|
assert croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
|
37
37
|
|
38
38
|
|
39
|
+
@pytest.mark.parametrize(
|
40
|
+
'croissant_version,tfds_version',
|
41
|
+
[
|
42
|
+
('1.0', '1.0.0'),
|
43
|
+
('1.2', '1.2.0'),
|
44
|
+
('1.2.3', '1.2.3'),
|
45
|
+
('1.2.3.4', '1.2.3.4'),
|
46
|
+
(None, None),
|
47
|
+
],
|
48
|
+
)
|
49
|
+
def test_get_croissant_version(croissant_version, tfds_version):
|
50
|
+
assert (
|
51
|
+
croissant_utils.get_croissant_version(croissant_version) == tfds_version
|
52
|
+
)
|
53
|
+
|
54
|
+
|
39
55
|
def test_get_record_set_ids():
|
40
56
|
metadata = mlc.Metadata(
|
41
57
|
name='dummy_dataset',
|
@@ -724,6 +724,7 @@ def dummy_croissant_file(
|
|
724
724
|
raw_data_filename: epath.PathLike = 'raw_data.jsonl',
|
725
725
|
croissant_filename: epath.PathLike = 'croissant.json',
|
726
726
|
split_names: Sequence[str] | None = None,
|
727
|
+
version: str = '1.2.0',
|
727
728
|
) -> Iterator[epath.Path]:
|
728
729
|
"""Yields temporary path to a dummy Croissant file.
|
729
730
|
|
@@ -746,6 +747,7 @@ def dummy_croissant_file(
|
|
746
747
|
If None, the function will create a split record set with the default
|
747
748
|
split names `train` and `test`. If `split_names` is defined, the `split`
|
748
749
|
key in the entries must match one of the split names.
|
750
|
+
version: The version of the dataset. Defaults to `1.2.0`.
|
749
751
|
"""
|
750
752
|
if entries is None:
|
751
753
|
entries = [
|
@@ -874,7 +876,7 @@ def dummy_croissant_file(
|
|
874
876
|
url='https://dummy_url',
|
875
877
|
distribution=distribution,
|
876
878
|
record_sets=record_sets,
|
877
|
-
version=
|
879
|
+
version=version,
|
878
880
|
license='Public',
|
879
881
|
)
|
880
882
|
# Write Croissant JSON-LD to tempdir.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: tfds-nightly
|
3
|
-
Version: 4.9.8.
|
3
|
+
Version: 4.9.8.dev202505230044
|
4
4
|
Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
|
5
5
|
Home-page: https://github.com/tensorflow/datasets
|
6
6
|
Download-URL: https://github.com/tensorflow/datasets/tags
|
{tfds_nightly-4.9.8.dev202505210044.dist-info → tfds_nightly-4.9.8.dev202505230044.dist-info}/RECORD
RENAMED
@@ -141,8 +141,8 @@ tensorflow_datasets/core/data_sources/python_test.py,sha256=Rg_ui2fA75532ma134VU
|
|
141
141
|
tensorflow_datasets/core/dataset_builders/__init__.py,sha256=KULxN1WzBvUwTw3K3SHFM-sE4wbm9UC6cX0tOm0jEZg,1945
|
142
142
|
tensorflow_datasets/core/dataset_builders/adhoc_builder.py,sha256=fS6OxRtI3DKdcsniJagTHYfopuoFs55cpW0_gPbhnkY,9216
|
143
143
|
tensorflow_datasets/core/dataset_builders/adhoc_builder_test.py,sha256=PAQXfp_qGs4KaAzYbTMm8y87HvnCUca035Jrafjq3zw,13096
|
144
|
-
tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256
|
145
|
-
tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=
|
144
|
+
tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=1_-GgOu_5ONlInvjZZFGGR44mcxlp_ZdVX9ttC83Z-s,15913
|
145
|
+
tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=Jc1EZI357oS1Vfszgy8E9uLMNoFh-kzQjcAz0g1OTN8,10538
|
146
146
|
tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py,sha256=si5OoTPeftUyPj9w5OuYKjwfx1ByZghBo6vfEKQQGhA,18313
|
147
147
|
tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py,sha256=UCvcr6DRwcWLu7x3o_PDyLJBI3r84ylfYMCO3qhD-8w,4389
|
148
148
|
tensorflow_datasets/core/dataset_builders/view_builder.py,sha256=HhLOqlkWwOSvr_8wzwwTRapgXY8yUg_bbBGasTb3cQw,11907
|
@@ -245,8 +245,8 @@ tensorflow_datasets/core/utils/bool_utils_test.py,sha256=4x3vqQvc8PtC-ptrUpu5TWx
|
|
245
245
|
tensorflow_datasets/core/utils/colormap.csv,sha256=DDayUU9R19cxhcG3fj4cFwhI46W20U7ofBG0kToUHOw,2732
|
246
246
|
tensorflow_datasets/core/utils/conversion_utils.py,sha256=f_ZW71wx_dyg5LJlmuaU_TtiApfC2IqsG6fsNR8SOQc,6340
|
247
247
|
tensorflow_datasets/core/utils/conversion_utils_test.py,sha256=Ipjqju_81kweBgqOYjoHErRxBWSbskQhSs6HTxRtlso,5043
|
248
|
-
tensorflow_datasets/core/utils/croissant_utils.py,sha256
|
249
|
-
tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=
|
248
|
+
tensorflow_datasets/core/utils/croissant_utils.py,sha256=WVmrKqmjYf1u6BOQWMs0yGic85aVxDOZtIPbcDsAL1o,5181
|
249
|
+
tensorflow_datasets/core/utils/croissant_utils_test.py,sha256=QbfAv-Ktrib4ARsR7fIoMXTsezfK1UAsgOQWTocjtaA,4497
|
250
250
|
tensorflow_datasets/core/utils/docs.py,sha256=e8sua3eiqkDsPJUbYrgp_FUwnnHwMyMzTMNWKMHn464,1549
|
251
251
|
tensorflow_datasets/core/utils/dtype_utils.py,sha256=E3Zwe_Hv6BiN_qv8VI4I1oHFd3MVgs2NCcimXMoahxI,3246
|
252
252
|
tensorflow_datasets/core/utils/dtype_utils_test.py,sha256=LAE2idzXz0BiNHELm3rmCdhwKIstkWj8ZRmztslcLi0,3259
|
@@ -2124,7 +2124,7 @@ tensorflow_datasets/testing/mocking.py,sha256=52uOXhIEZM-Aet6fRAW13REVSmq7mWT_kA
|
|
2124
2124
|
tensorflow_datasets/testing/mocking_test.py,sha256=OZQAAzOQLOEwGJaNZhEYg0Dn0Vi9a8xKswjbSWBeYao,13796
|
2125
2125
|
tensorflow_datasets/testing/test_case.py,sha256=5BHRT99KJWx7Zzn8fkgsq09h2lkrPONyWEZdt_S9aGY,2554
|
2126
2126
|
tensorflow_datasets/testing/test_case_in_context.py,sha256=Yf4ehrvtCnxR4HKN1c8Qi6prURWfLuxvwQieKfnMD8g,1872
|
2127
|
-
tensorflow_datasets/testing/test_utils.py,sha256=
|
2127
|
+
tensorflow_datasets/testing/test_utils.py,sha256=FouQSpUAuE-ZmiA0QHy_VuQjdabrT_gi7F2odjB4m04,26736
|
2128
2128
|
tensorflow_datasets/testing/test_utils_test.py,sha256=KKKlkXRtp-FidZ_B1taONT-sS46nyzTNcUDdtGS0FMo,9576
|
2129
2129
|
tensorflow_datasets/testing/version_test.py,sha256=PsD-paPb4DkrMvOdskmRvqyXXDp9hOQFDPYoQ9k2osA,2791
|
2130
2130
|
tensorflow_datasets/text/__init__.py,sha256=IvEJrcFmicw-ptk5p3aZkJnXOnpCW61u-k2NBgNgQX8,5319
|
@@ -2460,10 +2460,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=DJ687VN9hAp6SLXnr_P12
|
|
2460
2460
|
tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=38tJQ73VHz8vOJn-AyZh2we2YJucbSRIgmgcrsC6bQM,719
|
2461
2461
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=k-7YD1SGr5bASfdR2_09rrqz-8cpWdIcBWWEXhCvzuk,16903
|
2462
2462
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=vLlluBW77ASNVC4ix7t8idkSUBI6q1-B7zmRV_ICCQM,1778
|
2463
|
-
tfds_nightly-4.9.8.
|
2464
|
-
tfds_nightly-4.9.8.
|
2465
|
-
tfds_nightly-4.9.8.
|
2466
|
-
tfds_nightly-4.9.8.
|
2467
|
-
tfds_nightly-4.9.8.
|
2468
|
-
tfds_nightly-4.9.8.
|
2469
|
-
tfds_nightly-4.9.8.
|
2463
|
+
tfds_nightly-4.9.8.dev202505230044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
|
2464
|
+
tfds_nightly-4.9.8.dev202505230044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
2465
|
+
tfds_nightly-4.9.8.dev202505230044.dist-info/METADATA,sha256=H-v_aTgZoBRFgtfAeTzHNjDwFHbou29eoECUDNSbyJY,11963
|
2466
|
+
tfds_nightly-4.9.8.dev202505230044.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
2467
|
+
tfds_nightly-4.9.8.dev202505230044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
|
2468
|
+
tfds_nightly-4.9.8.dev202505230044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
|
2469
|
+
tfds_nightly-4.9.8.dev202505230044.dist-info/RECORD,,
|
{tfds_nightly-4.9.8.dev202505210044.dist-info → tfds_nightly-4.9.8.dev202505230044.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|