tfds-nightly 4.9.9.dev202508210044__py3-none-any.whl → 4.9.9.dev202508230044__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,7 @@
15
15
 
16
16
  """Tests for croissant_builder."""
17
17
 
18
+ from typing import Any, Dict, List, Type
18
19
  import numpy as np
19
20
  import pytest
20
21
  from tensorflow_datasets import testing
@@ -146,7 +147,10 @@ def _create_mlc_field(
146
147
  ],
147
148
  )
148
149
  def test_simple_datatype_converter(
149
- mlc_field, expected_feature, int_dtype, float_dtype
150
+ mlc_field: mlc.Field,
151
+ expected_feature: type[Any],
152
+ int_dtype: np.dtype | None,
153
+ float_dtype: np.dtype | None,
150
154
  ):
151
155
  actual_feature = croissant_builder.datatype_converter(
152
156
  mlc_field,
@@ -252,7 +256,11 @@ def test_datatype_converter_bbox_with_invalid_format():
252
256
  ),
253
257
  ],
254
258
  )
255
- def test_datatype_converter_complex(mlc_field, feature_type, subfield_types):
259
+ def test_datatype_converter_complex(
260
+ mlc_field: mlc.Field,
261
+ feature_type: Type[Any],
262
+ subfield_types: Dict[str, Type[Any]] | None,
263
+ ):
256
264
  actual_feature = croissant_builder.datatype_converter(mlc_field)
257
265
  assert actual_feature.doc.desc == mlc_field.description
258
266
  assert isinstance(actual_feature, feature_type)
@@ -411,7 +419,9 @@ def test_version_converter(tmp_path):
411
419
 
412
420
 
413
421
  @pytest.fixture(name="crs_builder")
414
- def mock_croissant_dataset_builder(tmp_path, request):
422
+ def mock_croissant_dataset_builder(
423
+ tmp_path, request
424
+ ) -> croissant_builder.CroissantBuilder:
415
425
  dataset_name = request.param["dataset_name"]
416
426
  with testing.dummy_croissant_file(
417
427
  dataset_name=dataset_name,
@@ -477,7 +487,11 @@ def test_croissant_builder(crs_builder):
477
487
  indirect=["crs_builder"],
478
488
  )
479
489
  @pytest.mark.parametrize("split_name", ["train", "test"])
480
- def test_download_and_prepare(crs_builder, expected_entries, split_name):
490
+ def test_download_and_prepare(
491
+ crs_builder: croissant_builder.CroissantBuilder,
492
+ expected_entries: List[Dict[str, Any]],
493
+ split_name: str,
494
+ ):
481
495
  crs_builder.download_and_prepare()
482
496
  data_source = crs_builder.as_data_source(split=split_name)
483
497
  expected_entries = [
@@ -314,10 +314,11 @@ def builder_from_files(
314
314
  DatasetNotFoundError: If the dataset cannot be loaded.
315
315
  """
316
316
  # Find and load dataset builder.
317
- builder_dir = _find_builder_dir(name, **builder_kwargs)
317
+ copy_builder_kwargs = dict(builder_kwargs)
318
+ builder_dir = _find_builder_dir(name, **copy_builder_kwargs)
318
319
  if builder_dir is None:
319
320
  data_dirs = file_utils.list_data_dirs(
320
- given_data_dir=builder_kwargs.get('data_dir')
321
+ given_data_dir=copy_builder_kwargs.get('data_dir')
321
322
  )
322
323
  raise registered.DatasetNotFoundError(
323
324
  f'Could not find dataset files for: {name}. Make sure you have the'
@@ -325,7 +326,7 @@ def builder_from_files(
325
326
  f'and that it has been generated in: {data_dirs}. If the dataset has'
326
327
  ' configs, you might have to specify the config name.'
327
328
  )
328
- file_format = builder_kwargs.pop('file_format', None)
329
+ file_format = copy_builder_kwargs.pop('file_format', None)
329
330
  return builder_from_directory(builder_dir, file_format=file_format)
330
331
 
331
332
 
@@ -34,9 +34,16 @@ class Builder(tfds.core.GeneratorBasedBuilder):
34
34
  VERSION = tfds.core.Version("2.1.0")
35
35
  RELEASE_NOTES = {
36
36
  "1.0.0": "Initial release.",
37
- "2.0.0": "Update the dataset with valid URLs.",
38
- "2.1.0": "Update the dataset with cleaned URLs.",
37
+ "2.0.0": "[Do not use] Update the dataset with valid URLs.",
38
+ "2.1.0": (
39
+ "Update the dataset with the correct URLs. The URLs in this version"
40
+ " come from HuggingFace's dataset repo, which is curated by the same"
41
+ " author: https://huggingface.co/datasets/alexfabbri/multi_news."
42
+ ),
39
43
  }
44
+ BLOCKED_VERSIONS = tfds.core.utils.BlockedVersions(
45
+ versions={"2.0.0": "The URLs of this version are invalid."}
46
+ )
40
47
 
41
48
  def _info(self) -> tfds.core.DatasetInfo:
42
49
  """Returns the dataset metadata."""
@@ -77,9 +84,10 @@ class Builder(tfds.core.GeneratorBasedBuilder):
77
84
  ).open() as tgt_f:
78
85
  for i, (src_line, tgt_line) in enumerate(zip(src_f, tgt_f)):
79
86
  yield i, {
80
- # In original file, each line has one example and natural newline
81
- # tokens "\n" are being replaced with "NEWLINE_CHAR". Here restore
82
- # the natural newline token to avoid special vocab "NEWLINE_CHAR".
87
+ # In the original file, each line has one example and natural
88
+ # newline tokens "\n" are being replaced with "NEWLINE_CHAR"
89
+ # Here, we restore the natural newline token to avoid the special
90
+ # vocab token "NEWLINE_CHAR".
83
91
  _DOCUMENT: src_line.strip().replace("NEWLINE_CHAR", "\n"),
84
92
  _SUMMARY: tgt_line.strip().lstrip(),
85
93
  }
@@ -168,7 +168,7 @@ class DatasetDocumentation:
168
168
  )
169
169
 
170
170
  def to_details_markdown(self) -> str:
171
- """ "Markdown to be shown on the details page for the namespace."""
171
+ """Markdown to be shown on the details page for the namespace."""
172
172
  extra_links = self.format_extra_links(prefix='* ', infix='\n')
173
173
  details = self.templates.dataset_details_template.format(
174
174
  name=self.name,
@@ -194,9 +194,6 @@ class DatasetDocumentation:
194
194
 
195
195
  def documentation(self, keep_short: bool = False) -> str:
196
196
  """Returns detailed documentation for all configs of this dataset."""
197
- # TODO(weide): if e.g. the description contains markdown chars, then it
198
- # messes up the page. Try escaping backticks or using code blocks.
199
- # TODO(weide): how to format citation?
200
197
  header_template = '## {config_name}'
201
198
  template = textwrap.dedent("""
202
199
  Use the following command to load this dataset in TFDS:
@@ -207,9 +204,7 @@ class DatasetDocumentation:
207
204
 
208
205
  * **Description**:
209
206
 
210
- ```
211
207
  {description}
212
- ```
213
208
 
214
209
  * **License**: {license}
215
210
  * **Version**: {version}
@@ -364,6 +359,7 @@ class HuggingfaceDatasetDocumentation(GithubDatasetDocumentation):
364
359
  )
365
360
  )
366
361
 
362
+ version = None
367
363
  if isinstance(config['version'], dict):
368
364
  version = config['version']['version_str']
369
365
  elif isinstance(config['version'], str):
@@ -222,9 +222,9 @@ class VersionSection(Section):
222
222
  all_versions = set(tfds.core.Version(v) for v in all_versions)
223
223
  for v in sorted(all_versions): # List all available versions
224
224
  if v == builder.version: # Highlight the default version
225
- version_name = '**`{}`** (default)'.format(str(v))
225
+ version_name = f'**`{v}`** (default)'
226
226
  else:
227
- version_name = '`{}`'.format(str(v))
227
+ version_name = f'`{v}`'
228
228
  if (
229
229
  v in curr_versions # Filter versions only present in RELEASE_NOTES
230
230
  and self._nightly_doc_util
@@ -322,14 +322,14 @@ class AutocacheSection(Section):
322
322
  autocached_info_parts = []
323
323
  if always_cached:
324
324
  split_names_str = ', '.join(always_cached)
325
- autocached_info_parts.append('Yes ({})'.format(split_names_str))
325
+ autocached_info_parts.append(f'Yes ({split_names_str})')
326
326
  if never_cached:
327
327
  split_names_str = ', '.join(never_cached)
328
- autocached_info_parts.append('No ({})'.format(split_names_str))
328
+ autocached_info_parts.append(f'No ({split_names_str})')
329
329
  if unshuffle_cached:
330
330
  split_names_str = ', '.join(unshuffle_cached)
331
331
  autocached_info_parts.append(
332
- 'Only when `shuffle_files=False` ({})'.format(split_names_str)
332
+ f'Only when `shuffle_files=False` ({split_names_str})'
333
333
  )
334
334
  autocached_info = ', '.join(autocached_info_parts)
335
335
  return autocached_info
@@ -346,7 +346,7 @@ class SplitInfoSection(Section):
346
346
 
347
347
  def _get_num_examples(self, split_info):
348
348
  if split_info.num_examples:
349
- return '{:,}'.format(split_info.num_examples)
349
+ return f'{split_info.num_examples:,}'
350
350
  return 'Not computed'
351
351
 
352
352
  def get_key(self, builder: tfds.core.DatasetBuilder):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tfds-nightly
3
- Version: 4.9.9.dev202508210044
3
+ Version: 4.9.9.dev202508230044
4
4
  Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
5
5
  Home-page: https://github.com/tensorflow/datasets
6
6
  Download-URL: https://github.com/tensorflow/datasets/tags
@@ -90,7 +90,7 @@ tensorflow_datasets/core/load.py,sha256=1FQVnKwn8OVS_IgDbs9XN7aIVxQnyfrS0pI2X9dh
90
90
  tensorflow_datasets/core/load_test.py,sha256=EEa8GuSIrEbn0RcGrWS3hmmatKBqBA3QOQWpQ1WjVgA,6490
91
91
  tensorflow_datasets/core/naming.py,sha256=B_P77QDA4lkG2FUl4PrzZR0U6qqae_fLxruGBw3ZSVc,25614
92
92
  tensorflow_datasets/core/naming_test.py,sha256=SwydgLjf2Mouow1yVZlc73sb8rp4522NhkTSEmg31vo,30112
93
- tensorflow_datasets/core/read_only_builder.py,sha256=R0QIqckUjl74G7oBj1uCRm_g9e0omstDMTbbwC25B88,22146
93
+ tensorflow_datasets/core/read_only_builder.py,sha256=08BmsgEBXhX0ydGo9-9qHLTjBE6pIvAC6VMmc3b9S8U,22206
94
94
  tensorflow_datasets/core/read_only_builder_test.py,sha256=Nw2KQCHBdTW7210Um2K3SzfqAOJB1v1r2yJkzdFehWA,24174
95
95
  tensorflow_datasets/core/reader.py,sha256=s65FNOUDyAhd4OgHOSvE5lr4rnlUnOILjlVcRS6Qbhw,17345
96
96
  tensorflow_datasets/core/reader_test.py,sha256=VcbUIDtvwjTRZs-0beQIiz26TALqLM5FgBsB-Gtw4kw,17882
@@ -142,7 +142,7 @@ tensorflow_datasets/core/dataset_builders/__init__.py,sha256=StTA3euephqDZdpTzJQ
142
142
  tensorflow_datasets/core/dataset_builders/adhoc_builder.py,sha256=1a-5hVjf9t24SD9fWzDDuKoOrA-Vmydf5QxvU7ap-sI,9263
143
143
  tensorflow_datasets/core/dataset_builders/adhoc_builder_test.py,sha256=yhRwrznK78MvHeWGRggnMTiyx_SlR1z30iD5VU3Gweo,13096
144
144
  tensorflow_datasets/core/dataset_builders/croissant_builder.py,sha256=XmnbIKiEN9OnY_RC8P7-83hbUfvtuJhbm24HfNFpiQs,17088
145
- tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=42HpBr3pANVKiok4lcx6xqwf0fY7kma6WIGA8WehNSs,15072
145
+ tensorflow_datasets/core/dataset_builders/croissant_builder_test.py,sha256=_8JVvhkv_QRUhN4GEw6V1PEryJXp8-DLzuVKzjkozgo,15370
146
146
  tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py,sha256=Loq3qeGk1Ias-d2oT_dK47BRNgTA4LKJchNGh7aA4a0,18313
147
147
  tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder_test.py,sha256=6N3DLsry9LhDqhpleaoXrrhaGiLJMBgUlwDnAji-1fI,4389
148
148
  tensorflow_datasets/core/dataset_builders/view_builder.py,sha256=eaCtjN5Vg4rK8JD3auA4PhF9mjH5HvQ9dslDX8LbwyM,11907
@@ -935,7 +935,7 @@ tensorflow_datasets/datasets/multi_news/README.md,sha256=s0XL9ddJL7oNJ9r7mSG8_Hd
935
935
  tensorflow_datasets/datasets/multi_news/TAGS.txt,sha256=OPDe1XqRiLYpvmXuPX2_aMaOKIXYsl562usmTEEqkwg,449
936
936
  tensorflow_datasets/datasets/multi_news/__init__.py,sha256=eFqnTjU7s5iubj6XcKoU8lZUSHecOdnebZFm1vTkjbA,612
937
937
  tensorflow_datasets/datasets/multi_news/checksums.tsv,sha256=S-8k82snl0zj1rjjO5LW7svXRNnDuWRc72qpIcBu6WA,1031
938
- tensorflow_datasets/datasets/multi_news/multi_news_dataset_builder.py,sha256=-ZOuQ7BfPN6_DkrUddcE2qm76eUpmlo8nHizd8DsstQ,3046
938
+ tensorflow_datasets/datasets/multi_news/multi_news_dataset_builder.py,sha256=6ZeVdbrtXKIu1sjxURsoGdPLRW0SXmK0BhOnmyrwpk4,3419
939
939
  tensorflow_datasets/datasets/multi_news/multi_news_dataset_builder_test.py,sha256=5amBMQ7PKbPLeZ2kiT18tEb_Z-CMS0DasTRT6goTjXQ,1259
940
940
  tensorflow_datasets/datasets/natural_instructions/CITATIONS.bib,sha256=tcQG5eEGL_wr_5MEnZ6Q_ce2oZm6InbbRKiFqee9g7I,412
941
941
  tensorflow_datasets/datasets/natural_instructions/README.md,sha256=mceGvviI62PO5mh59sYPP_9vuuwKo0g-m7LQilP1mBI,370
@@ -2007,11 +2007,11 @@ tensorflow_datasets/scripts/deployment/export_community_datasets_test.py,sha256=
2007
2007
  tensorflow_datasets/scripts/documentation/__init__.py,sha256=Z8UWkv0wbzS4AzaLgSpYVGApYv5j57RWY0vN5Z553BQ,613
2008
2008
  tensorflow_datasets/scripts/documentation/build_catalog.py,sha256=SYJoNW-VxvL8xx85uYlFBwbr1k64HcmRBfxsj9-sdYA,8680
2009
2009
  tensorflow_datasets/scripts/documentation/build_catalog_test.py,sha256=qjnqK6lhBh-uNrjLQkEs3AbKFBo5uz_sxhhdT4ibOyA,2532
2010
- tensorflow_datasets/scripts/documentation/build_community_catalog.py,sha256=gh84xnKbL_ndR4GGbgBNLJ0nxjFwiAPLuhUvzeKPZAo,19902
2010
+ tensorflow_datasets/scripts/documentation/build_community_catalog.py,sha256=58CT0UaHxw0-mZX1a1aoW96NkszgsntDXuxS_OOZtc8,19709
2011
2011
  tensorflow_datasets/scripts/documentation/build_community_catalog_test.py,sha256=KvCmBzIePyztWPSrCqTJ_j_3puNWXxgSWSfvcMgQPgk,6352
2012
2012
  tensorflow_datasets/scripts/documentation/collection_markdown_builder.py,sha256=4Oofl2dQjlvHTir46x2K6Vpa3amwPaB-3dm43f_GcS8,7287
2013
2013
  tensorflow_datasets/scripts/documentation/collection_markdown_builder_test.py,sha256=t8KEbotAk6zH09HFvJhsrHW23uE8H3_UGLOHOFcvFeQ,3909
2014
- tensorflow_datasets/scripts/documentation/dataset_markdown_builder.py,sha256=DWILh5kphLecCt77GSHXHDXrvFZOwi5AuiT3qnUQ7Ng,25556
2014
+ tensorflow_datasets/scripts/documentation/dataset_markdown_builder.py,sha256=4W-L77aM2B4xGNQC7i1p5yHvOlIJfC0dONTM-86yeoY,25498
2015
2015
  tensorflow_datasets/scripts/documentation/dataset_markdown_builder_test.py,sha256=WsDbmAO6TYGFpn4VxF49FYvRy3ujNiysk38cWKlFC10,4219
2016
2016
  tensorflow_datasets/scripts/documentation/doc_utils.py,sha256=DKHGhF7I4ZkKcDAJWYC8mxWBBtbWD211Yv6CTqOBSTw,10325
2017
2017
  tensorflow_datasets/scripts/documentation/doc_utils_test.py,sha256=FSncjt0UCgvdN9WcvqzswwEuf7ZGmdUIRL480PzHxNw,5805
@@ -2399,7 +2399,6 @@ tensorflow_datasets/url_checksums/movie_lens.txt,sha256=DmPIlh1aM7PxNzI5sVmOGwC4
2399
2399
  tensorflow_datasets/url_checksums/movie_rationales.txt,sha256=1GweBeFRzD61ISAkTR5MNiWuujW6PQymgp7ISGBgsAU,139
2400
2400
  tensorflow_datasets/url_checksums/movielens.txt,sha256=i6St5kA_ZV6y8_mk_b47eE9RIf9Pc1VH6asv58kNPlo,731
2401
2401
  tensorflow_datasets/url_checksums/moving_mnist.txt,sha256=OtC5WoEUStRKL2I7jAwIEFF6WvZ-z_1vDGPzxpnGxXA,166
2402
- tensorflow_datasets/url_checksums/multi_news.txt,sha256=noajcrnQ_UK7sh-uRR9CJYaeBFenCmj_ZXr_5ih3Gu0,201
2403
2402
  tensorflow_datasets/url_checksums/multi_nli.txt,sha256=LXDz04hlq0b9au9DDHaX_P-KGVi4ZHWV5wEGJcvD8bA,148
2404
2403
  tensorflow_datasets/url_checksums/multi_nli_mismatch.txt,sha256=LXDz04hlq0b9au9DDHaX_P-KGVi4ZHWV5wEGJcvD8bA,148
2405
2404
  tensorflow_datasets/url_checksums/omniglot.txt,sha256=4KFU4nJ5H772d1JmRBmQ2bzoL0rJqCzMdf6XCx1Xa_c,728
@@ -2472,10 +2471,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=PXS8DMNW-MDrT2p5oy4Ic
2472
2471
  tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=vGwSGeM8WE4Q-l0-eEE1sBojmk6YT0l1OO60AWa4Q40,719
2473
2472
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=q-vX_FBzIwsFxL4sY9vuyQ3UQD2PLM4yhUR4U6l-qao,16903
2474
2473
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=ZymHT1NkmD-pUnh3BmM3_g30c5afsWYnmqDD9dVyDSA,1778
2475
- tfds_nightly-4.9.9.dev202508210044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2476
- tfds_nightly-4.9.9.dev202508210044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2477
- tfds_nightly-4.9.9.dev202508210044.dist-info/METADATA,sha256=wYNNz571WHEAExSi_IJGNjB8LPNS6tiseK1QVGIcc3s,11291
2478
- tfds_nightly-4.9.9.dev202508210044.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2479
- tfds_nightly-4.9.9.dev202508210044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2480
- tfds_nightly-4.9.9.dev202508210044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2481
- tfds_nightly-4.9.9.dev202508210044.dist-info/RECORD,,
2474
+ tfds_nightly-4.9.9.dev202508230044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2475
+ tfds_nightly-4.9.9.dev202508230044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2476
+ tfds_nightly-4.9.9.dev202508230044.dist-info/METADATA,sha256=5HX3mZXPihfkdO-s-nCCoBocQswqBhL_-dCkKIMRnlU,11291
2477
+ tfds_nightly-4.9.9.dev202508230044.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2478
+ tfds_nightly-4.9.9.dev202508230044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2479
+ tfds_nightly-4.9.9.dev202508230044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2480
+ tfds_nightly-4.9.9.dev202508230044.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- https://drive.google.com/uc?export=download&id=1vRY2wM6rlOZrf9exGTm5pXj5ExlVwJ0C 256966232 64ae4d2483b248c9664b50bacfab6821f8a3e93f382c7587686fa4a127f77626 multi-news-original-20190725T164630Z-001.zip