tfds-nightly 4.9.9.dev202508110045__py3-none-any.whl → 4.9.9.dev202508120044__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,121 +17,121 @@
17
17
 
18
18
  import argparse
19
19
  from collections.abc import Iterator
20
+ import dataclasses
20
21
  import functools
21
22
  import importlib
22
23
  import itertools
23
24
  import json
24
25
  import multiprocessing
25
26
  import os
27
+ import typing
26
28
  from typing import Any, Type
27
29
 
28
30
  from absl import logging
31
+ import simple_parsing
29
32
  import tensorflow_datasets as tfds
30
33
  from tensorflow_datasets.scripts.cli import cli_utils
31
34
 
32
- # pylint: disable=logging-fstring-interpolation
33
35
 
34
-
35
- def register_subparser(parsers: argparse._SubParsersAction) -> None: # pylint: disable=protected-access
36
- """Add subparser for `build` command.
37
-
38
- New flags should be added to `cli_utils` module.
39
-
40
- Args:
41
- parsers: The subparsers object to add the parser to.
36
+ @dataclasses.dataclass(frozen=True, kw_only=True)
37
+ class Args:
38
+ """CLI arguments for building datasets.
39
+
40
+ Attributes:
41
+ positional_datasets: Name(s) of the dataset(s) to build. Default to current
42
+ dir. See https://www.tensorflow.org/datasets/cli for accepted values.
43
+ datasets: Datasets can also be provided as keyword argument.
44
+ debug: Debug & tests options. Use --pdb to enter post-mortem debugging mode
45
+ if an exception is raised.
46
+ paths: Path options.
47
+ generation: Generation options.
48
+ publishing: Publishing options.
49
+ automation: Automation options.
42
50
  """
43
- build_parser = parsers.add_parser(
44
- 'build', help='Commands for downloading and preparing datasets.'
45
- )
46
- build_parser.add_argument(
47
- 'datasets', # Positional arguments
48
- type=str,
51
+
52
+ positional_datasets: list[str] = simple_parsing.field(
53
+ positional=True,
49
54
  nargs='*',
50
- help=(
51
- 'Name(s) of the dataset(s) to build. Default to current dir. '
52
- 'See https://www.tensorflow.org/datasets/cli for accepted values.'
53
- ),
54
- )
55
- build_parser.add_argument( # Also accept keyword arguments
56
- '--datasets',
57
- type=str,
58
- nargs='+',
59
- dest='datasets_keyword',
60
- help='Datasets can also be provided as keyword argument.',
55
+ default_factory=list,
56
+ # Need to explicitly set metavar for command-line help.
57
+ metavar='datasets',
61
58
  )
59
+ datasets: list[str] = simple_parsing.field(nargs='*', default_factory=list)
62
60
 
63
- cli_utils.add_debug_argument_group(build_parser)
64
- cli_utils.add_path_argument_group(build_parser)
65
- cli_utils.add_generation_argument_group(build_parser)
66
- cli_utils.add_publish_argument_group(build_parser)
67
-
68
- # **** Automation options ****
69
- automation_group = build_parser.add_argument_group(
70
- 'Automation', description='Used by automated scripts.'
61
+ debug: cli_utils.DebugOptions = cli_utils.DebugOptions()
62
+ paths: cli_utils.PathOptions = simple_parsing.field(
63
+ default_factory=cli_utils.PathOptions
64
+ )
65
+ generation: cli_utils.GenerationOptions = simple_parsing.field(
66
+ default_factory=cli_utils.GenerationOptions
71
67
  )
72
- automation_group.add_argument(
73
- '--exclude_datasets',
74
- type=str,
75
- help=(
76
- 'If set, generate all datasets except the one defined here. '
77
- 'Comma separated list of datasets to exclude. '
78
- ),
68
+ publishing: cli_utils.PublishingOptions = simple_parsing.field(
69
+ default_factory=cli_utils.PublishingOptions
79
70
  )
80
- automation_group.add_argument(
81
- '--experimental_latest_version',
82
- action='store_true',
83
- help=(
84
- 'Build the latest Version(experiments=...) available rather than '
85
- 'default version.'
86
- ),
71
+ automation: cli_utils.AutomationOptions = simple_parsing.field(
72
+ default_factory=cli_utils.AutomationOptions
87
73
  )
88
74
 
89
- build_parser.set_defaults(subparser_fn=_build_datasets)
75
+ def execute(self) -> None:
76
+ """Build the given datasets."""
77
+ # Eventually register additional datasets imports
78
+ if self.generation.imports:
79
+ list(
80
+ importlib.import_module(m) for m in self.generation.imports.split(',')
81
+ )
90
82
 
83
+ # Select datasets to generate
84
+ datasets = self.positional_datasets + self.datasets
85
+ if (
86
+ self.automation.exclude_datasets
87
+ ): # Generate all datasets if `--exclude_datasets` set
88
+ if datasets:
89
+ raise ValueError("--exclude_datasets can't be used with `datasets`")
90
+ datasets = set(tfds.list_builders(with_community_datasets=False)) - set(
91
+ self.automation.exclude_datasets.split(',')
92
+ )
93
+ datasets = sorted(datasets) # `set` is not deterministic
94
+ else:
95
+ datasets = datasets or [''] # Empty string for default
96
+
97
+ # Import builder classes
98
+ builders_cls_and_kwargs = [
99
+ _get_builder_cls_and_kwargs(
100
+ dataset, has_imports=bool(self.generation.imports)
101
+ )
102
+ for dataset in datasets
103
+ ]
104
+
105
+ # Parallelize datasets generation.
106
+ builders = itertools.chain(*(
107
+ _make_builders(self, builder_cls, builder_kwargs)
108
+ for (builder_cls, builder_kwargs) in builders_cls_and_kwargs
109
+ ))
110
+ process_builder_fn = functools.partial(
111
+ _download if self.generation.download_only else _download_and_prepare,
112
+ self,
113
+ )
91
114
 
92
- def _build_datasets(args: argparse.Namespace) -> None:
93
- """Build the given datasets."""
94
- # Eventually register additional datasets imports
95
- if args.imports:
96
- list(importlib.import_module(m) for m in args.imports.split(','))
115
+ if self.generation.num_processes == 1:
116
+ for builder in builders:
117
+ process_builder_fn(builder)
118
+ else:
119
+ with multiprocessing.Pool(self.generation.num_processes) as pool:
120
+ pool.map(process_builder_fn, builders)
97
121
 
98
- # Select datasets to generate
99
- datasets = (args.datasets or []) + (args.datasets_keyword or [])
100
- if args.exclude_datasets: # Generate all datasets if `--exclude_datasets` set
101
- if datasets:
102
- raise ValueError("--exclude_datasets can't be used with `datasets`")
103
- datasets = set(tfds.list_builders(with_community_datasets=False)) - set(
104
- args.exclude_datasets.split(',')
105
- )
106
- datasets = sorted(datasets) # `set` is not deterministic
107
- else:
108
- datasets = datasets or [''] # Empty string for default
109
-
110
- # Import builder classes
111
- builders_cls_and_kwargs = [
112
- _get_builder_cls_and_kwargs(dataset, has_imports=bool(args.imports))
113
- for dataset in datasets
114
- ]
115
-
116
- # Parallelize datasets generation.
117
- builders = itertools.chain(*(
118
- _make_builders(args, builder_cls, builder_kwargs)
119
- for (builder_cls, builder_kwargs) in builders_cls_and_kwargs
120
- ))
121
- process_builder_fn = functools.partial(
122
- _download if args.download_only else _download_and_prepare, args
123
- )
124
122
 
125
- if args.num_processes == 1:
126
- for builder in builders:
127
- process_builder_fn(builder)
128
- else:
129
- with multiprocessing.Pool(args.num_processes) as pool:
130
- pool.map(process_builder_fn, builders)
123
+ def register_subparser(parsers: argparse._SubParsersAction) -> None: # pylint: disable=protected-access
124
+ """Add subparser for `build` command."""
125
+ parser = parsers.add_parser(
126
+ 'build', help='Commands for downloading and preparing datasets.'
127
+ )
128
+ parser = typing.cast(simple_parsing.ArgumentParser, parser)
129
+ parser.add_arguments(Args, dest='args')
130
+ parser.set_defaults(subparser_fn=lambda args: args.args.execute())
131
131
 
132
132
 
133
133
  def _make_builders(
134
- args: argparse.Namespace,
134
+ args: Args,
135
135
  builder_cls: Type[tfds.core.DatasetBuilder],
136
136
  builder_kwargs: dict[str, Any],
137
137
  ) -> Iterator[tfds.core.DatasetBuilder]:
@@ -146,7 +146,7 @@ def _make_builders(
146
146
  Initialized dataset builders.
147
147
  """
148
148
  # Eventually overwrite version
149
- if args.experimental_latest_version:
149
+ if args.automation.experimental_latest_version:
150
150
  if 'version' in builder_kwargs:
151
151
  raise ValueError(
152
152
  "Can't have both `--experimental_latest` and version set (`:1.0.0`)"
@@ -157,19 +157,19 @@ def _make_builders(
157
157
  builder_kwargs['config'] = _get_config_name(
158
158
  builder_cls=builder_cls,
159
159
  config_kwarg=builder_kwargs.get('config'),
160
- config_name=args.config,
161
- config_idx=args.config_idx,
160
+ config_name=args.generation.config,
161
+ config_idx=args.generation.config_idx,
162
162
  )
163
163
 
164
- if args.file_format:
165
- builder_kwargs['file_format'] = args.file_format
164
+ if args.generation.file_format:
165
+ builder_kwargs['file_format'] = args.generation.file_format
166
166
 
167
167
  make_builder = functools.partial(
168
168
  _make_builder,
169
169
  builder_cls,
170
- overwrite=args.overwrite,
171
- fail_if_exists=args.fail_if_exists,
172
- data_dir=args.data_dir,
170
+ overwrite=args.debug.overwrite,
171
+ fail_if_exists=args.debug.fail_if_exists,
172
+ data_dir=args.paths.data_dir,
173
173
  **builder_kwargs,
174
174
  )
175
175
 
@@ -203,7 +203,7 @@ def _get_builder_cls_and_kwargs(
203
203
  if not has_imports:
204
204
  path = _search_script_path(ds_to_build)
205
205
  if path is not None:
206
- logging.info(f'Loading dataset {ds_to_build} from path: {path}')
206
+ logging.info('Loading dataset %s from path: %s', ds_to_build, path)
207
207
  # Dynamically load user dataset script
208
208
  # When possible, load from the parent's parent, so module is named
209
209
  # "foo.foo_dataset_builder".
@@ -228,7 +228,9 @@ def _get_builder_cls_and_kwargs(
228
228
  name, builder_kwargs = tfds.core.naming.parse_builder_name_kwargs(ds_to_build)
229
229
  builder_cls = tfds.builder_cls(str(name))
230
230
  logging.info(
231
- f'Loading dataset {ds_to_build} from imports: {builder_cls.__module__}'
231
+ 'Loading dataset %s from imports: %s',
232
+ ds_to_build,
233
+ builder_cls.__module__,
232
234
  )
233
235
  return builder_cls, builder_kwargs
234
236
 
@@ -308,7 +310,7 @@ def _make_builder(
308
310
 
309
311
 
310
312
  def _download(
311
- args: argparse.Namespace,
313
+ args: Args,
312
314
  builder: tfds.core.DatasetBuilder,
313
315
  ) -> None:
314
316
  """Downloads all files of the given builder."""
@@ -330,7 +332,7 @@ def _download(
330
332
  if builder.MAX_SIMULTANEOUS_DOWNLOADS is not None:
331
333
  max_simultaneous_downloads = builder.MAX_SIMULTANEOUS_DOWNLOADS
332
334
 
333
- download_dir = args.download_dir or os.path.join(
335
+ download_dir = args.paths.download_dir or os.path.join(
334
336
  builder._data_dir_root, 'downloads' # pylint: disable=protected-access
335
337
  )
336
338
  dl_manager = tfds.download.DownloadManager(
@@ -352,39 +354,39 @@ def _download(
352
354
 
353
355
 
354
356
  def _download_and_prepare(
355
- args: argparse.Namespace,
357
+ args: Args,
356
358
  builder: tfds.core.DatasetBuilder,
357
359
  ) -> None:
358
360
  """Generate a single builder."""
359
361
  cli_utils.download_and_prepare(
360
362
  builder=builder,
361
363
  download_config=_make_download_config(args, dataset_name=builder.name),
362
- download_dir=args.download_dir,
363
- publish_dir=args.publish_dir,
364
- skip_if_published=args.skip_if_published,
365
- overwrite=args.overwrite,
366
- beam_pipeline_options=args.beam_pipeline_options,
367
- nondeterministic_order=args.nondeterministic_order,
364
+ download_dir=args.paths.download_dir,
365
+ publish_dir=args.publishing.publish_dir,
366
+ skip_if_published=args.publishing.skip_if_published,
367
+ overwrite=args.debug.overwrite,
368
+ beam_pipeline_options=args.generation.beam_pipeline_options,
369
+ nondeterministic_order=args.generation.nondeterministic_order,
368
370
  )
369
371
 
370
372
 
371
373
  def _make_download_config(
372
- args: argparse.Namespace,
374
+ args: Args,
373
375
  dataset_name: str,
374
376
  ) -> tfds.download.DownloadConfig:
375
377
  """Generate the download and prepare configuration."""
376
378
  # Load the download config
377
- manual_dir = args.manual_dir
378
- if args.add_name_to_manual_dir:
379
+ manual_dir = args.paths.manual_dir
380
+ if args.paths.add_name_to_manual_dir:
379
381
  manual_dir = manual_dir / dataset_name
380
382
 
381
383
  kwargs = {}
382
- if args.max_shard_size_mb:
383
- kwargs['max_shard_size'] = args.max_shard_size_mb << 20
384
- if args.num_shards:
385
- kwargs['num_shards'] = args.num_shards
386
- if args.download_config:
387
- kwargs.update(json.loads(args.download_config))
384
+ if args.generation.max_shard_size_mb:
385
+ kwargs['max_shard_size'] = args.generation.max_shard_size_mb << 20
386
+ if args.generation.num_shards:
387
+ kwargs['num_shards'] = args.generation.num_shards
388
+ if args.generation.download_config:
389
+ kwargs.update(json.loads(args.generation.download_config))
388
390
 
389
391
  if 'download_mode' in kwargs:
390
392
  kwargs['download_mode'] = tfds.download.GenerateMode(
@@ -392,15 +394,15 @@ def _make_download_config(
392
394
  )
393
395
  else:
394
396
  kwargs['download_mode'] = tfds.download.GenerateMode.REUSE_DATASET_IF_EXISTS
395
- if args.update_metadata_only:
397
+ if args.generation.update_metadata_only:
396
398
  kwargs['download_mode'] = tfds.download.GenerateMode.UPDATE_DATASET_INFO
397
399
 
398
400
  return tfds.download.DownloadConfig(
399
- extract_dir=args.extract_dir,
401
+ extract_dir=args.paths.extract_dir,
400
402
  manual_dir=manual_dir,
401
- max_examples_per_split=args.max_examples_per_split,
402
- register_checksums=args.register_checksums,
403
- force_checksums_validation=args.force_checksums_validation,
403
+ max_examples_per_split=args.debug.max_examples_per_split,
404
+ register_checksums=args.generation.register_checksums,
405
+ force_checksums_validation=args.generation.force_checksums_validation,
404
406
  **kwargs,
405
407
  )
406
408
 
@@ -445,11 +447,10 @@ def _get_config_name(
445
447
  else:
446
448
  return config_name
447
449
  elif config_idx is not None: # `--config_idx 123`
448
- if config_idx > len(builder_cls.BUILDER_CONFIGS):
450
+ if config_idx >= len(builder_cls.BUILDER_CONFIGS):
449
451
  raise ValueError(
450
- f'--config_idx {config_idx} greater than number '
451
- f'of configs {len(builder_cls.BUILDER_CONFIGS)} for '
452
- f'{builder_cls.name}.'
452
+ f'--config_idx {config_idx} greater than number of configs '
453
+ f'{len(builder_cls.BUILDER_CONFIGS)} for {builder_cls.name}.'
453
454
  )
454
455
  else:
455
456
  # Use `config.name` to avoid
@@ -311,7 +311,8 @@ def test_download_only(build):
311
311
  )
312
312
  def test_make_download_config(args: str, download_config_kwargs):
313
313
  args = main._parse_flags(f'tfds build x {args}'.split())
314
- actual = build_lib._make_download_config(args, dataset_name='x')
314
+ cmd_args: build_lib.Args = args.args
315
+ actual = build_lib._make_download_config(cmd_args, dataset_name='x')
315
316
  # Ignore the beam runner
316
317
  actual = actual.replace(beam_runner=None)
317
318
  expected = tfds.download.DownloadConfig(**download_config_kwargs)
@@ -127,232 +127,137 @@ class DatasetInfo:
127
127
  self.ds_import = ds_import
128
128
 
129
129
 
130
- def add_debug_argument_group(parser: argparse.ArgumentParser):
131
- """Adds debug argument group to the parser."""
132
- debug_group = parser.add_argument_group(
133
- 'Debug & tests',
134
- description=(
135
- '--pdb Enter post-mortem debugging mode if an exception is raised.'
136
- ),
137
- )
138
- debug_group.add_argument(
139
- '--overwrite',
140
- action='store_true',
141
- help='Delete pre-existing dataset if it exists.',
142
- )
143
- debug_group.add_argument(
144
- '--fail_if_exists',
145
- action='store_true',
146
- default=False,
147
- help='Fails the program if there is a pre-existing dataset.',
148
- )
149
- debug_group.add_argument(
150
- '--max_examples_per_split',
151
- type=int,
152
- nargs='?',
153
- const=1,
154
- help=(
155
- 'When set, only generate the first X examples (default to 1), rather'
156
- ' than the full dataset.If set to 0, only execute the'
157
- ' `_split_generators` (which download the original data), but skip'
158
- ' `_generator_examples`'
159
- ),
160
- )
130
+ @dataclasses.dataclass(frozen=True, kw_only=True)
131
+ class DebugOptions:
132
+ """Debug & tests options.
161
133
 
134
+ Attributes:
135
+ overwrite: If True, delete pre-existing dataset if it exists.
136
+ fail_if_exists: If True, fails the program if there is a pre-existing
137
+ dataset.
138
+ max_examples_per_split: When set, only generate the first X examples
139
+ (default to 1), rather than the full dataset. If set to 0, only execute
140
+ the `_split_generators` (which download the original data), but skip
141
+ `_generator_examples`.
142
+ """
162
143
 
163
- def add_path_argument_group(parser: argparse.ArgumentParser):
164
- """Adds path argument group to the parser."""
165
- path_group = parser.add_argument_group('Paths')
166
- path_group.add_argument(
167
- '--data_dir',
168
- type=epath.Path,
169
- default=epath.Path(constants.DATA_DIR),
170
- help=(
171
- 'Where to place datasets. Default to '
172
- '`~/tensorflow_datasets/` or `TFDS_DATA_DIR` environement variable.'
173
- ),
174
- )
175
- path_group.add_argument(
176
- '--download_dir',
177
- type=epath.Path,
178
- help='Where to place downloads. Default to `<data_dir>/downloads/`.',
179
- )
180
- path_group.add_argument(
181
- '--extract_dir',
182
- type=epath.Path,
183
- help='Where to extract files. Default to `<download_dir>/extracted/`.',
184
- )
185
- path_group.add_argument(
186
- '--manual_dir',
187
- type=epath.Path,
188
- help=(
189
- 'Where to manually download data (required for some datasets). '
190
- 'Default to `<download_dir>/manual/`.'
191
- ),
192
- )
193
- path_group.add_argument(
194
- '--add_name_to_manual_dir',
195
- action='store_true',
196
- help=(
197
- 'If true, append the dataset name to the `manual_dir` (e.g. '
198
- '`<download_dir>/manual/<dataset_name>/`. Useful to avoid collisions '
199
- 'if many datasets are generated.'
200
- ),
144
+ overwrite: bool = simple_parsing.flag(default=False)
145
+ fail_if_exists: bool = simple_parsing.flag(default=False)
146
+ max_examples_per_split: int | None = simple_parsing.field(
147
+ default=None, nargs='?', const=1
201
148
  )
202
149
 
203
150
 
204
- def add_generation_argument_group(parser: argparse.ArgumentParser):
205
- """Adds generation argument group to the parser."""
206
- generation_group = parser.add_argument_group('Generation')
207
- generation_group.add_argument(
208
- '--download_only',
209
- action='store_true',
210
- help=(
211
- 'If True, download all files but do not prepare the dataset. Uses the'
212
- ' checksum.tsv to find out what to download. Therefore, this does not'
213
- ' work in combination with --register_checksums.'
214
- ),
215
- )
216
- generation_group.add_argument(
217
- '--config',
218
- '-c',
219
- type=str,
220
- help=(
221
- 'Config name to build. Build all configs if not set. Can also be a'
222
- ' json of the kwargs forwarded to the config `__init__` (for custom'
223
- ' configs).'
224
- ),
225
- )
226
- # We are forced to have 2 flags to avoid ambiguity when config name is
227
- # a number (e.g. `voc/2017`)
228
- generation_group.add_argument(
229
- '--config_idx',
230
- type=int,
231
- help=(
232
- 'Config id to build (`builder_cls.BUILDER_CONFIGS[config_idx]`). '
233
- 'Mutually exclusive with `--config`.'
234
- ),
235
- )
236
- generation_group.add_argument(
237
- '--update_metadata_only',
238
- action='store_true',
239
- default=False,
240
- help=(
241
- 'If True, existing dataset_info.json is updated with metadata defined'
242
- ' in Builder class(es). Datasets must already have been prepared.'
243
- ),
244
- )
245
- generation_group.add_argument(
246
- '--download_config',
247
- type=str,
248
- help=(
249
- 'A json of the kwargs forwarded to the config `__init__` (for custom'
250
- ' DownloadConfigs).'
251
- ),
252
- )
253
- generation_group.add_argument(
254
- '--imports',
255
- '-i',
256
- type=str,
257
- help='Comma separated list of module to import to register datasets.',
258
- )
259
- generation_group.add_argument(
260
- '--register_checksums',
261
- action='store_true',
262
- help='If True, store size and checksum of downloaded files.',
263
- )
264
- generation_group.add_argument(
265
- '--force_checksums_validation',
266
- action='store_true',
267
- help='If True, raise an error if the checksums are not found.',
268
- )
269
- # For compatibility with absl.flags (which generates --foo and --nofoo).
270
- generation_group.add_argument(
271
- '--noforce_checksums_validation',
272
- dest='force_checksums_validation',
273
- action='store_false',
274
- help='If specified, bypass the checks on the checksums.',
275
- )
276
- generation_group.add_argument(
277
- '--beam_pipeline_options',
278
- type=str,
279
- # nargs='+',
280
- help=(
281
- 'A (comma-separated) list of flags to pass to `PipelineOptions` when'
282
- ' preparing with Apache Beam. (see:'
283
- ' https://www.tensorflow.org/datasets/beam_datasets). Example:'
284
- ' `--beam_pipeline_options=job_name=my-job,project=my-project`'
285
- ),
286
- )
287
- format_values = [f.value for f in file_adapters.FileFormat]
288
- generation_group.add_argument(
289
- '--file_format',
290
- type=str,
291
- help=(
292
- 'File format to which generate the tf-examples. '
293
- f'Available values: {format_values} (see `tfds.core.FileFormat`).'
294
- ),
295
- )
296
- generation_group.add_argument(
297
- '--max_shard_size_mb', type=int, help='The max shard size in megabytes.'
298
- )
299
- generation_group.add_argument(
300
- '--num_shards', type=int, help='The number of shards to write to.'
301
- )
302
- generation_group.add_argument(
303
- '--num-processes',
304
- type=int,
305
- default=1,
306
- help='Number of parallel build processes.',
307
- )
308
- generation_group.add_argument(
309
- '--nondeterministic_order',
310
- action='store_true',
311
- default=False,
312
- help=(
313
- 'If True, it will not assure deterministic ordering when writing'
314
- ' examples to disk. This might result in quicker dataset preparation.'
315
- ),
316
- )
317
- # For compatibility with absl.flags (which generates --foo and --nofoo).
318
- generation_group.add_argument(
319
- '--nonondeterministic_order',
320
- dest='nondeterministic_order',
321
- action='store_false',
322
- help=(
323
- 'If specified, it will assure deterministic ordering when writing'
324
- ' examples to disk.'
325
- ),
326
- )
151
+ @dataclasses.dataclass(frozen=True, kw_only=True)
152
+ class PathOptions:
153
+ """Path options.
327
154
 
155
+ Attributes:
156
+ data_dir: Where to place datasets. Default to `~/tensorflow_datasets/` or
157
+ `TFDS_DATA_DIR` environement variable.
158
+ download_dir: Where to place downloads. Default to `<data_dir>/downloads/`.
159
+ extract_dir: Where to extract files. Default to `<download_dir>/extracted/`.
160
+ manual_dir: Where to manually download data (required for some datasets).
161
+ Default to `<download_dir>/manual/`.
162
+ add_name_to_manual_dir: If true, append the dataset name to the `manual_dir`
163
+ (e.g. `<download_dir>/manual/<dataset_name>/`). Useful to avoid collisions
164
+ if many datasets are generated.
165
+ """
328
166
 
329
- def add_publish_argument_group(parser: argparse.ArgumentParser):
330
- """Adds publish argument group to the parser."""
331
- publish_group = parser.add_argument_group(
332
- 'Publishing',
333
- description='Options for publishing successfully created datasets.',
167
+ data_dir: epath.Path = simple_parsing.field(
168
+ default=epath.Path(constants.DATA_DIR)
334
169
  )
335
- publish_group.add_argument(
336
- '--publish_dir',
337
- type=epath.Path,
170
+ download_dir: epath.Path | None = None
171
+ extract_dir: epath.Path | None = None
172
+ manual_dir: epath.Path | None = None
173
+ add_name_to_manual_dir: bool = simple_parsing.flag(default=False)
174
+
175
+
176
+ @dataclasses.dataclass(frozen=True, kw_only=True)
177
+ class GenerationOptions:
178
+ """Generation options.
179
+
180
+ Attributes:
181
+ download_only: If True, download all files but do not prepare the dataset.
182
+ Uses the checksum.tsv to find out what to download. Therefore, this does
183
+ not work in combination with --register_checksums.
184
+ config: Config name to build. Build all configs if not set. Can also be a
185
+ json of the kwargs forwarded to the config `__init__` (for custom
186
+ configs).
187
+ config_idx: Config id to build (`builder_cls.BUILDER_CONFIGS[config_idx]`).
188
+ Mutually exclusive with `--config`. We are forced to have 2 flags to avoid
189
+ ambiguity when `config` is a number (e.g. `voc/2017`).
190
+ update_metadata_only: If True, existing dataset_info.json is updated with
191
+ metadata defined in Builder class(es). Datasets must already have been
192
+ prepared.
193
+ download_config: A json of the kwargs forwarded to the config `__init__`
194
+ (for custom DownloadConfigs).
195
+ imports: Comma separated list of module to import to register datasets.
196
+ register_checksums: If True, store size and checksum of downloaded files.
197
+ force_checksums_validation: If True, raise an error if the checksums are not
198
+ found. Otherwise, bypass the checks on the checksums
199
+ beam_pipeline_options: A (comma-separated) list of flags to pass to
200
+ `PipelineOptions` when preparing with Apache Beam. (see:
201
+ https://www.tensorflow.org/datasets/beam_datasets). Example:
202
+ `--beam_pipeline_options=job_name=my-job,project=my-project`
203
+ file_format: File format to which generate the tf-examples.
204
+ max_shard_size_mb: The max shard size in megabytes.
205
+ num_shards: The number of shards to write to.
206
+ num_processes: Number of parallel build processes.
207
+ nondeterministic_order: If True, it will not assure deterministic ordering
208
+ when writing examples to disk. This might result in quicker dataset
209
+ preparation. Otherwise, it will assure deterministic ordering when writing
210
+ examples to disk
211
+ """
212
+
213
+ download_only: bool = simple_parsing.flag(default=False)
214
+ config: str | None = simple_parsing.field(default=None, alias='-c')
215
+ config_idx: int | None = None
216
+ update_metadata_only: bool = simple_parsing.flag(default=False)
217
+ download_config: str | None = None
218
+ imports: str | None = simple_parsing.field(default=None, alias='-i')
219
+ register_checksums: bool = simple_parsing.flag(default=False)
220
+ force_checksums_validation: bool = simple_parsing.flag(default=False)
221
+ beam_pipeline_options: str | None = None
222
+ file_format: str | None = simple_parsing.choice(
223
+ *(file_format.value for file_format in file_adapters.FileFormat),
338
224
  default=None,
339
- required=False,
340
- help=(
341
- 'Where to optionally publish the dataset after it has been '
342
- 'generated successfully. Should be the root data dir under which'
343
- 'datasets are stored. '
344
- 'If unspecified, dataset will not be published'
345
- ),
346
- )
347
- publish_group.add_argument(
348
- '--skip_if_published',
349
- action='store_true',
350
- default=False,
351
- help=(
352
- 'If the dataset with the same version and config is already '
353
- 'published, then it will not be regenerated.'
354
- ),
355
225
  )
226
+ max_shard_size_mb: int | None = None
227
+ num_shards: int | None = None
228
+ num_processes: int = simple_parsing.field(default=1, alias='num-processes')
229
+ nondeterministic_order: bool = simple_parsing.flag(default=False)
230
+
231
+
232
+ @dataclasses.dataclass(frozen=True, kw_only=True)
233
+ class PublishingOptions:
234
+ """Publishing options.
235
+
236
+ Attributes:
237
+ publish_dir: Where to optionally publish the dataset after it has been
238
+ generated successfully. Should be the root data dir under which datasets
239
+ are stored. If unspecified, dataset will not be published.
240
+ skip_if_published: If the dataset with the same version and config is
241
+ already published, then it will not be regenerated.
242
+ """
243
+
244
+ publish_dir: epath.Path | None = None
245
+ skip_if_published: bool = simple_parsing.flag(default=False)
246
+
247
+
248
+ @dataclasses.dataclass(frozen=True, kw_only=True)
249
+ class AutomationOptions:
250
+ """Automation options.
251
+
252
+ Attributes:
253
+ exclude_datasets: If set, generate all datasets except the one defined here.
254
+ Comma separated list of datasets to exclude.
255
+ experimental_latest_version: Build the latest Version(experiments=...)
256
+ available rather than default version.
257
+ """
258
+
259
+ exclude_datasets: str | None = None
260
+ experimental_latest_version: bool = simple_parsing.flag(default=False)
356
261
 
357
262
 
358
263
  def download_and_prepare(
@@ -16,12 +16,11 @@
16
16
  r"""Wrapper around `tfds build`."""
17
17
 
18
18
  import argparse
19
- from typing import List
20
19
 
21
20
  from absl import app
22
21
  from absl import flags
23
22
  from absl import logging
24
-
23
+ from tensorflow_datasets.scripts.cli import build
25
24
  from tensorflow_datasets.scripts.cli import main as main_cli
26
25
 
27
26
  module_import = flags.DEFINE_string('module_import', None, '`--imports` flag.')
@@ -33,7 +32,7 @@ builder_config_id = flags.DEFINE_integer(
33
32
 
34
33
 
35
34
 
36
- def _parse_flags(argv: List[str]) -> argparse.Namespace:
35
+ def _parse_flags(argv: list[str]) -> argparse.Namespace:
37
36
  """Command lines flag parsing."""
38
37
  return main_cli._parse_flags([argv[0], 'build'] + argv[1:]) # pylint: disable=protected-access
39
38
 
@@ -46,12 +45,13 @@ def main(args: argparse.Namespace) -> None:
46
45
  logging.warning(
47
46
  '***`tfds build` should be used instead of `download_and_prepare`.***'
48
47
  )
48
+ cmd_args: build.Args = args.args
49
49
  if module_import.value:
50
- args.imports = module_import.value
50
+ cmd_args.generation.imports = module_import.value
51
51
  if dataset.value:
52
- args.datasets = [dataset.value]
52
+ cmd_args.datasets = [dataset.value]
53
53
  if builder_config_id.value is not None:
54
- args.config_idx = builder_config_id.value
54
+ cmd_args.generation.config_idx = builder_config_id.value
55
55
  main_cli.main(args)
56
56
 
57
57
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tfds-nightly
3
- Version: 4.9.9.dev202508110045
3
+ Version: 4.9.9.dev202508120044
4
4
  Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
5
5
  Home-page: https://github.com/tensorflow/datasets
6
6
  Download-URL: https://github.com/tensorflow/datasets/tags
@@ -1965,7 +1965,7 @@ tensorflow_datasets/robotics/rtx/__init__.py,sha256=T5AMbjr-iztrX4Q7k4QhiMNXLOAK
1965
1965
  tensorflow_datasets/robotics/rtx/rtx.py,sha256=8OEnc0_LNsgEJjaySoMwWDjzgiv4hzeobuploMM1cdo,50084
1966
1966
  tensorflow_datasets/scripts/__init__.py,sha256=Z8UWkv0wbzS4AzaLgSpYVGApYv5j57RWY0vN5Z553BQ,613
1967
1967
  tensorflow_datasets/scripts/convert_format.py,sha256=Kopn3YbNqH-euJaWFsd1nyo56-HDHgq8fDzRViXdx9A,3604
1968
- tensorflow_datasets/scripts/download_and_prepare.py,sha256=Yd9Kg9u_WG6GgfXY6BFnyXz1utXd6clHbLZN9lnWrJc,1777
1968
+ tensorflow_datasets/scripts/download_and_prepare.py,sha256=BZMDXUlZIwB74ukiJ59hIx_PSJhLht2h94rAEhpEX-M,1871
1969
1969
  tensorflow_datasets/scripts/freeze_dataset_versions.py,sha256=SKC7raxmREqaD5pUnSuy_NHdu9gxTlRxJIOoPoT3cuw,1244
1970
1970
  tensorflow_datasets/scripts/print_num_configs.py,sha256=an80znBHmkycQS4ZEHFQTi1fuFop56tDUx9hgguVcvw,971
1971
1971
  tensorflow_datasets/scripts/replace_fake_images.py,sha256=9L2m3zY0nntaOmsVlNWy6BRJEEytyrMuu5W0LXzLCpA,5223
@@ -1979,11 +1979,11 @@ tensorflow_datasets/scripts/cleanup/refactor_dataset_as_folder.py,sha256=VpEc2Us
1979
1979
  tensorflow_datasets/scripts/cleanup/url_filename_recorder.py,sha256=iLcsT8UgbyNUw00N7bVBC0zCqEuIQ2ndeCCcb4B-OEc,4490
1980
1980
  tensorflow_datasets/scripts/cleanup/url_status_checker.py,sha256=Tr3LtLnGhI8ElDAS-ejmuAU3rs1lmqmYlU4figoVQg0,1967
1981
1981
  tensorflow_datasets/scripts/cli/__init__.py,sha256=Z8UWkv0wbzS4AzaLgSpYVGApYv5j57RWY0vN5Z553BQ,613
1982
- tensorflow_datasets/scripts/cli/build.py,sha256=jZp7CaP62D2Usi4l-o9oCUqTHhnigX15PNUr9pOd4Wo,14961
1983
- tensorflow_datasets/scripts/cli/build_test.py,sha256=xlFYScPSMcsUR27GQ-W5wdGdLdkXu_n0hM1rl20WWW8,10542
1982
+ tensorflow_datasets/scripts/cli/build.py,sha256=Gwemm1V9qNHrgEVxcO2R8SU-_naNtFXKctIdR6U2sLo,15433
1983
+ tensorflow_datasets/scripts/cli/build_test.py,sha256=Zh9TTkGW3_Gvl6Lm6_4E4rIWUcjJ0bq-Ymd_SYS5jnY,10585
1984
1984
  tensorflow_datasets/scripts/cli/builder_templates.py,sha256=99SvH3skigkc2Qg737BV2OzhXL_Rgu4az8eVHsxKCLk,7985
1985
1985
  tensorflow_datasets/scripts/cli/builder_templates_test.py,sha256=HBNB-v2zlImKULPI8Webs9hXCkeFmWT29urxav-tDe8,2062
1986
- tensorflow_datasets/scripts/cli/cli_utils.py,sha256=zE-jLQw0dn_98PHOTLX6pMoFqjSCBOD7lh5dytJcphE,14049
1986
+ tensorflow_datasets/scripts/cli/cli_utils.py,sha256=IXH0POQyoM5wZs4RK3Crb8jq86nlk_mkK4Hc-kynwW0,12497
1987
1987
  tensorflow_datasets/scripts/cli/conftest.py,sha256=3PNh_BbR013G4HyLAZOleUXsQ9mICrD03NaKwdHFMXs,1291
1988
1988
  tensorflow_datasets/scripts/cli/convert_format.py,sha256=02RDZQQCuXf_XFFpx0gmRVkYyJg534kY0fZwGKxtUL4,4197
1989
1989
  tensorflow_datasets/scripts/cli/convert_format_utils.py,sha256=U_q5WVgMNrjBkOc166U4Y_eca5KOS3Xb3jSDjp4XdK4,29078
@@ -2468,10 +2468,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=PXS8DMNW-MDrT2p5oy4Ic
2468
2468
  tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=vGwSGeM8WE4Q-l0-eEE1sBojmk6YT0l1OO60AWa4Q40,719
2469
2469
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=q-vX_FBzIwsFxL4sY9vuyQ3UQD2PLM4yhUR4U6l-qao,16903
2470
2470
  tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=ZymHT1NkmD-pUnh3BmM3_g30c5afsWYnmqDD9dVyDSA,1778
2471
- tfds_nightly-4.9.9.dev202508110045.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2472
- tfds_nightly-4.9.9.dev202508110045.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2473
- tfds_nightly-4.9.9.dev202508110045.dist-info/METADATA,sha256=Xr0YCoYhfCImcmxnROqE4vp203B8XMiktUrh1hrarRw,11694
2474
- tfds_nightly-4.9.9.dev202508110045.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2475
- tfds_nightly-4.9.9.dev202508110045.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2476
- tfds_nightly-4.9.9.dev202508110045.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2477
- tfds_nightly-4.9.9.dev202508110045.dist-info/RECORD,,
2471
+ tfds_nightly-4.9.9.dev202508120044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
2472
+ tfds_nightly-4.9.9.dev202508120044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
2473
+ tfds_nightly-4.9.9.dev202508120044.dist-info/METADATA,sha256=GPNy1y0jxT2XuJB5EIzhzDryXPOnpRsInh2iFU6COyg,11694
2474
+ tfds_nightly-4.9.9.dev202508120044.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
2475
+ tfds_nightly-4.9.9.dev202508120044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
2476
+ tfds_nightly-4.9.9.dev202508120044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
2477
+ tfds_nightly-4.9.9.dev202508120044.dist-info/RECORD,,