tfds-nightly 4.9.9.dev202508110045__py3-none-any.whl → 4.9.9.dev202508120044__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorflow_datasets/scripts/cli/build.py +125 -124
- tensorflow_datasets/scripts/cli/build_test.py +2 -1
- tensorflow_datasets/scripts/cli/cli_utils.py +121 -216
- tensorflow_datasets/scripts/download_and_prepare.py +6 -6
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508120044.dist-info}/METADATA +1 -1
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508120044.dist-info}/RECORD +11 -11
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508120044.dist-info}/WHEEL +0 -0
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508120044.dist-info}/entry_points.txt +0 -0
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508120044.dist-info}/licenses/AUTHORS +0 -0
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508120044.dist-info}/licenses/LICENSE +0 -0
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508120044.dist-info}/top_level.txt +0 -0
@@ -17,121 +17,121 @@
|
|
17
17
|
|
18
18
|
import argparse
|
19
19
|
from collections.abc import Iterator
|
20
|
+
import dataclasses
|
20
21
|
import functools
|
21
22
|
import importlib
|
22
23
|
import itertools
|
23
24
|
import json
|
24
25
|
import multiprocessing
|
25
26
|
import os
|
27
|
+
import typing
|
26
28
|
from typing import Any, Type
|
27
29
|
|
28
30
|
from absl import logging
|
31
|
+
import simple_parsing
|
29
32
|
import tensorflow_datasets as tfds
|
30
33
|
from tensorflow_datasets.scripts.cli import cli_utils
|
31
34
|
|
32
|
-
# pylint: disable=logging-fstring-interpolation
|
33
35
|
|
34
|
-
|
35
|
-
|
36
|
-
"""
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
36
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
37
|
+
class Args:
|
38
|
+
"""CLI arguments for building datasets.
|
39
|
+
|
40
|
+
Attributes:
|
41
|
+
positional_datasets: Name(s) of the dataset(s) to build. Default to current
|
42
|
+
dir. See https://www.tensorflow.org/datasets/cli for accepted values.
|
43
|
+
datasets: Datasets can also be provided as keyword argument.
|
44
|
+
debug: Debug & tests options. Use --pdb to enter post-mortem debugging mode
|
45
|
+
if an exception is raised.
|
46
|
+
paths: Path options.
|
47
|
+
generation: Generation options.
|
48
|
+
publishing: Publishing options.
|
49
|
+
automation: Automation options.
|
42
50
|
"""
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
build_parser.add_argument(
|
47
|
-
'datasets', # Positional arguments
|
48
|
-
type=str,
|
51
|
+
|
52
|
+
positional_datasets: list[str] = simple_parsing.field(
|
53
|
+
positional=True,
|
49
54
|
nargs='*',
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
),
|
54
|
-
)
|
55
|
-
build_parser.add_argument( # Also accept keyword arguments
|
56
|
-
'--datasets',
|
57
|
-
type=str,
|
58
|
-
nargs='+',
|
59
|
-
dest='datasets_keyword',
|
60
|
-
help='Datasets can also be provided as keyword argument.',
|
55
|
+
default_factory=list,
|
56
|
+
# Need to explicitly set metavar for command-line help.
|
57
|
+
metavar='datasets',
|
61
58
|
)
|
59
|
+
datasets: list[str] = simple_parsing.field(nargs='*', default_factory=list)
|
62
60
|
|
63
|
-
cli_utils.
|
64
|
-
cli_utils.
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
automation_group = build_parser.add_argument_group(
|
70
|
-
'Automation', description='Used by automated scripts.'
|
61
|
+
debug: cli_utils.DebugOptions = cli_utils.DebugOptions()
|
62
|
+
paths: cli_utils.PathOptions = simple_parsing.field(
|
63
|
+
default_factory=cli_utils.PathOptions
|
64
|
+
)
|
65
|
+
generation: cli_utils.GenerationOptions = simple_parsing.field(
|
66
|
+
default_factory=cli_utils.GenerationOptions
|
71
67
|
)
|
72
|
-
|
73
|
-
|
74
|
-
type=str,
|
75
|
-
help=(
|
76
|
-
'If set, generate all datasets except the one defined here. '
|
77
|
-
'Comma separated list of datasets to exclude. '
|
78
|
-
),
|
68
|
+
publishing: cli_utils.PublishingOptions = simple_parsing.field(
|
69
|
+
default_factory=cli_utils.PublishingOptions
|
79
70
|
)
|
80
|
-
|
81
|
-
|
82
|
-
action='store_true',
|
83
|
-
help=(
|
84
|
-
'Build the latest Version(experiments=...) available rather than '
|
85
|
-
'default version.'
|
86
|
-
),
|
71
|
+
automation: cli_utils.AutomationOptions = simple_parsing.field(
|
72
|
+
default_factory=cli_utils.AutomationOptions
|
87
73
|
)
|
88
74
|
|
89
|
-
|
75
|
+
def execute(self) -> None:
|
76
|
+
"""Build the given datasets."""
|
77
|
+
# Eventually register additional datasets imports
|
78
|
+
if self.generation.imports:
|
79
|
+
list(
|
80
|
+
importlib.import_module(m) for m in self.generation.imports.split(',')
|
81
|
+
)
|
90
82
|
|
83
|
+
# Select datasets to generate
|
84
|
+
datasets = self.positional_datasets + self.datasets
|
85
|
+
if (
|
86
|
+
self.automation.exclude_datasets
|
87
|
+
): # Generate all datasets if `--exclude_datasets` set
|
88
|
+
if datasets:
|
89
|
+
raise ValueError("--exclude_datasets can't be used with `datasets`")
|
90
|
+
datasets = set(tfds.list_builders(with_community_datasets=False)) - set(
|
91
|
+
self.automation.exclude_datasets.split(',')
|
92
|
+
)
|
93
|
+
datasets = sorted(datasets) # `set` is not deterministic
|
94
|
+
else:
|
95
|
+
datasets = datasets or [''] # Empty string for default
|
96
|
+
|
97
|
+
# Import builder classes
|
98
|
+
builders_cls_and_kwargs = [
|
99
|
+
_get_builder_cls_and_kwargs(
|
100
|
+
dataset, has_imports=bool(self.generation.imports)
|
101
|
+
)
|
102
|
+
for dataset in datasets
|
103
|
+
]
|
104
|
+
|
105
|
+
# Parallelize datasets generation.
|
106
|
+
builders = itertools.chain(*(
|
107
|
+
_make_builders(self, builder_cls, builder_kwargs)
|
108
|
+
for (builder_cls, builder_kwargs) in builders_cls_and_kwargs
|
109
|
+
))
|
110
|
+
process_builder_fn = functools.partial(
|
111
|
+
_download if self.generation.download_only else _download_and_prepare,
|
112
|
+
self,
|
113
|
+
)
|
91
114
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
115
|
+
if self.generation.num_processes == 1:
|
116
|
+
for builder in builders:
|
117
|
+
process_builder_fn(builder)
|
118
|
+
else:
|
119
|
+
with multiprocessing.Pool(self.generation.num_processes) as pool:
|
120
|
+
pool.map(process_builder_fn, builders)
|
97
121
|
|
98
|
-
# Select datasets to generate
|
99
|
-
datasets = (args.datasets or []) + (args.datasets_keyword or [])
|
100
|
-
if args.exclude_datasets: # Generate all datasets if `--exclude_datasets` set
|
101
|
-
if datasets:
|
102
|
-
raise ValueError("--exclude_datasets can't be used with `datasets`")
|
103
|
-
datasets = set(tfds.list_builders(with_community_datasets=False)) - set(
|
104
|
-
args.exclude_datasets.split(',')
|
105
|
-
)
|
106
|
-
datasets = sorted(datasets) # `set` is not deterministic
|
107
|
-
else:
|
108
|
-
datasets = datasets or [''] # Empty string for default
|
109
|
-
|
110
|
-
# Import builder classes
|
111
|
-
builders_cls_and_kwargs = [
|
112
|
-
_get_builder_cls_and_kwargs(dataset, has_imports=bool(args.imports))
|
113
|
-
for dataset in datasets
|
114
|
-
]
|
115
|
-
|
116
|
-
# Parallelize datasets generation.
|
117
|
-
builders = itertools.chain(*(
|
118
|
-
_make_builders(args, builder_cls, builder_kwargs)
|
119
|
-
for (builder_cls, builder_kwargs) in builders_cls_and_kwargs
|
120
|
-
))
|
121
|
-
process_builder_fn = functools.partial(
|
122
|
-
_download if args.download_only else _download_and_prepare, args
|
123
|
-
)
|
124
122
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
123
|
+
def register_subparser(parsers: argparse._SubParsersAction) -> None: # pylint: disable=protected-access
|
124
|
+
"""Add subparser for `build` command."""
|
125
|
+
parser = parsers.add_parser(
|
126
|
+
'build', help='Commands for downloading and preparing datasets.'
|
127
|
+
)
|
128
|
+
parser = typing.cast(simple_parsing.ArgumentParser, parser)
|
129
|
+
parser.add_arguments(Args, dest='args')
|
130
|
+
parser.set_defaults(subparser_fn=lambda args: args.args.execute())
|
131
131
|
|
132
132
|
|
133
133
|
def _make_builders(
|
134
|
-
args:
|
134
|
+
args: Args,
|
135
135
|
builder_cls: Type[tfds.core.DatasetBuilder],
|
136
136
|
builder_kwargs: dict[str, Any],
|
137
137
|
) -> Iterator[tfds.core.DatasetBuilder]:
|
@@ -146,7 +146,7 @@ def _make_builders(
|
|
146
146
|
Initialized dataset builders.
|
147
147
|
"""
|
148
148
|
# Eventually overwrite version
|
149
|
-
if args.experimental_latest_version:
|
149
|
+
if args.automation.experimental_latest_version:
|
150
150
|
if 'version' in builder_kwargs:
|
151
151
|
raise ValueError(
|
152
152
|
"Can't have both `--experimental_latest` and version set (`:1.0.0`)"
|
@@ -157,19 +157,19 @@ def _make_builders(
|
|
157
157
|
builder_kwargs['config'] = _get_config_name(
|
158
158
|
builder_cls=builder_cls,
|
159
159
|
config_kwarg=builder_kwargs.get('config'),
|
160
|
-
config_name=args.config,
|
161
|
-
config_idx=args.config_idx,
|
160
|
+
config_name=args.generation.config,
|
161
|
+
config_idx=args.generation.config_idx,
|
162
162
|
)
|
163
163
|
|
164
|
-
if args.file_format:
|
165
|
-
builder_kwargs['file_format'] = args.file_format
|
164
|
+
if args.generation.file_format:
|
165
|
+
builder_kwargs['file_format'] = args.generation.file_format
|
166
166
|
|
167
167
|
make_builder = functools.partial(
|
168
168
|
_make_builder,
|
169
169
|
builder_cls,
|
170
|
-
overwrite=args.overwrite,
|
171
|
-
fail_if_exists=args.fail_if_exists,
|
172
|
-
data_dir=args.data_dir,
|
170
|
+
overwrite=args.debug.overwrite,
|
171
|
+
fail_if_exists=args.debug.fail_if_exists,
|
172
|
+
data_dir=args.paths.data_dir,
|
173
173
|
**builder_kwargs,
|
174
174
|
)
|
175
175
|
|
@@ -203,7 +203,7 @@ def _get_builder_cls_and_kwargs(
|
|
203
203
|
if not has_imports:
|
204
204
|
path = _search_script_path(ds_to_build)
|
205
205
|
if path is not None:
|
206
|
-
logging.info(
|
206
|
+
logging.info('Loading dataset %s from path: %s', ds_to_build, path)
|
207
207
|
# Dynamically load user dataset script
|
208
208
|
# When possible, load from the parent's parent, so module is named
|
209
209
|
# "foo.foo_dataset_builder".
|
@@ -228,7 +228,9 @@ def _get_builder_cls_and_kwargs(
|
|
228
228
|
name, builder_kwargs = tfds.core.naming.parse_builder_name_kwargs(ds_to_build)
|
229
229
|
builder_cls = tfds.builder_cls(str(name))
|
230
230
|
logging.info(
|
231
|
-
|
231
|
+
'Loading dataset %s from imports: %s',
|
232
|
+
ds_to_build,
|
233
|
+
builder_cls.__module__,
|
232
234
|
)
|
233
235
|
return builder_cls, builder_kwargs
|
234
236
|
|
@@ -308,7 +310,7 @@ def _make_builder(
|
|
308
310
|
|
309
311
|
|
310
312
|
def _download(
|
311
|
-
args:
|
313
|
+
args: Args,
|
312
314
|
builder: tfds.core.DatasetBuilder,
|
313
315
|
) -> None:
|
314
316
|
"""Downloads all files of the given builder."""
|
@@ -330,7 +332,7 @@ def _download(
|
|
330
332
|
if builder.MAX_SIMULTANEOUS_DOWNLOADS is not None:
|
331
333
|
max_simultaneous_downloads = builder.MAX_SIMULTANEOUS_DOWNLOADS
|
332
334
|
|
333
|
-
download_dir = args.download_dir or os.path.join(
|
335
|
+
download_dir = args.paths.download_dir or os.path.join(
|
334
336
|
builder._data_dir_root, 'downloads' # pylint: disable=protected-access
|
335
337
|
)
|
336
338
|
dl_manager = tfds.download.DownloadManager(
|
@@ -352,39 +354,39 @@ def _download(
|
|
352
354
|
|
353
355
|
|
354
356
|
def _download_and_prepare(
|
355
|
-
args:
|
357
|
+
args: Args,
|
356
358
|
builder: tfds.core.DatasetBuilder,
|
357
359
|
) -> None:
|
358
360
|
"""Generate a single builder."""
|
359
361
|
cli_utils.download_and_prepare(
|
360
362
|
builder=builder,
|
361
363
|
download_config=_make_download_config(args, dataset_name=builder.name),
|
362
|
-
download_dir=args.download_dir,
|
363
|
-
publish_dir=args.publish_dir,
|
364
|
-
skip_if_published=args.skip_if_published,
|
365
|
-
overwrite=args.overwrite,
|
366
|
-
beam_pipeline_options=args.beam_pipeline_options,
|
367
|
-
nondeterministic_order=args.nondeterministic_order,
|
364
|
+
download_dir=args.paths.download_dir,
|
365
|
+
publish_dir=args.publishing.publish_dir,
|
366
|
+
skip_if_published=args.publishing.skip_if_published,
|
367
|
+
overwrite=args.debug.overwrite,
|
368
|
+
beam_pipeline_options=args.generation.beam_pipeline_options,
|
369
|
+
nondeterministic_order=args.generation.nondeterministic_order,
|
368
370
|
)
|
369
371
|
|
370
372
|
|
371
373
|
def _make_download_config(
|
372
|
-
args:
|
374
|
+
args: Args,
|
373
375
|
dataset_name: str,
|
374
376
|
) -> tfds.download.DownloadConfig:
|
375
377
|
"""Generate the download and prepare configuration."""
|
376
378
|
# Load the download config
|
377
|
-
manual_dir = args.manual_dir
|
378
|
-
if args.add_name_to_manual_dir:
|
379
|
+
manual_dir = args.paths.manual_dir
|
380
|
+
if args.paths.add_name_to_manual_dir:
|
379
381
|
manual_dir = manual_dir / dataset_name
|
380
382
|
|
381
383
|
kwargs = {}
|
382
|
-
if args.max_shard_size_mb:
|
383
|
-
kwargs['max_shard_size'] = args.max_shard_size_mb << 20
|
384
|
-
if args.num_shards:
|
385
|
-
kwargs['num_shards'] = args.num_shards
|
386
|
-
if args.download_config:
|
387
|
-
kwargs.update(json.loads(args.download_config))
|
384
|
+
if args.generation.max_shard_size_mb:
|
385
|
+
kwargs['max_shard_size'] = args.generation.max_shard_size_mb << 20
|
386
|
+
if args.generation.num_shards:
|
387
|
+
kwargs['num_shards'] = args.generation.num_shards
|
388
|
+
if args.generation.download_config:
|
389
|
+
kwargs.update(json.loads(args.generation.download_config))
|
388
390
|
|
389
391
|
if 'download_mode' in kwargs:
|
390
392
|
kwargs['download_mode'] = tfds.download.GenerateMode(
|
@@ -392,15 +394,15 @@ def _make_download_config(
|
|
392
394
|
)
|
393
395
|
else:
|
394
396
|
kwargs['download_mode'] = tfds.download.GenerateMode.REUSE_DATASET_IF_EXISTS
|
395
|
-
if args.update_metadata_only:
|
397
|
+
if args.generation.update_metadata_only:
|
396
398
|
kwargs['download_mode'] = tfds.download.GenerateMode.UPDATE_DATASET_INFO
|
397
399
|
|
398
400
|
return tfds.download.DownloadConfig(
|
399
|
-
extract_dir=args.extract_dir,
|
401
|
+
extract_dir=args.paths.extract_dir,
|
400
402
|
manual_dir=manual_dir,
|
401
|
-
max_examples_per_split=args.max_examples_per_split,
|
402
|
-
register_checksums=args.register_checksums,
|
403
|
-
force_checksums_validation=args.force_checksums_validation,
|
403
|
+
max_examples_per_split=args.debug.max_examples_per_split,
|
404
|
+
register_checksums=args.generation.register_checksums,
|
405
|
+
force_checksums_validation=args.generation.force_checksums_validation,
|
404
406
|
**kwargs,
|
405
407
|
)
|
406
408
|
|
@@ -445,11 +447,10 @@ def _get_config_name(
|
|
445
447
|
else:
|
446
448
|
return config_name
|
447
449
|
elif config_idx is not None: # `--config_idx 123`
|
448
|
-
if config_idx
|
450
|
+
if config_idx >= len(builder_cls.BUILDER_CONFIGS):
|
449
451
|
raise ValueError(
|
450
|
-
f'--config_idx {config_idx} greater than number '
|
451
|
-
f'
|
452
|
-
f'{builder_cls.name}.'
|
452
|
+
f'--config_idx {config_idx} greater than number of configs '
|
453
|
+
f'{len(builder_cls.BUILDER_CONFIGS)} for {builder_cls.name}.'
|
453
454
|
)
|
454
455
|
else:
|
455
456
|
# Use `config.name` to avoid
|
@@ -311,7 +311,8 @@ def test_download_only(build):
|
|
311
311
|
)
|
312
312
|
def test_make_download_config(args: str, download_config_kwargs):
|
313
313
|
args = main._parse_flags(f'tfds build x {args}'.split())
|
314
|
-
|
314
|
+
cmd_args: build_lib.Args = args.args
|
315
|
+
actual = build_lib._make_download_config(cmd_args, dataset_name='x')
|
315
316
|
# Ignore the beam runner
|
316
317
|
actual = actual.replace(beam_runner=None)
|
317
318
|
expected = tfds.download.DownloadConfig(**download_config_kwargs)
|
@@ -127,232 +127,137 @@ class DatasetInfo:
|
|
127
127
|
self.ds_import = ds_import
|
128
128
|
|
129
129
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
'Debug & tests',
|
134
|
-
description=(
|
135
|
-
'--pdb Enter post-mortem debugging mode if an exception is raised.'
|
136
|
-
),
|
137
|
-
)
|
138
|
-
debug_group.add_argument(
|
139
|
-
'--overwrite',
|
140
|
-
action='store_true',
|
141
|
-
help='Delete pre-existing dataset if it exists.',
|
142
|
-
)
|
143
|
-
debug_group.add_argument(
|
144
|
-
'--fail_if_exists',
|
145
|
-
action='store_true',
|
146
|
-
default=False,
|
147
|
-
help='Fails the program if there is a pre-existing dataset.',
|
148
|
-
)
|
149
|
-
debug_group.add_argument(
|
150
|
-
'--max_examples_per_split',
|
151
|
-
type=int,
|
152
|
-
nargs='?',
|
153
|
-
const=1,
|
154
|
-
help=(
|
155
|
-
'When set, only generate the first X examples (default to 1), rather'
|
156
|
-
' than the full dataset.If set to 0, only execute the'
|
157
|
-
' `_split_generators` (which download the original data), but skip'
|
158
|
-
' `_generator_examples`'
|
159
|
-
),
|
160
|
-
)
|
130
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
131
|
+
class DebugOptions:
|
132
|
+
"""Debug & tests options.
|
161
133
|
|
134
|
+
Attributes:
|
135
|
+
overwrite: If True, delete pre-existing dataset if it exists.
|
136
|
+
fail_if_exists: If True, fails the program if there is a pre-existing
|
137
|
+
dataset.
|
138
|
+
max_examples_per_split: When set, only generate the first X examples
|
139
|
+
(default to 1), rather than the full dataset. If set to 0, only execute
|
140
|
+
the `_split_generators` (which download the original data), but skip
|
141
|
+
`_generator_examples`.
|
142
|
+
"""
|
162
143
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
'--data_dir',
|
168
|
-
type=epath.Path,
|
169
|
-
default=epath.Path(constants.DATA_DIR),
|
170
|
-
help=(
|
171
|
-
'Where to place datasets. Default to '
|
172
|
-
'`~/tensorflow_datasets/` or `TFDS_DATA_DIR` environement variable.'
|
173
|
-
),
|
174
|
-
)
|
175
|
-
path_group.add_argument(
|
176
|
-
'--download_dir',
|
177
|
-
type=epath.Path,
|
178
|
-
help='Where to place downloads. Default to `<data_dir>/downloads/`.',
|
179
|
-
)
|
180
|
-
path_group.add_argument(
|
181
|
-
'--extract_dir',
|
182
|
-
type=epath.Path,
|
183
|
-
help='Where to extract files. Default to `<download_dir>/extracted/`.',
|
184
|
-
)
|
185
|
-
path_group.add_argument(
|
186
|
-
'--manual_dir',
|
187
|
-
type=epath.Path,
|
188
|
-
help=(
|
189
|
-
'Where to manually download data (required for some datasets). '
|
190
|
-
'Default to `<download_dir>/manual/`.'
|
191
|
-
),
|
192
|
-
)
|
193
|
-
path_group.add_argument(
|
194
|
-
'--add_name_to_manual_dir',
|
195
|
-
action='store_true',
|
196
|
-
help=(
|
197
|
-
'If true, append the dataset name to the `manual_dir` (e.g. '
|
198
|
-
'`<download_dir>/manual/<dataset_name>/`. Useful to avoid collisions '
|
199
|
-
'if many datasets are generated.'
|
200
|
-
),
|
144
|
+
overwrite: bool = simple_parsing.flag(default=False)
|
145
|
+
fail_if_exists: bool = simple_parsing.flag(default=False)
|
146
|
+
max_examples_per_split: int | None = simple_parsing.field(
|
147
|
+
default=None, nargs='?', const=1
|
201
148
|
)
|
202
149
|
|
203
150
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
generation_group.add_argument(
|
208
|
-
'--download_only',
|
209
|
-
action='store_true',
|
210
|
-
help=(
|
211
|
-
'If True, download all files but do not prepare the dataset. Uses the'
|
212
|
-
' checksum.tsv to find out what to download. Therefore, this does not'
|
213
|
-
' work in combination with --register_checksums.'
|
214
|
-
),
|
215
|
-
)
|
216
|
-
generation_group.add_argument(
|
217
|
-
'--config',
|
218
|
-
'-c',
|
219
|
-
type=str,
|
220
|
-
help=(
|
221
|
-
'Config name to build. Build all configs if not set. Can also be a'
|
222
|
-
' json of the kwargs forwarded to the config `__init__` (for custom'
|
223
|
-
' configs).'
|
224
|
-
),
|
225
|
-
)
|
226
|
-
# We are forced to have 2 flags to avoid ambiguity when config name is
|
227
|
-
# a number (e.g. `voc/2017`)
|
228
|
-
generation_group.add_argument(
|
229
|
-
'--config_idx',
|
230
|
-
type=int,
|
231
|
-
help=(
|
232
|
-
'Config id to build (`builder_cls.BUILDER_CONFIGS[config_idx]`). '
|
233
|
-
'Mutually exclusive with `--config`.'
|
234
|
-
),
|
235
|
-
)
|
236
|
-
generation_group.add_argument(
|
237
|
-
'--update_metadata_only',
|
238
|
-
action='store_true',
|
239
|
-
default=False,
|
240
|
-
help=(
|
241
|
-
'If True, existing dataset_info.json is updated with metadata defined'
|
242
|
-
' in Builder class(es). Datasets must already have been prepared.'
|
243
|
-
),
|
244
|
-
)
|
245
|
-
generation_group.add_argument(
|
246
|
-
'--download_config',
|
247
|
-
type=str,
|
248
|
-
help=(
|
249
|
-
'A json of the kwargs forwarded to the config `__init__` (for custom'
|
250
|
-
' DownloadConfigs).'
|
251
|
-
),
|
252
|
-
)
|
253
|
-
generation_group.add_argument(
|
254
|
-
'--imports',
|
255
|
-
'-i',
|
256
|
-
type=str,
|
257
|
-
help='Comma separated list of module to import to register datasets.',
|
258
|
-
)
|
259
|
-
generation_group.add_argument(
|
260
|
-
'--register_checksums',
|
261
|
-
action='store_true',
|
262
|
-
help='If True, store size and checksum of downloaded files.',
|
263
|
-
)
|
264
|
-
generation_group.add_argument(
|
265
|
-
'--force_checksums_validation',
|
266
|
-
action='store_true',
|
267
|
-
help='If True, raise an error if the checksums are not found.',
|
268
|
-
)
|
269
|
-
# For compatibility with absl.flags (which generates --foo and --nofoo).
|
270
|
-
generation_group.add_argument(
|
271
|
-
'--noforce_checksums_validation',
|
272
|
-
dest='force_checksums_validation',
|
273
|
-
action='store_false',
|
274
|
-
help='If specified, bypass the checks on the checksums.',
|
275
|
-
)
|
276
|
-
generation_group.add_argument(
|
277
|
-
'--beam_pipeline_options',
|
278
|
-
type=str,
|
279
|
-
# nargs='+',
|
280
|
-
help=(
|
281
|
-
'A (comma-separated) list of flags to pass to `PipelineOptions` when'
|
282
|
-
' preparing with Apache Beam. (see:'
|
283
|
-
' https://www.tensorflow.org/datasets/beam_datasets). Example:'
|
284
|
-
' `--beam_pipeline_options=job_name=my-job,project=my-project`'
|
285
|
-
),
|
286
|
-
)
|
287
|
-
format_values = [f.value for f in file_adapters.FileFormat]
|
288
|
-
generation_group.add_argument(
|
289
|
-
'--file_format',
|
290
|
-
type=str,
|
291
|
-
help=(
|
292
|
-
'File format to which generate the tf-examples. '
|
293
|
-
f'Available values: {format_values} (see `tfds.core.FileFormat`).'
|
294
|
-
),
|
295
|
-
)
|
296
|
-
generation_group.add_argument(
|
297
|
-
'--max_shard_size_mb', type=int, help='The max shard size in megabytes.'
|
298
|
-
)
|
299
|
-
generation_group.add_argument(
|
300
|
-
'--num_shards', type=int, help='The number of shards to write to.'
|
301
|
-
)
|
302
|
-
generation_group.add_argument(
|
303
|
-
'--num-processes',
|
304
|
-
type=int,
|
305
|
-
default=1,
|
306
|
-
help='Number of parallel build processes.',
|
307
|
-
)
|
308
|
-
generation_group.add_argument(
|
309
|
-
'--nondeterministic_order',
|
310
|
-
action='store_true',
|
311
|
-
default=False,
|
312
|
-
help=(
|
313
|
-
'If True, it will not assure deterministic ordering when writing'
|
314
|
-
' examples to disk. This might result in quicker dataset preparation.'
|
315
|
-
),
|
316
|
-
)
|
317
|
-
# For compatibility with absl.flags (which generates --foo and --nofoo).
|
318
|
-
generation_group.add_argument(
|
319
|
-
'--nonondeterministic_order',
|
320
|
-
dest='nondeterministic_order',
|
321
|
-
action='store_false',
|
322
|
-
help=(
|
323
|
-
'If specified, it will assure deterministic ordering when writing'
|
324
|
-
' examples to disk.'
|
325
|
-
),
|
326
|
-
)
|
151
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
152
|
+
class PathOptions:
|
153
|
+
"""Path options.
|
327
154
|
|
155
|
+
Attributes:
|
156
|
+
data_dir: Where to place datasets. Default to `~/tensorflow_datasets/` or
|
157
|
+
`TFDS_DATA_DIR` environement variable.
|
158
|
+
download_dir: Where to place downloads. Default to `<data_dir>/downloads/`.
|
159
|
+
extract_dir: Where to extract files. Default to `<download_dir>/extracted/`.
|
160
|
+
manual_dir: Where to manually download data (required for some datasets).
|
161
|
+
Default to `<download_dir>/manual/`.
|
162
|
+
add_name_to_manual_dir: If true, append the dataset name to the `manual_dir`
|
163
|
+
(e.g. `<download_dir>/manual/<dataset_name>/`). Useful to avoid collisions
|
164
|
+
if many datasets are generated.
|
165
|
+
"""
|
328
166
|
|
329
|
-
|
330
|
-
|
331
|
-
publish_group = parser.add_argument_group(
|
332
|
-
'Publishing',
|
333
|
-
description='Options for publishing successfully created datasets.',
|
167
|
+
data_dir: epath.Path = simple_parsing.field(
|
168
|
+
default=epath.Path(constants.DATA_DIR)
|
334
169
|
)
|
335
|
-
|
336
|
-
|
337
|
-
|
170
|
+
download_dir: epath.Path | None = None
|
171
|
+
extract_dir: epath.Path | None = None
|
172
|
+
manual_dir: epath.Path | None = None
|
173
|
+
add_name_to_manual_dir: bool = simple_parsing.flag(default=False)
|
174
|
+
|
175
|
+
|
176
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
177
|
+
class GenerationOptions:
|
178
|
+
"""Generation options.
|
179
|
+
|
180
|
+
Attributes:
|
181
|
+
download_only: If True, download all files but do not prepare the dataset.
|
182
|
+
Uses the checksum.tsv to find out what to download. Therefore, this does
|
183
|
+
not work in combination with --register_checksums.
|
184
|
+
config: Config name to build. Build all configs if not set. Can also be a
|
185
|
+
json of the kwargs forwarded to the config `__init__` (for custom
|
186
|
+
configs).
|
187
|
+
config_idx: Config id to build (`builder_cls.BUILDER_CONFIGS[config_idx]`).
|
188
|
+
Mutually exclusive with `--config`. We are forced to have 2 flags to avoid
|
189
|
+
ambiguity when `config` is a number (e.g. `voc/2017`).
|
190
|
+
update_metadata_only: If True, existing dataset_info.json is updated with
|
191
|
+
metadata defined in Builder class(es). Datasets must already have been
|
192
|
+
prepared.
|
193
|
+
download_config: A json of the kwargs forwarded to the config `__init__`
|
194
|
+
(for custom DownloadConfigs).
|
195
|
+
imports: Comma separated list of module to import to register datasets.
|
196
|
+
register_checksums: If True, store size and checksum of downloaded files.
|
197
|
+
force_checksums_validation: If True, raise an error if the checksums are not
|
198
|
+
found. Otherwise, bypass the checks on the checksums
|
199
|
+
beam_pipeline_options: A (comma-separated) list of flags to pass to
|
200
|
+
`PipelineOptions` when preparing with Apache Beam. (see:
|
201
|
+
https://www.tensorflow.org/datasets/beam_datasets). Example:
|
202
|
+
`--beam_pipeline_options=job_name=my-job,project=my-project`
|
203
|
+
file_format: File format to which generate the tf-examples.
|
204
|
+
max_shard_size_mb: The max shard size in megabytes.
|
205
|
+
num_shards: The number of shards to write to.
|
206
|
+
num_processes: Number of parallel build processes.
|
207
|
+
nondeterministic_order: If True, it will not assure deterministic ordering
|
208
|
+
when writing examples to disk. This might result in quicker dataset
|
209
|
+
preparation. Otherwise, it will assure deterministic ordering when writing
|
210
|
+
examples to disk
|
211
|
+
"""
|
212
|
+
|
213
|
+
download_only: bool = simple_parsing.flag(default=False)
|
214
|
+
config: str | None = simple_parsing.field(default=None, alias='-c')
|
215
|
+
config_idx: int | None = None
|
216
|
+
update_metadata_only: bool = simple_parsing.flag(default=False)
|
217
|
+
download_config: str | None = None
|
218
|
+
imports: str | None = simple_parsing.field(default=None, alias='-i')
|
219
|
+
register_checksums: bool = simple_parsing.flag(default=False)
|
220
|
+
force_checksums_validation: bool = simple_parsing.flag(default=False)
|
221
|
+
beam_pipeline_options: str | None = None
|
222
|
+
file_format: str | None = simple_parsing.choice(
|
223
|
+
*(file_format.value for file_format in file_adapters.FileFormat),
|
338
224
|
default=None,
|
339
|
-
required=False,
|
340
|
-
help=(
|
341
|
-
'Where to optionally publish the dataset after it has been '
|
342
|
-
'generated successfully. Should be the root data dir under which'
|
343
|
-
'datasets are stored. '
|
344
|
-
'If unspecified, dataset will not be published'
|
345
|
-
),
|
346
|
-
)
|
347
|
-
publish_group.add_argument(
|
348
|
-
'--skip_if_published',
|
349
|
-
action='store_true',
|
350
|
-
default=False,
|
351
|
-
help=(
|
352
|
-
'If the dataset with the same version and config is already '
|
353
|
-
'published, then it will not be regenerated.'
|
354
|
-
),
|
355
225
|
)
|
226
|
+
max_shard_size_mb: int | None = None
|
227
|
+
num_shards: int | None = None
|
228
|
+
num_processes: int = simple_parsing.field(default=1, alias='num-processes')
|
229
|
+
nondeterministic_order: bool = simple_parsing.flag(default=False)
|
230
|
+
|
231
|
+
|
232
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
233
|
+
class PublishingOptions:
|
234
|
+
"""Publishing options.
|
235
|
+
|
236
|
+
Attributes:
|
237
|
+
publish_dir: Where to optionally publish the dataset after it has been
|
238
|
+
generated successfully. Should be the root data dir under which datasets
|
239
|
+
are stored. If unspecified, dataset will not be published.
|
240
|
+
skip_if_published: If the dataset with the same version and config is
|
241
|
+
already published, then it will not be regenerated.
|
242
|
+
"""
|
243
|
+
|
244
|
+
publish_dir: epath.Path | None = None
|
245
|
+
skip_if_published: bool = simple_parsing.flag(default=False)
|
246
|
+
|
247
|
+
|
248
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
249
|
+
class AutomationOptions:
|
250
|
+
"""Automation options.
|
251
|
+
|
252
|
+
Attributes:
|
253
|
+
exclude_datasets: If set, generate all datasets except the one defined here.
|
254
|
+
Comma separated list of datasets to exclude.
|
255
|
+
experimental_latest_version: Build the latest Version(experiments=...)
|
256
|
+
available rather than default version.
|
257
|
+
"""
|
258
|
+
|
259
|
+
exclude_datasets: str | None = None
|
260
|
+
experimental_latest_version: bool = simple_parsing.flag(default=False)
|
356
261
|
|
357
262
|
|
358
263
|
def download_and_prepare(
|
@@ -16,12 +16,11 @@
|
|
16
16
|
r"""Wrapper around `tfds build`."""
|
17
17
|
|
18
18
|
import argparse
|
19
|
-
from typing import List
|
20
19
|
|
21
20
|
from absl import app
|
22
21
|
from absl import flags
|
23
22
|
from absl import logging
|
24
|
-
|
23
|
+
from tensorflow_datasets.scripts.cli import build
|
25
24
|
from tensorflow_datasets.scripts.cli import main as main_cli
|
26
25
|
|
27
26
|
module_import = flags.DEFINE_string('module_import', None, '`--imports` flag.')
|
@@ -33,7 +32,7 @@ builder_config_id = flags.DEFINE_integer(
|
|
33
32
|
|
34
33
|
|
35
34
|
|
36
|
-
def _parse_flags(argv:
|
35
|
+
def _parse_flags(argv: list[str]) -> argparse.Namespace:
|
37
36
|
"""Command lines flag parsing."""
|
38
37
|
return main_cli._parse_flags([argv[0], 'build'] + argv[1:]) # pylint: disable=protected-access
|
39
38
|
|
@@ -46,12 +45,13 @@ def main(args: argparse.Namespace) -> None:
|
|
46
45
|
logging.warning(
|
47
46
|
'***`tfds build` should be used instead of `download_and_prepare`.***'
|
48
47
|
)
|
48
|
+
cmd_args: build.Args = args.args
|
49
49
|
if module_import.value:
|
50
|
-
|
50
|
+
cmd_args.generation.imports = module_import.value
|
51
51
|
if dataset.value:
|
52
|
-
|
52
|
+
cmd_args.datasets = [dataset.value]
|
53
53
|
if builder_config_id.value is not None:
|
54
|
-
|
54
|
+
cmd_args.generation.config_idx = builder_config_id.value
|
55
55
|
main_cli.main(args)
|
56
56
|
|
57
57
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: tfds-nightly
|
3
|
-
Version: 4.9.9.
|
3
|
+
Version: 4.9.9.dev202508120044
|
4
4
|
Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
|
5
5
|
Home-page: https://github.com/tensorflow/datasets
|
6
6
|
Download-URL: https://github.com/tensorflow/datasets/tags
|
{tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508120044.dist-info}/RECORD
RENAMED
@@ -1965,7 +1965,7 @@ tensorflow_datasets/robotics/rtx/__init__.py,sha256=T5AMbjr-iztrX4Q7k4QhiMNXLOAK
|
|
1965
1965
|
tensorflow_datasets/robotics/rtx/rtx.py,sha256=8OEnc0_LNsgEJjaySoMwWDjzgiv4hzeobuploMM1cdo,50084
|
1966
1966
|
tensorflow_datasets/scripts/__init__.py,sha256=Z8UWkv0wbzS4AzaLgSpYVGApYv5j57RWY0vN5Z553BQ,613
|
1967
1967
|
tensorflow_datasets/scripts/convert_format.py,sha256=Kopn3YbNqH-euJaWFsd1nyo56-HDHgq8fDzRViXdx9A,3604
|
1968
|
-
tensorflow_datasets/scripts/download_and_prepare.py,sha256=
|
1968
|
+
tensorflow_datasets/scripts/download_and_prepare.py,sha256=BZMDXUlZIwB74ukiJ59hIx_PSJhLht2h94rAEhpEX-M,1871
|
1969
1969
|
tensorflow_datasets/scripts/freeze_dataset_versions.py,sha256=SKC7raxmREqaD5pUnSuy_NHdu9gxTlRxJIOoPoT3cuw,1244
|
1970
1970
|
tensorflow_datasets/scripts/print_num_configs.py,sha256=an80znBHmkycQS4ZEHFQTi1fuFop56tDUx9hgguVcvw,971
|
1971
1971
|
tensorflow_datasets/scripts/replace_fake_images.py,sha256=9L2m3zY0nntaOmsVlNWy6BRJEEytyrMuu5W0LXzLCpA,5223
|
@@ -1979,11 +1979,11 @@ tensorflow_datasets/scripts/cleanup/refactor_dataset_as_folder.py,sha256=VpEc2Us
|
|
1979
1979
|
tensorflow_datasets/scripts/cleanup/url_filename_recorder.py,sha256=iLcsT8UgbyNUw00N7bVBC0zCqEuIQ2ndeCCcb4B-OEc,4490
|
1980
1980
|
tensorflow_datasets/scripts/cleanup/url_status_checker.py,sha256=Tr3LtLnGhI8ElDAS-ejmuAU3rs1lmqmYlU4figoVQg0,1967
|
1981
1981
|
tensorflow_datasets/scripts/cli/__init__.py,sha256=Z8UWkv0wbzS4AzaLgSpYVGApYv5j57RWY0vN5Z553BQ,613
|
1982
|
-
tensorflow_datasets/scripts/cli/build.py,sha256=
|
1983
|
-
tensorflow_datasets/scripts/cli/build_test.py,sha256=
|
1982
|
+
tensorflow_datasets/scripts/cli/build.py,sha256=Gwemm1V9qNHrgEVxcO2R8SU-_naNtFXKctIdR6U2sLo,15433
|
1983
|
+
tensorflow_datasets/scripts/cli/build_test.py,sha256=Zh9TTkGW3_Gvl6Lm6_4E4rIWUcjJ0bq-Ymd_SYS5jnY,10585
|
1984
1984
|
tensorflow_datasets/scripts/cli/builder_templates.py,sha256=99SvH3skigkc2Qg737BV2OzhXL_Rgu4az8eVHsxKCLk,7985
|
1985
1985
|
tensorflow_datasets/scripts/cli/builder_templates_test.py,sha256=HBNB-v2zlImKULPI8Webs9hXCkeFmWT29urxav-tDe8,2062
|
1986
|
-
tensorflow_datasets/scripts/cli/cli_utils.py,sha256=
|
1986
|
+
tensorflow_datasets/scripts/cli/cli_utils.py,sha256=IXH0POQyoM5wZs4RK3Crb8jq86nlk_mkK4Hc-kynwW0,12497
|
1987
1987
|
tensorflow_datasets/scripts/cli/conftest.py,sha256=3PNh_BbR013G4HyLAZOleUXsQ9mICrD03NaKwdHFMXs,1291
|
1988
1988
|
tensorflow_datasets/scripts/cli/convert_format.py,sha256=02RDZQQCuXf_XFFpx0gmRVkYyJg534kY0fZwGKxtUL4,4197
|
1989
1989
|
tensorflow_datasets/scripts/cli/convert_format_utils.py,sha256=U_q5WVgMNrjBkOc166U4Y_eca5KOS3Xb3jSDjp4XdK4,29078
|
@@ -2468,10 +2468,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=PXS8DMNW-MDrT2p5oy4Ic
|
|
2468
2468
|
tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=vGwSGeM8WE4Q-l0-eEE1sBojmk6YT0l1OO60AWa4Q40,719
|
2469
2469
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=q-vX_FBzIwsFxL4sY9vuyQ3UQD2PLM4yhUR4U6l-qao,16903
|
2470
2470
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=ZymHT1NkmD-pUnh3BmM3_g30c5afsWYnmqDD9dVyDSA,1778
|
2471
|
-
tfds_nightly-4.9.9.
|
2472
|
-
tfds_nightly-4.9.9.
|
2473
|
-
tfds_nightly-4.9.9.
|
2474
|
-
tfds_nightly-4.9.9.
|
2475
|
-
tfds_nightly-4.9.9.
|
2476
|
-
tfds_nightly-4.9.9.
|
2477
|
-
tfds_nightly-4.9.9.
|
2471
|
+
tfds_nightly-4.9.9.dev202508120044.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
|
2472
|
+
tfds_nightly-4.9.9.dev202508120044.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
2473
|
+
tfds_nightly-4.9.9.dev202508120044.dist-info/METADATA,sha256=GPNy1y0jxT2XuJB5EIzhzDryXPOnpRsInh2iFU6COyg,11694
|
2474
|
+
tfds_nightly-4.9.9.dev202508120044.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
2475
|
+
tfds_nightly-4.9.9.dev202508120044.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
|
2476
|
+
tfds_nightly-4.9.9.dev202508120044.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
|
2477
|
+
tfds_nightly-4.9.9.dev202508120044.dist-info/RECORD,,
|
{tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508120044.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|