tfds-nightly 4.9.9.dev202508110045__py3-none-any.whl → 4.9.9.dev202508130045__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorflow_datasets/scripts/cli/build.py +115 -126
- tensorflow_datasets/scripts/cli/build_test.py +3 -1
- tensorflow_datasets/scripts/cli/cli_utils.py +153 -217
- tensorflow_datasets/scripts/cli/convert_format.py +3 -15
- tensorflow_datasets/scripts/cli/croissant.py +6 -27
- tensorflow_datasets/scripts/cli/main.py +54 -31
- tensorflow_datasets/scripts/cli/new.py +2 -15
- tensorflow_datasets/scripts/download_and_prepare.py +8 -8
- tensorflow_datasets/testing/dataset_builder_testing.py +16 -16
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508130045.dist-info}/METADATA +1 -1
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508130045.dist-info}/RECORD +16 -16
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508130045.dist-info}/WHEEL +0 -0
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508130045.dist-info}/entry_points.txt +0 -0
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508130045.dist-info}/licenses/AUTHORS +0 -0
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508130045.dist-info}/licenses/LICENSE +0 -0
- {tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508130045.dist-info}/top_level.txt +0 -0
@@ -15,8 +15,8 @@
|
|
15
15
|
|
16
16
|
"""`tfds build` command."""
|
17
17
|
|
18
|
-
import argparse
|
19
18
|
from collections.abc import Iterator
|
19
|
+
import dataclasses
|
20
20
|
import functools
|
21
21
|
import importlib
|
22
22
|
import itertools
|
@@ -26,112 +26,100 @@ import os
|
|
26
26
|
from typing import Any, Type
|
27
27
|
|
28
28
|
from absl import logging
|
29
|
+
import simple_parsing
|
29
30
|
import tensorflow_datasets as tfds
|
30
31
|
from tensorflow_datasets.scripts.cli import cli_utils
|
31
32
|
|
32
|
-
# pylint: disable=logging-fstring-interpolation
|
33
33
|
|
34
|
-
|
35
|
-
|
36
|
-
"""
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
34
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
35
|
+
class Args(cli_utils.Args):
|
36
|
+
"""Commands for downloading and preparing datasets.
|
37
|
+
|
38
|
+
Attributes:
|
39
|
+
positional_datasets: Name(s) of the dataset(s) to build. Default to current
|
40
|
+
dir. See https://www.tensorflow.org/datasets/cli for accepted values.
|
41
|
+
datasets: Datasets can also be provided as keyword argument.
|
42
|
+
debug: Debug & tests options. Use --pdb to enter post-mortem debugging mode
|
43
|
+
if an exception is raised.
|
44
|
+
paths: Path options.
|
45
|
+
generation: Generation options.
|
46
|
+
publishing: Publishing options.
|
47
|
+
automation: Automation options.
|
42
48
|
"""
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
build_parser.add_argument(
|
47
|
-
'datasets', # Positional arguments
|
48
|
-
type=str,
|
49
|
+
|
50
|
+
positional_datasets: list[str] = simple_parsing.field(
|
51
|
+
positional=True,
|
49
52
|
nargs='*',
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
),
|
54
|
-
)
|
55
|
-
build_parser.add_argument( # Also accept keyword arguments
|
56
|
-
'--datasets',
|
57
|
-
type=str,
|
58
|
-
nargs='+',
|
59
|
-
dest='datasets_keyword',
|
60
|
-
help='Datasets can also be provided as keyword argument.',
|
53
|
+
default_factory=list,
|
54
|
+
# Need to explicitly set metavar for command-line help.
|
55
|
+
metavar='datasets',
|
61
56
|
)
|
57
|
+
datasets: list[str] = simple_parsing.field(nargs='*', default_factory=list)
|
62
58
|
|
63
|
-
cli_utils.
|
64
|
-
cli_utils.
|
65
|
-
|
66
|
-
cli_utils.add_publish_argument_group(build_parser)
|
67
|
-
|
68
|
-
# **** Automation options ****
|
69
|
-
automation_group = build_parser.add_argument_group(
|
70
|
-
'Automation', description='Used by automated scripts.'
|
59
|
+
debug: cli_utils.DebugOptions = cli_utils.DebugOptions()
|
60
|
+
paths: cli_utils.PathOptions = simple_parsing.field(
|
61
|
+
default_factory=cli_utils.PathOptions
|
71
62
|
)
|
72
|
-
|
73
|
-
|
74
|
-
type=str,
|
75
|
-
help=(
|
76
|
-
'If set, generate all datasets except the one defined here. '
|
77
|
-
'Comma separated list of datasets to exclude. '
|
78
|
-
),
|
63
|
+
generation: cli_utils.GenerationOptions = simple_parsing.field(
|
64
|
+
default_factory=cli_utils.GenerationOptions
|
79
65
|
)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
'default version.'
|
86
|
-
),
|
66
|
+
publishing: cli_utils.PublishingOptions = simple_parsing.field(
|
67
|
+
default_factory=cli_utils.PublishingOptions
|
68
|
+
)
|
69
|
+
automation: cli_utils.AutomationOptions = simple_parsing.field(
|
70
|
+
default_factory=cli_utils.AutomationOptions
|
87
71
|
)
|
88
72
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
list(importlib.import_module(m) for m in args.imports.split(','))
|
73
|
+
def execute(self) -> None:
|
74
|
+
"""Build the given datasets."""
|
75
|
+
# Eventually register additional datasets imports
|
76
|
+
if self.generation.imports:
|
77
|
+
list(
|
78
|
+
importlib.import_module(m) for m in self.generation.imports.split(',')
|
79
|
+
)
|
97
80
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
81
|
+
# Select datasets to generate
|
82
|
+
datasets = self.positional_datasets + self.datasets
|
83
|
+
if (
|
84
|
+
self.automation.exclude_datasets
|
85
|
+
): # Generate all datasets if `--exclude_datasets` set
|
86
|
+
if datasets:
|
87
|
+
raise ValueError("--exclude_datasets can't be used with `datasets`")
|
88
|
+
datasets = set(tfds.list_builders(with_community_datasets=False)) - set(
|
89
|
+
self.automation.exclude_datasets.split(',')
|
90
|
+
)
|
91
|
+
datasets = sorted(datasets) # `set` is not deterministic
|
92
|
+
else:
|
93
|
+
datasets = datasets or [''] # Empty string for default
|
94
|
+
|
95
|
+
# Import builder classes
|
96
|
+
builders_cls_and_kwargs = [
|
97
|
+
_get_builder_cls_and_kwargs(
|
98
|
+
dataset, has_imports=bool(self.generation.imports)
|
99
|
+
)
|
100
|
+
for dataset in datasets
|
101
|
+
]
|
102
|
+
|
103
|
+
# Parallelize datasets generation.
|
104
|
+
builders = itertools.chain(*(
|
105
|
+
_make_builders(self, builder_cls, builder_kwargs)
|
106
|
+
for (builder_cls, builder_kwargs) in builders_cls_and_kwargs
|
107
|
+
))
|
108
|
+
process_builder_fn = functools.partial(
|
109
|
+
_download if self.generation.download_only else _download_and_prepare,
|
110
|
+
self,
|
105
111
|
)
|
106
|
-
datasets = sorted(datasets) # `set` is not deterministic
|
107
|
-
else:
|
108
|
-
datasets = datasets or [''] # Empty string for default
|
109
|
-
|
110
|
-
# Import builder classes
|
111
|
-
builders_cls_and_kwargs = [
|
112
|
-
_get_builder_cls_and_kwargs(dataset, has_imports=bool(args.imports))
|
113
|
-
for dataset in datasets
|
114
|
-
]
|
115
|
-
|
116
|
-
# Parallelize datasets generation.
|
117
|
-
builders = itertools.chain(*(
|
118
|
-
_make_builders(args, builder_cls, builder_kwargs)
|
119
|
-
for (builder_cls, builder_kwargs) in builders_cls_and_kwargs
|
120
|
-
))
|
121
|
-
process_builder_fn = functools.partial(
|
122
|
-
_download if args.download_only else _download_and_prepare, args
|
123
|
-
)
|
124
112
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
113
|
+
if self.generation.num_processes == 1:
|
114
|
+
for builder in builders:
|
115
|
+
process_builder_fn(builder)
|
116
|
+
else:
|
117
|
+
with multiprocessing.Pool(self.generation.num_processes) as pool:
|
118
|
+
pool.map(process_builder_fn, builders)
|
131
119
|
|
132
120
|
|
133
121
|
def _make_builders(
|
134
|
-
args:
|
122
|
+
args: Args,
|
135
123
|
builder_cls: Type[tfds.core.DatasetBuilder],
|
136
124
|
builder_kwargs: dict[str, Any],
|
137
125
|
) -> Iterator[tfds.core.DatasetBuilder]:
|
@@ -146,7 +134,7 @@ def _make_builders(
|
|
146
134
|
Initialized dataset builders.
|
147
135
|
"""
|
148
136
|
# Eventually overwrite version
|
149
|
-
if args.experimental_latest_version:
|
137
|
+
if args.automation.experimental_latest_version:
|
150
138
|
if 'version' in builder_kwargs:
|
151
139
|
raise ValueError(
|
152
140
|
"Can't have both `--experimental_latest` and version set (`:1.0.0`)"
|
@@ -157,19 +145,19 @@ def _make_builders(
|
|
157
145
|
builder_kwargs['config'] = _get_config_name(
|
158
146
|
builder_cls=builder_cls,
|
159
147
|
config_kwarg=builder_kwargs.get('config'),
|
160
|
-
config_name=args.config,
|
161
|
-
config_idx=args.config_idx,
|
148
|
+
config_name=args.generation.config,
|
149
|
+
config_idx=args.generation.config_idx,
|
162
150
|
)
|
163
151
|
|
164
|
-
if args.file_format:
|
165
|
-
builder_kwargs['file_format'] = args.file_format
|
152
|
+
if args.generation.file_format:
|
153
|
+
builder_kwargs['file_format'] = args.generation.file_format
|
166
154
|
|
167
155
|
make_builder = functools.partial(
|
168
156
|
_make_builder,
|
169
157
|
builder_cls,
|
170
|
-
overwrite=args.overwrite,
|
171
|
-
fail_if_exists=args.fail_if_exists,
|
172
|
-
data_dir=args.data_dir,
|
158
|
+
overwrite=args.debug.overwrite,
|
159
|
+
fail_if_exists=args.debug.fail_if_exists,
|
160
|
+
data_dir=args.paths.data_dir,
|
173
161
|
**builder_kwargs,
|
174
162
|
)
|
175
163
|
|
@@ -203,7 +191,7 @@ def _get_builder_cls_and_kwargs(
|
|
203
191
|
if not has_imports:
|
204
192
|
path = _search_script_path(ds_to_build)
|
205
193
|
if path is not None:
|
206
|
-
logging.info(
|
194
|
+
logging.info('Loading dataset %s from path: %s', ds_to_build, path)
|
207
195
|
# Dynamically load user dataset script
|
208
196
|
# When possible, load from the parent's parent, so module is named
|
209
197
|
# "foo.foo_dataset_builder".
|
@@ -228,7 +216,9 @@ def _get_builder_cls_and_kwargs(
|
|
228
216
|
name, builder_kwargs = tfds.core.naming.parse_builder_name_kwargs(ds_to_build)
|
229
217
|
builder_cls = tfds.builder_cls(str(name))
|
230
218
|
logging.info(
|
231
|
-
|
219
|
+
'Loading dataset %s from imports: %s',
|
220
|
+
ds_to_build,
|
221
|
+
builder_cls.__module__,
|
232
222
|
)
|
233
223
|
return builder_cls, builder_kwargs
|
234
224
|
|
@@ -308,7 +298,7 @@ def _make_builder(
|
|
308
298
|
|
309
299
|
|
310
300
|
def _download(
|
311
|
-
args:
|
301
|
+
args: Args,
|
312
302
|
builder: tfds.core.DatasetBuilder,
|
313
303
|
) -> None:
|
314
304
|
"""Downloads all files of the given builder."""
|
@@ -330,7 +320,7 @@ def _download(
|
|
330
320
|
if builder.MAX_SIMULTANEOUS_DOWNLOADS is not None:
|
331
321
|
max_simultaneous_downloads = builder.MAX_SIMULTANEOUS_DOWNLOADS
|
332
322
|
|
333
|
-
download_dir = args.download_dir or os.path.join(
|
323
|
+
download_dir = args.paths.download_dir or os.path.join(
|
334
324
|
builder._data_dir_root, 'downloads' # pylint: disable=protected-access
|
335
325
|
)
|
336
326
|
dl_manager = tfds.download.DownloadManager(
|
@@ -352,39 +342,39 @@ def _download(
|
|
352
342
|
|
353
343
|
|
354
344
|
def _download_and_prepare(
|
355
|
-
args:
|
345
|
+
args: Args,
|
356
346
|
builder: tfds.core.DatasetBuilder,
|
357
347
|
) -> None:
|
358
348
|
"""Generate a single builder."""
|
359
349
|
cli_utils.download_and_prepare(
|
360
350
|
builder=builder,
|
361
351
|
download_config=_make_download_config(args, dataset_name=builder.name),
|
362
|
-
download_dir=args.download_dir,
|
363
|
-
publish_dir=args.publish_dir,
|
364
|
-
skip_if_published=args.skip_if_published,
|
365
|
-
overwrite=args.overwrite,
|
366
|
-
beam_pipeline_options=args.beam_pipeline_options,
|
367
|
-
nondeterministic_order=args.nondeterministic_order,
|
352
|
+
download_dir=args.paths.download_dir,
|
353
|
+
publish_dir=args.publishing.publish_dir,
|
354
|
+
skip_if_published=args.publishing.skip_if_published,
|
355
|
+
overwrite=args.debug.overwrite,
|
356
|
+
beam_pipeline_options=args.generation.beam_pipeline_options,
|
357
|
+
nondeterministic_order=args.generation.nondeterministic_order,
|
368
358
|
)
|
369
359
|
|
370
360
|
|
371
361
|
def _make_download_config(
|
372
|
-
args:
|
362
|
+
args: Args,
|
373
363
|
dataset_name: str,
|
374
364
|
) -> tfds.download.DownloadConfig:
|
375
365
|
"""Generate the download and prepare configuration."""
|
376
366
|
# Load the download config
|
377
|
-
manual_dir = args.manual_dir
|
378
|
-
if args.add_name_to_manual_dir:
|
367
|
+
manual_dir = args.paths.manual_dir
|
368
|
+
if args.paths.add_name_to_manual_dir:
|
379
369
|
manual_dir = manual_dir / dataset_name
|
380
370
|
|
381
371
|
kwargs = {}
|
382
|
-
if args.max_shard_size_mb:
|
383
|
-
kwargs['max_shard_size'] = args.max_shard_size_mb << 20
|
384
|
-
if args.num_shards:
|
385
|
-
kwargs['num_shards'] = args.num_shards
|
386
|
-
if args.download_config:
|
387
|
-
kwargs.update(json.loads(args.download_config))
|
372
|
+
if args.generation.max_shard_size_mb:
|
373
|
+
kwargs['max_shard_size'] = args.generation.max_shard_size_mb << 20
|
374
|
+
if args.generation.num_shards:
|
375
|
+
kwargs['num_shards'] = args.generation.num_shards
|
376
|
+
if args.generation.download_config:
|
377
|
+
kwargs.update(json.loads(args.generation.download_config))
|
388
378
|
|
389
379
|
if 'download_mode' in kwargs:
|
390
380
|
kwargs['download_mode'] = tfds.download.GenerateMode(
|
@@ -392,15 +382,15 @@ def _make_download_config(
|
|
392
382
|
)
|
393
383
|
else:
|
394
384
|
kwargs['download_mode'] = tfds.download.GenerateMode.REUSE_DATASET_IF_EXISTS
|
395
|
-
if args.update_metadata_only:
|
385
|
+
if args.generation.update_metadata_only:
|
396
386
|
kwargs['download_mode'] = tfds.download.GenerateMode.UPDATE_DATASET_INFO
|
397
387
|
|
398
388
|
return tfds.download.DownloadConfig(
|
399
|
-
extract_dir=args.extract_dir,
|
389
|
+
extract_dir=args.paths.extract_dir,
|
400
390
|
manual_dir=manual_dir,
|
401
|
-
max_examples_per_split=args.max_examples_per_split,
|
402
|
-
register_checksums=args.register_checksums,
|
403
|
-
force_checksums_validation=args.force_checksums_validation,
|
391
|
+
max_examples_per_split=args.debug.max_examples_per_split,
|
392
|
+
register_checksums=args.generation.register_checksums,
|
393
|
+
force_checksums_validation=args.generation.force_checksums_validation,
|
404
394
|
**kwargs,
|
405
395
|
)
|
406
396
|
|
@@ -445,11 +435,10 @@ def _get_config_name(
|
|
445
435
|
else:
|
446
436
|
return config_name
|
447
437
|
elif config_idx is not None: # `--config_idx 123`
|
448
|
-
if config_idx
|
438
|
+
if config_idx >= len(builder_cls.BUILDER_CONFIGS):
|
449
439
|
raise ValueError(
|
450
|
-
f'--config_idx {config_idx} greater than number '
|
451
|
-
f'
|
452
|
-
f'{builder_cls.name}.'
|
440
|
+
f'--config_idx {config_idx} greater than number of configs '
|
441
|
+
f'{len(builder_cls.BUILDER_CONFIGS)} for {builder_cls.name}.'
|
453
442
|
)
|
454
443
|
else:
|
455
444
|
# Use `config.name` to avoid
|
@@ -19,6 +19,7 @@ import dataclasses
|
|
19
19
|
import functools
|
20
20
|
import multiprocessing
|
21
21
|
import os
|
22
|
+
import typing
|
22
23
|
from unittest import mock
|
23
24
|
|
24
25
|
from etils import epath
|
@@ -311,7 +312,8 @@ def test_download_only(build):
|
|
311
312
|
)
|
312
313
|
def test_make_download_config(args: str, download_config_kwargs):
|
313
314
|
args = main._parse_flags(f'tfds build x {args}'.split())
|
314
|
-
|
315
|
+
cmd_args = typing.cast(build_lib.Args, args.command)
|
316
|
+
actual = build_lib._make_download_config(cmd_args, dataset_name='x')
|
315
317
|
# Ignore the beam runner
|
316
318
|
actual = actual.replace(beam_runner=None)
|
317
319
|
expected = tfds.download.DownloadConfig(**download_config_kwargs)
|
@@ -15,11 +15,13 @@
|
|
15
15
|
|
16
16
|
"""Utility functions for TFDS CLI."""
|
17
17
|
|
18
|
+
import abc
|
18
19
|
import argparse
|
19
|
-
from collections.abc import Sequence
|
20
|
+
from collections.abc import Callable, Sequence
|
20
21
|
import dataclasses
|
21
22
|
import itertools
|
22
23
|
import pathlib
|
24
|
+
from typing import TypeVar
|
23
25
|
|
24
26
|
from absl import logging
|
25
27
|
from absl.flags import argparse_flags
|
@@ -33,6 +35,8 @@ from tensorflow_datasets.core import naming
|
|
33
35
|
from tensorflow_datasets.core.utils import file_utils
|
34
36
|
from tensorflow_datasets.scripts.utils import flag_utils
|
35
37
|
|
38
|
+
_DataclassT = TypeVar('_DataclassT')
|
39
|
+
|
36
40
|
|
37
41
|
class ArgumentParser(
|
38
42
|
argparse_flags.ArgumentParser, simple_parsing.ArgumentParser
|
@@ -77,6 +81,33 @@ class ArgumentParser(
|
|
77
81
|
return super().parse_known_args(args, namespace)
|
78
82
|
|
79
83
|
|
84
|
+
def make_flags_parser(
|
85
|
+
args_dataclass: type[_DataclassT], description: str
|
86
|
+
) -> Callable[[list[str]], _DataclassT]:
|
87
|
+
"""Returns a function that parses flags and returns the dataclass instance."""
|
88
|
+
|
89
|
+
def _parse_flags(argv: list[str]) -> _DataclassT:
|
90
|
+
"""Command lines flag parsing."""
|
91
|
+
parser = ArgumentParser(
|
92
|
+
description=description,
|
93
|
+
allow_abbrev=False,
|
94
|
+
)
|
95
|
+
parser.add_arguments(args_dataclass, dest='args')
|
96
|
+
return parser.parse_args(argv[1:]).args
|
97
|
+
|
98
|
+
return _parse_flags
|
99
|
+
|
100
|
+
|
101
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
102
|
+
class Args(abc.ABC):
|
103
|
+
"""CLI arguments for TFDS CLI commands."""
|
104
|
+
|
105
|
+
@abc.abstractmethod
|
106
|
+
def execute(self) -> None:
|
107
|
+
"""Execute the CLI command."""
|
108
|
+
...
|
109
|
+
|
110
|
+
|
80
111
|
@dataclasses.dataclass
|
81
112
|
class DatasetInfo:
|
82
113
|
"""Structure for common string used for formatting.
|
@@ -127,232 +158,137 @@ class DatasetInfo:
|
|
127
158
|
self.ds_import = ds_import
|
128
159
|
|
129
160
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
'Debug & tests',
|
134
|
-
description=(
|
135
|
-
'--pdb Enter post-mortem debugging mode if an exception is raised.'
|
136
|
-
),
|
137
|
-
)
|
138
|
-
debug_group.add_argument(
|
139
|
-
'--overwrite',
|
140
|
-
action='store_true',
|
141
|
-
help='Delete pre-existing dataset if it exists.',
|
142
|
-
)
|
143
|
-
debug_group.add_argument(
|
144
|
-
'--fail_if_exists',
|
145
|
-
action='store_true',
|
146
|
-
default=False,
|
147
|
-
help='Fails the program if there is a pre-existing dataset.',
|
148
|
-
)
|
149
|
-
debug_group.add_argument(
|
150
|
-
'--max_examples_per_split',
|
151
|
-
type=int,
|
152
|
-
nargs='?',
|
153
|
-
const=1,
|
154
|
-
help=(
|
155
|
-
'When set, only generate the first X examples (default to 1), rather'
|
156
|
-
' than the full dataset.If set to 0, only execute the'
|
157
|
-
' `_split_generators` (which download the original data), but skip'
|
158
|
-
' `_generator_examples`'
|
159
|
-
),
|
160
|
-
)
|
161
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
162
|
+
class DebugOptions:
|
163
|
+
"""Debug & tests options.
|
161
164
|
|
165
|
+
Attributes:
|
166
|
+
overwrite: If True, delete pre-existing dataset if it exists.
|
167
|
+
fail_if_exists: If True, fails the program if there is a pre-existing
|
168
|
+
dataset.
|
169
|
+
max_examples_per_split: When set, only generate the first X examples
|
170
|
+
(default to 1), rather than the full dataset. If set to 0, only execute
|
171
|
+
the `_split_generators` (which download the original data), but skip
|
172
|
+
`_generator_examples`.
|
173
|
+
"""
|
162
174
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
'--data_dir',
|
168
|
-
type=epath.Path,
|
169
|
-
default=epath.Path(constants.DATA_DIR),
|
170
|
-
help=(
|
171
|
-
'Where to place datasets. Default to '
|
172
|
-
'`~/tensorflow_datasets/` or `TFDS_DATA_DIR` environement variable.'
|
173
|
-
),
|
174
|
-
)
|
175
|
-
path_group.add_argument(
|
176
|
-
'--download_dir',
|
177
|
-
type=epath.Path,
|
178
|
-
help='Where to place downloads. Default to `<data_dir>/downloads/`.',
|
179
|
-
)
|
180
|
-
path_group.add_argument(
|
181
|
-
'--extract_dir',
|
182
|
-
type=epath.Path,
|
183
|
-
help='Where to extract files. Default to `<download_dir>/extracted/`.',
|
184
|
-
)
|
185
|
-
path_group.add_argument(
|
186
|
-
'--manual_dir',
|
187
|
-
type=epath.Path,
|
188
|
-
help=(
|
189
|
-
'Where to manually download data (required for some datasets). '
|
190
|
-
'Default to `<download_dir>/manual/`.'
|
191
|
-
),
|
192
|
-
)
|
193
|
-
path_group.add_argument(
|
194
|
-
'--add_name_to_manual_dir',
|
195
|
-
action='store_true',
|
196
|
-
help=(
|
197
|
-
'If true, append the dataset name to the `manual_dir` (e.g. '
|
198
|
-
'`<download_dir>/manual/<dataset_name>/`. Useful to avoid collisions '
|
199
|
-
'if many datasets are generated.'
|
200
|
-
),
|
175
|
+
overwrite: bool = simple_parsing.flag(default=False)
|
176
|
+
fail_if_exists: bool = simple_parsing.flag(default=False)
|
177
|
+
max_examples_per_split: int | None = simple_parsing.field(
|
178
|
+
default=None, nargs='?', const=1
|
201
179
|
)
|
202
180
|
|
203
181
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
generation_group.add_argument(
|
208
|
-
'--download_only',
|
209
|
-
action='store_true',
|
210
|
-
help=(
|
211
|
-
'If True, download all files but do not prepare the dataset. Uses the'
|
212
|
-
' checksum.tsv to find out what to download. Therefore, this does not'
|
213
|
-
' work in combination with --register_checksums.'
|
214
|
-
),
|
215
|
-
)
|
216
|
-
generation_group.add_argument(
|
217
|
-
'--config',
|
218
|
-
'-c',
|
219
|
-
type=str,
|
220
|
-
help=(
|
221
|
-
'Config name to build. Build all configs if not set. Can also be a'
|
222
|
-
' json of the kwargs forwarded to the config `__init__` (for custom'
|
223
|
-
' configs).'
|
224
|
-
),
|
225
|
-
)
|
226
|
-
# We are forced to have 2 flags to avoid ambiguity when config name is
|
227
|
-
# a number (e.g. `voc/2017`)
|
228
|
-
generation_group.add_argument(
|
229
|
-
'--config_idx',
|
230
|
-
type=int,
|
231
|
-
help=(
|
232
|
-
'Config id to build (`builder_cls.BUILDER_CONFIGS[config_idx]`). '
|
233
|
-
'Mutually exclusive with `--config`.'
|
234
|
-
),
|
235
|
-
)
|
236
|
-
generation_group.add_argument(
|
237
|
-
'--update_metadata_only',
|
238
|
-
action='store_true',
|
239
|
-
default=False,
|
240
|
-
help=(
|
241
|
-
'If True, existing dataset_info.json is updated with metadata defined'
|
242
|
-
' in Builder class(es). Datasets must already have been prepared.'
|
243
|
-
),
|
244
|
-
)
|
245
|
-
generation_group.add_argument(
|
246
|
-
'--download_config',
|
247
|
-
type=str,
|
248
|
-
help=(
|
249
|
-
'A json of the kwargs forwarded to the config `__init__` (for custom'
|
250
|
-
' DownloadConfigs).'
|
251
|
-
),
|
252
|
-
)
|
253
|
-
generation_group.add_argument(
|
254
|
-
'--imports',
|
255
|
-
'-i',
|
256
|
-
type=str,
|
257
|
-
help='Comma separated list of module to import to register datasets.',
|
258
|
-
)
|
259
|
-
generation_group.add_argument(
|
260
|
-
'--register_checksums',
|
261
|
-
action='store_true',
|
262
|
-
help='If True, store size and checksum of downloaded files.',
|
263
|
-
)
|
264
|
-
generation_group.add_argument(
|
265
|
-
'--force_checksums_validation',
|
266
|
-
action='store_true',
|
267
|
-
help='If True, raise an error if the checksums are not found.',
|
268
|
-
)
|
269
|
-
# For compatibility with absl.flags (which generates --foo and --nofoo).
|
270
|
-
generation_group.add_argument(
|
271
|
-
'--noforce_checksums_validation',
|
272
|
-
dest='force_checksums_validation',
|
273
|
-
action='store_false',
|
274
|
-
help='If specified, bypass the checks on the checksums.',
|
275
|
-
)
|
276
|
-
generation_group.add_argument(
|
277
|
-
'--beam_pipeline_options',
|
278
|
-
type=str,
|
279
|
-
# nargs='+',
|
280
|
-
help=(
|
281
|
-
'A (comma-separated) list of flags to pass to `PipelineOptions` when'
|
282
|
-
' preparing with Apache Beam. (see:'
|
283
|
-
' https://www.tensorflow.org/datasets/beam_datasets). Example:'
|
284
|
-
' `--beam_pipeline_options=job_name=my-job,project=my-project`'
|
285
|
-
),
|
286
|
-
)
|
287
|
-
format_values = [f.value for f in file_adapters.FileFormat]
|
288
|
-
generation_group.add_argument(
|
289
|
-
'--file_format',
|
290
|
-
type=str,
|
291
|
-
help=(
|
292
|
-
'File format to which generate the tf-examples. '
|
293
|
-
f'Available values: {format_values} (see `tfds.core.FileFormat`).'
|
294
|
-
),
|
295
|
-
)
|
296
|
-
generation_group.add_argument(
|
297
|
-
'--max_shard_size_mb', type=int, help='The max shard size in megabytes.'
|
298
|
-
)
|
299
|
-
generation_group.add_argument(
|
300
|
-
'--num_shards', type=int, help='The number of shards to write to.'
|
301
|
-
)
|
302
|
-
generation_group.add_argument(
|
303
|
-
'--num-processes',
|
304
|
-
type=int,
|
305
|
-
default=1,
|
306
|
-
help='Number of parallel build processes.',
|
307
|
-
)
|
308
|
-
generation_group.add_argument(
|
309
|
-
'--nondeterministic_order',
|
310
|
-
action='store_true',
|
311
|
-
default=False,
|
312
|
-
help=(
|
313
|
-
'If True, it will not assure deterministic ordering when writing'
|
314
|
-
' examples to disk. This might result in quicker dataset preparation.'
|
315
|
-
),
|
316
|
-
)
|
317
|
-
# For compatibility with absl.flags (which generates --foo and --nofoo).
|
318
|
-
generation_group.add_argument(
|
319
|
-
'--nonondeterministic_order',
|
320
|
-
dest='nondeterministic_order',
|
321
|
-
action='store_false',
|
322
|
-
help=(
|
323
|
-
'If specified, it will assure deterministic ordering when writing'
|
324
|
-
' examples to disk.'
|
325
|
-
),
|
326
|
-
)
|
182
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
183
|
+
class PathOptions:
|
184
|
+
"""Path options.
|
327
185
|
|
186
|
+
Attributes:
|
187
|
+
data_dir: Where to place datasets. Default to `~/tensorflow_datasets/` or
|
188
|
+
`TFDS_DATA_DIR` environement variable.
|
189
|
+
download_dir: Where to place downloads. Default to `<data_dir>/downloads/`.
|
190
|
+
extract_dir: Where to extract files. Default to `<download_dir>/extracted/`.
|
191
|
+
manual_dir: Where to manually download data (required for some datasets).
|
192
|
+
Default to `<download_dir>/manual/`.
|
193
|
+
add_name_to_manual_dir: If true, append the dataset name to the `manual_dir`
|
194
|
+
(e.g. `<download_dir>/manual/<dataset_name>/`). Useful to avoid collisions
|
195
|
+
if many datasets are generated.
|
196
|
+
"""
|
328
197
|
|
329
|
-
|
330
|
-
|
331
|
-
publish_group = parser.add_argument_group(
|
332
|
-
'Publishing',
|
333
|
-
description='Options for publishing successfully created datasets.',
|
198
|
+
data_dir: epath.Path = simple_parsing.field(
|
199
|
+
default=epath.Path(constants.DATA_DIR)
|
334
200
|
)
|
335
|
-
|
336
|
-
|
337
|
-
|
201
|
+
download_dir: epath.Path | None = None
|
202
|
+
extract_dir: epath.Path | None = None
|
203
|
+
manual_dir: epath.Path | None = None
|
204
|
+
add_name_to_manual_dir: bool = simple_parsing.flag(default=False)
|
205
|
+
|
206
|
+
|
207
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
208
|
+
class GenerationOptions:
|
209
|
+
"""Generation options.
|
210
|
+
|
211
|
+
Attributes:
|
212
|
+
download_only: If True, download all files but do not prepare the dataset.
|
213
|
+
Uses the checksum.tsv to find out what to download. Therefore, this does
|
214
|
+
not work in combination with --register_checksums.
|
215
|
+
config: Config name to build. Build all configs if not set. Can also be a
|
216
|
+
json of the kwargs forwarded to the config `__init__` (for custom
|
217
|
+
configs).
|
218
|
+
config_idx: Config id to build (`builder_cls.BUILDER_CONFIGS[config_idx]`).
|
219
|
+
Mutually exclusive with `--config`. We are forced to have 2 flags to avoid
|
220
|
+
ambiguity when `config` is a number (e.g. `voc/2017`).
|
221
|
+
update_metadata_only: If True, existing dataset_info.json is updated with
|
222
|
+
metadata defined in Builder class(es). Datasets must already have been
|
223
|
+
prepared.
|
224
|
+
download_config: A json of the kwargs forwarded to the config `__init__`
|
225
|
+
(for custom DownloadConfigs).
|
226
|
+
imports: Comma separated list of module to import to register datasets.
|
227
|
+
register_checksums: If True, store size and checksum of downloaded files.
|
228
|
+
force_checksums_validation: If True, raise an error if the checksums are not
|
229
|
+
found. Otherwise, bypass the checks on the checksums
|
230
|
+
beam_pipeline_options: A (comma-separated) list of flags to pass to
|
231
|
+
`PipelineOptions` when preparing with Apache Beam. (see:
|
232
|
+
https://www.tensorflow.org/datasets/beam_datasets). Example:
|
233
|
+
`--beam_pipeline_options=job_name=my-job,project=my-project`
|
234
|
+
file_format: File format to which generate the tf-examples.
|
235
|
+
max_shard_size_mb: The max shard size in megabytes.
|
236
|
+
num_shards: The number of shards to write to.
|
237
|
+
num_processes: Number of parallel build processes.
|
238
|
+
nondeterministic_order: If True, it will not assure deterministic ordering
|
239
|
+
when writing examples to disk. This might result in quicker dataset
|
240
|
+
preparation. Otherwise, it will assure deterministic ordering when writing
|
241
|
+
examples to disk
|
242
|
+
"""
|
243
|
+
|
244
|
+
download_only: bool = simple_parsing.flag(default=False)
|
245
|
+
config: str | None = simple_parsing.field(default=None, alias='-c')
|
246
|
+
config_idx: int | None = None
|
247
|
+
update_metadata_only: bool = simple_parsing.flag(default=False)
|
248
|
+
download_config: str | None = None
|
249
|
+
imports: str | None = simple_parsing.field(default=None, alias='-i')
|
250
|
+
register_checksums: bool = simple_parsing.flag(default=False)
|
251
|
+
force_checksums_validation: bool = simple_parsing.flag(default=False)
|
252
|
+
beam_pipeline_options: str | None = None
|
253
|
+
file_format: str | None = simple_parsing.choice(
|
254
|
+
*(file_format.value for file_format in file_adapters.FileFormat),
|
338
255
|
default=None,
|
339
|
-
required=False,
|
340
|
-
help=(
|
341
|
-
'Where to optionally publish the dataset after it has been '
|
342
|
-
'generated successfully. Should be the root data dir under which'
|
343
|
-
'datasets are stored. '
|
344
|
-
'If unspecified, dataset will not be published'
|
345
|
-
),
|
346
|
-
)
|
347
|
-
publish_group.add_argument(
|
348
|
-
'--skip_if_published',
|
349
|
-
action='store_true',
|
350
|
-
default=False,
|
351
|
-
help=(
|
352
|
-
'If the dataset with the same version and config is already '
|
353
|
-
'published, then it will not be regenerated.'
|
354
|
-
),
|
355
256
|
)
|
257
|
+
max_shard_size_mb: int | None = None
|
258
|
+
num_shards: int | None = None
|
259
|
+
num_processes: int = simple_parsing.field(default=1, alias='num-processes')
|
260
|
+
nondeterministic_order: bool = simple_parsing.flag(default=False)
|
261
|
+
|
262
|
+
|
263
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
264
|
+
class PublishingOptions:
|
265
|
+
"""Publishing options.
|
266
|
+
|
267
|
+
Attributes:
|
268
|
+
publish_dir: Where to optionally publish the dataset after it has been
|
269
|
+
generated successfully. Should be the root data dir under which datasets
|
270
|
+
are stored. If unspecified, dataset will not be published.
|
271
|
+
skip_if_published: If the dataset with the same version and config is
|
272
|
+
already published, then it will not be regenerated.
|
273
|
+
"""
|
274
|
+
|
275
|
+
publish_dir: epath.Path | None = None
|
276
|
+
skip_if_published: bool = simple_parsing.flag(default=False)
|
277
|
+
|
278
|
+
|
279
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
280
|
+
class AutomationOptions:
|
281
|
+
"""Automation options.
|
282
|
+
|
283
|
+
Attributes:
|
284
|
+
exclude_datasets: If set, generate all datasets except the one defined here.
|
285
|
+
Comma separated list of datasets to exclude.
|
286
|
+
experimental_latest_version: Build the latest Version(experiments=...)
|
287
|
+
available rather than default version.
|
288
|
+
"""
|
289
|
+
|
290
|
+
exclude_datasets: str | None = None
|
291
|
+
experimental_latest_version: bool = simple_parsing.flag(default=False)
|
356
292
|
|
357
293
|
|
358
294
|
def download_and_prepare(
|
@@ -25,19 +25,18 @@ tfds convert_format \
|
|
25
25
|
```
|
26
26
|
"""
|
27
27
|
|
28
|
-
import argparse
|
29
28
|
import dataclasses
|
30
|
-
import typing
|
31
29
|
|
32
30
|
from etils import epath
|
33
31
|
import simple_parsing
|
34
32
|
from tensorflow_datasets.core import file_adapters
|
33
|
+
from tensorflow_datasets.scripts.cli import cli_utils
|
35
34
|
from tensorflow_datasets.scripts.cli import convert_format_utils
|
36
35
|
|
37
36
|
|
38
37
|
@dataclasses.dataclass(frozen=True, kw_only=True)
|
39
|
-
class Args:
|
40
|
-
"""
|
38
|
+
class Args(cli_utils.Args):
|
39
|
+
"""Converts a dataset from one file format to another format.
|
41
40
|
|
42
41
|
Attributes:
|
43
42
|
root_data_dir: Root data dir that contains all datasets. All datasets and
|
@@ -94,14 +93,3 @@ class Args:
|
|
94
93
|
num_workers=self.num_workers,
|
95
94
|
fail_on_error=not self.only_log_errors,
|
96
95
|
)
|
97
|
-
|
98
|
-
|
99
|
-
def register_subparser(parsers: argparse._SubParsersAction) -> None:
|
100
|
-
"""Add subparser for `convert_format` command."""
|
101
|
-
parser = parsers.add_parser(
|
102
|
-
'convert_format',
|
103
|
-
help='Converts a dataset from one file format to another format.',
|
104
|
-
)
|
105
|
-
parser = typing.cast(simple_parsing.ArgumentParser, parser)
|
106
|
-
parser.add_arguments(Args, dest='args')
|
107
|
-
parser.set_defaults(subparser_fn=lambda args: args.args.execute())
|
@@ -26,11 +26,9 @@ tfds build_croissant \
|
|
26
26
|
```
|
27
27
|
"""
|
28
28
|
|
29
|
-
import argparse
|
30
29
|
import dataclasses
|
31
30
|
import functools
|
32
31
|
import json
|
33
|
-
import typing
|
34
32
|
|
35
33
|
from etils import epath
|
36
34
|
import mlcroissant as mlc
|
@@ -43,8 +41,8 @@ from tensorflow_datasets.scripts.cli import cli_utils
|
|
43
41
|
|
44
42
|
|
45
43
|
@dataclasses.dataclass(frozen=True, kw_only=True)
|
46
|
-
class CmdArgs(simple_parsing.helpers.FrozenSerializable):
|
47
|
-
"""
|
44
|
+
class CmdArgs(simple_parsing.helpers.FrozenSerializable, cli_utils.Args):
|
45
|
+
"""Prepares a Croissant dataset.
|
48
46
|
|
49
47
|
Attributes:
|
50
48
|
jsonld: Path to the JSONLD file.
|
@@ -122,18 +120,10 @@ class CmdArgs(simple_parsing.helpers.FrozenSerializable):
|
|
122
120
|
self.overwrite_version or self.dataset.metadata.version or '1.0.0'
|
123
121
|
)
|
124
122
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
'build_croissant',
|
130
|
-
help='Prepares a croissant dataset',
|
131
|
-
)
|
132
|
-
parser = typing.cast(simple_parsing.ArgumentParser, parser)
|
133
|
-
parser.add_arguments(CmdArgs, dest='args')
|
134
|
-
parser.set_defaults(
|
135
|
-
subparser_fn=lambda args: prepare_croissant_builders(args.args)
|
136
|
-
)
|
123
|
+
def execute(self) -> None:
|
124
|
+
"""Creates Croissant Builders and prepares them."""
|
125
|
+
for record_set_id in self.record_set_ids:
|
126
|
+
prepare_croissant_builder(args=self, record_set_id=record_set_id)
|
137
127
|
|
138
128
|
|
139
129
|
def prepare_croissant_builder(
|
@@ -163,14 +153,3 @@ def prepare_croissant_builder(
|
|
163
153
|
beam_pipeline_options=None,
|
164
154
|
)
|
165
155
|
return builder
|
166
|
-
|
167
|
-
|
168
|
-
def prepare_croissant_builders(args: CmdArgs):
|
169
|
-
"""Creates Croissant Builders and prepares them.
|
170
|
-
|
171
|
-
Args:
|
172
|
-
args: CLI arguments.
|
173
|
-
"""
|
174
|
-
# Generate each config sequentially.
|
175
|
-
for record_set_id in args.record_set_ids:
|
176
|
-
prepare_croissant_builder(args=args, record_set_id=record_set_id)
|
@@ -21,13 +21,13 @@ TFDS CLI to help creates and build datasets (e.g. `tfds new my_dataset`,
|
|
21
21
|
See: https://www.tensorflow.org/datasets/cli
|
22
22
|
"""
|
23
23
|
|
24
|
-
import
|
24
|
+
import dataclasses
|
25
25
|
import logging as python_logging
|
26
|
-
from typing import List
|
27
26
|
|
28
27
|
from absl import app
|
29
28
|
from absl import flags
|
30
29
|
from absl import logging
|
30
|
+
import simple_parsing
|
31
31
|
|
32
32
|
import tensorflow_datasets.public_api as tfds
|
33
33
|
|
@@ -41,33 +41,60 @@ from tensorflow_datasets.scripts.cli import new
|
|
41
41
|
FLAGS = flags.FLAGS
|
42
42
|
|
43
43
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
44
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
45
|
+
class _DummyCommand:
|
46
|
+
"""Dummy command to avoid `command is MISSING` error."""
|
47
|
+
|
48
|
+
pass
|
49
|
+
|
50
|
+
|
51
|
+
version_field = simple_parsing.field(
|
52
|
+
action='version',
|
53
|
+
version='TensorFlow Datasets: ' + tfds.__version__,
|
54
|
+
help='The version of the TensorFlow Datasets package.',
|
55
|
+
)
|
56
|
+
|
57
|
+
|
58
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
59
|
+
class Args(cli_utils.Args):
|
60
|
+
"""Tensorflow Datasets CLI tool."""
|
61
|
+
|
62
|
+
version: str = version_field
|
63
|
+
"""The version of the TensorFlow Datasets package."""
|
64
|
+
|
65
|
+
dry_run: bool = simple_parsing.flag(default=False)
|
66
|
+
"""If True, print the parsed arguments and exit."""
|
67
|
+
|
68
|
+
command: build.Args | new.Args | convert_format.Args | croissant.CmdArgs = (
|
69
|
+
simple_parsing.subparsers(
|
70
|
+
{
|
71
|
+
'build': build.Args,
|
72
|
+
'new': new.Args,
|
73
|
+
'convert_format': convert_format.Args,
|
74
|
+
'build_croissant': croissant.CmdArgs,
|
75
|
+
},
|
76
|
+
default_factory=_DummyCommand,
|
77
|
+
)
|
59
78
|
)
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
79
|
+
"""The command to execute."""
|
80
|
+
|
81
|
+
def execute(self) -> None:
|
82
|
+
"""Run the command."""
|
83
|
+
if self.dry_run:
|
84
|
+
print(self)
|
85
|
+
# When no command is given, print the help message.
|
86
|
+
elif isinstance(self.command, _DummyCommand):
|
87
|
+
_parse_flags(['', '--help'])
|
88
|
+
else:
|
89
|
+
self.command.execute()
|
90
|
+
|
91
|
+
|
92
|
+
_parse_flags = cli_utils.make_flags_parser(
|
93
|
+
Args, description='Tensorflow Datasets CLI tool'
|
94
|
+
)
|
68
95
|
|
69
96
|
|
70
|
-
def main(args:
|
97
|
+
def main(args: Args) -> None:
|
71
98
|
|
72
99
|
# From the CLI, all datasets are visible
|
73
100
|
tfds.core.visibility.set_availables([
|
@@ -98,11 +125,7 @@ def main(args: argparse.Namespace) -> None:
|
|
98
125
|
new_stream = tfds.core.utils.tqdm_utils.TqdmStream()
|
99
126
|
python_handler.setStream(new_stream)
|
100
127
|
|
101
|
-
|
102
|
-
print(args)
|
103
|
-
else:
|
104
|
-
# Launch the subcommand defined in the subparser (or default to print help)
|
105
|
-
args.subparser_fn(args)
|
128
|
+
args.execute()
|
106
129
|
|
107
130
|
|
108
131
|
def launch_cli() -> None:
|
@@ -15,13 +15,11 @@
|
|
15
15
|
|
16
16
|
"""`tfds new` command."""
|
17
17
|
|
18
|
-
import argparse
|
19
18
|
import dataclasses
|
20
19
|
import os
|
21
20
|
import pathlib
|
22
21
|
import subprocess
|
23
22
|
import textwrap
|
24
|
-
import typing
|
25
23
|
|
26
24
|
import simple_parsing
|
27
25
|
from tensorflow_datasets.core import constants
|
@@ -33,8 +31,8 @@ from tensorflow_datasets.scripts.cli import cli_utils as utils
|
|
33
31
|
|
34
32
|
|
35
33
|
@dataclasses.dataclass(frozen=True, kw_only=True)
|
36
|
-
class Args:
|
37
|
-
"""
|
34
|
+
class Args(utils.Args):
|
35
|
+
"""Creates a new dataset directory from the template.
|
38
36
|
|
39
37
|
Attributes:
|
40
38
|
dataset_name: Name of the dataset to be created (in snake_case).
|
@@ -71,17 +69,6 @@ class Args:
|
|
71
69
|
)
|
72
70
|
|
73
71
|
|
74
|
-
def register_subparser(parsers: argparse._SubParsersAction) -> None:
|
75
|
-
"""Add subparser for `new` command."""
|
76
|
-
parser = parsers.add_parser(
|
77
|
-
'new',
|
78
|
-
help='Creates a new dataset directory from the template.',
|
79
|
-
)
|
80
|
-
parser = typing.cast(simple_parsing.ArgumentParser, parser)
|
81
|
-
parser.add_arguments(Args, dest='args')
|
82
|
-
parser.set_defaults(subparser_fn=lambda args: args.args.execute())
|
83
|
-
|
84
|
-
|
85
72
|
def create_dataset_files(
|
86
73
|
dataset_name: str,
|
87
74
|
dataset_dir: pathlib.Path,
|
@@ -15,13 +15,12 @@
|
|
15
15
|
|
16
16
|
r"""Wrapper around `tfds build`."""
|
17
17
|
|
18
|
-
import
|
19
|
-
from typing import List
|
18
|
+
import typing
|
20
19
|
|
21
20
|
from absl import app
|
22
21
|
from absl import flags
|
23
22
|
from absl import logging
|
24
|
-
|
23
|
+
from tensorflow_datasets.scripts.cli import build
|
25
24
|
from tensorflow_datasets.scripts.cli import main as main_cli
|
26
25
|
|
27
26
|
module_import = flags.DEFINE_string('module_import', None, '`--imports` flag.')
|
@@ -33,7 +32,7 @@ builder_config_id = flags.DEFINE_integer(
|
|
33
32
|
|
34
33
|
|
35
34
|
|
36
|
-
def _parse_flags(argv:
|
35
|
+
def _parse_flags(argv: list[str]) -> main_cli.Args:
|
37
36
|
"""Command lines flag parsing."""
|
38
37
|
return main_cli._parse_flags([argv[0], 'build'] + argv[1:]) # pylint: disable=protected-access
|
39
38
|
|
@@ -41,17 +40,18 @@ def _parse_flags(argv: List[str]) -> argparse.Namespace:
|
|
41
40
|
_display_warning = True
|
42
41
|
|
43
42
|
|
44
|
-
def main(args:
|
43
|
+
def main(args: main_cli.Args) -> None:
|
45
44
|
if _display_warning:
|
46
45
|
logging.warning(
|
47
46
|
'***`tfds build` should be used instead of `download_and_prepare`.***'
|
48
47
|
)
|
48
|
+
cmd_args = typing.cast(build.Args, args.command)
|
49
49
|
if module_import.value:
|
50
|
-
|
50
|
+
cmd_args.generation.imports = module_import.value
|
51
51
|
if dataset.value:
|
52
|
-
|
52
|
+
cmd_args.datasets = [dataset.value]
|
53
53
|
if builder_config_id.value is not None:
|
54
|
-
|
54
|
+
cmd_args.generation.config_idx = builder_config_id.value
|
55
55
|
main_cli.main(args)
|
56
56
|
|
57
57
|
|
@@ -105,15 +105,19 @@ class DatasetBuilderTestCase(
|
|
105
105
|
BUILDER_CONFIGS from the class will be tested.
|
106
106
|
* DL_EXTRACT_RESULT: `dict[str, str]`, the returned result of mocked
|
107
107
|
`download_and_extract` method. The values should be the path of files
|
108
|
-
present in the `fake_examples` directory, relative to
|
109
|
-
|
108
|
+
present in the `fake_examples` (or `dummy_data`) directory, relative to
|
109
|
+
that directory.
|
110
|
+
If not specified, path to `fake_examples` (or `dummy_data`) will always be
|
111
|
+
returned.
|
110
112
|
* DL_EXTRACT_ONLY_RESULT: `dict[str, str]`, the returned result of mocked
|
111
113
|
`extract` method. The values should be the path of files present in the
|
112
|
-
`fake_examples` directory, relative to that directory.
|
114
|
+
`fake_examples` (or `dummy_data`) directory, relative to that directory.
|
115
|
+
If not specified:
|
113
116
|
will call DownloadManager `extract` method.
|
114
117
|
* DL_DOWNLOAD_RESULT: `dict[str, str]`, the returned result of mocked
|
115
118
|
`download_and_extract` method. The values should be the path of files
|
116
|
-
present in the `fake_examples` directory, relative to
|
119
|
+
present in the `fake_examples` (or `dummy_data`) directory, relative to
|
120
|
+
that directory.
|
117
121
|
If not specified: will use DL_EXTRACT_RESULT (this is due to backwards
|
118
122
|
compatibility and will be removed in the future).
|
119
123
|
* EXAMPLE_DIR: `str`, the base directory in in which fake examples are
|
@@ -167,11 +171,9 @@ class DatasetBuilderTestCase(
|
|
167
171
|
"Assign your DatasetBuilder class to %s.DATASET_CLASS." % name
|
168
172
|
)
|
169
173
|
|
170
|
-
cls._available_cm = visibility.set_availables_tmp(
|
171
|
-
|
172
|
-
|
173
|
-
]
|
174
|
-
)
|
174
|
+
cls._available_cm = visibility.set_availables_tmp([
|
175
|
+
visibility.DatasetType.TFDS_PUBLIC,
|
176
|
+
])
|
175
177
|
cls._available_cm.__enter__() # pylint: disable=protected-access
|
176
178
|
|
177
179
|
@classmethod
|
@@ -398,9 +400,9 @@ class DatasetBuilderTestCase(
|
|
398
400
|
err_msg = (
|
399
401
|
"Did you forget to record checksums with `--register_checksums` ? See"
|
400
402
|
" instructions at:"
|
401
|
-
" https://www.tensorflow.org/datasets/add_dataset#
|
402
|
-
" want to opt-out of checksums validation, please add
|
403
|
-
" True` to the `DatasetBuilderTestCase`.\n"
|
403
|
+
" https://www.tensorflow.org/datasets/add_dataset#run_the_generation_code"
|
404
|
+
" If you want to opt-out of checksums validation, please add "
|
405
|
+
" `SKIP_CHECKSUMS = True` to the `DatasetBuilderTestCase`.\n"
|
404
406
|
)
|
405
407
|
url_infos = self.dataset_class.url_infos
|
406
408
|
filepath = self.dataset_class._checksums_path # pylint: disable=protected-access
|
@@ -574,15 +576,13 @@ class DatasetBuilderTestCase(
|
|
574
576
|
|
575
577
|
# If configs specified, ensure they are all valid
|
576
578
|
if builder.builder_config and builder.builder_config.description:
|
577
|
-
err_msg = textwrap.dedent(
|
578
|
-
"""\
|
579
|
+
err_msg = textwrap.dedent("""\
|
579
580
|
The BuilderConfig description should be a one-line description of
|
580
581
|
the config.
|
581
582
|
It shouldn't be the same as `builder.info.description` to avoid
|
582
583
|
redundancy. Both `config.description` and `builder.info.description`
|
583
584
|
will be displayed in the catalog.
|
584
|
-
"""
|
585
|
-
)
|
585
|
+
""")
|
586
586
|
ratio = difflib.SequenceMatcher(
|
587
587
|
None,
|
588
588
|
builder.builder_config.description,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: tfds-nightly
|
3
|
-
Version: 4.9.9.
|
3
|
+
Version: 4.9.9.dev202508130045
|
4
4
|
Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
|
5
5
|
Home-page: https://github.com/tensorflow/datasets
|
6
6
|
Download-URL: https://github.com/tensorflow/datasets/tags
|
{tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508130045.dist-info}/RECORD
RENAMED
@@ -1965,7 +1965,7 @@ tensorflow_datasets/robotics/rtx/__init__.py,sha256=T5AMbjr-iztrX4Q7k4QhiMNXLOAK
|
|
1965
1965
|
tensorflow_datasets/robotics/rtx/rtx.py,sha256=8OEnc0_LNsgEJjaySoMwWDjzgiv4hzeobuploMM1cdo,50084
|
1966
1966
|
tensorflow_datasets/scripts/__init__.py,sha256=Z8UWkv0wbzS4AzaLgSpYVGApYv5j57RWY0vN5Z553BQ,613
|
1967
1967
|
tensorflow_datasets/scripts/convert_format.py,sha256=Kopn3YbNqH-euJaWFsd1nyo56-HDHgq8fDzRViXdx9A,3604
|
1968
|
-
tensorflow_datasets/scripts/download_and_prepare.py,sha256=
|
1968
|
+
tensorflow_datasets/scripts/download_and_prepare.py,sha256=LzbjSnFeo53r1D5oaRgTucHtJiabhBBYodmZsKBpt9s,1875
|
1969
1969
|
tensorflow_datasets/scripts/freeze_dataset_versions.py,sha256=SKC7raxmREqaD5pUnSuy_NHdu9gxTlRxJIOoPoT3cuw,1244
|
1970
1970
|
tensorflow_datasets/scripts/print_num_configs.py,sha256=an80znBHmkycQS4ZEHFQTi1fuFop56tDUx9hgguVcvw,971
|
1971
1971
|
tensorflow_datasets/scripts/replace_fake_images.py,sha256=9L2m3zY0nntaOmsVlNWy6BRJEEytyrMuu5W0LXzLCpA,5223
|
@@ -1979,19 +1979,19 @@ tensorflow_datasets/scripts/cleanup/refactor_dataset_as_folder.py,sha256=VpEc2Us
|
|
1979
1979
|
tensorflow_datasets/scripts/cleanup/url_filename_recorder.py,sha256=iLcsT8UgbyNUw00N7bVBC0zCqEuIQ2ndeCCcb4B-OEc,4490
|
1980
1980
|
tensorflow_datasets/scripts/cleanup/url_status_checker.py,sha256=Tr3LtLnGhI8ElDAS-ejmuAU3rs1lmqmYlU4figoVQg0,1967
|
1981
1981
|
tensorflow_datasets/scripts/cli/__init__.py,sha256=Z8UWkv0wbzS4AzaLgSpYVGApYv5j57RWY0vN5Z553BQ,613
|
1982
|
-
tensorflow_datasets/scripts/cli/build.py,sha256=
|
1983
|
-
tensorflow_datasets/scripts/cli/build_test.py,sha256=
|
1982
|
+
tensorflow_datasets/scripts/cli/build.py,sha256=_YetKh9ZZJfo3w6brP5sdzsdCKfVM4HnQLUyX4mbrX4,15002
|
1983
|
+
tensorflow_datasets/scripts/cli/build_test.py,sha256=K7ho7IRtAty1ZNPLj33Th_nZajYBkXRLA4u3dbElQmo,10615
|
1984
1984
|
tensorflow_datasets/scripts/cli/builder_templates.py,sha256=99SvH3skigkc2Qg737BV2OzhXL_Rgu4az8eVHsxKCLk,7985
|
1985
1985
|
tensorflow_datasets/scripts/cli/builder_templates_test.py,sha256=HBNB-v2zlImKULPI8Webs9hXCkeFmWT29urxav-tDe8,2062
|
1986
|
-
tensorflow_datasets/scripts/cli/cli_utils.py,sha256=
|
1986
|
+
tensorflow_datasets/scripts/cli/cli_utils.py,sha256=sARBmqVP9W6FgTNTPcCN8rUpRqoOAd4WdMksBRnu1Tg,13307
|
1987
1987
|
tensorflow_datasets/scripts/cli/conftest.py,sha256=3PNh_BbR013G4HyLAZOleUXsQ9mICrD03NaKwdHFMXs,1291
|
1988
|
-
tensorflow_datasets/scripts/cli/convert_format.py,sha256=
|
1988
|
+
tensorflow_datasets/scripts/cli/convert_format.py,sha256=ZS7CmWJ-oZ0usO4TB8GKDj9TBJ5MyEO0I9QLRg7eQOw,3797
|
1989
1989
|
tensorflow_datasets/scripts/cli/convert_format_utils.py,sha256=U_q5WVgMNrjBkOc166U4Y_eca5KOS3Xb3jSDjp4XdK4,29078
|
1990
1990
|
tensorflow_datasets/scripts/cli/convert_format_utils_test.py,sha256=9JGNu9TvUWzbuhe6DWwnO3V9Lia5S1Is64re-pceAWE,8823
|
1991
|
-
tensorflow_datasets/scripts/cli/croissant.py,sha256=
|
1992
|
-
tensorflow_datasets/scripts/cli/main.py,sha256=
|
1991
|
+
tensorflow_datasets/scripts/cli/croissant.py,sha256=0JFcSCc4nuk-jVnG_dFQkvTWiKuNZDx-OUTC4gjqRwA,5568
|
1992
|
+
tensorflow_datasets/scripts/cli/main.py,sha256=T4MRQGfNm-FLrp8aZoujQcHY6ctkmX2B6qkErFQUVpA,4238
|
1993
1993
|
tensorflow_datasets/scripts/cli/main_test.py,sha256=3zNaS_2FmxxLoZOX05iJ2riuP4Qv8cx6bhAI56tV8YI,1067
|
1994
|
-
tensorflow_datasets/scripts/cli/new.py,sha256=
|
1994
|
+
tensorflow_datasets/scripts/cli/new.py,sha256=fJok7iV0zauRKwV9n3FLVG57qfiVHYUXVBtqjEApNBY,7386
|
1995
1995
|
tensorflow_datasets/scripts/cli/new_test.py,sha256=USr9So-FPtg8UzaQPPacXn0E1ukDIoew9oYkOn45oik,2655
|
1996
1996
|
tensorflow_datasets/scripts/deployment/__init__.py,sha256=Z8UWkv0wbzS4AzaLgSpYVGApYv5j57RWY0vN5Z553BQ,613
|
1997
1997
|
tensorflow_datasets/scripts/deployment/copy_dataset_info_files.py,sha256=uLuvwOWqvo1SOLAcxAOHIWBvfbyZQJ7nF79v8lTalKQ,2690
|
@@ -2122,7 +2122,7 @@ tensorflow_datasets/summarization/media_sum/media_sum.py,sha256=CIhR_cfQb1aEfu9B
|
|
2122
2122
|
tensorflow_datasets/summarization/summscreen/__init__.py,sha256=ADxohrpUPJjug4r2kGCCJEWZzVD4s2S0smqLfjkc8YY,718
|
2123
2123
|
tensorflow_datasets/summarization/summscreen/summscreen.py,sha256=DfwGr3vsRhOC62ODJ1Sp7-v219bPjJ93KK043YReV7I,884
|
2124
2124
|
tensorflow_datasets/testing/__init__.py,sha256=aSwY_kciK-EZXp1D_JRkuuCJwtbFljGZ72c9YNB6yfE,6049
|
2125
|
-
tensorflow_datasets/testing/dataset_builder_testing.py,sha256=
|
2125
|
+
tensorflow_datasets/testing/dataset_builder_testing.py,sha256=t95l1N8exM7G7qdPMHe1oOlF0E7KpptJBNivLXA3Tqo,25155
|
2126
2126
|
tensorflow_datasets/testing/dataset_builder_testing_test.py,sha256=Nf7Ykg5bY5o9ZatQKrRJhr-qGTtNKle4aZph4rt72i4,1283
|
2127
2127
|
tensorflow_datasets/testing/dataset_collection_builder_testing.py,sha256=tUv2l53rc9GEo4sWvM9OP9r-Ze54dcDakeLQBMS7yos,4825
|
2128
2128
|
tensorflow_datasets/testing/dataset_collection_builder_testing_test.py,sha256=Dw5tACaDjVt9CZi0V84tMAh2JJexrRwWF1N3DID1Mbs,1155
|
@@ -2468,10 +2468,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=PXS8DMNW-MDrT2p5oy4Ic
|
|
2468
2468
|
tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=vGwSGeM8WE4Q-l0-eEE1sBojmk6YT0l1OO60AWa4Q40,719
|
2469
2469
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=q-vX_FBzIwsFxL4sY9vuyQ3UQD2PLM4yhUR4U6l-qao,16903
|
2470
2470
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=ZymHT1NkmD-pUnh3BmM3_g30c5afsWYnmqDD9dVyDSA,1778
|
2471
|
-
tfds_nightly-4.9.9.
|
2472
|
-
tfds_nightly-4.9.9.
|
2473
|
-
tfds_nightly-4.9.9.
|
2474
|
-
tfds_nightly-4.9.9.
|
2475
|
-
tfds_nightly-4.9.9.
|
2476
|
-
tfds_nightly-4.9.9.
|
2477
|
-
tfds_nightly-4.9.9.
|
2471
|
+
tfds_nightly-4.9.9.dev202508130045.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
|
2472
|
+
tfds_nightly-4.9.9.dev202508130045.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
2473
|
+
tfds_nightly-4.9.9.dev202508130045.dist-info/METADATA,sha256=MRLubuygIcfrej-GxBNv-7IT4Nyueo9Uqa-rh7TrfOQ,11694
|
2474
|
+
tfds_nightly-4.9.9.dev202508130045.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
2475
|
+
tfds_nightly-4.9.9.dev202508130045.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
|
2476
|
+
tfds_nightly-4.9.9.dev202508130045.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
|
2477
|
+
tfds_nightly-4.9.9.dev202508130045.dist-info/RECORD,,
|
{tfds_nightly-4.9.9.dev202508110045.dist-info → tfds_nightly-4.9.9.dev202508130045.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|