tfds-nightly 4.9.9.dev202508060045__py3-none-any.whl → 4.9.9.dev202508080045__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorflow_datasets/scripts/cli/convert_format.py +63 -104
- tensorflow_datasets/scripts/cli/croissant.py +5 -10
- tensorflow_datasets/scripts/cli/main.py +18 -3
- tensorflow_datasets/scripts/cli/new.py +46 -44
- {tfds_nightly-4.9.9.dev202508060045.dist-info → tfds_nightly-4.9.9.dev202508080045.dist-info}/METADATA +1 -1
- {tfds_nightly-4.9.9.dev202508060045.dist-info → tfds_nightly-4.9.9.dev202508080045.dist-info}/RECORD +11 -11
- {tfds_nightly-4.9.9.dev202508060045.dist-info → tfds_nightly-4.9.9.dev202508080045.dist-info}/WHEEL +0 -0
- {tfds_nightly-4.9.9.dev202508060045.dist-info → tfds_nightly-4.9.9.dev202508080045.dist-info}/entry_points.txt +0 -0
- {tfds_nightly-4.9.9.dev202508060045.dist-info → tfds_nightly-4.9.9.dev202508080045.dist-info}/licenses/AUTHORS +0 -0
- {tfds_nightly-4.9.9.dev202508060045.dist-info → tfds_nightly-4.9.9.dev202508080045.dist-info}/licenses/LICENSE +0 -0
- {tfds_nightly-4.9.9.dev202508060045.dist-info → tfds_nightly-4.9.9.dev202508080045.dist-info}/top_level.txt +0 -0
@@ -26,94 +26,74 @@ tfds convert_format \
|
|
26
26
|
"""
|
27
27
|
|
28
28
|
import argparse
|
29
|
-
|
29
|
+
import dataclasses
|
30
|
+
import typing
|
30
31
|
|
31
32
|
from etils import epath
|
33
|
+
import simple_parsing
|
32
34
|
from tensorflow_datasets.core import file_adapters
|
33
35
|
from tensorflow_datasets.scripts.cli import convert_format_utils
|
34
36
|
|
35
37
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
help='File format to convert the dataset to.',
|
73
|
-
required=True,
|
74
|
-
)
|
75
|
-
parser.add_argument(
|
76
|
-
'--out_dir',
|
77
|
-
type=str,
|
78
|
-
help=(
|
79
|
-
'Path where the converted dataset will be stored. Should include the'
|
80
|
-
' config and version, e.g. `/data/dataset_name/config/1.2.3`. If not'
|
81
|
-
' specified, the converted shards will be stored in the same'
|
82
|
-
' directory as the input dataset.'
|
83
|
-
),
|
84
|
-
default='',
|
85
|
-
required=False,
|
86
|
-
)
|
87
|
-
parser.add_argument(
|
88
|
-
'--overwrite',
|
89
|
-
action='store_true',
|
90
|
-
help='Whether to overwrite the output directory if it already exists.',
|
91
|
-
)
|
92
|
-
parser.add_argument(
|
93
|
-
'--use_beam',
|
94
|
-
action='store_true',
|
95
|
-
help='Use beam to convert the dataset.',
|
96
|
-
)
|
97
|
-
parser.add_argument(
|
98
|
-
'--num_workers',
|
99
|
-
type=int,
|
100
|
-
default=8,
|
101
|
-
help=(
|
102
|
-
'Number of workers to use when not using Beam. If `--use_beam` is'
|
103
|
-
' set, this flag is ignored. If `--num_workers=1`, the conversion'
|
104
|
-
' will be done sequentially.'
|
105
|
-
),
|
38
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
39
|
+
class Args:
|
40
|
+
"""CLI arguments for converting datasets from one file format to another.
|
41
|
+
|
42
|
+
Attributes:
|
43
|
+
root_data_dir: Root data dir that contains all datasets. All datasets and
|
44
|
+
all their configs and versions that are in this folder will be converted.
|
45
|
+
dataset_dir: Path where the dataset to be converted is located. Converts all
|
46
|
+
configs and versions in this folder.
|
47
|
+
dataset_version_dir: Path where the dataset to be converted is located.
|
48
|
+
Should include config and version. Can also be a comma-separated list of
|
49
|
+
paths. If multiple paths are specified, `--out_dir` should not be
|
50
|
+
specified, since each dataset will be converted in the same directory as
|
51
|
+
the input dataset.
|
52
|
+
out_file_format: File format to convert the dataset to.
|
53
|
+
out_dir: Path where the converted dataset will be stored. Datasets will be
|
54
|
+
stored with the same folder structure as the input folder. If `None`, the
|
55
|
+
converted shards will be stored in the same folder as the input datasets.
|
56
|
+
overwrite: Whether to overwrite the output directory if it already exists.
|
57
|
+
use_beam: Use beam to convert the dataset.
|
58
|
+
num_workers: Number of workers to use when not using Beam. If `--use_beam`
|
59
|
+
is set, this flag is ignored. If `--num_workers=1`, the conversion will be
|
60
|
+
done sequentially.
|
61
|
+
only_log_errors: If set, errors during the conversion will be logged as
|
62
|
+
errors and will not crash the conversion. If you are converting a large
|
63
|
+
number of datasets, you might want to set this flag to true.
|
64
|
+
"""
|
65
|
+
|
66
|
+
root_data_dir: epath.Path | None = None
|
67
|
+
dataset_dir: epath.Path | None = None
|
68
|
+
dataset_version_dir: list[epath.Path] = simple_parsing.field(
|
69
|
+
default_factory=list,
|
70
|
+
type=lambda dataset_version_dirs_str: [
|
71
|
+
epath.Path(path) for path in dataset_version_dirs_str.split(',')
|
72
|
+
],
|
73
|
+
nargs='?',
|
106
74
|
)
|
107
|
-
|
108
|
-
|
109
|
-
action='store_true',
|
110
|
-
default=False,
|
111
|
-
help=(
|
112
|
-
'If set, errors during the conversion will be logged as errors and'
|
113
|
-
' will not crash the conversion. If you are converting a large number'
|
114
|
-
' of datasets, you might want to set this flag to true.'
|
115
|
-
),
|
75
|
+
out_file_format: str = simple_parsing.choice(
|
76
|
+
*(file_format.value for file_format in file_adapters.FileFormat),
|
116
77
|
)
|
78
|
+
out_dir: epath.Path | None = None
|
79
|
+
overwrite: bool = False
|
80
|
+
use_beam: bool = False
|
81
|
+
num_workers: int = 8
|
82
|
+
only_log_errors: bool = False
|
83
|
+
|
84
|
+
def execute(self) -> None:
|
85
|
+
"""Converts a dataset from one file format to another."""
|
86
|
+
convert_format_utils.convert_dataset(
|
87
|
+
out_dir=self.out_dir,
|
88
|
+
out_file_format=self.out_file_format,
|
89
|
+
dataset_dir=self.dataset_dir,
|
90
|
+
root_data_dir=self.root_data_dir,
|
91
|
+
dataset_version_dir=self.dataset_version_dir,
|
92
|
+
overwrite=self.overwrite,
|
93
|
+
use_beam=self.use_beam,
|
94
|
+
num_workers=self.num_workers,
|
95
|
+
fail_on_error=not self.only_log_errors,
|
96
|
+
)
|
117
97
|
|
118
98
|
|
119
99
|
def register_subparser(parsers: argparse._SubParsersAction) -> None:
|
@@ -122,27 +102,6 @@ def register_subparser(parsers: argparse._SubParsersAction) -> None:
|
|
122
102
|
'convert_format',
|
123
103
|
help='Converts a dataset from one file format to another format.',
|
124
104
|
)
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
dataset_version_dir: str | None,
|
129
|
-
) -> Sequence[epath.Path] | None:
|
130
|
-
if not dataset_version_dir:
|
131
|
-
return None
|
132
|
-
return [epath.Path(path) for path in dataset_version_dir.split(',')]
|
133
|
-
|
134
|
-
parser.set_defaults(
|
135
|
-
subparser_fn=lambda args: convert_format_utils.convert_dataset(
|
136
|
-
out_dir=epath.Path(args.out_dir) if args.out_dir else None,
|
137
|
-
out_file_format=args.out_file_format,
|
138
|
-
dataset_dir=args.dataset_dir or None,
|
139
|
-
root_data_dir=args.root_data_dir or None,
|
140
|
-
dataset_version_dir=_parse_dataset_version_dir(
|
141
|
-
args.dataset_version_dir
|
142
|
-
),
|
143
|
-
overwrite=args.overwrite,
|
144
|
-
use_beam=args.use_beam,
|
145
|
-
num_workers=args.num_workers,
|
146
|
-
fail_on_error=not args.only_log_errors,
|
147
|
-
)
|
148
|
-
)
|
105
|
+
parser = typing.cast(simple_parsing.ArgumentParser, parser)
|
106
|
+
parser.add_arguments(Args, dest='args')
|
107
|
+
parser.set_defaults(subparser_fn=lambda args: args.args.execute())
|
@@ -125,16 +125,11 @@ class CmdArgs(simple_parsing.helpers.FrozenSerializable):
|
|
125
125
|
|
126
126
|
def register_subparser(parsers: argparse._SubParsersAction):
|
127
127
|
"""Add subparser for `convert_format` command."""
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
help='Prepares a croissant dataset',
|
134
|
-
)
|
135
|
-
parser = typing.cast(simple_parsing.ArgumentParser, parser)
|
136
|
-
finally:
|
137
|
-
parsers._parser_class = orig_parser_class # pylint: disable=protected-access
|
128
|
+
parser = parsers.add_parser(
|
129
|
+
'build_croissant',
|
130
|
+
help='Prepares a croissant dataset',
|
131
|
+
)
|
132
|
+
parser = typing.cast(simple_parsing.ArgumentParser, parser)
|
138
133
|
parser.add_arguments(CmdArgs, dest='args')
|
139
134
|
parser.set_defaults(
|
140
135
|
subparser_fn=lambda args: prepare_croissant_builders(args.args)
|
@@ -28,8 +28,8 @@ from typing import List
|
|
28
28
|
from absl import app
|
29
29
|
from absl import flags
|
30
30
|
from absl import logging
|
31
|
-
from absl.flags import argparse_flags
|
32
31
|
|
32
|
+
import simple_parsing
|
33
33
|
import tensorflow_datasets.public_api as tfds
|
34
34
|
|
35
35
|
# Import commands
|
@@ -46,7 +46,7 @@ def _parse_flags(argv: List[str]) -> argparse.Namespace:
|
|
46
46
|
"""Command lines flag parsing."""
|
47
47
|
argv = flag_utils.normalize_flags(argv) # See b/174043007 for context.
|
48
48
|
|
49
|
-
parser =
|
49
|
+
parser = simple_parsing.ArgumentParser(
|
50
50
|
description='Tensorflow Datasets CLI tool',
|
51
51
|
allow_abbrev=False,
|
52
52
|
)
|
@@ -67,7 +67,22 @@ def _parse_flags(argv: List[str]) -> argparse.Namespace:
|
|
67
67
|
new.register_subparser(subparser)
|
68
68
|
convert_format.register_subparser(subparser)
|
69
69
|
croissant.register_subparser(subparser)
|
70
|
-
|
70
|
+
|
71
|
+
namespace, remaining_argv = parser.parse_known_args(argv[1:])
|
72
|
+
|
73
|
+
# Manually parse absl flags from the remaining arguments.
|
74
|
+
try:
|
75
|
+
# FLAGS requires the program name as the first argument.
|
76
|
+
positionals = FLAGS(argv[:1] + remaining_argv)
|
77
|
+
except flags.Error as e:
|
78
|
+
parser.error(str(e))
|
79
|
+
|
80
|
+
# There should be no positional arguments left, as they should have been
|
81
|
+
# handled by the sub-commands.
|
82
|
+
if len(positionals) > 1:
|
83
|
+
parser.error(f"unrecognized arguments: {' '.join(positionals[1:])}")
|
84
|
+
|
85
|
+
return namespace
|
71
86
|
|
72
87
|
|
73
88
|
def main(args: argparse.Namespace) -> None:
|
@@ -16,12 +16,14 @@
|
|
16
16
|
"""`tfds new` command."""
|
17
17
|
|
18
18
|
import argparse
|
19
|
+
import dataclasses
|
19
20
|
import os
|
20
21
|
import pathlib
|
21
22
|
import subprocess
|
22
23
|
import textwrap
|
23
|
-
|
24
|
+
import typing
|
24
25
|
|
26
|
+
import simple_parsing
|
25
27
|
from tensorflow_datasets.core import constants
|
26
28
|
from tensorflow_datasets.core import dataset_metadata
|
27
29
|
from tensorflow_datasets.core import naming
|
@@ -30,60 +32,60 @@ from tensorflow_datasets.scripts.cli import builder_templates
|
|
30
32
|
from tensorflow_datasets.scripts.cli import cli_utils as utils
|
31
33
|
|
32
34
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
35
|
+
@dataclasses.dataclass(frozen=True, kw_only=True)
|
36
|
+
class Args:
|
37
|
+
"""CLI arguments for creating a new dataset directory.
|
38
|
+
|
39
|
+
Attributes:
|
40
|
+
dataset_name: Name of the dataset to be created (in snake_case).
|
41
|
+
data_format: Format of the input data, which is used to generate a
|
42
|
+
format-specific template.
|
43
|
+
dir: Path where the dataset directory will be created. Defaults to current
|
44
|
+
directory.
|
45
|
+
"""
|
46
|
+
|
47
|
+
dataset_name: str = simple_parsing.field(
|
48
|
+
positional=True,
|
49
|
+
# Need to explicitly set metavar for command-line help.
|
50
|
+
metavar='dataset_name',
|
42
51
|
)
|
43
|
-
|
44
|
-
|
45
|
-
|
52
|
+
data_format: str = simple_parsing.choice(
|
53
|
+
builder_templates.STANDARD,
|
54
|
+
builder_templates.CONLL,
|
55
|
+
builder_templates.CONLLU,
|
46
56
|
default=builder_templates.STANDARD,
|
47
|
-
choices=[
|
48
|
-
builder_templates.STANDARD,
|
49
|
-
builder_templates.CONLL,
|
50
|
-
builder_templates.CONLLU,
|
51
|
-
],
|
52
|
-
help=(
|
53
|
-
'Optional format of the input data, which is used to generate a '
|
54
|
-
'format-specific template.'
|
55
|
-
),
|
56
|
-
)
|
57
|
-
new_parser.add_argument(
|
58
|
-
'--dir',
|
59
|
-
type=pathlib.Path,
|
60
|
-
default=pathlib.Path.cwd(),
|
61
|
-
help=(
|
62
|
-
'Path where the dataset directory will be created. '
|
63
|
-
'Defaults to current directory.'
|
64
|
-
),
|
65
57
|
)
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
58
|
+
dir: pathlib.Path = simple_parsing.field(default_factory=pathlib.Path.cwd)
|
59
|
+
|
60
|
+
def execute(self) -> None:
|
61
|
+
"""Creates the dataset directory."""
|
62
|
+
if not naming.is_valid_dataset_and_class_name(self.dataset_name):
|
63
|
+
raise ValueError(
|
64
|
+
'Invalid dataset name. It should be a valid Python class name.'
|
65
|
+
)
|
66
|
+
|
67
|
+
create_dataset_files(
|
68
|
+
dataset_name=self.dataset_name,
|
69
|
+
dataset_dir=self.dir,
|
70
|
+
data_format=self.data_format,
|
74
71
|
)
|
75
72
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
73
|
+
|
74
|
+
def register_subparser(parsers: argparse._SubParsersAction) -> None:
|
75
|
+
"""Add subparser for `new` command."""
|
76
|
+
parser = parsers.add_parser(
|
77
|
+
'new',
|
78
|
+
help='Creates a new dataset directory from the template.',
|
80
79
|
)
|
80
|
+
parser = typing.cast(simple_parsing.ArgumentParser, parser)
|
81
|
+
parser.add_arguments(Args, dest='args')
|
82
|
+
parser.set_defaults(subparser_fn=lambda args: args.args.execute())
|
81
83
|
|
82
84
|
|
83
85
|
def create_dataset_files(
|
84
86
|
dataset_name: str,
|
85
87
|
dataset_dir: pathlib.Path,
|
86
|
-
data_format:
|
88
|
+
data_format: str | None = None,
|
87
89
|
) -> None:
|
88
90
|
"""Creates the dataset files."""
|
89
91
|
# Creates the root directory
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: tfds-nightly
|
3
|
-
Version: 4.9.9.
|
3
|
+
Version: 4.9.9.dev202508080045
|
4
4
|
Summary: tensorflow/datasets is a library of datasets ready to use with TensorFlow.
|
5
5
|
Home-page: https://github.com/tensorflow/datasets
|
6
6
|
Download-URL: https://github.com/tensorflow/datasets/tags
|
{tfds_nightly-4.9.9.dev202508060045.dist-info → tfds_nightly-4.9.9.dev202508080045.dist-info}/RECORD
RENAMED
@@ -1985,13 +1985,13 @@ tensorflow_datasets/scripts/cli/builder_templates.py,sha256=99SvH3skigkc2Qg737BV
|
|
1985
1985
|
tensorflow_datasets/scripts/cli/builder_templates_test.py,sha256=HBNB-v2zlImKULPI8Webs9hXCkeFmWT29urxav-tDe8,2062
|
1986
1986
|
tensorflow_datasets/scripts/cli/cli_utils.py,sha256=rMYMcQj1w46OTOeMyp3qf4y9v7ArOGh6u5NaCjBXal8,12313
|
1987
1987
|
tensorflow_datasets/scripts/cli/conftest.py,sha256=cmvCCV-efT5ZXYPkCSGS1OxoKNPAfSsLcFTfYfe61S0,1233
|
1988
|
-
tensorflow_datasets/scripts/cli/convert_format.py,sha256=
|
1988
|
+
tensorflow_datasets/scripts/cli/convert_format.py,sha256=02RDZQQCuXf_XFFpx0gmRVkYyJg534kY0fZwGKxtUL4,4197
|
1989
1989
|
tensorflow_datasets/scripts/cli/convert_format_utils.py,sha256=U_q5WVgMNrjBkOc166U4Y_eca5KOS3Xb3jSDjp4XdK4,29078
|
1990
1990
|
tensorflow_datasets/scripts/cli/convert_format_utils_test.py,sha256=9JGNu9TvUWzbuhe6DWwnO3V9Lia5S1Is64re-pceAWE,8823
|
1991
|
-
tensorflow_datasets/scripts/cli/croissant.py,sha256=
|
1992
|
-
tensorflow_datasets/scripts/cli/main.py,sha256=
|
1991
|
+
tensorflow_datasets/scripts/cli/croissant.py,sha256=6jzmOXt_i7aeJHUVX7_zpRRMEXId_PzU24zUDdExRUs,6112
|
1992
|
+
tensorflow_datasets/scripts/cli/main.py,sha256=FJJwyUtM1N9gNDsxGm850m5ejzzJ9mgESNW9Xz8E9_I,4383
|
1993
1993
|
tensorflow_datasets/scripts/cli/main_test.py,sha256=3zNaS_2FmxxLoZOX05iJ2riuP4Qv8cx6bhAI56tV8YI,1067
|
1994
|
-
tensorflow_datasets/scripts/cli/new.py,sha256=
|
1994
|
+
tensorflow_datasets/scripts/cli/new.py,sha256=x_GQSEVva1XuMvFwL3rANjDxviwZviXKHCICY7P30Jc,7803
|
1995
1995
|
tensorflow_datasets/scripts/cli/new_test.py,sha256=USr9So-FPtg8UzaQPPacXn0E1ukDIoew9oYkOn45oik,2655
|
1996
1996
|
tensorflow_datasets/scripts/deployment/__init__.py,sha256=Z8UWkv0wbzS4AzaLgSpYVGApYv5j57RWY0vN5Z553BQ,613
|
1997
1997
|
tensorflow_datasets/scripts/deployment/copy_dataset_info_files.py,sha256=uLuvwOWqvo1SOLAcxAOHIWBvfbyZQJ7nF79v8lTalKQ,2690
|
@@ -2468,10 +2468,10 @@ tensorflow_datasets/vision_language/wit/wit_test.py,sha256=PXS8DMNW-MDrT2p5oy4Ic
|
|
2468
2468
|
tensorflow_datasets/vision_language/wit_kaggle/__init__.py,sha256=vGwSGeM8WE4Q-l0-eEE1sBojmk6YT0l1OO60AWa4Q40,719
|
2469
2469
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle.py,sha256=q-vX_FBzIwsFxL4sY9vuyQ3UQD2PLM4yhUR4U6l-qao,16903
|
2470
2470
|
tensorflow_datasets/vision_language/wit_kaggle/wit_kaggle_test.py,sha256=ZymHT1NkmD-pUnh3BmM3_g30c5afsWYnmqDD9dVyDSA,1778
|
2471
|
-
tfds_nightly-4.9.9.
|
2472
|
-
tfds_nightly-4.9.9.
|
2473
|
-
tfds_nightly-4.9.9.
|
2474
|
-
tfds_nightly-4.9.9.
|
2475
|
-
tfds_nightly-4.9.9.
|
2476
|
-
tfds_nightly-4.9.9.
|
2477
|
-
tfds_nightly-4.9.9.
|
2471
|
+
tfds_nightly-4.9.9.dev202508080045.dist-info/licenses/AUTHORS,sha256=nvBG4WwfgjuOu1oZkuQKw9kg7X6rve679ObS-YDDmXg,309
|
2472
|
+
tfds_nightly-4.9.9.dev202508080045.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
2473
|
+
tfds_nightly-4.9.9.dev202508080045.dist-info/METADATA,sha256=Za3dYGEQJMn0dTM0uUOEd_2jik8ANJg6Lmtp94-hKQ0,11694
|
2474
|
+
tfds_nightly-4.9.9.dev202508080045.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
2475
|
+
tfds_nightly-4.9.9.dev202508080045.dist-info/entry_points.txt,sha256=eHEL7nF5y1uCY2FgkuYIdE062epJXlAQTSdq89px4p4,73
|
2476
|
+
tfds_nightly-4.9.9.dev202508080045.dist-info/top_level.txt,sha256=bAevmk9209s_oxVZVlN6hSDIVS423qrMQvmcWSvW4do,20
|
2477
|
+
tfds_nightly-4.9.9.dev202508080045.dist-info/RECORD,,
|
{tfds_nightly-4.9.9.dev202508060045.dist-info → tfds_nightly-4.9.9.dev202508080045.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|