mldataforge 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mldataforge-0.1.3/PKG-INFO +62 -0
- mldataforge-0.1.3/README.md +33 -0
- mldataforge-0.1.3/mldataforge/__main__.py +4 -0
- mldataforge-0.1.3/mldataforge/commands/__init__.py +15 -0
- {mldataforge-0.1.1 → mldataforge-0.1.3}/mldataforge/commands/convert/jsonl.py +3 -1
- {mldataforge-0.1.1 → mldataforge-0.1.3}/mldataforge/commands/convert/parquet.py +3 -1
- mldataforge-0.1.3/mldataforge/commands/join.py +64 -0
- mldataforge-0.1.3/mldataforge/commands/split.py +58 -0
- {mldataforge-0.1.1 → mldataforge-0.1.3}/mldataforge/options.py +41 -0
- {mldataforge-0.1.1 → mldataforge-0.1.3}/mldataforge/pigz.py +7 -1
- {mldataforge-0.1.1 → mldataforge-0.1.3}/mldataforge/utils.py +70 -27
- {mldataforge-0.1.1 → mldataforge-0.1.3}/pyproject.toml +6 -1
- mldataforge-0.1.1/PKG-INFO +0 -20
- mldataforge-0.1.1/README.md +0 -2
- mldataforge-0.1.1/mldataforge/__main__.py +0 -12
- mldataforge-0.1.1/mldataforge/commands/__init__.py +0 -3
- {mldataforge-0.1.1 → mldataforge-0.1.3}/.gitignore +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.3}/LICENSE +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.3}/mldataforge/commands/convert/__init__.py +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.3}/mldataforge/commands/convert/mds.py +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.3}/mldataforge/mds.py +0 -0
@@ -0,0 +1,62 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: mldataforge
|
3
|
+
Version: 0.1.3
|
4
|
+
Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
|
5
|
+
Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
|
6
|
+
Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
|
7
|
+
Author: Peter Schneider-Kamp
|
8
|
+
License-File: LICENSE
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.12
|
13
|
+
Requires-Dist: click
|
14
|
+
Requires-Dist: datasets
|
15
|
+
Requires-Dist: mltiming
|
16
|
+
Requires-Dist: mosaicml-streaming
|
17
|
+
Provides-Extra: all
|
18
|
+
Requires-Dist: build; extra == 'all'
|
19
|
+
Requires-Dist: pytest; extra == 'all'
|
20
|
+
Requires-Dist: pytest-dependency; extra == 'all'
|
21
|
+
Requires-Dist: twine; extra == 'all'
|
22
|
+
Provides-Extra: dev
|
23
|
+
Requires-Dist: build; extra == 'dev'
|
24
|
+
Requires-Dist: twine; extra == 'dev'
|
25
|
+
Provides-Extra: test
|
26
|
+
Requires-Dist: pytest; extra == 'test'
|
27
|
+
Requires-Dist: pytest-dependency; extra == 'test'
|
28
|
+
Description-Content-Type: text/markdown
|
29
|
+
|
30
|
+
# mldatasets
|
31
|
+
swiss army knife of scripts for transforming and processing datasets for machine learning
|
32
|
+
|
33
|
+
## conversion
|
34
|
+
Currently, mldataforge provides space- and time-efficient conversions between JSONL (with or without compression), MosaiclML Dataset (MDS format), and Parquet. The implementations handle conversions by individual samples or small batches of samples and make efficient use of multi-core architectures where possible. Consequently, mldataforge is an excellent choice when transforming TB-scale datasets on data processing nodes with many cores.
|
35
|
+
|
36
|
+
## splitting
|
37
|
+
Currently, mldataforge provides space- and time-efficient splitting of JSONL (with or without compression). The implementations handle conversions by individual samples or small batches of samples and make efficient use of multi-core architectures where possible. The splitting function can take an already splitted dataset and re-split it with a different granularity.
|
38
|
+
|
39
|
+
## installation and general usage
|
40
|
+
```
|
41
|
+
pip install mldataforge
|
42
|
+
python -m mldataforge --help
|
43
|
+
```
|
44
|
+
|
45
|
+
## usage example: converting MosaiclML Dataset (MDS) to Parquet format
|
46
|
+
```
|
47
|
+
Usage: python -m mldataforge convert mds parquet [OPTIONS] OUTPUT_FILE
|
48
|
+
MDS_DIRECTORIES...
|
49
|
+
|
50
|
+
Options:
|
51
|
+
--compression [snappy|gzip|zstd]
|
52
|
+
Compress the output file (default: snappy).
|
53
|
+
--overwrite Overwrite existing path.
|
54
|
+
--yes Assume yes to all prompts. Use with caution
|
55
|
+
as it will remove files or even entire
|
56
|
+
directories without confirmation.
|
57
|
+
--batch-size INTEGER Batch size for loading data and writing
|
58
|
+
files (default: 65536).
|
59
|
+
--no-bulk Use a custom space and time-efficient bulk
|
60
|
+
reader (only gzip and no compression).
|
61
|
+
--help Show this message and exit.
|
62
|
+
```
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# mldatasets
|
2
|
+
swiss army knife of scripts for transforming and processing datasets for machine learning
|
3
|
+
|
4
|
+
## conversion
|
5
|
+
Currently, mldataforge provides space- and time-efficient conversions between JSONL (with or without compression), MosaiclML Dataset (MDS format), and Parquet. The implementations handle conversions by individual samples or small batches of samples and make efficient use of multi-core architectures where possible. Consequently, mldataforge is an excellent choice when transforming TB-scale datasets on data processing nodes with many cores.
|
6
|
+
|
7
|
+
## splitting
|
8
|
+
Currently, mldataforge provides space- and time-efficient splitting of JSONL (with or without compression). The implementations handle conversions by individual samples or small batches of samples and make efficient use of multi-core architectures where possible. The splitting function can take an already splitted dataset and re-split it with a different granularity.
|
9
|
+
|
10
|
+
## installation and general usage
|
11
|
+
```
|
12
|
+
pip install mldataforge
|
13
|
+
python -m mldataforge --help
|
14
|
+
```
|
15
|
+
|
16
|
+
## usage example: converting MosaiclML Dataset (MDS) to Parquet format
|
17
|
+
```
|
18
|
+
Usage: python -m mldataforge convert mds parquet [OPTIONS] OUTPUT_FILE
|
19
|
+
MDS_DIRECTORIES...
|
20
|
+
|
21
|
+
Options:
|
22
|
+
--compression [snappy|gzip|zstd]
|
23
|
+
Compress the output file (default: snappy).
|
24
|
+
--overwrite Overwrite existing path.
|
25
|
+
--yes Assume yes to all prompts. Use with caution
|
26
|
+
as it will remove files or even entire
|
27
|
+
directories without confirmation.
|
28
|
+
--batch-size INTEGER Batch size for loading data and writing
|
29
|
+
files (default: 65536).
|
30
|
+
--no-bulk Use a custom space and time-efficient bulk
|
31
|
+
reader (only gzip and no compression).
|
32
|
+
--help Show this message and exit.
|
33
|
+
```
|
@@ -18,7 +18,8 @@ def jsonl():
|
|
18
18
|
@yes_option()
|
19
19
|
@processes_option()
|
20
20
|
@buf_size_option()
|
21
|
-
|
21
|
+
@shard_size_option()
|
22
|
+
def mds(output_dir, jsonl_files, compression, processes, overwrite, yes, buf_size, shard_size):
|
22
23
|
check_arguments(output_dir, overwrite, yes, jsonl_files)
|
23
24
|
save_mds(
|
24
25
|
load_dataset("json", data_files=jsonl_files, split="train"),
|
@@ -27,6 +28,7 @@ def mds(output_dir, jsonl_files, compression, processes, overwrite, yes, buf_siz
|
|
27
28
|
compression=compression,
|
28
29
|
buf_size=buf_size,
|
29
30
|
pigz=use_pigz(compression),
|
31
|
+
shard_size=shard_size,
|
30
32
|
)
|
31
33
|
|
32
34
|
@jsonl.command()
|
@@ -34,7 +34,8 @@ def jsonl(output_file, parquet_files, compression, processes, overwrite, yes):
|
|
34
34
|
@overwrite_option()
|
35
35
|
@yes_option()
|
36
36
|
@buf_size_option()
|
37
|
-
|
37
|
+
@shard_size_option()
|
38
|
+
def mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_size, shard_size):
|
38
39
|
check_arguments(output_dir, overwrite, yes, parquet_files)
|
39
40
|
save_mds(
|
40
41
|
load_dataset("parquet", data_files=parquet_files, split="train"),
|
@@ -43,4 +44,5 @@ def mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_s
|
|
43
44
|
compression=compression,
|
44
45
|
buf_size=buf_size,
|
45
46
|
pigz=use_pigz(compression),
|
47
|
+
shard_size=shard_size,
|
46
48
|
)
|
@@ -0,0 +1,64 @@
|
|
1
|
+
import click
|
2
|
+
from datasets import load_dataset
|
3
|
+
|
4
|
+
from ..options import *
|
5
|
+
from ..utils import *
|
6
|
+
|
7
|
+
__all__ = ["join"]
|
8
|
+
|
9
|
+
@click.group()
|
10
|
+
def join():
|
11
|
+
pass
|
12
|
+
|
13
|
+
@join.command()
|
14
|
+
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
15
|
+
@click.argument("jsonl_files", type=click.Path(exists=True), required=True, nargs=-1)
|
16
|
+
@compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
|
17
|
+
@processes_option()
|
18
|
+
@overwrite_option()
|
19
|
+
@yes_option()
|
20
|
+
def jsonl(output_file, jsonl_files, compression, processes, overwrite, yes):
|
21
|
+
check_arguments(output_file, overwrite, yes, jsonl_files)
|
22
|
+
save_jsonl(
|
23
|
+
load_dataset("json", data_files=jsonl_files, split="train"),
|
24
|
+
output_file,
|
25
|
+
compression=compression,
|
26
|
+
processes=processes,
|
27
|
+
)
|
28
|
+
|
29
|
+
@join.command()
|
30
|
+
@click.argument("output_dir", type=click.Path(exists=False), required=True)
|
31
|
+
@click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
|
32
|
+
@compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
|
33
|
+
@processes_option()
|
34
|
+
@overwrite_option()
|
35
|
+
@yes_option()
|
36
|
+
@batch_size_option()
|
37
|
+
@buf_size_option()
|
38
|
+
@no_bulk_option()
|
39
|
+
def mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk):
|
40
|
+
check_arguments(output_dir, overwrite, yes, mds_directories)
|
41
|
+
save_mds(
|
42
|
+
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
|
43
|
+
output_dir,
|
44
|
+
processes=processes,
|
45
|
+
compression=compression,
|
46
|
+
buf_size=buf_size,
|
47
|
+
pigz=use_pigz(compression),
|
48
|
+
)
|
49
|
+
|
50
|
+
@join.command()
|
51
|
+
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
52
|
+
@click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
|
53
|
+
@compression_option("snappy", ["snappy", "gzip", "zstd"])
|
54
|
+
@overwrite_option()
|
55
|
+
@yes_option()
|
56
|
+
@batch_size_option()
|
57
|
+
def parquet(output_file, parquet_files, compression, overwrite, yes, batch_size):
|
58
|
+
check_arguments(output_file, overwrite, yes, parquet_files)
|
59
|
+
save_parquet(
|
60
|
+
load_dataset("parquet", data_files=parquet_files, split="train"),
|
61
|
+
output_file,
|
62
|
+
compression=compression,
|
63
|
+
batch_size=batch_size,
|
64
|
+
)
|
@@ -0,0 +1,58 @@
|
|
1
|
+
import click
|
2
|
+
from datasets import load_dataset
|
3
|
+
|
4
|
+
from ..options import *
|
5
|
+
from ..utils import *
|
6
|
+
|
7
|
+
__all__ = ["split"]
|
8
|
+
|
9
|
+
@click.group()
|
10
|
+
def split():
|
11
|
+
pass
|
12
|
+
|
13
|
+
@split.command()
|
14
|
+
@click.argument("jsonl_files", type=click.Path(exists=True), required=True, nargs=-1)
|
15
|
+
@prefix_option()
|
16
|
+
@output_dir_option()
|
17
|
+
@size_hint_option()
|
18
|
+
@compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
|
19
|
+
@processes_option()
|
20
|
+
@overwrite_option()
|
21
|
+
@yes_option()
|
22
|
+
def jsonl(jsonl_files, prefix, output_dir, size_hint, compression, processes, overwrite, yes):
|
23
|
+
save_jsonl(
|
24
|
+
load_dataset("json", data_files=jsonl_files, split="train"),
|
25
|
+
output_file=f"{output_dir}/{prefix}{{part:04d}}.jsonl{extension(compression, jsonl_files[0])}",
|
26
|
+
compression=compression,
|
27
|
+
processes=processes,
|
28
|
+
size_hint=size_hint,
|
29
|
+
overwrite=overwrite,
|
30
|
+
yes=yes,
|
31
|
+
)
|
32
|
+
|
33
|
+
@split.command()
|
34
|
+
@click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
|
35
|
+
@prefix_option()
|
36
|
+
@output_dir_option()
|
37
|
+
@size_hint_option()
|
38
|
+
@compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
|
39
|
+
@processes_option()
|
40
|
+
@overwrite_option()
|
41
|
+
@yes_option()
|
42
|
+
@buf_size_option()
|
43
|
+
@batch_size_option()
|
44
|
+
@no_bulk_option()
|
45
|
+
@shard_size_option()
|
46
|
+
def mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size):
|
47
|
+
save_mds(
|
48
|
+
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
|
49
|
+
output_dir=f"{output_dir}/{prefix}{{part:04d}}",
|
50
|
+
processes=processes,
|
51
|
+
compression=compression,
|
52
|
+
buf_size=buf_size,
|
53
|
+
pigz=use_pigz(compression),
|
54
|
+
shard_size=shard_size,
|
55
|
+
size_hint=size_hint,
|
56
|
+
overwrite=overwrite,
|
57
|
+
yes=yes,
|
58
|
+
)
|
@@ -50,6 +50,17 @@ def compression_option(default, choices):
|
|
50
50
|
help=f"Compress the output file (default: {default}).",
|
51
51
|
)
|
52
52
|
|
53
|
+
def output_dir_option(default="."):
|
54
|
+
"""
|
55
|
+
Option for specifying the output directory.
|
56
|
+
"""
|
57
|
+
return click.option(
|
58
|
+
"--output-dir",
|
59
|
+
default=default,
|
60
|
+
type=click.Path(exists=False),
|
61
|
+
help="Output directory.",
|
62
|
+
)
|
63
|
+
|
53
64
|
def overwrite_option():
|
54
65
|
"""
|
55
66
|
Option for specifying whether to overwrite existing files.
|
@@ -60,6 +71,16 @@ def overwrite_option():
|
|
60
71
|
help="Overwrite existing path.",
|
61
72
|
)
|
62
73
|
|
74
|
+
def prefix_option(default="part-"):
|
75
|
+
"""
|
76
|
+
Option for specifying the prefix for output files.
|
77
|
+
"""
|
78
|
+
return click.option(
|
79
|
+
"--prefix",
|
80
|
+
default=default,
|
81
|
+
help=f"Prefix for output files (default: {default}).",
|
82
|
+
)
|
83
|
+
|
63
84
|
def processes_option(default=64):
|
64
85
|
"""
|
65
86
|
Option for specifying the number of processes to use.
|
@@ -70,6 +91,26 @@ def processes_option(default=64):
|
|
70
91
|
help=f"Number of processes to use (default: {default}).",
|
71
92
|
)
|
72
93
|
|
94
|
+
def shard_size_option(default=2**26):
|
95
|
+
"""
|
96
|
+
Option for specifying the shard size.
|
97
|
+
"""
|
98
|
+
return click.option(
|
99
|
+
"--shard-size",
|
100
|
+
default=default,
|
101
|
+
help=f"Shard size for the dataset (default: {default}).",
|
102
|
+
)
|
103
|
+
|
104
|
+
def size_hint_option(default=2**26):
|
105
|
+
"""
|
106
|
+
Option for specifying the size hint.
|
107
|
+
"""
|
108
|
+
return click.option(
|
109
|
+
"--size-hint",
|
110
|
+
default=default,
|
111
|
+
help=f"Size hint for the dataset (default: {default}).",
|
112
|
+
)
|
113
|
+
|
73
114
|
def yes_option():
|
74
115
|
"""
|
75
116
|
Option for specifying whether to assume yes to all prompts.
|
@@ -16,6 +16,7 @@ class PigzFile(object):
|
|
16
16
|
self.encoding = encoding if self.is_text else None
|
17
17
|
self._process = None
|
18
18
|
self._fw = None
|
19
|
+
self.offset = 0
|
19
20
|
args = ["pigz", "-p", str(self.processes), "-c"]
|
20
21
|
if self.is_read:
|
21
22
|
args.extend(("-d", self.path))
|
@@ -28,6 +29,7 @@ class PigzFile(object):
|
|
28
29
|
assert self.is_read
|
29
30
|
for line in self._process.stdout:
|
30
31
|
assert isinstance(line, str) if self.is_text else isinstance(line, bytes)
|
32
|
+
self.offset += len(line)
|
31
33
|
yield line
|
32
34
|
self._process.wait()
|
33
35
|
assert self._process.returncode == 0
|
@@ -39,6 +41,7 @@ class PigzFile(object):
|
|
39
41
|
assert self._fw is not None
|
40
42
|
assert isinstance(line, str) if self.is_text else isinstance(line, bytes)
|
41
43
|
self._process.stdin.write(line)
|
44
|
+
self.offset += len(line)
|
42
45
|
|
43
46
|
def close(self):
|
44
47
|
if self._process:
|
@@ -52,7 +55,10 @@ class PigzFile(object):
|
|
52
55
|
self._process = None
|
53
56
|
self._fw.close()
|
54
57
|
self._fw = None
|
55
|
-
|
58
|
+
|
59
|
+
def tell(self):
|
60
|
+
return self.offset
|
61
|
+
|
56
62
|
def __enter__(self):
|
57
63
|
return self
|
58
64
|
|
@@ -19,6 +19,7 @@ __all__ = [
|
|
19
19
|
"batch_iterable",
|
20
20
|
"check_arguments",
|
21
21
|
"confirm_overwrite",
|
22
|
+
"extension",
|
22
23
|
"load_mds_directories",
|
23
24
|
"save_jsonl",
|
24
25
|
"save_mds",
|
@@ -36,8 +37,8 @@ def batch_iterable(iterable, batch_size):
|
|
36
37
|
if batch:
|
37
38
|
yield batch
|
38
39
|
|
39
|
-
def check_arguments(output_path, overwrite, yes, input_paths):
|
40
|
-
if not input_paths:
|
40
|
+
def check_arguments(output_path, overwrite, yes, input_paths=None):
|
41
|
+
if input_paths is not None and not input_paths:
|
41
42
|
raise click.BadArgumentUsage("No input paths provided.")
|
42
43
|
if os.path.exists(output_path):
|
43
44
|
if os.path.isfile(output_path):
|
@@ -70,6 +71,24 @@ def _determine_compression(file_path, compression="infer"):
|
|
70
71
|
compression = None
|
71
72
|
return compression
|
72
73
|
|
74
|
+
def extension(compression, file_path):
|
75
|
+
"""Get the file extension for the given compression type."""
|
76
|
+
if compression == "infer":
|
77
|
+
compression = _infer_compression(file_path)
|
78
|
+
if compression in ("gzip", "pigz"):
|
79
|
+
return ".gz"
|
80
|
+
if compression == "bz2":
|
81
|
+
return ".bz2"
|
82
|
+
if compression == "xz":
|
83
|
+
return ".xz"
|
84
|
+
if compression == "zip":
|
85
|
+
return ".zip"
|
86
|
+
if compression == "zstd":
|
87
|
+
return ".zst"
|
88
|
+
if compression is None:
|
89
|
+
return ""
|
90
|
+
raise ValueError(f"Unsupported compression type: {compression}")
|
91
|
+
|
73
92
|
def _infer_mds_encoding(value):
|
74
93
|
"""Determine the MDS encoding for a given value."""
|
75
94
|
if isinstance(value, str):
|
@@ -157,43 +176,67 @@ def _pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=F
|
|
157
176
|
if not quiet:
|
158
177
|
print(f"Removed {input_file}")
|
159
178
|
|
160
|
-
def save_jsonl(iterable, output_file, compression=None, processes=64):
|
179
|
+
def save_jsonl(iterable, output_file, compression=None, processes=64, size_hint=None, overwrite=True, yes=True):
|
161
180
|
compression = _determine_compression(output_file, compression)
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
181
|
+
f = None
|
182
|
+
part = 0
|
183
|
+
for item in tqdm(iterable, desc="Writing to JSONL", unit="sample"):
|
184
|
+
if f is None:
|
185
|
+
part_file = output_file.format(part=part)
|
186
|
+
check_arguments(part_file, overwrite, yes)
|
187
|
+
f= _open_jsonl(part_file, mode="wb", compression=compression, processes=processes)
|
188
|
+
f.write(f"{json.dumps(item)}\n".encode("utf-8"))
|
189
|
+
if size_hint is not None and f.tell() >= size_hint:
|
190
|
+
f.close()
|
191
|
+
part += 1
|
192
|
+
f = None
|
193
|
+
if f is not None:
|
194
|
+
f.close()
|
195
|
+
|
196
|
+
def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pigz=False, shard_size=None, size_hint=None, overwrite=True, yes=True):
|
167
197
|
if compression == "none" or pigz:
|
168
198
|
compression = None
|
169
199
|
if compression == "gzip":
|
170
200
|
compression = "gz"
|
171
201
|
writer = None
|
202
|
+
part = 0
|
203
|
+
files = []
|
172
204
|
for sample in tqdm(it, desc="Writing to MDS", unit="sample"):
|
173
205
|
if writer is None:
|
206
|
+
part_dir = output_dir.format(part=part)
|
207
|
+
check_arguments(part_dir, overwrite, yes)
|
208
|
+
files.append(part_dir)
|
174
209
|
columns = {key: _infer_mds_encoding(value) for key, value in sample.items()}
|
175
|
-
writer = MDSWriter(out=
|
210
|
+
writer = MDSWriter(out=part_dir, columns=columns, compression=compression, size_limit=shard_size)
|
211
|
+
offset = 0
|
212
|
+
prev = writer.new_shard_size
|
176
213
|
writer.write(sample)
|
214
|
+
offset += (writer.new_shard_size - prev) if prev < writer.new_shard_size else writer.new_shard_size
|
215
|
+
if size_hint is not None and offset >= size_hint:
|
216
|
+
writer.finish()
|
217
|
+
part += 1
|
218
|
+
writer = None
|
177
219
|
writer.finish()
|
178
220
|
if pigz:
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
"
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
221
|
+
for output_dir in files:
|
222
|
+
index_path = os.path.join(output_dir, "index.json")
|
223
|
+
index = json.load(open(index_path, "rt"))
|
224
|
+
name2info = {shard["raw_data"]["basename"]: shard for shard in index["shards"]}
|
225
|
+
file_names = [file for file in os.listdir(output_dir) if file.endswith(".mds")]
|
226
|
+
assert set(file_names) == set(name2info.keys())
|
227
|
+
for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file"):
|
228
|
+
compressed_file_name = file_name + ".gz"
|
229
|
+
file_path = os.path.join(output_dir, file_name)
|
230
|
+
compressed_file_path = os.path.join(output_dir, compressed_file_name)
|
231
|
+
_pigz_compress(file_path, compressed_file_path, processes, buf_size=buf_size, keep=False, quiet=True)
|
232
|
+
name2info[file_name]["compression"] = "gz"
|
233
|
+
name2info[file_name]["zip_data"] = {
|
234
|
+
"basename": compressed_file_name,
|
235
|
+
"bytes": os.stat(compressed_file_path).st_size,
|
236
|
+
"hashes": {},
|
237
|
+
}
|
238
|
+
json.dump(index, open(index_path, "wt"))
|
239
|
+
print(f"Compressed {output_dir} with pigz")
|
197
240
|
|
198
241
|
def save_parquet(it, output_file, compression=None, batch_size=2**16):
|
199
242
|
writer = None
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "mldataforge"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.3"
|
8
8
|
authors = [
|
9
9
|
{ name = "Peter Schneider-Kamp" }
|
10
10
|
]
|
@@ -25,6 +25,11 @@ dependencies = [
|
|
25
25
|
'mosaicml-streaming'
|
26
26
|
]
|
27
27
|
|
28
|
+
[project.optional-dependencies]
|
29
|
+
test = ["pytest", "pytest-dependency"]
|
30
|
+
dev = ["build", "twine"]
|
31
|
+
all = ["build", "twine", "pytest", "pytest-dependency"]
|
32
|
+
|
28
33
|
[project.urls]
|
29
34
|
"Homepage" = "https://github.com/schneiderkamplab/mldataforge"
|
30
35
|
"Bug Tracker" = "https://github.com/schneiderkamplab/mldataforge/issues"
|
mldataforge-0.1.1/PKG-INFO
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: mldataforge
|
3
|
-
Version: 0.1.1
|
4
|
-
Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
|
5
|
-
Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
|
6
|
-
Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
|
7
|
-
Author: Peter Schneider-Kamp
|
8
|
-
License-File: LICENSE
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Requires-Python: >=3.12
|
13
|
-
Requires-Dist: click
|
14
|
-
Requires-Dist: datasets
|
15
|
-
Requires-Dist: mltiming
|
16
|
-
Requires-Dist: mosaicml-streaming
|
17
|
-
Description-Content-Type: text/markdown
|
18
|
-
|
19
|
-
# mldatasets
|
20
|
-
swiss army knife of scripts for transforming and processing datasets for machine learning
|
mldataforge-0.1.1/README.md
DELETED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|