mldataforge 0.0.5__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mldataforge-0.0.5 → mldataforge-0.1.1}/PKG-INFO +1 -1
- mldataforge-0.1.1/mldataforge/commands/convert/jsonl.py +46 -0
- mldataforge-0.1.1/mldataforge/commands/convert/mds.py +45 -0
- mldataforge-0.1.1/mldataforge/commands/convert/parquet.py +46 -0
- mldataforge-0.1.1/mldataforge/mds.py +97 -0
- mldataforge-0.1.1/mldataforge/options.py +81 -0
- {mldataforge-0.0.5 → mldataforge-0.1.1}/mldataforge/utils.py +77 -55
- {mldataforge-0.0.5 → mldataforge-0.1.1}/pyproject.toml +1 -1
- mldataforge-0.0.5/mldataforge/commands/convert/jsonl/__init__.py +0 -13
- mldataforge-0.0.5/mldataforge/commands/convert/jsonl/mds.py +0 -59
- mldataforge-0.0.5/mldataforge/commands/convert/jsonl/parquet.py +0 -39
- mldataforge-0.0.5/mldataforge/commands/convert/mds/__init__.py +0 -13
- mldataforge-0.0.5/mldataforge/commands/convert/mds/jsonl.py +0 -23
- mldataforge-0.0.5/mldataforge/commands/convert/mds/parquet.py +0 -26
- mldataforge-0.0.5/mldataforge/commands/convert/parquet/__init__.py +0 -13
- mldataforge-0.0.5/mldataforge/commands/convert/parquet/jsonl.py +0 -25
- mldataforge-0.0.5/mldataforge/commands/convert/parquet/mds.py +0 -43
- {mldataforge-0.0.5 → mldataforge-0.1.1}/.gitignore +0 -0
- {mldataforge-0.0.5 → mldataforge-0.1.1}/LICENSE +0 -0
- {mldataforge-0.0.5 → mldataforge-0.1.1}/README.md +0 -0
- {mldataforge-0.0.5 → mldataforge-0.1.1}/mldataforge/__main__.py +0 -0
- {mldataforge-0.0.5 → mldataforge-0.1.1}/mldataforge/commands/__init__.py +0 -0
- {mldataforge-0.0.5 → mldataforge-0.1.1}/mldataforge/commands/convert/__init__.py +0 -0
- {mldataforge-0.0.5 → mldataforge-0.1.1}/mldataforge/pigz.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mldataforge
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.1.1
|
4
4
|
Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
|
5
5
|
Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
|
6
6
|
Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import click
|
2
|
+
from datasets import load_dataset
|
3
|
+
|
4
|
+
from ...options import *
|
5
|
+
from ...utils import *
|
6
|
+
|
7
|
+
__all__ = ["jsonl"]
|
8
|
+
|
9
|
+
@click.group()
|
10
|
+
def jsonl():
|
11
|
+
pass
|
12
|
+
|
13
|
+
@jsonl.command()
|
14
|
+
@click.argument('output_dir', type=click.Path(exists=False))
|
15
|
+
@click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
|
16
|
+
@compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
|
17
|
+
@overwrite_option()
|
18
|
+
@yes_option()
|
19
|
+
@processes_option()
|
20
|
+
@buf_size_option()
|
21
|
+
def mds(output_dir, jsonl_files, compression, processes, overwrite, yes, buf_size):
|
22
|
+
check_arguments(output_dir, overwrite, yes, jsonl_files)
|
23
|
+
save_mds(
|
24
|
+
load_dataset("json", data_files=jsonl_files, split="train"),
|
25
|
+
output_dir,
|
26
|
+
processes=processes,
|
27
|
+
compression=compression,
|
28
|
+
buf_size=buf_size,
|
29
|
+
pigz=use_pigz(compression),
|
30
|
+
)
|
31
|
+
|
32
|
+
@jsonl.command()
|
33
|
+
@click.argument('output_file', type=click.Path(exists=False))
|
34
|
+
@click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
|
35
|
+
@compression_option("snappy", ["snappy", "gzip", "zstd"])
|
36
|
+
@overwrite_option()
|
37
|
+
@yes_option()
|
38
|
+
@batch_size_option()
|
39
|
+
def parquet(output_file, jsonl_files, compression, overwrite, yes, batch_size):
|
40
|
+
check_arguments(output_file, overwrite, yes, jsonl_files)
|
41
|
+
save_parquet(
|
42
|
+
load_dataset("json", data_files=jsonl_files, split="train"),
|
43
|
+
output_file,
|
44
|
+
compression=compression,
|
45
|
+
batch_size=batch_size,
|
46
|
+
)
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import click
|
2
|
+
|
3
|
+
from ...options import *
|
4
|
+
from ...utils import *
|
5
|
+
|
6
|
+
__all__ = ["mds"]
|
7
|
+
|
8
|
+
@click.group()
|
9
|
+
def mds():
|
10
|
+
pass
|
11
|
+
|
12
|
+
@mds.command()
|
13
|
+
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
14
|
+
@click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
|
15
|
+
@compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
|
16
|
+
@processes_option()
|
17
|
+
@overwrite_option()
|
18
|
+
@yes_option()
|
19
|
+
@batch_size_option()
|
20
|
+
@no_bulk_option()
|
21
|
+
def jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk):
|
22
|
+
check_arguments(output_file, overwrite, yes, mds_directories)
|
23
|
+
save_jsonl(
|
24
|
+
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
|
25
|
+
output_file,
|
26
|
+
compression=compression,
|
27
|
+
processes=processes,
|
28
|
+
)
|
29
|
+
|
30
|
+
@mds.command()
|
31
|
+
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
32
|
+
@click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
|
33
|
+
@compression_option("snappy", ["snappy", "gzip", "zstd"])
|
34
|
+
@overwrite_option()
|
35
|
+
@yes_option()
|
36
|
+
@batch_size_option()
|
37
|
+
@no_bulk_option()
|
38
|
+
def parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk):
|
39
|
+
check_arguments(output_file, overwrite, yes, mds_directories)
|
40
|
+
save_parquet(
|
41
|
+
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
|
42
|
+
output_file,
|
43
|
+
compression=compression,
|
44
|
+
batch_size=batch_size,
|
45
|
+
)
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import click
|
2
|
+
from datasets import load_dataset
|
3
|
+
|
4
|
+
from ...options import *
|
5
|
+
from ...utils import *
|
6
|
+
|
7
|
+
__all__ = ["parquet"]
|
8
|
+
|
9
|
+
@click.group()
|
10
|
+
def parquet():
|
11
|
+
pass
|
12
|
+
|
13
|
+
@parquet.command()
|
14
|
+
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
15
|
+
@click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
|
16
|
+
@compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
|
17
|
+
@processes_option()
|
18
|
+
@overwrite_option()
|
19
|
+
@yes_option()
|
20
|
+
def jsonl(output_file, parquet_files, compression, processes, overwrite, yes):
|
21
|
+
check_arguments(output_file, overwrite, yes, parquet_files)
|
22
|
+
save_jsonl(
|
23
|
+
load_dataset("parquet", data_files=parquet_files, split="train"),
|
24
|
+
output_file,
|
25
|
+
compression=compression,
|
26
|
+
processes=processes,
|
27
|
+
)
|
28
|
+
|
29
|
+
@parquet.command()
|
30
|
+
@click.argument('output_dir', type=click.Path(exists=False))
|
31
|
+
@click.argument('parquet_files', nargs=-1, type=click.Path(exists=True))
|
32
|
+
@compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
|
33
|
+
@processes_option()
|
34
|
+
@overwrite_option()
|
35
|
+
@yes_option()
|
36
|
+
@buf_size_option()
|
37
|
+
def mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_size):
|
38
|
+
check_arguments(output_dir, overwrite, yes, parquet_files)
|
39
|
+
save_mds(
|
40
|
+
load_dataset("parquet", data_files=parquet_files, split="train"),
|
41
|
+
output_dir,
|
42
|
+
processes=processes,
|
43
|
+
compression=compression,
|
44
|
+
buf_size=buf_size,
|
45
|
+
pigz=use_pigz(compression),
|
46
|
+
)
|
@@ -0,0 +1,97 @@
|
|
1
|
+
import gzip
|
2
|
+
import json
|
3
|
+
from mltiming import timing
|
4
|
+
import numpy as np
|
5
|
+
import os
|
6
|
+
from streaming.base.format.mds.encodings import mds_decode
|
7
|
+
from typing import Any, Optional, Generator
|
8
|
+
|
9
|
+
class MDSBulkReader:
|
10
|
+
def __init__(
|
11
|
+
self,
|
12
|
+
dirnames: list[str],
|
13
|
+
split: Optional[str],
|
14
|
+
) -> None:
|
15
|
+
self.shards = []
|
16
|
+
self.samples = 0
|
17
|
+
for dirname in dirnames:
|
18
|
+
if split is not None:
|
19
|
+
dirname = os.path.join(dirname, split)
|
20
|
+
index = json.load(open(os.path.join(dirname, "index.json"), 'rt'))
|
21
|
+
for shard in index["shards"]:
|
22
|
+
basename = shard['raw_data']['basename'] if shard['zip_data'] is None else shard['zip_data']['basename']
|
23
|
+
filename = os.path.join(dirname, basename)
|
24
|
+
self.shards.append({
|
25
|
+
"filename": filename,
|
26
|
+
"compression": shard['compression'],
|
27
|
+
})
|
28
|
+
self.samples += shard['samples']
|
29
|
+
|
30
|
+
def __len__(self) -> int:
|
31
|
+
return self.samples
|
32
|
+
|
33
|
+
def __iter__(self) -> Generator[dict[str, Any], None, None]:
|
34
|
+
for shard in self.shards:
|
35
|
+
with MDSShardReader(**shard) as reader:
|
36
|
+
for sample in reader:
|
37
|
+
yield sample
|
38
|
+
|
39
|
+
class MDSShardReader:
|
40
|
+
def __init__(
|
41
|
+
self,
|
42
|
+
filename: str,
|
43
|
+
compression: Optional[str],
|
44
|
+
) -> None:
|
45
|
+
if compression is None:
|
46
|
+
_open = open
|
47
|
+
elif compression == 'gz':
|
48
|
+
_open = gzip.open
|
49
|
+
else:
|
50
|
+
raise ValueError(f'Unsupported compression type: {compression}. Supported types: None, gzip.')
|
51
|
+
self.fp = _open(filename, "rb")
|
52
|
+
self.samples = np.frombuffer(self.fp.read(4), np.uint32)[0]
|
53
|
+
self.index = np.frombuffer(self.fp.read((1+self.samples)*4), np.uint32)
|
54
|
+
info = json.loads(self.fp.read(self.index[0]-self.fp.tell()))
|
55
|
+
self.column_encodings = info["column_encodings"]
|
56
|
+
self.column_names = info["column_names"]
|
57
|
+
self.column_sizes = info["column_sizes"]
|
58
|
+
assert self.fp.tell() == self.index[0]
|
59
|
+
|
60
|
+
def decode_sample(self, data: bytes) -> dict[str, Any]:
|
61
|
+
sizes = []
|
62
|
+
idx = 0
|
63
|
+
for key, size in zip(self.column_names, self.column_sizes):
|
64
|
+
if size:
|
65
|
+
sizes.append(size)
|
66
|
+
else:
|
67
|
+
size, = np.frombuffer(data[idx:idx + 4], np.uint32)
|
68
|
+
sizes.append(size)
|
69
|
+
idx += 4
|
70
|
+
sample = {}
|
71
|
+
for key, encoding, size in zip(self.column_names, self.column_encodings, sizes):
|
72
|
+
value = data[idx:idx + size]
|
73
|
+
sample[key] = mds_decode(encoding, value)
|
74
|
+
idx += size
|
75
|
+
return sample
|
76
|
+
|
77
|
+
def get_sample_data(self, idx: int) -> bytes:
|
78
|
+
begin, end = self.index[idx:idx+2]
|
79
|
+
assert self.fp.tell() == begin
|
80
|
+
data = self.fp.read(end - begin)
|
81
|
+
assert self.fp.tell() == end
|
82
|
+
assert data
|
83
|
+
return data
|
84
|
+
|
85
|
+
def get_item(self, idx: int) -> dict[str, Any]:
|
86
|
+
data = self.get_sample_data(idx)
|
87
|
+
return self.decode_sample(data)
|
88
|
+
|
89
|
+
def __iter__(self) -> Generator[dict[str, Any], None, None]:
|
90
|
+
for i in range(self.samples):
|
91
|
+
yield self.get_item(i)
|
92
|
+
|
93
|
+
def __enter__(self) -> "MDSShardReader":
|
94
|
+
return self
|
95
|
+
|
96
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
97
|
+
self.fp.close()
|
@@ -0,0 +1,81 @@
|
|
1
|
+
import click
|
2
|
+
|
3
|
+
__alll__ = [
|
4
|
+
"batch_size_option",
|
5
|
+
"buf_size_option",
|
6
|
+
"compression_option",
|
7
|
+
"overwrite_option",
|
8
|
+
"processes_option",
|
9
|
+
"yes_option",
|
10
|
+
]
|
11
|
+
|
12
|
+
def batch_size_option(default=2**16):
|
13
|
+
"""
|
14
|
+
Option for specifying the batch size.
|
15
|
+
"""
|
16
|
+
return click.option(
|
17
|
+
"--batch-size",
|
18
|
+
default=default,
|
19
|
+
help=f"Batch size for loading data and writing files (default: {default}).",
|
20
|
+
)
|
21
|
+
|
22
|
+
def buf_size_option(default=2**24):
|
23
|
+
"""
|
24
|
+
Option for specifying the buffer size.
|
25
|
+
"""
|
26
|
+
return click.option(
|
27
|
+
"--buf-size",
|
28
|
+
default=default,
|
29
|
+
help=f"Buffer size for pigz compression (default: {default}).",
|
30
|
+
)
|
31
|
+
|
32
|
+
def no_bulk_option():
|
33
|
+
"""
|
34
|
+
Option for specifying whether to use a custom space and time-efficient bulk reader (only gzip and no compression).
|
35
|
+
"""
|
36
|
+
return click.option(
|
37
|
+
"--no-bulk",
|
38
|
+
is_flag=True,
|
39
|
+
help="Use a custom space and time-efficient bulk reader (only gzip and no compression).",
|
40
|
+
)
|
41
|
+
|
42
|
+
def compression_option(default, choices):
|
43
|
+
"""
|
44
|
+
Option for specifying the compression type.
|
45
|
+
"""
|
46
|
+
return click.option(
|
47
|
+
"--compression",
|
48
|
+
default=default,
|
49
|
+
type=click.Choice(choices, case_sensitive=False),
|
50
|
+
help=f"Compress the output file (default: {default}).",
|
51
|
+
)
|
52
|
+
|
53
|
+
def overwrite_option():
|
54
|
+
"""
|
55
|
+
Option for specifying whether to overwrite existing files.
|
56
|
+
"""
|
57
|
+
return click.option(
|
58
|
+
"--overwrite",
|
59
|
+
is_flag=True,
|
60
|
+
help="Overwrite existing path.",
|
61
|
+
)
|
62
|
+
|
63
|
+
def processes_option(default=64):
|
64
|
+
"""
|
65
|
+
Option for specifying the number of processes to use.
|
66
|
+
"""
|
67
|
+
return click.option(
|
68
|
+
"--processes",
|
69
|
+
default=default,
|
70
|
+
help=f"Number of processes to use (default: {default}).",
|
71
|
+
)
|
72
|
+
|
73
|
+
def yes_option():
|
74
|
+
"""
|
75
|
+
Option for specifying whether to assume yes to all prompts.
|
76
|
+
"""
|
77
|
+
return click.option(
|
78
|
+
"--yes",
|
79
|
+
is_flag=True,
|
80
|
+
help="Assume yes to all prompts. Use with caution as it will remove files or even entire directories without confirmation.",
|
81
|
+
)
|
@@ -1,29 +1,28 @@
|
|
1
|
-
import atexit
|
2
1
|
import bz2
|
3
2
|
import click
|
4
|
-
from datasets import concatenate_datasets
|
3
|
+
from datasets import concatenate_datasets
|
5
4
|
import gzip
|
5
|
+
import json
|
6
6
|
import lzma
|
7
7
|
from mltiming import timing
|
8
|
+
import pyarrow as pa
|
9
|
+
import pyarrow.parquet as pq
|
8
10
|
import os
|
9
11
|
import shutil
|
10
|
-
from streaming import StreamingDataset
|
11
|
-
import tempfile
|
12
|
+
from streaming import MDSWriter, StreamingDataset
|
12
13
|
from tqdm import tqdm
|
13
14
|
|
15
|
+
from .mds import MDSBulkReader
|
14
16
|
from .pigz import pigz_open
|
15
17
|
|
16
18
|
__all__ = [
|
17
|
-
"
|
18
|
-
"
|
19
|
-
"
|
20
|
-
"infer_mds_encoding",
|
21
|
-
"infer_compression",
|
22
|
-
"load_parquet_files",
|
19
|
+
"batch_iterable",
|
20
|
+
"check_arguments",
|
21
|
+
"confirm_overwrite",
|
23
22
|
"load_mds_directories",
|
24
|
-
"
|
25
|
-
"
|
26
|
-
"
|
23
|
+
"save_jsonl",
|
24
|
+
"save_mds",
|
25
|
+
"save_parquet",
|
27
26
|
"use_pigz",
|
28
27
|
]
|
29
28
|
|
@@ -37,7 +36,9 @@ def batch_iterable(iterable, batch_size):
|
|
37
36
|
if batch:
|
38
37
|
yield batch
|
39
38
|
|
40
|
-
def
|
39
|
+
def check_arguments(output_path, overwrite, yes, input_paths):
|
40
|
+
if not input_paths:
|
41
|
+
raise click.BadArgumentUsage("No input paths provided.")
|
41
42
|
if os.path.exists(output_path):
|
42
43
|
if os.path.isfile(output_path):
|
43
44
|
if not overwrite:
|
@@ -62,31 +63,14 @@ def confirm_overwrite(message):
|
|
62
63
|
if response.lower() != 'yes':
|
63
64
|
raise click.Abort()
|
64
65
|
|
65
|
-
def
|
66
|
-
def _cleanup_file(file_path):
|
67
|
-
try:
|
68
|
-
os.remove(file_path)
|
69
|
-
except OSError:
|
70
|
-
pass
|
71
|
-
# Create a named temp file, don't delete right away
|
72
|
-
temp = tempfile.NamedTemporaryFile(delete=False)
|
73
|
-
temp_name = temp.name
|
74
|
-
# Close so others can open it again without conflicts (especially on Windows)
|
75
|
-
temp.close()
|
76
|
-
|
77
|
-
# Schedule its deletion at exit
|
78
|
-
atexit.register(_cleanup_file, temp_name)
|
79
|
-
|
80
|
-
return temp_name
|
81
|
-
|
82
|
-
def determine_compression(file_path, compression="infer"):
|
66
|
+
def _determine_compression(file_path, compression="infer"):
|
83
67
|
if compression == "infer":
|
84
|
-
compression =
|
68
|
+
compression = _infer_compression(file_path)
|
85
69
|
if compression == "none":
|
86
70
|
compression = None
|
87
71
|
return compression
|
88
72
|
|
89
|
-
def
|
73
|
+
def _infer_mds_encoding(value):
|
90
74
|
"""Determine the MDS encoding for a given value."""
|
91
75
|
if isinstance(value, str):
|
92
76
|
return 'str'
|
@@ -98,11 +82,11 @@ def infer_mds_encoding(value):
|
|
98
82
|
return 'bool'
|
99
83
|
return 'pkl'
|
100
84
|
|
101
|
-
def
|
85
|
+
def _infer_compression(file_path):
|
102
86
|
"""Infer the compression type from the file extension."""
|
103
87
|
extension = os.path.splitext(file_path)[1]
|
104
88
|
if extension.endswith('.gz'):
|
105
|
-
if
|
89
|
+
if _pigz_available():
|
106
90
|
return 'pigz'
|
107
91
|
return 'gzip'
|
108
92
|
if extension.endswith('.bz2'):
|
@@ -115,9 +99,11 @@ def infer_compression(file_path):
|
|
115
99
|
return 'zstd'
|
116
100
|
return None
|
117
101
|
|
118
|
-
def load_mds_directories(mds_directories, split='.', batch_size=2**16):
|
102
|
+
def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True):
|
103
|
+
if bulk:
|
104
|
+
return MDSBulkReader(mds_directories, split=split)
|
119
105
|
dss = []
|
120
|
-
for mds_directory in
|
106
|
+
for mds_directory in mds_directories:
|
121
107
|
ds = StreamingDataset(
|
122
108
|
local=mds_directory,
|
123
109
|
remote=None,
|
@@ -136,21 +122,9 @@ def load_mds_directories(mds_directories, split='.', batch_size=2**16):
|
|
136
122
|
ds = concatenate_datasets(dsets=dss)
|
137
123
|
return ds
|
138
124
|
|
139
|
-
def
|
140
|
-
dss = []
|
141
|
-
for parquet_file in tqdm(parquet_files, desc="Loading parquet files", unit="file"):
|
142
|
-
ds = load_dataset("parquet", data_files=parquet_file, split="train")
|
143
|
-
dss.append(ds)
|
144
|
-
if len(dss) == 1:
|
145
|
-
ds = dss[0]
|
146
|
-
else:
|
147
|
-
with timing(message=f"Concatenating {len(dss)} datasets"):
|
148
|
-
ds = concatenate_datasets(dsets=dss)
|
149
|
-
return ds
|
150
|
-
|
151
|
-
def open_jsonl(file_path, mode="rt", compression="infer", processes=64):
|
125
|
+
def _open_jsonl(file_path, mode="rt", compression="infer", processes=64):
|
152
126
|
"""Open a JSONL file, handling gzip compression if necessary."""
|
153
|
-
compression =
|
127
|
+
compression = _determine_compression(file_path, compression)
|
154
128
|
if compression == "gzip":
|
155
129
|
return gzip.open(file_path, mode)
|
156
130
|
if compression == "pigz":
|
@@ -163,11 +137,11 @@ def open_jsonl(file_path, mode="rt", compression="infer", processes=64):
|
|
163
137
|
return open(file_path, mode)
|
164
138
|
raise ValueError(f"Unsupported compression type: {compression}")
|
165
139
|
|
166
|
-
def
|
140
|
+
def _pigz_available():
|
167
141
|
"""Check if pigz is available on the system."""
|
168
142
|
return shutil.which("pigz") is not None
|
169
143
|
|
170
|
-
def
|
144
|
+
def _pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=False, quiet=False):
|
171
145
|
"""Compress a file using pigz."""
|
172
146
|
size = os.stat(input_file).st_size
|
173
147
|
num_blocks = (size+buf_size-1) // buf_size
|
@@ -183,6 +157,54 @@ def pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=Fa
|
|
183
157
|
if not quiet:
|
184
158
|
print(f"Removed {input_file}")
|
185
159
|
|
160
|
+
def save_jsonl(iterable, output_file, compression=None, processes=64):
|
161
|
+
compression = _determine_compression(output_file, compression)
|
162
|
+
with _open_jsonl(output_file, mode="wb", compression=compression, processes=processes) as f:
|
163
|
+
for item in tqdm(iterable, desc="Writing to JSONL", unit="sample"):
|
164
|
+
f.write(f"{json.dumps(item)}\n".encode("utf-8"))
|
165
|
+
|
166
|
+
def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pigz=False):
|
167
|
+
if compression == "none" or pigz:
|
168
|
+
compression = None
|
169
|
+
if compression == "gzip":
|
170
|
+
compression = "gz"
|
171
|
+
writer = None
|
172
|
+
for sample in tqdm(it, desc="Writing to MDS", unit="sample"):
|
173
|
+
if writer is None:
|
174
|
+
columns = {key: _infer_mds_encoding(value) for key, value in sample.items()}
|
175
|
+
writer = MDSWriter(out=output_dir, columns=columns, compression=compression)
|
176
|
+
writer.write(sample)
|
177
|
+
writer.finish()
|
178
|
+
if pigz:
|
179
|
+
index_path = os.path.join(output_dir, "index.json")
|
180
|
+
index = json.load(open(index_path, "rt"))
|
181
|
+
name2info = {shard["raw_data"]["basename"]: shard for shard in index["shards"]}
|
182
|
+
file_names = [file for file in os.listdir(output_dir) if file.endswith(".mds")]
|
183
|
+
assert set(file_names) == set(name2info.keys())
|
184
|
+
for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file"):
|
185
|
+
compressed_file_name = file_name + ".gz"
|
186
|
+
file_path = os.path.join(output_dir, file_name)
|
187
|
+
compressed_file_path = os.path.join(output_dir, compressed_file_name)
|
188
|
+
_pigz_compress(file_path, compressed_file_path, processes, buf_size=buf_size, keep=False, quiet=True)
|
189
|
+
name2info[file_name]["compression"] = "gz"
|
190
|
+
name2info[file_name]["zip_data"] = {
|
191
|
+
"basename": compressed_file_name,
|
192
|
+
"bytes": os.stat(compressed_file_path).st_size,
|
193
|
+
"hashes": {},
|
194
|
+
}
|
195
|
+
json.dump(index, open(index_path, "wt"))
|
196
|
+
print(f"Compressed {output_dir} with pigz")
|
197
|
+
|
198
|
+
def save_parquet(it, output_file, compression=None, batch_size=2**16):
|
199
|
+
writer = None
|
200
|
+
it = tqdm(it, desc="Writing to Parquet", unit="sample")
|
201
|
+
for batch in batch_iterable(it, batch_size):
|
202
|
+
table = pa.Table.from_pylist(batch)
|
203
|
+
if writer is None:
|
204
|
+
writer = pq.ParquetWriter(output_file, table.schema, compression=compression)
|
205
|
+
writer.write_table(table)
|
206
|
+
writer.close()
|
207
|
+
|
186
208
|
def use_pigz(compression):
|
187
209
|
"""Determine if pigz should be used based on the compression type."""
|
188
|
-
return compression == "pigz" or (compression == "gzip" and
|
210
|
+
return compression == "pigz" or (compression == "gzip" and _pigz_available())
|
@@ -1,59 +0,0 @@
|
|
1
|
-
import click
|
2
|
-
import json
|
3
|
-
import os
|
4
|
-
from streaming import MDSWriter
|
5
|
-
from tqdm import tqdm
|
6
|
-
|
7
|
-
from ....utils import check_overwrite, infer_mds_encoding, open_jsonl, pigz_compress, use_pigz
|
8
|
-
|
9
|
-
@click.command()
|
10
|
-
@click.argument('output_dir', type=click.Path(exists=False))
|
11
|
-
@click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
|
12
|
-
@click.option('--compression', type=click.Choice(['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'], case_sensitive=False), default=None, help='Compression type for the output dataset (default: None).')
|
13
|
-
@click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
|
14
|
-
@click.option("--overwrite", is_flag=True, help="Overwrite existing MDS directory.")
|
15
|
-
@click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove entire directory trees without confirmation.")
|
16
|
-
@click.option("--buf-size", default=2**24, help=f"Buffer size for pigz compression (default: {2**24}).")
|
17
|
-
def mds(output_dir, jsonl_files, processes, compression, overwrite, yes, buf_size):
|
18
|
-
check_overwrite(output_dir, overwrite, yes)
|
19
|
-
if not jsonl_files:
|
20
|
-
raise click.BadArgumentUsage("No JSONL files provided.")
|
21
|
-
with open_jsonl(jsonl_files[0]) as f:
|
22
|
-
sample = json.loads(f.readline())
|
23
|
-
pigz = use_pigz(compression)
|
24
|
-
if compression == "none" or pigz:
|
25
|
-
compression = None
|
26
|
-
if compression == "gzip":
|
27
|
-
compression = "gz"
|
28
|
-
columns = {key: infer_mds_encoding(value) for key, value in sample.items()}
|
29
|
-
lines = 0
|
30
|
-
with MDSWriter(out=output_dir, columns=columns, compression=compression) as writer:
|
31
|
-
for jsonl_file in tqdm(jsonl_files, desc="Processing JSONL files", unit="file"):
|
32
|
-
with open_jsonl(jsonl_file, compression="infer") as f:
|
33
|
-
for line_num, line in enumerate(f, start=1):
|
34
|
-
try:
|
35
|
-
item = json.loads(line)
|
36
|
-
writer.write(item)
|
37
|
-
except json.JSONDecodeError as e:
|
38
|
-
print(f"Skipping line {line_num} in {jsonl_file} due to JSON error: {e}")
|
39
|
-
lines += 1
|
40
|
-
print(f"Wrote {lines} lines from {len(jsonl_files)} files to MDS files in {output_dir}")
|
41
|
-
if pigz:
|
42
|
-
index_path = os.path.join(output_dir, "index.json")
|
43
|
-
index = json.load(open(index_path, "rt"))
|
44
|
-
name2info = {shard["raw_data"]["basename"]: shard for shard in index["shards"]}
|
45
|
-
file_names = [file for file in os.listdir(output_dir) if file.endswith(".mds")]
|
46
|
-
assert set(file_names) == set(name2info.keys())
|
47
|
-
for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file"):
|
48
|
-
compressed_file_name = file_name + ".gz"
|
49
|
-
file_path = os.path.join(output_dir, file_name)
|
50
|
-
compressed_file_path = os.path.join(output_dir, compressed_file_name)
|
51
|
-
pigz_compress(file_path, compressed_file_path, processes, buf_size=buf_size, keep=False, quiet=True)
|
52
|
-
name2info[file_name]["compression"] = "gz"
|
53
|
-
name2info[file_name]["zip_data"] = {
|
54
|
-
"basename": compressed_file_name,
|
55
|
-
"bytes": os.stat(compressed_file_path).st_size,
|
56
|
-
"hashes": {},
|
57
|
-
}
|
58
|
-
json.dump(index, open(index_path, "wt"))
|
59
|
-
print(f"Compressed {output_dir} with pigz")
|
@@ -1,39 +0,0 @@
|
|
1
|
-
import click
|
2
|
-
import json
|
3
|
-
import pyarrow as pa
|
4
|
-
import pyarrow.parquet as pq
|
5
|
-
from tqdm import tqdm
|
6
|
-
|
7
|
-
from ....utils import batch_iterable, check_overwrite, open_jsonl
|
8
|
-
|
9
|
-
def _iterate(jsonl_files):
|
10
|
-
lines = 0
|
11
|
-
for jsonl_file in tqdm(jsonl_files, desc="Processing JSONL files", unit="file"):
|
12
|
-
with open_jsonl(jsonl_file, compression="infer") as f:
|
13
|
-
for line_num, line in enumerate(f, start=1):
|
14
|
-
try:
|
15
|
-
item = json.loads(line)
|
16
|
-
yield item
|
17
|
-
except json.JSONDecodeError as e:
|
18
|
-
print(f"Skipping line {line_num} in {jsonl_file} due to JSON error: {e}")
|
19
|
-
lines += 1
|
20
|
-
print(f"Wrote {lines} lines from {len(jsonl_files)} files")
|
21
|
-
|
22
|
-
@click.command()
|
23
|
-
@click.argument('output_file', type=click.Path(exists=False))
|
24
|
-
@click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
|
25
|
-
@click.option("--compression", default="snappy", type=click.Choice(["snappy", "gzip", "zstd"]), help="Compress the Parquet file (default: snappy).")
|
26
|
-
@click.option("--overwrite", is_flag=True, help="Overwrite existing MDS directory.")
|
27
|
-
@click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove entire directory trees without confirmation.")
|
28
|
-
@click.option("--batch-size", default=2**16, help="Batch size for loading MDS directories and writing Parquet files (default: 65536).")
|
29
|
-
def parquet(output_file, jsonl_files, compression, overwrite, yes, batch_size):
|
30
|
-
check_overwrite(output_file, overwrite, yes)
|
31
|
-
if not jsonl_files:
|
32
|
-
raise click.BadArgumentUsage("No JSONL files provided.")
|
33
|
-
writer = None
|
34
|
-
for batch in batch_iterable(_iterate(jsonl_files), batch_size):
|
35
|
-
table = pa.Table.from_pylist(batch)
|
36
|
-
if writer is None:
|
37
|
-
writer = pq.ParquetWriter(output_file, table.schema, compression=compression)
|
38
|
-
writer.write_table(table)
|
39
|
-
writer.close()
|
@@ -1,23 +0,0 @@
|
|
1
|
-
import click
|
2
|
-
import json
|
3
|
-
from tqdm import tqdm
|
4
|
-
|
5
|
-
from ....utils import check_overwrite, create_temp_file, determine_compression, load_mds_directories, open_jsonl
|
6
|
-
|
7
|
-
@click.command()
|
8
|
-
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
9
|
-
@click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
|
10
|
-
@click.option("--compression", default="infer", type=click.Choice(["none", "infer", "pigz", "gzip", "bz2", "xz"]), help="Compress the output JSONL file (default: infer; pigz for parallel gzip).")
|
11
|
-
@click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
|
12
|
-
@click.option("--overwrite", is_flag=True, help="Overwrite existing JSONL files.")
|
13
|
-
@click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove files without confirmation.")
|
14
|
-
@click.option("--batch-size", default=2**16, help="Batch size for loading MDS directories (default: 65536).")
|
15
|
-
def jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size):
|
16
|
-
check_overwrite(output_file, overwrite, yes)
|
17
|
-
if not mds_directories:
|
18
|
-
raise click.BadArgumentUsage("No MDS files provided.")
|
19
|
-
ds = load_mds_directories(mds_directories, batch_size=batch_size)
|
20
|
-
compression = determine_compression(output_file, compression)
|
21
|
-
with open_jsonl(output_file, mode="wb", compression=compression, processes=processes) as f:
|
22
|
-
for item in tqdm(ds, desc="Writing to JSONL", unit="line"):
|
23
|
-
f.write(f"{json.dumps(item)}\n".encode("utf-8"))
|
@@ -1,26 +0,0 @@
|
|
1
|
-
import click
|
2
|
-
import pyarrow as pa
|
3
|
-
import pyarrow.parquet as pq
|
4
|
-
from tqdm import tqdm
|
5
|
-
|
6
|
-
from ....utils import batch_iterable, check_overwrite, load_mds_directories
|
7
|
-
|
8
|
-
@click.command()
|
9
|
-
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
10
|
-
@click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
|
11
|
-
@click.option("--compression", default="snappy", type=click.Choice(["snappy", "gzip", "zstd"]), help="Compress the Parquet file (default: snappy).")
|
12
|
-
@click.option("--overwrite", is_flag=True, help="Overwrite existing Parquet files.")
|
13
|
-
@click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove files without confirmation.")
|
14
|
-
@click.option("--batch-size", default=2**16, help="Batch size for loading MDS directories and writing Parquet files (default: 65536).")
|
15
|
-
def parquet(output_file, mds_directories, compression, overwrite, yes, batch_size):
|
16
|
-
check_overwrite(output_file, overwrite, yes)
|
17
|
-
if not mds_directories:
|
18
|
-
raise click.BadArgumentUsage("No MDS files provided.")
|
19
|
-
ds = load_mds_directories(mds_directories, batch_size=batch_size)
|
20
|
-
writer = None
|
21
|
-
for batch in tqdm(batch_iterable(ds, batch_size), desc="Writing to Parquet", unit="batch", total=(len(ds)+batch_size-1) // batch_size):
|
22
|
-
table = pa.Table.from_pylist(batch)
|
23
|
-
if writer is None:
|
24
|
-
writer = pq.ParquetWriter(output_file, table.schema, compression=compression)
|
25
|
-
writer.write_table(table)
|
26
|
-
writer.close()
|
@@ -1,25 +0,0 @@
|
|
1
|
-
import click
|
2
|
-
from mltiming import timing
|
3
|
-
|
4
|
-
from ....utils import check_overwrite, create_temp_file, determine_compression, load_parquet_files, pigz_compress
|
5
|
-
|
6
|
-
@click.command()
|
7
|
-
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
8
|
-
@click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
|
9
|
-
@click.option("--compression", default="infer", type=click.Choice(["none", "infer", "pigz", "gzip", "bz2", "xz"]), help="Compress the output JSONL file (default: infer; pigz for parallel gzip).")
|
10
|
-
@click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
|
11
|
-
@click.option("--overwrite", is_flag=True, help="Overwrite existing JSONL files.")
|
12
|
-
@click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove files without confirmation.")
|
13
|
-
@click.option("--buf-size", default=2**24, help=f"Buffer size for pigz compression (default: {2**24}).")
|
14
|
-
def jsonl(output_file, parquet_files, compression, processes, overwrite, yes, buf_size):
|
15
|
-
check_overwrite(output_file, overwrite, yes)
|
16
|
-
if not parquet_files:
|
17
|
-
raise click.BadArgumentUsage("No parquet files provided.")
|
18
|
-
ds = load_parquet_files(parquet_files)
|
19
|
-
compression = determine_compression(output_file, compression)
|
20
|
-
compressed_file = None
|
21
|
-
if compression == "pigz":
|
22
|
-
compression, compressed_file, output_file = None, output_file, create_temp_file()
|
23
|
-
ds.to_json(output_file, num_proc=processes, orient="records", lines=True, compression=compression)
|
24
|
-
if compressed_file is not None:
|
25
|
-
pigz_compress(output_file, compressed_file, processes, buf_size, keep=False)
|
@@ -1,43 +0,0 @@
|
|
1
|
-
import click
|
2
|
-
import json
|
3
|
-
import os
|
4
|
-
from streaming import MDSWriter
|
5
|
-
from tqdm import tqdm
|
6
|
-
|
7
|
-
from ....utils import check_overwrite, infer_mds_encoding, load_parquet_files, pigz_compress, use_pigz
|
8
|
-
|
9
|
-
@click.command()
|
10
|
-
@click.argument('output_dir', type=click.Path(exists=False))
|
11
|
-
@click.argument('parquet_files', nargs=-1, type=click.Path(exists=True))
|
12
|
-
@click.option('--compression', type=click.Choice(['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'], case_sensitive=False), default=None, help='Compression type for the output dataset (default: None).')
|
13
|
-
@click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
|
14
|
-
@click.option("--overwrite", is_flag=True, help="Overwrite existing MDS directory.")
|
15
|
-
@click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove entire directory trees without confirmation.")
|
16
|
-
@click.option("--buf-size", default=2**24, help=f"Buffer size for pigz compression (default: {2**24}).")
|
17
|
-
def mds(output_dir, parquet_files, processes, compression, overwrite, yes, buf_size):
|
18
|
-
check_overwrite(output_dir, overwrite, yes)
|
19
|
-
if not parquet_files:
|
20
|
-
raise click.BadArgumentUsage("No parquet files provided.")
|
21
|
-
ds = load_parquet_files(parquet_files)
|
22
|
-
pigz = use_pigz(compression)
|
23
|
-
sample = ds[0]
|
24
|
-
if compression == "none" or pigz:
|
25
|
-
compression = None
|
26
|
-
if compression == "gzip":
|
27
|
-
compression = "gz"
|
28
|
-
columns = {key: infer_mds_encoding(value) for key, value in sample.items()}
|
29
|
-
lines = 0
|
30
|
-
with MDSWriter(out=output_dir, columns=columns, compression=compression) as writer:
|
31
|
-
for item in tqdm(ds, desc="Processing samples", unit="sample"):
|
32
|
-
writer.write(item)
|
33
|
-
lines += 1
|
34
|
-
print(f"Wrote {lines} lines from {len(parquet_files)} files to MDS files in {output_dir}")
|
35
|
-
if pigz:
|
36
|
-
file_paths = []
|
37
|
-
for file in os.listdir(output_dir):
|
38
|
-
if file.endswith(".mds"):
|
39
|
-
file_paths.append(os.path.join(output_dir, file))
|
40
|
-
for file_path in tqdm(file_paths, desc="Compressing with pigz", unit="file"):
|
41
|
-
pigz_compress(file_path, file_path + ".gz", processes, buf_size=buf_size, keep=False, quiet=True)
|
42
|
-
output_dir
|
43
|
-
print(f"Compressed {output_dir} with pigz")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|