mldataforge 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mldataforge
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
5
5
  Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
6
6
  Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
@@ -10,10 +10,15 @@ Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.12
13
+ Requires-Dist: brotlicffi
13
14
  Requires-Dist: click
14
15
  Requires-Dist: datasets
16
+ Requires-Dist: isal
17
+ Requires-Dist: lz4
15
18
  Requires-Dist: mltiming
16
19
  Requires-Dist: mosaicml-streaming
20
+ Requires-Dist: python-snappy
21
+ Requires-Dist: zstandard
17
22
  Provides-Extra: all
18
23
  Requires-Dist: build; extra == 'all'
19
24
  Requires-Dist: pytest; extra == 'all'
@@ -0,0 +1,82 @@
1
+ import brotlicffi as brotli
2
+ import io
3
+
4
+ __all__ = ["brotli_open"]
5
+
6
+ def brotli_open(filename, mode='rb', encoding='utf-8', compress_level=11):
7
+ return BrotliFile(filename, mode=mode, encoding=encoding, compress_level=11)
8
+
9
+ import brotlicffi as brotli
10
+ import io
11
+
12
+ __all__ = ["brotli_open"]
13
+
14
+ class BrotliFile:
15
+ def __init__(self, filename, mode='rb', encoding='utf-8', compress_level=11):
16
+ self.filename = filename
17
+ self.mode = mode
18
+ self.encoding = encoding
19
+ self.compress_level = compress_level
20
+
21
+ self.binary = 'b' in mode
22
+ file_mode = mode.replace('t', 'b')
23
+ self.file = open(filename, file_mode)
24
+
25
+ if 'r' in mode:
26
+ self._decompressor = brotli.Decompressor()
27
+ self._stream = self._wrap_reader()
28
+ elif 'w' in mode:
29
+ self._compressor = brotli.Compressor(quality=compress_level)
30
+ self._stream = self._wrap_writer()
31
+ else:
32
+ raise ValueError("Unsupported mode (use 'rb', 'wb', 'rt', or 'wt')")
33
+
34
+ def _wrap_reader(self):
35
+ buffer = io.BytesIO()
36
+ while True:
37
+ chunk = self.file.read(8192)
38
+ if not chunk:
39
+ break
40
+ buffer.write(self._decompressor.process(chunk))
41
+ buffer.seek(0)
42
+ return buffer if self.binary else io.TextIOWrapper(buffer, encoding=self.encoding)
43
+
44
+ def _wrap_writer(self):
45
+ return self if self.binary else io.TextIOWrapper(self, encoding=self.encoding)
46
+
47
+ def write(self, data):
48
+ if isinstance(data, str):
49
+ data = data.encode(self.encoding)
50
+ compressed = self._compressor.process(data)
51
+ self.file.write(compressed)
52
+ return len(data)
53
+
54
+ def flush(self):
55
+ if hasattr(self, '_compressor'):
56
+ self.file.write(self._compressor.finish())
57
+ self.file.flush()
58
+
59
+ def read(self, *args, **kwargs):
60
+ return self._stream.read(*args, **kwargs)
61
+
62
+ def readline(self, *args, **kwargs):
63
+ return self._stream.readline(*args, **kwargs)
64
+
65
+ def __iter__(self):
66
+ return iter(self._stream)
67
+
68
+ def close(self):
69
+ try:
70
+ if hasattr(self._stream, 'flush'):
71
+ self._stream.flush()
72
+ finally:
73
+ self.file.close()
74
+
75
+ def __enter__(self):
76
+ return self
77
+
78
+ def __exit__(self, exc_type, exc_val, exc_tb):
79
+ self.close()
80
+
81
+ def tell(self):
82
+ return self._stream.tell()
@@ -1,10 +1,11 @@
1
1
  import click
2
2
  from datasets import load_dataset
3
3
 
4
+ from ...compression import *
4
5
  from ...options import *
5
6
  from ...utils import *
6
7
 
7
- __all__ = ["jsonl"]
8
+ __all__ = ["jsonl_to_mds", "jsonl_to_parquet"]
8
9
 
9
10
  @click.group()
10
11
  def jsonl():
@@ -13,35 +14,40 @@ def jsonl():
13
14
  @jsonl.command()
14
15
  @click.argument('output_dir', type=click.Path(exists=False))
15
16
  @click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
16
- @compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
17
+ @compression_option(MDS_COMPRESSIONS)
17
18
  @overwrite_option()
18
19
  @yes_option()
19
20
  @processes_option()
20
21
  @buf_size_option()
21
22
  @shard_size_option()
22
- def mds(output_dir, jsonl_files, compression, processes, overwrite, yes, buf_size, shard_size):
23
+ @no_pigz_option()
24
+ def mds(**kwargs):
25
+ jsonl_to_mds(**kwargs)
26
+ def jsonl_to_mds(output_dir, jsonl_files, compression, processes, overwrite, yes, buf_size, shard_size, no_pigz):
23
27
  check_arguments(output_dir, overwrite, yes, jsonl_files)
24
28
  save_mds(
25
- load_dataset("json", data_files=jsonl_files, split="train"),
29
+ load_jsonl_files(jsonl_files),
26
30
  output_dir,
27
31
  processes=processes,
28
32
  compression=compression,
29
33
  buf_size=buf_size,
30
- pigz=use_pigz(compression),
34
+ pigz=use_pigz(compression, no_pigz),
31
35
  shard_size=shard_size,
32
36
  )
33
37
 
34
38
  @jsonl.command()
35
39
  @click.argument('output_file', type=click.Path(exists=False))
36
40
  @click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
37
- @compression_option("snappy", ["snappy", "gzip", "zstd"])
41
+ @compression_option(PARQUET_COMPRESSIONS)
38
42
  @overwrite_option()
39
43
  @yes_option()
40
44
  @batch_size_option()
41
- def parquet(output_file, jsonl_files, compression, overwrite, yes, batch_size):
45
+ def parquet(**kwargs):
46
+ jsonl_to_parquet(**kwargs)
47
+ def jsonl_to_parquet(output_file, jsonl_files, compression, overwrite, yes, batch_size):
42
48
  check_arguments(output_file, overwrite, yes, jsonl_files)
43
49
  save_parquet(
44
- load_dataset("json", data_files=jsonl_files, split="train"),
50
+ load_jsonl_files(jsonl_files),
45
51
  output_file,
46
52
  compression=compression,
47
53
  batch_size=batch_size,
@@ -1,9 +1,10 @@
1
1
  import click
2
2
 
3
+ from ...compression import *
3
4
  from ...options import *
4
5
  from ...utils import *
5
6
 
6
- __all__ = ["mds"]
7
+ __all__ = ["mds_to_jsonl", "mds_to_parquet"]
7
8
 
8
9
  @click.group()
9
10
  def mds():
@@ -12,13 +13,15 @@ def mds():
12
13
  @mds.command()
13
14
  @click.argument("output_file", type=click.Path(exists=False), required=True)
14
15
  @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
15
- @compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
16
+ @compression_option(JSONL_COMPRESSIONS)
16
17
  @processes_option()
17
18
  @overwrite_option()
18
19
  @yes_option()
19
20
  @batch_size_option()
20
21
  @no_bulk_option()
21
- def jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk):
22
+ def jsonl(**kwargs):
23
+ mds_to_jsonl(**kwargs)
24
+ def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk):
22
25
  check_arguments(output_file, overwrite, yes, mds_directories)
23
26
  save_jsonl(
24
27
  load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
@@ -30,12 +33,14 @@ def jsonl(output_file, mds_directories, compression, processes, overwrite, yes,
30
33
  @mds.command()
31
34
  @click.argument("output_file", type=click.Path(exists=False), required=True)
32
35
  @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
33
- @compression_option("snappy", ["snappy", "gzip", "zstd"])
36
+ @compression_option(PARQUET_COMPRESSIONS)
34
37
  @overwrite_option()
35
38
  @yes_option()
36
39
  @batch_size_option()
37
40
  @no_bulk_option()
38
- def parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk):
41
+ def parquet(**kwargs):
42
+ mds_to_parquet(**kwargs)
43
+ def mds_to_parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk):
39
44
  check_arguments(output_file, overwrite, yes, mds_directories)
40
45
  save_parquet(
41
46
  load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
@@ -1,10 +1,11 @@
1
1
  import click
2
2
  from datasets import load_dataset
3
3
 
4
+ from ...compression import *
4
5
  from ...options import *
5
6
  from ...utils import *
6
7
 
7
- __all__ = ["parquet"]
8
+ __all__ = ["parquet_to_jsonl", "parquet_to_mds"]
8
9
 
9
10
  @click.group()
10
11
  def parquet():
@@ -13,11 +14,13 @@ def parquet():
13
14
  @parquet.command()
14
15
  @click.argument("output_file", type=click.Path(exists=False), required=True)
15
16
  @click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
16
- @compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
17
+ @compression_option(JSONL_COMPRESSIONS)
17
18
  @processes_option()
18
19
  @overwrite_option()
19
20
  @yes_option()
20
- def jsonl(output_file, parquet_files, compression, processes, overwrite, yes):
21
+ def jsonl(**kwargs):
22
+ parquet_to_jsonl(**kwargs)
23
+ def parquet_to_jsonl(output_file, parquet_files, compression, processes, overwrite, yes):
21
24
  check_arguments(output_file, overwrite, yes, parquet_files)
22
25
  save_jsonl(
23
26
  load_dataset("parquet", data_files=parquet_files, split="train"),
@@ -29,13 +32,16 @@ def jsonl(output_file, parquet_files, compression, processes, overwrite, yes):
29
32
  @parquet.command()
30
33
  @click.argument('output_dir', type=click.Path(exists=False))
31
34
  @click.argument('parquet_files', nargs=-1, type=click.Path(exists=True))
32
- @compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
35
+ @compression_option(MDS_COMPRESSIONS)
33
36
  @processes_option()
34
37
  @overwrite_option()
35
38
  @yes_option()
36
39
  @buf_size_option()
37
40
  @shard_size_option()
38
- def mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_size, shard_size):
41
+ @no_pigz_option()
42
+ def mds(**kwargs):
43
+ parquet_to_mds(**kwargs)
44
+ def parquet_to_mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_size, shard_size, no_pigz):
39
45
  check_arguments(output_dir, overwrite, yes, parquet_files)
40
46
  save_mds(
41
47
  load_dataset("parquet", data_files=parquet_files, split="train"),
@@ -43,6 +49,6 @@ def mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_s
43
49
  processes=processes,
44
50
  compression=compression,
45
51
  buf_size=buf_size,
46
- pigz=use_pigz(compression),
52
+ pigz=use_pigz(compression, no_pigz=no_pigz),
47
53
  shard_size=shard_size,
48
54
  )
@@ -1,10 +1,11 @@
1
1
  import click
2
2
  from datasets import load_dataset
3
3
 
4
+ from ..compression import *
4
5
  from ..options import *
5
6
  from ..utils import *
6
7
 
7
- __all__ = ["join"]
8
+ __all__ = ["join_jsonl", "join_mds", "join_parquet"]
8
9
 
9
10
  @click.group()
10
11
  def join():
@@ -13,14 +14,16 @@ def join():
13
14
  @join.command()
14
15
  @click.argument("output_file", type=click.Path(exists=False), required=True)
15
16
  @click.argument("jsonl_files", type=click.Path(exists=True), required=True, nargs=-1)
16
- @compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
17
+ @compression_option(JSONL_COMPRESSIONS)
17
18
  @processes_option()
18
19
  @overwrite_option()
19
20
  @yes_option()
20
- def jsonl(output_file, jsonl_files, compression, processes, overwrite, yes):
21
+ def jsonl(**kwargs):
22
+ join_jsonl(**kwargs)
23
+ def join_jsonl(output_file, jsonl_files, compression, processes, overwrite, yes):
21
24
  check_arguments(output_file, overwrite, yes, jsonl_files)
22
25
  save_jsonl(
23
- load_dataset("json", data_files=jsonl_files, split="train"),
26
+ load_jsonl_files(jsonl_files),
24
27
  output_file,
25
28
  compression=compression,
26
29
  processes=processes,
@@ -29,14 +32,19 @@ def jsonl(output_file, jsonl_files, compression, processes, overwrite, yes):
29
32
  @join.command()
30
33
  @click.argument("output_dir", type=click.Path(exists=False), required=True)
31
34
  @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
32
- @compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
35
+ @compression_option(MDS_COMPRESSIONS)
33
36
  @processes_option()
34
37
  @overwrite_option()
35
38
  @yes_option()
36
39
  @batch_size_option()
37
40
  @buf_size_option()
38
41
  @no_bulk_option()
39
- def mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk):
42
+ @shard_size_option()
43
+ @no_pigz_option()
44
+ def mds(**kwargs):
45
+ print(kwargs)
46
+ join_mds(**kwargs)
47
+ def join_mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk, shard_size, no_pigz):
40
48
  check_arguments(output_dir, overwrite, yes, mds_directories)
41
49
  save_mds(
42
50
  load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
@@ -44,17 +52,20 @@ def mds(output_dir, mds_directories, compression, processes, overwrite, yes, bat
44
52
  processes=processes,
45
53
  compression=compression,
46
54
  buf_size=buf_size,
47
- pigz=use_pigz(compression),
55
+ shard_size=shard_size,
56
+ pigz=use_pigz(compression, no_pigz),
48
57
  )
49
58
 
50
59
  @join.command()
51
60
  @click.argument("output_file", type=click.Path(exists=False), required=True)
52
61
  @click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
53
- @compression_option("snappy", ["snappy", "gzip", "zstd"])
62
+ @compression_option(PARQUET_COMPRESSIONS)
54
63
  @overwrite_option()
55
64
  @yes_option()
56
65
  @batch_size_option()
57
- def parquet(output_file, parquet_files, compression, overwrite, yes, batch_size):
66
+ def parquet(**kwargs):
67
+ join_parquet(**kwargs)
68
+ def join_parquet(output_file, parquet_files, compression, overwrite, yes, batch_size):
58
69
  check_arguments(output_file, overwrite, yes, parquet_files)
59
70
  save_parquet(
60
71
  load_dataset("parquet", data_files=parquet_files, split="train"),
@@ -1,10 +1,11 @@
1
1
  import click
2
2
  from datasets import load_dataset
3
3
 
4
+ from ..compression import *
4
5
  from ..options import *
5
6
  from ..utils import *
6
7
 
7
- __all__ = ["split"]
8
+ __all__ = ["split_jsonl", "split_mds", "split_parquet"]
8
9
 
9
10
  @click.group()
10
11
  def split():
@@ -15,14 +16,14 @@ def split():
15
16
  @prefix_option()
16
17
  @output_dir_option()
17
18
  @size_hint_option()
18
- @compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
19
+ @compression_option(JSONL_COMPRESSIONS)
19
20
  @processes_option()
20
21
  @overwrite_option()
21
22
  @yes_option()
22
23
  def jsonl(jsonl_files, prefix, output_dir, size_hint, compression, processes, overwrite, yes):
23
24
  save_jsonl(
24
- load_dataset("json", data_files=jsonl_files, split="train"),
25
- output_file=f"{output_dir}/{prefix}{{part:04d}}.jsonl{extension(compression, jsonl_files[0])}",
25
+ load_jsonl_files(jsonl_files),
26
+ output_file=f"{output_dir}/{prefix}{{part:04d}}.jsonl{extension_compression(compression, jsonl_files[0])}",
26
27
  compression=compression,
27
28
  processes=processes,
28
29
  size_hint=size_hint,
@@ -35,7 +36,7 @@ def jsonl(jsonl_files, prefix, output_dir, size_hint, compression, processes, ov
35
36
  @prefix_option()
36
37
  @output_dir_option()
37
38
  @size_hint_option()
38
- @compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
39
+ @compression_option(MDS_COMPRESSIONS)
39
40
  @processes_option()
40
41
  @overwrite_option()
41
42
  @yes_option()
@@ -43,14 +44,15 @@ def jsonl(jsonl_files, prefix, output_dir, size_hint, compression, processes, ov
43
44
  @batch_size_option()
44
45
  @no_bulk_option()
45
46
  @shard_size_option()
46
- def mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size):
47
+ @no_pigz_option()
48
+ def mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size, no_pigz):
47
49
  save_mds(
48
50
  load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
49
51
  output_dir=f"{output_dir}/{prefix}{{part:04d}}",
50
52
  processes=processes,
51
53
  compression=compression,
52
54
  buf_size=buf_size,
53
- pigz=use_pigz(compression),
55
+ pigz=use_pigz(compression, no_pigz),
54
56
  shard_size=shard_size,
55
57
  size_hint=size_hint,
56
58
  overwrite=overwrite,
@@ -62,7 +64,7 @@ def mds(mds_directories, prefix, output_dir, size_hint, compression, processes,
62
64
  @prefix_option()
63
65
  @output_dir_option()
64
66
  @size_hint_option()
65
- @compression_option("snappy", ["snappy", "gzip", "zstd"])
67
+ @compression_option(PARQUET_COMPRESSIONS)
66
68
  @overwrite_option()
67
69
  @yes_option()
68
70
  @batch_size_option()
@@ -0,0 +1,162 @@
1
+ import bz2
2
+ from isal import igzip as gzip
3
+ import lz4
4
+ import lzma
5
+ import os
6
+ import shutil
7
+ from tqdm import tqdm
8
+ import zstandard
9
+
10
+ from .brotli import brotli_open
11
+ from .pigz import pigz_open
12
+ from .snappy import snappy_open
13
+
14
+ __all__ = [
15
+ "JSONL_COMPRESSIONS",
16
+ "MDS_COMPRESSIONS",
17
+ "PARQUET_COMPRESSIONS",
18
+ "determine_compression",
19
+ "extension_compression",
20
+ "infer_compression",
21
+ "open_compression",
22
+ "pigz_available",
23
+ "pigz_compress",
24
+ "use_pigz",
25
+ ]
26
+
27
+ JSONL_COMPRESSIONS = dict(
28
+ default="infer",
29
+ choices=["infer", "none", "bz2", "gzip", "lz4", "lzma", "pigz", "snappy", "xz", "zstd"],
30
+ )
31
+ MDS_COMPRESSIONS = dict(
32
+ default=None,
33
+ choices=["none", "brotli", "bz2", "gzip", "pigz", "snappy", "zstd", "sample::brotli", "sample::bz2", "sample::gzip", "sample::snappy", "sample::zstd"],
34
+ )
35
+ PARQUET_COMPRESSIONS = dict(
36
+ default="snappy",
37
+ choices=["snappy", "brotli", "gzip", "lz4", "zstd"],
38
+ )
39
+
40
+ def determine_compression(fmt, file_path, compression="infer", no_pigz=False):
41
+ if compression == "none":
42
+ return None
43
+ if fmt == "jsonl":
44
+ if compression == "infer":
45
+ compression = infer_compression(file_path)
46
+ if compression == "brotli":
47
+ return "br"
48
+ return compression
49
+ if fmt == "mds":
50
+ if compression == "infer":
51
+ raise ValueError()
52
+ if compression == "pigz" or (not no_pigz and compression == "gzip" and pigz_available()):
53
+ return None
54
+ if compression == "gzip":
55
+ return "gz"
56
+ if compression == "brotli":
57
+ return "br"
58
+ if compression == "sample::gzip":
59
+ return "gz"
60
+ if compression == "sample::brotli":
61
+ return "br"
62
+ return compression
63
+ if fmt == "parquet":
64
+ return compression
65
+ raise ValueError(f"Unsupported format: {format}")
66
+
67
+ def extension_compression(compression, file_path):
68
+ """Get the file extension for the given compression type."""
69
+ if compression == "infer":
70
+ compression = infer_compression(file_path)
71
+ if compression == "brotli":
72
+ return ".br"
73
+ if compression == "bz2":
74
+ return ".bz2"
75
+ if compression in ("gzip", "pigz"):
76
+ return ".gz"
77
+ if compression == "lz4":
78
+ return ".lz4"
79
+ if compression == "lzma":
80
+ return ".lzma"
81
+ if compression == "snappy":
82
+ return ".snappy"
83
+ if compression == "xz":
84
+ return ".xz"
85
+ if compression == "zstd":
86
+ return ".zst"
87
+ if compression is None or compression == "none":
88
+ return ""
89
+ raise ValueError(f"Unsupported compression type: {compression}")
90
+
91
+ def infer_compression(file_path, pigz=True):
92
+ """Infer the compression type from the file extension."""
93
+ extension = os.path.splitext(file_path)[1]
94
+ if extension.endswith('.br'):
95
+ return 'brotli'
96
+ if extension.endswith('.bz2'):
97
+ return 'bz2'
98
+ if extension.endswith('.gz'):
99
+ if pigz and pigz_available():
100
+ return 'pigz'
101
+ return 'gzip'
102
+ if extension.endswith('.lz4'):
103
+ return 'lz4'
104
+ if extension.endswith('.lzma'):
105
+ return 'lzma'
106
+ if extension.endswith('.snappy'):
107
+ return 'snappy'
108
+ if extension.endswith('.xz'):
109
+ return 'xz'
110
+ if extension.endswith('.zip'):
111
+ return 'zip'
112
+ if extension.endswith('.zst'):
113
+ return 'zstd'
114
+ return None
115
+
116
+ def open_compression(file_path, mode="rt", compression="infer", processes=64):
117
+ """Open a file, handling compression if necessary."""
118
+ if compression == "infer":
119
+ compression = infer_compression(file_path)
120
+ if compression in ("brotli", "br"):
121
+ return brotli_open(file_path, mode)
122
+ if compression in ("gzip", "gz"):
123
+ return gzip.open(file_path, mode)
124
+ if compression == "pigz":
125
+ return pigz_open(file_path, mode, processes=processes) if mode[0] == "w" else gzip.open(file_path, mode)
126
+ if compression == "bz2":
127
+ return bz2.open(file_path, mode)
128
+ if compression == "lz4":
129
+ return lz4.frame.open(file_path, mode)
130
+ if compression in ("lzma", "xz"):
131
+ return lzma.open(file_path, mode)
132
+ if compression == "snappy":
133
+ return snappy_open(file_path, mode)
134
+ if compression == "zstd":
135
+ return zstandard.open(file_path, mode)
136
+ if compression is None or compression == "none":
137
+ return open(file_path, mode)
138
+ raise ValueError(f"Unsupported compression type: {compression}")
139
+
140
+ def pigz_available():
141
+ """Check if pigz is available on the system."""
142
+ return shutil.which("pigz") is not None
143
+
144
+ def pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=False, quiet=False):
145
+ """Compress a file using pigz."""
146
+ size = os.stat(input_file).st_size
147
+ num_blocks = (size+buf_size-1) // buf_size
148
+ with open(input_file, "rb") as f_in, pigz_open(output_file, "wb", processes=processes) as f_out:
149
+ for _ in tqdm(range(num_blocks), desc="Compressing with pigz", unit="block", disable=quiet):
150
+ buf = f_in.read(buf_size)
151
+ assert buf
152
+ f_out.write(buf)
153
+ buf = f_in.read()
154
+ assert not buf
155
+ if not keep:
156
+ os.remove(input_file)
157
+ if not quiet:
158
+ print(f"Removed {input_file}")
159
+
160
+ def use_pigz(compression, no_pigz=False):
161
+ """Determine if pigz should be used based on the compression type."""
162
+ return compression == "pigz" or (not no_pigz and compression == "gzip" and pigz_available())