mldataforge 0.0.5__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {mldataforge-0.0.5 → mldataforge-0.1.0}/PKG-INFO +1 -1
  2. mldataforge-0.1.0/mldataforge/commands/convert/jsonl.py +46 -0
  3. mldataforge-0.1.0/mldataforge/commands/convert/mds.py +43 -0
  4. mldataforge-0.1.0/mldataforge/commands/convert/parquet.py +46 -0
  5. mldataforge-0.1.0/mldataforge/options.py +71 -0
  6. {mldataforge-0.0.5 → mldataforge-0.1.0}/mldataforge/utils.py +73 -54
  7. {mldataforge-0.0.5 → mldataforge-0.1.0}/pyproject.toml +1 -1
  8. mldataforge-0.0.5/mldataforge/commands/convert/jsonl/__init__.py +0 -13
  9. mldataforge-0.0.5/mldataforge/commands/convert/jsonl/mds.py +0 -59
  10. mldataforge-0.0.5/mldataforge/commands/convert/jsonl/parquet.py +0 -39
  11. mldataforge-0.0.5/mldataforge/commands/convert/mds/__init__.py +0 -13
  12. mldataforge-0.0.5/mldataforge/commands/convert/mds/jsonl.py +0 -23
  13. mldataforge-0.0.5/mldataforge/commands/convert/mds/parquet.py +0 -26
  14. mldataforge-0.0.5/mldataforge/commands/convert/parquet/__init__.py +0 -13
  15. mldataforge-0.0.5/mldataforge/commands/convert/parquet/jsonl.py +0 -25
  16. mldataforge-0.0.5/mldataforge/commands/convert/parquet/mds.py +0 -43
  17. {mldataforge-0.0.5 → mldataforge-0.1.0}/.gitignore +0 -0
  18. {mldataforge-0.0.5 → mldataforge-0.1.0}/LICENSE +0 -0
  19. {mldataforge-0.0.5 → mldataforge-0.1.0}/README.md +0 -0
  20. {mldataforge-0.0.5 → mldataforge-0.1.0}/mldataforge/__main__.py +0 -0
  21. {mldataforge-0.0.5 → mldataforge-0.1.0}/mldataforge/commands/__init__.py +0 -0
  22. {mldataforge-0.0.5 → mldataforge-0.1.0}/mldataforge/commands/convert/__init__.py +0 -0
  23. {mldataforge-0.0.5 → mldataforge-0.1.0}/mldataforge/pigz.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mldataforge
3
- Version: 0.0.5
3
+ Version: 0.1.0
4
4
  Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
5
5
  Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
6
6
  Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
@@ -0,0 +1,46 @@
1
+ import click
2
+ from datasets import load_dataset
3
+
4
+ from ...options import *
5
+ from ...utils import *
6
+
7
+ __all__ = ["jsonl"]
8
+
9
+ @click.group()
10
+ def jsonl():
11
+ pass
12
+
13
+ @jsonl.command()
14
+ @click.argument('output_dir', type=click.Path(exists=False))
15
+ @click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
16
+ @compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
17
+ @overwrite_option()
18
+ @yes_option()
19
+ @processes_option()
20
+ @buf_size_option()
21
+ def mds(output_dir, jsonl_files, compression, processes, overwrite, yes, buf_size):
22
+ check_arguments(output_dir, overwrite, yes, jsonl_files)
23
+ save_mds(
24
+ load_dataset("json", data_files=jsonl_files, split="train"),
25
+ output_dir,
26
+ processes=processes,
27
+ compression=compression,
28
+ buf_size=buf_size,
29
+ pigz=use_pigz(compression),
30
+ )
31
+
32
+ @jsonl.command()
33
+ @click.argument('output_file', type=click.Path(exists=False))
34
+ @click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
35
+ @compression_option("snappy", ["snappy", "gzip", "zstd"])
36
+ @overwrite_option()
37
+ @yes_option()
38
+ @batch_size_option()
39
+ def parquet(output_file, jsonl_files, compression, overwrite, yes, batch_size):
40
+ check_arguments(output_file, overwrite, yes, jsonl_files)
41
+ save_parquet(
42
+ load_dataset("json", data_files=jsonl_files, split="train"),
43
+ output_file,
44
+ compression=compression,
45
+ batch_size=batch_size,
46
+ )
@@ -0,0 +1,43 @@
1
+ import click
2
+
3
+ from ...options import *
4
+ from ...utils import *
5
+
6
+ __all__ = ["mds"]
7
+
8
+ @click.group()
9
+ def mds():
10
+ pass
11
+
12
+ @mds.command()
13
+ @click.argument("output_file", type=click.Path(exists=False), required=True)
14
+ @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
15
+ @compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
16
+ @processes_option()
17
+ @overwrite_option()
18
+ @yes_option()
19
+ @batch_size_option()
20
+ def jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size):
21
+ check_arguments(output_file, overwrite, yes, mds_directories)
22
+ save_jsonl(
23
+ load_mds_directories(mds_directories, batch_size=batch_size),
24
+ output_file,
25
+ compression=compression,
26
+ processes=processes,
27
+ )
28
+
29
+ @mds.command()
30
+ @click.argument("output_file", type=click.Path(exists=False), required=True)
31
+ @click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
32
+ @compression_option("snappy", ["snappy", "gzip", "zstd"])
33
+ @overwrite_option()
34
+ @yes_option()
35
+ @batch_size_option()
36
+ def parquet(output_file, parquet_files, compression, overwrite, yes, batch_size):
37
+ check_arguments(output_file, overwrite, yes, parquet_files)
38
+ save_parquet(
39
+ load_mds_directories(parquet_files, batch_size=batch_size),
40
+ output_file,
41
+ compression=compression,
42
+ batch_size=batch_size,
43
+ )
@@ -0,0 +1,46 @@
1
+ import click
2
+ from datasets import load_dataset
3
+
4
+ from ...options import *
5
+ from ...utils import *
6
+
7
+ __all__ = ["parquet"]
8
+
9
+ @click.group()
10
+ def parquet():
11
+ pass
12
+
13
+ @parquet.command()
14
+ @click.argument("output_file", type=click.Path(exists=False), required=True)
15
+ @click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
16
+ @compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
17
+ @processes_option()
18
+ @overwrite_option()
19
+ @yes_option()
20
+ def jsonl(output_file, parquet_files, compression, processes, overwrite, yes):
21
+ check_arguments(output_file, overwrite, yes, parquet_files)
22
+ save_jsonl(
23
+ load_dataset("parquet", data_files=parquet_files, split="train"),
24
+ output_file,
25
+ compression=compression,
26
+ processes=processes,
27
+ )
28
+
29
+ @parquet.command()
30
+ @click.argument('output_dir', type=click.Path(exists=False))
31
+ @click.argument('parquet_files', nargs=-1, type=click.Path(exists=True))
32
+ @compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
33
+ @processes_option()
34
+ @overwrite_option()
35
+ @yes_option()
36
+ @buf_size_option()
37
+ def mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_size):
38
+ check_arguments(output_dir, overwrite, yes, parquet_files)
39
+ save_mds(
40
+ load_dataset("parquet", data_files=parquet_files, split="train"),
41
+ output_dir,
42
+ processes=processes,
43
+ compression=compression,
44
+ buf_size=buf_size,
45
+ pigz=use_pigz(compression),
46
+ )
@@ -0,0 +1,71 @@
1
+ import click
2
+
3
+ __alll__ = [
4
+ "batch_size_option",
5
+ "buf_size_option",
6
+ "compression_option",
7
+ "overwrite_option",
8
+ "processes_option",
9
+ "yes_option",
10
+ ]
11
+
12
+ def batch_size_option(default=2**16):
13
+ """
14
+ Option for specifying the batch size.
15
+ """
16
+ return click.option(
17
+ "--batch-size",
18
+ default=default,
19
+ help=f"Batch size for loading data and writing files (default: {default}).",
20
+ )
21
+
22
+ def buf_size_option(default=2**24):
23
+ """
24
+ Option for specifying the buffer size.
25
+ """
26
+ return click.option(
27
+ "--buf-size",
28
+ default=default,
29
+ help=f"Buffer size for pigz compression (default: {default}).",
30
+ )
31
+
32
+ def compression_option(default, choices):
33
+ """
34
+ Option for specifying the compression type.
35
+ """
36
+ return click.option(
37
+ "--compression",
38
+ default=default,
39
+ type=click.Choice(choices, case_sensitive=False),
40
+ help=f"Compress the output file (default: {default}).",
41
+ )
42
+
43
+ def overwrite_option():
44
+ """
45
+ Option for specifying whether to overwrite existing files.
46
+ """
47
+ return click.option(
48
+ "--overwrite",
49
+ is_flag=True,
50
+ help="Overwrite existing path.",
51
+ )
52
+
53
+ def processes_option(default=64):
54
+ """
55
+ Option for specifying the number of processes to use.
56
+ """
57
+ return click.option(
58
+ "--processes",
59
+ default=default,
60
+ help=f"Number of processes to use (default: {default}).",
61
+ )
62
+
63
+ def yes_option():
64
+ """
65
+ Option for specifying whether to assume yes to all prompts.
66
+ """
67
+ return click.option(
68
+ "--yes",
69
+ is_flag=True,
70
+ help="Assume yes to all prompts. Use with caution as it will remove files or even entire directories without confirmation.",
71
+ )
@@ -1,29 +1,27 @@
1
- import atexit
2
1
  import bz2
3
2
  import click
4
- from datasets import concatenate_datasets, load_dataset
3
+ from datasets import concatenate_datasets
5
4
  import gzip
5
+ import json
6
6
  import lzma
7
7
  from mltiming import timing
8
+ import pyarrow as pa
9
+ import pyarrow.parquet as pq
8
10
  import os
9
11
  import shutil
10
- from streaming import StreamingDataset
11
- import tempfile
12
+ from streaming import MDSWriter, StreamingDataset
12
13
  from tqdm import tqdm
13
14
 
14
15
  from .pigz import pigz_open
15
16
 
16
17
  __all__ = [
17
- "check_overwrite",
18
- "create_temp_file",
19
- "determine_compression",
20
- "infer_mds_encoding",
21
- "infer_compression",
22
- "load_parquet_files",
18
+ "batch_iterable",
19
+ "check_arguments",
20
+ "confirm_overwrite",
23
21
  "load_mds_directories",
24
- "open_jsonl",
25
- "pigz_available",
26
- "pigz_compress",
22
+ "save_jsonl",
23
+ "save_mds",
24
+ "save_parquet",
27
25
  "use_pigz",
28
26
  ]
29
27
 
@@ -37,7 +35,9 @@ def batch_iterable(iterable, batch_size):
37
35
  if batch:
38
36
  yield batch
39
37
 
40
- def check_overwrite(output_path, overwrite, yes):
38
+ def check_arguments(output_path, overwrite, yes, input_paths):
39
+ if not input_paths:
40
+ raise click.BadArgumentUsage("No input paths provided.")
41
41
  if os.path.exists(output_path):
42
42
  if os.path.isfile(output_path):
43
43
  if not overwrite:
@@ -62,31 +62,14 @@ def confirm_overwrite(message):
62
62
  if response.lower() != 'yes':
63
63
  raise click.Abort()
64
64
 
65
- def create_temp_file():
66
- def _cleanup_file(file_path):
67
- try:
68
- os.remove(file_path)
69
- except OSError:
70
- pass
71
- # Create a named temp file, don't delete right away
72
- temp = tempfile.NamedTemporaryFile(delete=False)
73
- temp_name = temp.name
74
- # Close so others can open it again without conflicts (especially on Windows)
75
- temp.close()
76
-
77
- # Schedule its deletion at exit
78
- atexit.register(_cleanup_file, temp_name)
79
-
80
- return temp_name
81
-
82
- def determine_compression(file_path, compression="infer"):
65
+ def _determine_compression(file_path, compression="infer"):
83
66
  if compression == "infer":
84
- compression = infer_compression(file_path)
67
+ compression = _infer_compression(file_path)
85
68
  if compression == "none":
86
69
  compression = None
87
70
  return compression
88
71
 
89
- def infer_mds_encoding(value):
72
+ def _infer_mds_encoding(value):
90
73
  """Determine the MDS encoding for a given value."""
91
74
  if isinstance(value, str):
92
75
  return 'str'
@@ -98,11 +81,11 @@ def infer_mds_encoding(value):
98
81
  return 'bool'
99
82
  return 'pkl'
100
83
 
101
- def infer_compression(file_path):
84
+ def _infer_compression(file_path):
102
85
  """Infer the compression type from the file extension."""
103
86
  extension = os.path.splitext(file_path)[1]
104
87
  if extension.endswith('.gz'):
105
- if pigz_available():
88
+ if _pigz_available():
106
89
  return 'pigz'
107
90
  return 'gzip'
108
91
  if extension.endswith('.bz2'):
@@ -117,7 +100,7 @@ def infer_compression(file_path):
117
100
 
118
101
  def load_mds_directories(mds_directories, split='.', batch_size=2**16):
119
102
  dss = []
120
- for mds_directory in tqdm(mds_directories, desc="Loading MDS directories", unit="directory"):
103
+ for mds_directory in mds_directories:
121
104
  ds = StreamingDataset(
122
105
  local=mds_directory,
123
106
  remote=None,
@@ -136,21 +119,9 @@ def load_mds_directories(mds_directories, split='.', batch_size=2**16):
136
119
  ds = concatenate_datasets(dsets=dss)
137
120
  return ds
138
121
 
139
- def load_parquet_files(parquet_files):
140
- dss = []
141
- for parquet_file in tqdm(parquet_files, desc="Loading parquet files", unit="file"):
142
- ds = load_dataset("parquet", data_files=parquet_file, split="train")
143
- dss.append(ds)
144
- if len(dss) == 1:
145
- ds = dss[0]
146
- else:
147
- with timing(message=f"Concatenating {len(dss)} datasets"):
148
- ds = concatenate_datasets(dsets=dss)
149
- return ds
150
-
151
- def open_jsonl(file_path, mode="rt", compression="infer", processes=64):
122
+ def _open_jsonl(file_path, mode="rt", compression="infer", processes=64):
152
123
  """Open a JSONL file, handling gzip compression if necessary."""
153
- compression = determine_compression(file_path, compression)
124
+ compression = _determine_compression(file_path, compression)
154
125
  if compression == "gzip":
155
126
  return gzip.open(file_path, mode)
156
127
  if compression == "pigz":
@@ -163,11 +134,11 @@ def open_jsonl(file_path, mode="rt", compression="infer", processes=64):
163
134
  return open(file_path, mode)
164
135
  raise ValueError(f"Unsupported compression type: {compression}")
165
136
 
166
- def pigz_available():
137
+ def _pigz_available():
167
138
  """Check if pigz is available on the system."""
168
139
  return shutil.which("pigz") is not None
169
140
 
170
- def pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=False, quiet=False):
141
+ def _pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=False, quiet=False):
171
142
  """Compress a file using pigz."""
172
143
  size = os.stat(input_file).st_size
173
144
  num_blocks = (size+buf_size-1) // buf_size
@@ -183,6 +154,54 @@ def pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=Fa
183
154
  if not quiet:
184
155
  print(f"Removed {input_file}")
185
156
 
157
+ def save_jsonl(iterable, output_file, compression=None, processes=64):
158
+ compression = _determine_compression(output_file, compression)
159
+ with _open_jsonl(output_file, mode="wb", compression=compression, processes=processes) as f:
160
+ for item in tqdm(iterable, desc="Writing to JSONL", unit="sample"):
161
+ f.write(f"{json.dumps(item)}\n".encode("utf-8"))
162
+
163
+ def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pigz=False):
164
+ if compression == "none" or pigz:
165
+ compression = None
166
+ if compression == "gzip":
167
+ compression = "gz"
168
+ writer = None
169
+ for sample in tqdm(it, desc="Writing to MDS", unit="sample"):
170
+ if writer is None:
171
+ columns = {key: _infer_mds_encoding(value) for key, value in sample.items()}
172
+ writer = MDSWriter(out=output_dir, columns=columns, compression=compression)
173
+ writer.write(sample)
174
+ writer.finish()
175
+ if pigz:
176
+ index_path = os.path.join(output_dir, "index.json")
177
+ index = json.load(open(index_path, "rt"))
178
+ name2info = {shard["raw_data"]["basename"]: shard for shard in index["shards"]}
179
+ file_names = [file for file in os.listdir(output_dir) if file.endswith(".mds")]
180
+ assert set(file_names) == set(name2info.keys())
181
+ for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file"):
182
+ compressed_file_name = file_name + ".gz"
183
+ file_path = os.path.join(output_dir, file_name)
184
+ compressed_file_path = os.path.join(output_dir, compressed_file_name)
185
+ _pigz_compress(file_path, compressed_file_path, processes, buf_size=buf_size, keep=False, quiet=True)
186
+ name2info[file_name]["compression"] = "gz"
187
+ name2info[file_name]["zip_data"] = {
188
+ "basename": compressed_file_name,
189
+ "bytes": os.stat(compressed_file_path).st_size,
190
+ "hashes": {},
191
+ }
192
+ json.dump(index, open(index_path, "wt"))
193
+ print(f"Compressed {output_dir} with pigz")
194
+
195
+ def save_parquet(it, output_file, compression=None, batch_size=2**16):
196
+ writer = None
197
+ it = tqdm(it, desc="Writing to Parquet", unit="sample")
198
+ for batch in batch_iterable(it, batch_size):
199
+ table = pa.Table.from_pylist(batch)
200
+ if writer is None:
201
+ writer = pq.ParquetWriter(output_file, table.schema, compression=compression)
202
+ writer.write_table(table)
203
+ writer.close()
204
+
186
205
  def use_pigz(compression):
187
206
  """Determine if pigz should be used based on the compression type."""
188
- return compression == "pigz" or (compression == "gzip" and pigz_available())
207
+ return compression == "pigz" or (compression == "gzip" and _pigz_available())
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "mldataforge"
7
- version = "0.0.5"
7
+ version = "0.1.0"
8
8
  authors = [
9
9
  { name = "Peter Schneider-Kamp" }
10
10
  ]
@@ -1,13 +0,0 @@
1
- import click
2
-
3
- from .mds import mds
4
- from .parquet import parquet
5
-
6
- __all__ = ["jsonl"]
7
-
8
- @click.group()
9
- def jsonl():
10
- pass
11
-
12
- jsonl.add_command(mds)
13
- jsonl.add_command(parquet)
@@ -1,59 +0,0 @@
1
- import click
2
- import json
3
- import os
4
- from streaming import MDSWriter
5
- from tqdm import tqdm
6
-
7
- from ....utils import check_overwrite, infer_mds_encoding, open_jsonl, pigz_compress, use_pigz
8
-
9
- @click.command()
10
- @click.argument('output_dir', type=click.Path(exists=False))
11
- @click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
12
- @click.option('--compression', type=click.Choice(['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'], case_sensitive=False), default=None, help='Compression type for the output dataset (default: None).')
13
- @click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
14
- @click.option("--overwrite", is_flag=True, help="Overwrite existing MDS directory.")
15
- @click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove entire directory trees without confirmation.")
16
- @click.option("--buf-size", default=2**24, help=f"Buffer size for pigz compression (default: {2**24}).")
17
- def mds(output_dir, jsonl_files, processes, compression, overwrite, yes, buf_size):
18
- check_overwrite(output_dir, overwrite, yes)
19
- if not jsonl_files:
20
- raise click.BadArgumentUsage("No JSONL files provided.")
21
- with open_jsonl(jsonl_files[0]) as f:
22
- sample = json.loads(f.readline())
23
- pigz = use_pigz(compression)
24
- if compression == "none" or pigz:
25
- compression = None
26
- if compression == "gzip":
27
- compression = "gz"
28
- columns = {key: infer_mds_encoding(value) for key, value in sample.items()}
29
- lines = 0
30
- with MDSWriter(out=output_dir, columns=columns, compression=compression) as writer:
31
- for jsonl_file in tqdm(jsonl_files, desc="Processing JSONL files", unit="file"):
32
- with open_jsonl(jsonl_file, compression="infer") as f:
33
- for line_num, line in enumerate(f, start=1):
34
- try:
35
- item = json.loads(line)
36
- writer.write(item)
37
- except json.JSONDecodeError as e:
38
- print(f"Skipping line {line_num} in {jsonl_file} due to JSON error: {e}")
39
- lines += 1
40
- print(f"Wrote {lines} lines from {len(jsonl_files)} files to MDS files in {output_dir}")
41
- if pigz:
42
- index_path = os.path.join(output_dir, "index.json")
43
- index = json.load(open(index_path, "rt"))
44
- name2info = {shard["raw_data"]["basename"]: shard for shard in index["shards"]}
45
- file_names = [file for file in os.listdir(output_dir) if file.endswith(".mds")]
46
- assert set(file_names) == set(name2info.keys())
47
- for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file"):
48
- compressed_file_name = file_name + ".gz"
49
- file_path = os.path.join(output_dir, file_name)
50
- compressed_file_path = os.path.join(output_dir, compressed_file_name)
51
- pigz_compress(file_path, compressed_file_path, processes, buf_size=buf_size, keep=False, quiet=True)
52
- name2info[file_name]["compression"] = "gz"
53
- name2info[file_name]["zip_data"] = {
54
- "basename": compressed_file_name,
55
- "bytes": os.stat(compressed_file_path).st_size,
56
- "hashes": {},
57
- }
58
- json.dump(index, open(index_path, "wt"))
59
- print(f"Compressed {output_dir} with pigz")
@@ -1,39 +0,0 @@
1
- import click
2
- import json
3
- import pyarrow as pa
4
- import pyarrow.parquet as pq
5
- from tqdm import tqdm
6
-
7
- from ....utils import batch_iterable, check_overwrite, open_jsonl
8
-
9
- def _iterate(jsonl_files):
10
- lines = 0
11
- for jsonl_file in tqdm(jsonl_files, desc="Processing JSONL files", unit="file"):
12
- with open_jsonl(jsonl_file, compression="infer") as f:
13
- for line_num, line in enumerate(f, start=1):
14
- try:
15
- item = json.loads(line)
16
- yield item
17
- except json.JSONDecodeError as e:
18
- print(f"Skipping line {line_num} in {jsonl_file} due to JSON error: {e}")
19
- lines += 1
20
- print(f"Wrote {lines} lines from {len(jsonl_files)} files")
21
-
22
- @click.command()
23
- @click.argument('output_file', type=click.Path(exists=False))
24
- @click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
25
- @click.option("--compression", default="snappy", type=click.Choice(["snappy", "gzip", "zstd"]), help="Compress the Parquet file (default: snappy).")
26
- @click.option("--overwrite", is_flag=True, help="Overwrite existing MDS directory.")
27
- @click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove entire directory trees without confirmation.")
28
- @click.option("--batch-size", default=2**16, help="Batch size for loading MDS directories and writing Parquet files (default: 65536).")
29
- def parquet(output_file, jsonl_files, compression, overwrite, yes, batch_size):
30
- check_overwrite(output_file, overwrite, yes)
31
- if not jsonl_files:
32
- raise click.BadArgumentUsage("No JSONL files provided.")
33
- writer = None
34
- for batch in batch_iterable(_iterate(jsonl_files), batch_size):
35
- table = pa.Table.from_pylist(batch)
36
- if writer is None:
37
- writer = pq.ParquetWriter(output_file, table.schema, compression=compression)
38
- writer.write_table(table)
39
- writer.close()
@@ -1,13 +0,0 @@
1
- import click
2
-
3
- from .jsonl import jsonl
4
- from .parquet import parquet
5
-
6
- __all__ = ["mds"]
7
-
8
- @click.group()
9
- def mds():
10
- pass
11
-
12
- mds.add_command(jsonl)
13
- mds.add_command(parquet)
@@ -1,23 +0,0 @@
1
- import click
2
- import json
3
- from tqdm import tqdm
4
-
5
- from ....utils import check_overwrite, create_temp_file, determine_compression, load_mds_directories, open_jsonl
6
-
7
- @click.command()
8
- @click.argument("output_file", type=click.Path(exists=False), required=True)
9
- @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
10
- @click.option("--compression", default="infer", type=click.Choice(["none", "infer", "pigz", "gzip", "bz2", "xz"]), help="Compress the output JSONL file (default: infer; pigz for parallel gzip).")
11
- @click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
12
- @click.option("--overwrite", is_flag=True, help="Overwrite existing JSONL files.")
13
- @click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove files without confirmation.")
14
- @click.option("--batch-size", default=2**16, help="Batch size for loading MDS directories (default: 65536).")
15
- def jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size):
16
- check_overwrite(output_file, overwrite, yes)
17
- if not mds_directories:
18
- raise click.BadArgumentUsage("No MDS files provided.")
19
- ds = load_mds_directories(mds_directories, batch_size=batch_size)
20
- compression = determine_compression(output_file, compression)
21
- with open_jsonl(output_file, mode="wb", compression=compression, processes=processes) as f:
22
- for item in tqdm(ds, desc="Writing to JSONL", unit="line"):
23
- f.write(f"{json.dumps(item)}\n".encode("utf-8"))
@@ -1,26 +0,0 @@
1
- import click
2
- import pyarrow as pa
3
- import pyarrow.parquet as pq
4
- from tqdm import tqdm
5
-
6
- from ....utils import batch_iterable, check_overwrite, load_mds_directories
7
-
8
- @click.command()
9
- @click.argument("output_file", type=click.Path(exists=False), required=True)
10
- @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
11
- @click.option("--compression", default="snappy", type=click.Choice(["snappy", "gzip", "zstd"]), help="Compress the Parquet file (default: snappy).")
12
- @click.option("--overwrite", is_flag=True, help="Overwrite existing Parquet files.")
13
- @click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove files without confirmation.")
14
- @click.option("--batch-size", default=2**16, help="Batch size for loading MDS directories and writing Parquet files (default: 65536).")
15
- def parquet(output_file, mds_directories, compression, overwrite, yes, batch_size):
16
- check_overwrite(output_file, overwrite, yes)
17
- if not mds_directories:
18
- raise click.BadArgumentUsage("No MDS files provided.")
19
- ds = load_mds_directories(mds_directories, batch_size=batch_size)
20
- writer = None
21
- for batch in tqdm(batch_iterable(ds, batch_size), desc="Writing to Parquet", unit="batch", total=(len(ds)+batch_size-1) // batch_size):
22
- table = pa.Table.from_pylist(batch)
23
- if writer is None:
24
- writer = pq.ParquetWriter(output_file, table.schema, compression=compression)
25
- writer.write_table(table)
26
- writer.close()
@@ -1,13 +0,0 @@
1
- import click
2
-
3
- from .jsonl import jsonl
4
- from .mds import mds
5
-
6
- __all__ = ["parquet"]
7
-
8
- @click.group()
9
- def parquet():
10
- pass
11
-
12
- parquet.add_command(jsonl)
13
- parquet.add_command(mds)
@@ -1,25 +0,0 @@
1
- import click
2
- from mltiming import timing
3
-
4
- from ....utils import check_overwrite, create_temp_file, determine_compression, load_parquet_files, pigz_compress
5
-
6
- @click.command()
7
- @click.argument("output_file", type=click.Path(exists=False), required=True)
8
- @click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
9
- @click.option("--compression", default="infer", type=click.Choice(["none", "infer", "pigz", "gzip", "bz2", "xz"]), help="Compress the output JSONL file (default: infer; pigz for parallel gzip).")
10
- @click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
11
- @click.option("--overwrite", is_flag=True, help="Overwrite existing JSONL files.")
12
- @click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove files without confirmation.")
13
- @click.option("--buf-size", default=2**24, help=f"Buffer size for pigz compression (default: {2**24}).")
14
- def jsonl(output_file, parquet_files, compression, processes, overwrite, yes, buf_size):
15
- check_overwrite(output_file, overwrite, yes)
16
- if not parquet_files:
17
- raise click.BadArgumentUsage("No parquet files provided.")
18
- ds = load_parquet_files(parquet_files)
19
- compression = determine_compression(output_file, compression)
20
- compressed_file = None
21
- if compression == "pigz":
22
- compression, compressed_file, output_file = None, output_file, create_temp_file()
23
- ds.to_json(output_file, num_proc=processes, orient="records", lines=True, compression=compression)
24
- if compressed_file is not None:
25
- pigz_compress(output_file, compressed_file, processes, buf_size, keep=False)
@@ -1,43 +0,0 @@
1
- import click
2
- import json
3
- import os
4
- from streaming import MDSWriter
5
- from tqdm import tqdm
6
-
7
- from ....utils import check_overwrite, infer_mds_encoding, load_parquet_files, pigz_compress, use_pigz
8
-
9
- @click.command()
10
- @click.argument('output_dir', type=click.Path(exists=False))
11
- @click.argument('parquet_files', nargs=-1, type=click.Path(exists=True))
12
- @click.option('--compression', type=click.Choice(['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'], case_sensitive=False), default=None, help='Compression type for the output dataset (default: None).')
13
- @click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
14
- @click.option("--overwrite", is_flag=True, help="Overwrite existing MDS directory.")
15
- @click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove entire directory trees without confirmation.")
16
- @click.option("--buf-size", default=2**24, help=f"Buffer size for pigz compression (default: {2**24}).")
17
- def mds(output_dir, parquet_files, processes, compression, overwrite, yes, buf_size):
18
- check_overwrite(output_dir, overwrite, yes)
19
- if not parquet_files:
20
- raise click.BadArgumentUsage("No parquet files provided.")
21
- ds = load_parquet_files(parquet_files)
22
- pigz = use_pigz(compression)
23
- sample = ds[0]
24
- if compression == "none" or pigz:
25
- compression = None
26
- if compression == "gzip":
27
- compression = "gz"
28
- columns = {key: infer_mds_encoding(value) for key, value in sample.items()}
29
- lines = 0
30
- with MDSWriter(out=output_dir, columns=columns, compression=compression) as writer:
31
- for item in tqdm(ds, desc="Processing samples", unit="sample"):
32
- writer.write(item)
33
- lines += 1
34
- print(f"Wrote {lines} lines from {len(parquet_files)} files to MDS files in {output_dir}")
35
- if pigz:
36
- file_paths = []
37
- for file in os.listdir(output_dir):
38
- if file.endswith(".mds"):
39
- file_paths.append(os.path.join(output_dir, file))
40
- for file_path in tqdm(file_paths, desc="Compressing with pigz", unit="file"):
41
- pigz_compress(file_path, file_path + ".gz", processes, buf_size=buf_size, keep=False, quiet=True)
42
- output_dir
43
- print(f"Compressed {output_dir} with pigz")
File without changes
File without changes
File without changes