PyPI - mldataforge - Versions diffs - 0.0.2__tar.gz → 0.0.4__tar.gz - Mend

mldataforge 0.0.2tar.gz → 0.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{mldataforge-0.0.2 → mldataforge-0.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mldataforge
-Version: 0.0.2
+Version: 0.0.4
 Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
 Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
 Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues

{mldataforge-0.0.2 → mldataforge-0.0.4}/mldataforge/commands/convert/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import click
 from .jsonl import jsonl
+from .mds import mds
 from .parquet import parquet
 __all__ = ["convert"]
@@ -9,5 +10,6 @@ __all__ = ["convert"]
 def convert():
     pass
-convert.add_command(parquet)
 convert.add_command(jsonl)
+convert.add_command(mds)
+convert.add_command(parquet)

{mldataforge-0.0.2 → mldataforge-0.0.4}/mldataforge/commands/convert/jsonl/mds.py RENAMED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 import click
 import json
 import os
@@ -40,11 +39,21 @@ def mds(output_dir, jsonl_files, processes, compression, overwrite, yes, buf_siz
                     lines += 1
     print(f"Wrote {lines} lines from {len(jsonl_files)} files to MDS files in {output_dir}")
     if pigz:
-        file_paths = []
-        for file in os.listdir(output_dir):
-            if file.endswith(".mds"):
-                file_paths.append(os.path.join(output_dir, file))
-        for file_path in tqdm(file_paths, desc="Compressing with pigz", unit="file"):
-            pigz_compress(file_path, file_path + ".gz", processes, buf_size=buf_size, keep=False, quiet=True)
-        output_dir
+        index_path = os.path.join(output_dir, "index.json")
+        index = json.load(open(index_path, "rt"))
+        name2info = {shard["raw_data"]["basename"]: shard for shard in index["shards"]}
+        file_names = [file for file in os.listdir(output_dir) if file.endswith(".mds")]
+        assert set(file_names) == set(name2info.keys())
+        for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file"):
+            compressed_file_name = file_name + ".gz"
+            file_path = os.path.join(output_dir, file_name)
+            compressed_file_path = os.path.join(output_dir, compressed_file_name)
+            pigz_compress(file_path, compressed_file_path, processes, buf_size=buf_size, keep=False, quiet=True)
+            name2info[file_name]["compression"] = "gz"
+            name2info[file_name]["zip_data"] = {
+                "basename": compressed_file_name,
+                "bytes": os.stat(compressed_file_path).st_size,
+                "hashes": {},
+            }
+        json.dump(index, open(index_path, "wt"))
         print(f"Compressed {output_dir} with pigz")

mldataforge-0.0.4/mldataforge/commands/convert/mds/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+import click
+from .jsonl import jsonl
+__all__ = ["mds"]
+@click.group()
+def mds():
+    pass
+mds.add_command(jsonl)

mldataforge-0.0.4/mldataforge/commands/convert/mds/jsonl.py ADDED Viewed

@@ -0,0 +1,23 @@
+import click
+import json
+from tqdm import tqdm
+from ....utils import check_overwrite, create_temp_file, determine_compression, load_mds_directories, open_jsonl
+@click.command()
+@click.argument("output_file", type=click.Path(exists=False), required=True)
+@click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
+@click.option("--compression", default="infer", type=click.Choice(["none", "infer", "pigz", "gzip", "bz2", "xz"]), help="Compress the output JSONL file (default: infer; pigz for parallel gzip).")
+@click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
+@click.option("--overwrite", is_flag=True, help="Overwrite existing JSONL files.")
+@click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove files without confirmation.")
+@click.option("--batch-size", default=2**16, help="Batch size for loading MDS directories (default: 65536).")
+def jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size):
+    check_overwrite(output_file, overwrite, yes)
+    if not mds_directories:
+        raise click.BadArgumentUsage("No MDS files provided.")
+    ds = load_mds_directories(mds_directories, batch_size=batch_size)
+    compression = determine_compression(output_file, compression)
+    with open_jsonl(output_file, mode="wb", compression=compression, processes=processes) as f:
+        for item in tqdm(ds, desc="Writing to JSONL", unit="line"):
+            f.write(f"{json.dumps(item)}\n".encode("utf-8"))

{mldataforge-0.0.2 → mldataforge-0.0.4}/mldataforge/commands/convert/parquet/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import click
 from .jsonl import jsonl
+from .mds import mds
 __all__ = ["parquet"]
@@ -9,3 +10,4 @@ def parquet():
     pass
 parquet.add_command(jsonl)
+parquet.add_command(mds)

{mldataforge-0.0.2 → mldataforge-0.0.4}/mldataforge/commands/convert/parquet/jsonl.py RENAMED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 import click
 from mltiming import timing

mldataforge-0.0.4/mldataforge/commands/convert/parquet/mds.py ADDED Viewed

@@ -0,0 +1,43 @@
+import click
+import json
+import os
+from streaming import MDSWriter
+from tqdm import tqdm
+from ....utils import check_overwrite, infer_mds_encoding, load_parquet_files, pigz_compress, use_pigz
+@click.command()
+@click.argument('output_dir', type=click.Path(exists=False))
+@click.argument('parquet_files', nargs=-1, type=click.Path(exists=True))
+@click.option('--compression', type=click.Choice(['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'], case_sensitive=False), default=None, help='Compression type for the output dataset (default: None).')
+@click.option("--processes", default=64, help="Number of processes to use for pigz compression (default: 64).")
+@click.option("--overwrite", is_flag=True, help="Overwrite existing MDS directory.")
+@click.option("--yes", is_flag=True, help="Assume yes to all prompts. Use with caution as it will remove entire directory trees without confirmation.")
+@click.option("--buf-size", default=2**24, help=f"Buffer size for pigz compression (default: {2**24}).")
+def mds(output_dir, parquet_files, processes, compression, overwrite, yes, buf_size):
+    check_overwrite(output_dir, overwrite, yes)
+    if not parquet_files:
+        raise click.BadArgumentUsage("No parquet files provided.")
+    ds = load_parquet_files(parquet_files)
+    pigz = use_pigz(compression)
+    sample = ds[0]
+    if compression == "none" or pigz:
+        compression = None
+    if compression == "gzip":
+        compression = "gz"
+    columns = {key: infer_mds_encoding(value) for key, value in sample.items()}
+    lines = 0
+    with MDSWriter(out=output_dir, columns=columns, compression=compression) as writer:
+        for item in tqdm(ds, desc="Processing samples", unit="sample"):
+            writer.write(item)
+            lines += 1
+    print(f"Wrote {lines} lines from {len(parquet_files)} files to MDS files in {output_dir}")
+    if pigz:
+        file_paths = []
+        for file in os.listdir(output_dir):
+            if file.endswith(".mds"):
+                file_paths.append(os.path.join(output_dir, file))
+        for file_path in tqdm(file_paths, desc="Compressing with pigz", unit="file"):
+            pigz_compress(file_path, file_path + ".gz", processes, buf_size=buf_size, keep=False, quiet=True)
+        output_dir
+        print(f"Compressed {output_dir} with pigz")

mldataforge-0.0.4/mldataforge/pigz.py ADDED Viewed

@@ -0,0 +1,60 @@
+import subprocess
+__all__ = ["pigz_open"]
+def pigz_open(path, mode="rt", processes=64, encoding=None):
+    return PigzFile(path, mode=mode, processes=processes, encoding=encoding)
+class PigzFile(object):
+    """A wrapper for pigz to handle gzip compression and decompression."""
+    def __init__(self, path, mode="rt", processes=4, encoding="utf-8"):
+        assert mode in ("rt", "wt", "rb", "wb")
+        self.path = path
+        self.is_read = mode[0] == "r"
+        self.is_text = mode[1] == "t"
+        self.processes = processes
+        self.encoding = encoding if self.is_text else None
+        self._process = None
+        self._fw = None
+        args = ["pigz", "-p", str(self.processes), "-c"]
+        if self.is_read:
+            args.extend(("-d", self.path))
+            self._process = subprocess.Popen(args, stdout=subprocess.PIPE, encoding=self.encoding, text=self.is_text)
+        else:
+            self._fw = open(self.path, "w+")
+            self._process = subprocess.Popen(args, stdout=self._fw, stdin=subprocess.PIPE, encoding=self.encoding, text=self.is_text)
+    def __iter__(self):
+        assert self.is_read
+        for line in self._process.stdout:
+            assert isinstance(line, str) if self.is_text else isinstance(line, bytes)
+            yield line
+        self._process.wait()
+        assert self._process.returncode == 0
+        self._process.stdout.close()
+        self._process = None
+    def write(self, line):
+        assert not self.is_read
+        assert self._fw is not None
+        assert isinstance(line, str) if self.is_text else isinstance(line, bytes)
+        self._process.stdin.write(line)
+    def close(self):
+        if self._process:
+            if self.is_read:
+                self._process.kill()
+                self._process.stdout.close()
+                self._process = None
+            else:
+                self._process.stdin.close()
+                self._process.wait()
+                self._process = None
+                self._fw.close()
+                self._fw = None
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()

{mldataforge-0.0.2 → mldataforge-0.0.4}/mldataforge/utils.py RENAMED Viewed

@@ -7,10 +7,12 @@ import lzma
 from mltiming import timing
 import os
 import shutil
-import subprocess
+from streaming import StreamingDataset
 import tempfile
 from tqdm import tqdm
+from .pigz import pigz_open
 __all__ = [
     "check_overwrite",
     "create_temp_file",
@@ -18,72 +20,13 @@ __all__ = [
     "infer_mds_encoding",
     "infer_compression",
     "load_parquet_files",
+    "load_mds_directories",
     "open_jsonl",
     "pigz_available",
     "pigz_compress",
     "use_pigz",
 ]
-class PigzFile(object):
-    """A wrapper for pigz to handle gzip compression and decompression."""
-    def __init__(self, path, mode="rt", processes=4, encoding=None):
-        if mode not in ("rt", "wt", "rb", "wb"):
-            raise ValueError("Mode must be one of rt, wt, rb, or wb.")
-        self.path = path
-        self.mode = mode
-        self.processes = processes
-        self.encoding = "latin1" if mode[1] == "b" else ("utf-8" if encoding is None else encoding)
-        self._process = None
-        self._fw = None
-        if self.mode[0] == "r":
-            args = ["pigz", "-d", "-c", "-p", str(self.processes), self.path]
-            self._process = subprocess.Popen(
-                args,
-                stdout=subprocess.PIPE,
-                encoding=encoding,
-            )
-        elif self.mode[0] == "w":
-            args = ["pigz", "-p", str(self.processes), "-c"]
-            self._fw = open(self.path, "w+")
-            self._process = subprocess.Popen(
-                args,
-                stdout=self._fw,
-                stdin=subprocess.PIPE,
-                encoding=encoding,
-            )
-    def __iter__(self):
-        assert self.mode[0] == "r"
-        for line in self._process.stdout:
-            yield line
-        self._process.wait()
-        assert self._process.returncode == 0
-        self._process.stdout.close()
-        self._process = None
-    def write(self, line):
-        assert self.mode[0] == "w"
-        self._process.stdin.write(line if self.mode[1] == "t" else line.encode(self.encoding))
-    def close(self):
-        if self._process:
-            if self.mode[0] == "r":
-                self._process.kill()
-                self._process.stdout.close()
-                self._process = None
-            elif self.mode[1] == "w":
-                self._process.stdin.close()
-                self._process.wait()
-                self._process = None
-                self._fw.close()
-                self._fw = None
-    def __enter__(self):
-        return self
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
 def check_overwrite(output_path, overwrite, yes):
     if os.path.exists(output_path):
         if os.path.isfile(output_path):
@@ -162,6 +105,27 @@ def infer_compression(file_path):
         return 'zstd'
     return None
+def load_mds_directories(mds_directories, split='.', batch_size=2**16):
+    dss = []
+    for mds_directory in tqdm(mds_directories, desc="Loading MDS directories", unit="directory"):
+        ds = StreamingDataset(
+            local=mds_directory,
+            remote=None,
+            split=split,
+            shuffle=False,
+            allow_unsafe_types=True,
+            batch_size=batch_size,
+            download_retry=1,
+            validate_hash=False,
+        )
+        dss.append(ds)
+    if len(dss) == 1:
+        ds = dss[0]
+    else:
+        with timing(message=f"Concatenating {len(dss)} datasets"):
+            ds = concatenate_datasets(dsets=dss)
+    return ds
 def load_parquet_files(parquet_files):
     dss = []
     for parquet_file in tqdm(parquet_files, desc="Loading parquet files", unit="file"):
@@ -174,11 +138,13 @@ def load_parquet_files(parquet_files):
             ds = concatenate_datasets(dsets=dss)
     return ds
-def open_jsonl(file_path, mode="rt", compression="infer"):
+def open_jsonl(file_path, mode="rt", compression="infer", processes=64):
     """Open a JSONL file, handling gzip compression if necessary."""
     compression = determine_compression(file_path, compression)
-    if compression in ("gzip", "pigz"):
+    if compression == "gzip":
         return gzip.open(file_path, mode)
+    if compression == "pigz":
+        return pigz_open(file_path, mode, processes=processes) if mode[0] == "w" else gzip.open(file_path, mode)
     if compression == "bz2":
         return bz2.open(file_path, mode)
     if compression == "xz":
@@ -195,7 +161,7 @@ def pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=Fa
     """Compress a file using pigz."""
     size = os.stat(input_file).st_size
     num_blocks = (size+buf_size-1) // buf_size
-    with open(input_file, "rt", encoding="latin1") as f_in, PigzFile(output_file, "wb", processes=processes) as f_out:
+    with open(input_file, "rb") as f_in, pigz_open(output_file, "wb", processes=processes) as f_out:
         for _ in tqdm(range(num_blocks), desc="Compressing with pigz", unit="block", disable=quiet):
             buf = f_in.read(buf_size)
             assert buf

{mldataforge-0.0.2 → mldataforge-0.0.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "mldataforge"
-version = "0.0.2"
+version = "0.0.4"
 authors = [
   { name = "Peter Schneider-Kamp" }
 ]

{mldataforge-0.0.2 → mldataforge-0.0.4}/.gitignore RENAMED Viewed

File without changes

{mldataforge-0.0.2 → mldataforge-0.0.4}/LICENSE RENAMED Viewed

File without changes

{mldataforge-0.0.2 → mldataforge-0.0.4}/README.md RENAMED Viewed

File without changes

{mldataforge-0.0.2 → mldataforge-0.0.4}/mldataforge/__main__.py RENAMED Viewed

File without changes

{mldataforge-0.0.2 → mldataforge-0.0.4}/mldataforge/commands/__init__.py RENAMED Viewed

File without changes

{mldataforge-0.0.2 → mldataforge-0.0.4}/mldataforge/commands/convert/jsonl/__init__.py RENAMED Viewed

File without changes

mldataforge 0.0.2__tar.gz → 0.0.4__tar.gz

mldataforge 0.0.2tar.gz → 0.0.4tar.gz