PyPI - mldataforge - Versions diffs - 0.1.3__tar.gz → 0.1.5__tar.gz - Mend

mldataforge 0.1.3tar.gz → 0.1.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{mldataforge-0.1.3 → mldataforge-0.1.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mldataforge
-Version: 0.1.3
+Version: 0.1.5
 Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
 Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
 Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
@@ -10,10 +10,15 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.12
+Requires-Dist: brotlicffi
 Requires-Dist: click
 Requires-Dist: datasets
+Requires-Dist: isal
+Requires-Dist: lz4
 Requires-Dist: mltiming
 Requires-Dist: mosaicml-streaming
+Requires-Dist: python-snappy
+Requires-Dist: zstandard
 Provides-Extra: all
 Requires-Dist: build; extra == 'all'
 Requires-Dist: pytest; extra == 'all'

mldataforge-0.1.5/mldataforge/brotli.py ADDED Viewed

@@ -0,0 +1,82 @@
+import brotlicffi as brotli
+import io
+__all__ = ["brotli_open"]
+def brotli_open(filename, mode='rb', encoding='utf-8', compress_level=11):
+    return BrotliFile(filename, mode=mode, encoding=encoding, compress_level=11)
+import brotlicffi as brotli
+import io
+__all__ = ["brotli_open"]
+class BrotliFile:
+    def __init__(self, filename, mode='rb', encoding='utf-8', compress_level=11):
+        self.filename = filename
+        self.mode = mode
+        self.encoding = encoding
+        self.compress_level = compress_level
+        self.binary = 'b' in mode
+        file_mode = mode.replace('t', 'b')
+        self.file = open(filename, file_mode)
+        if 'r' in mode:
+            self._decompressor = brotli.Decompressor()
+            self._stream = self._wrap_reader()
+        elif 'w' in mode:
+            self._compressor = brotli.Compressor(quality=compress_level)
+            self._stream = self._wrap_writer()
+        else:
+            raise ValueError("Unsupported mode (use 'rb', 'wb', 'rt', or 'wt')")
+    def _wrap_reader(self):
+        buffer = io.BytesIO()
+        while True:
+            chunk = self.file.read(8192)
+            if not chunk:
+                break
+            buffer.write(self._decompressor.process(chunk))
+        buffer.seek(0)
+        return buffer if self.binary else io.TextIOWrapper(buffer, encoding=self.encoding)
+    def _wrap_writer(self):
+        return self if self.binary else io.TextIOWrapper(self, encoding=self.encoding)
+    def write(self, data):
+        if isinstance(data, str):
+            data = data.encode(self.encoding)
+        compressed = self._compressor.process(data)
+        self.file.write(compressed)
+        return len(data)
+    def flush(self):
+        if hasattr(self, '_compressor'):
+            self.file.write(self._compressor.finish())
+        self.file.flush()
+    def read(self, *args, **kwargs):
+        return self._stream.read(*args, **kwargs)
+    def readline(self, *args, **kwargs):
+        return self._stream.readline(*args, **kwargs)
+    def __iter__(self):
+        return iter(self._stream)
+    def close(self):
+        try:
+            if hasattr(self._stream, 'flush'):
+                self._stream.flush()
+        finally:
+            self.file.close()
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def tell(self):
+        return self._stream.tell()

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/commands/convert/jsonl.py RENAMED Viewed

@@ -1,10 +1,11 @@
 import click
 from datasets import load_dataset
+from ...compression import *
 from ...options import *
 from ...utils import *
-__all__ = ["jsonl"]
+__all__ = ["jsonl_to_mds", "jsonl_to_parquet"]
 @click.group()
 def jsonl():
@@ -13,35 +14,40 @@ def jsonl():
 @jsonl.command()
 @click.argument('output_dir', type=click.Path(exists=False))
 @click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
-@compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
+@compression_option(MDS_COMPRESSIONS)
 @overwrite_option()
 @yes_option()
 @processes_option()
 @buf_size_option()
 @shard_size_option()
-def mds(output_dir, jsonl_files, compression, processes, overwrite, yes, buf_size, shard_size):
+@no_pigz_option()
+def mds(**kwargs):
+    jsonl_to_mds(**kwargs)
+def jsonl_to_mds(output_dir, jsonl_files, compression, processes, overwrite, yes, buf_size, shard_size, no_pigz):
     check_arguments(output_dir, overwrite, yes, jsonl_files)
     save_mds(
-        load_dataset("json", data_files=jsonl_files, split="train"),
+        load_jsonl_files(jsonl_files),
         output_dir,
         processes=processes,
         compression=compression,
         buf_size=buf_size,
-        pigz=use_pigz(compression),
+        pigz=use_pigz(compression, no_pigz),
         shard_size=shard_size,
     )
 @jsonl.command()
 @click.argument('output_file', type=click.Path(exists=False))
 @click.argument('jsonl_files', nargs=-1, type=click.Path(exists=True))
-@compression_option("snappy", ["snappy", "gzip", "zstd"])
+@compression_option(PARQUET_COMPRESSIONS)
 @overwrite_option()
 @yes_option()
 @batch_size_option()
-def parquet(output_file, jsonl_files, compression, overwrite, yes, batch_size):
+def parquet(**kwargs):
+    jsonl_to_parquet(**kwargs)
+def jsonl_to_parquet(output_file, jsonl_files, compression, overwrite, yes, batch_size):
     check_arguments(output_file, overwrite, yes, jsonl_files)
     save_parquet(
-        load_dataset("json", data_files=jsonl_files, split="train"),
+        load_jsonl_files(jsonl_files),
         output_file,
         compression=compression,
         batch_size=batch_size,

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/commands/convert/mds.py RENAMED Viewed

@@ -1,9 +1,10 @@
 import click
+from ...compression import *
 from ...options import *
 from ...utils import *
-__all__ = ["mds"]
+__all__ = ["mds_to_jsonl", "mds_to_parquet"]
 @click.group()
 def mds():
@@ -12,13 +13,15 @@ def mds():
 @mds.command()
 @click.argument("output_file", type=click.Path(exists=False), required=True)
 @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
-@compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
+@compression_option(JSONL_COMPRESSIONS)
 @processes_option()
 @overwrite_option()
 @yes_option()
 @batch_size_option()
 @no_bulk_option()
-def jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk):
+def jsonl(**kwargs):
+    mds_to_jsonl(**kwargs)
+def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk):
     check_arguments(output_file, overwrite, yes, mds_directories)
     save_jsonl(
         load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
@@ -30,12 +33,14 @@ def jsonl(output_file, mds_directories, compression, processes, overwrite, yes,
 @mds.command()
 @click.argument("output_file", type=click.Path(exists=False), required=True)
 @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
-@compression_option("snappy", ["snappy", "gzip", "zstd"])
+@compression_option(PARQUET_COMPRESSIONS)
 @overwrite_option()
 @yes_option()
 @batch_size_option()
 @no_bulk_option()
-def parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk):
+def parquet(**kwargs):
+    mds_to_parquet(**kwargs)
+def mds_to_parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk):
     check_arguments(output_file, overwrite, yes, mds_directories)
     save_parquet(
         load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/commands/convert/parquet.py RENAMED Viewed

@@ -1,10 +1,11 @@
 import click
 from datasets import load_dataset
+from ...compression import *
 from ...options import *
 from ...utils import *
-__all__ = ["parquet"]
+__all__ = ["parquet_to_jsonl", "parquet_to_mds"]
 @click.group()
 def parquet():
@@ -13,11 +14,13 @@ def parquet():
 @parquet.command()
 @click.argument("output_file", type=click.Path(exists=False), required=True)
 @click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
-@compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
+@compression_option(JSONL_COMPRESSIONS)
 @processes_option()
 @overwrite_option()
 @yes_option()
-def jsonl(output_file, parquet_files, compression, processes, overwrite, yes):
+def jsonl(**kwargs):
+    parquet_to_jsonl(**kwargs)
+def parquet_to_jsonl(output_file, parquet_files, compression, processes, overwrite, yes):
     check_arguments(output_file, overwrite, yes, parquet_files)
     save_jsonl(
         load_dataset("parquet", data_files=parquet_files, split="train"),
@@ -29,13 +32,16 @@ def jsonl(output_file, parquet_files, compression, processes, overwrite, yes):
 @parquet.command()
 @click.argument('output_dir', type=click.Path(exists=False))
 @click.argument('parquet_files', nargs=-1, type=click.Path(exists=True))
-@compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
+@compression_option(MDS_COMPRESSIONS)
 @processes_option()
 @overwrite_option()
 @yes_option()
 @buf_size_option()
 @shard_size_option()
-def mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_size, shard_size):
+@no_pigz_option()
+def mds(**kwargs):
+    parquet_to_mds(**kwargs)
+def parquet_to_mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_size, shard_size, no_pigz):
     check_arguments(output_dir, overwrite, yes, parquet_files)
     save_mds(
         load_dataset("parquet", data_files=parquet_files, split="train"),
@@ -43,6 +49,6 @@ def mds(output_dir, parquet_files, compression, processes, overwrite, yes, buf_s
         processes=processes,
         compression=compression,
         buf_size=buf_size,
-        pigz=use_pigz(compression),
+        pigz=use_pigz(compression, no_pigz=no_pigz),
         shard_size=shard_size,
     )

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/commands/join.py RENAMED Viewed

@@ -1,10 +1,11 @@
 import click
 from datasets import load_dataset
+from ..compression import *
 from ..options import *
 from ..utils import *
-__all__ = ["join"]
+__all__ = ["join_jsonl", "join_mds", "join_parquet"]
 @click.group()
 def join():
@@ -13,14 +14,16 @@ def join():
 @join.command()
 @click.argument("output_file", type=click.Path(exists=False), required=True)
 @click.argument("jsonl_files", type=click.Path(exists=True), required=True, nargs=-1)
-@compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
+@compression_option(JSONL_COMPRESSIONS)
 @processes_option()
 @overwrite_option()
 @yes_option()
-def jsonl(output_file, jsonl_files, compression, processes, overwrite, yes):
+def jsonl(**kwargs):
+    join_jsonl(**kwargs)
+def join_jsonl(output_file, jsonl_files, compression, processes, overwrite, yes):
     check_arguments(output_file, overwrite, yes, jsonl_files)
     save_jsonl(
-        load_dataset("json", data_files=jsonl_files, split="train"),
+        load_jsonl_files(jsonl_files),
         output_file,
         compression=compression,
         processes=processes,
@@ -29,14 +32,19 @@ def jsonl(output_file, jsonl_files, compression, processes, overwrite, yes):
 @join.command()
 @click.argument("output_dir", type=click.Path(exists=False), required=True)
 @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
-@compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
+@compression_option(MDS_COMPRESSIONS)
 @processes_option()
 @overwrite_option()
 @yes_option()
 @batch_size_option()
 @buf_size_option()
 @no_bulk_option()
-def mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk):
+@shard_size_option()
+@no_pigz_option()
+def mds(**kwargs):
+    print(kwargs)
+    join_mds(**kwargs)
+def join_mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk, shard_size, no_pigz):
     check_arguments(output_dir, overwrite, yes, mds_directories)
     save_mds(
         load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
@@ -44,17 +52,20 @@ def mds(output_dir, mds_directories, compression, processes, overwrite, yes, bat
         processes=processes,
         compression=compression,
         buf_size=buf_size,
-        pigz=use_pigz(compression),
+        shard_size=shard_size,
+        pigz=use_pigz(compression, no_pigz)
     )
 @join.command()
 @click.argument("output_file", type=click.Path(exists=False), required=True)
 @click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
-@compression_option("snappy", ["snappy", "gzip", "zstd"])
+@compression_option(PARQUET_COMPRESSIONS)
 @overwrite_option()
 @yes_option()
 @batch_size_option()
-def parquet(output_file, parquet_files, compression, overwrite, yes, batch_size):
+def parquet(**kwargs):
+    join_parquet(**kwargs)
+def join_parquet(output_file, parquet_files, compression, overwrite, yes, batch_size):
     check_arguments(output_file, overwrite, yes, parquet_files)
     save_parquet(
         load_dataset("parquet", data_files=parquet_files, split="train"),

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/commands/split.py RENAMED Viewed

@@ -1,10 +1,11 @@
 import click
 from datasets import load_dataset
+from ..compression import *
 from ..options import *
 from ..utils import *
-__all__ = ["split"]
+__all__ = ["split_jsonl", "split_mds", "split_parquet"]
 @click.group()
 def split():
@@ -15,14 +16,14 @@ def split():
 @prefix_option()
 @output_dir_option()
 @size_hint_option()
-@compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
+@compression_option(JSONL_COMPRESSIONS)
 @processes_option()
 @overwrite_option()
 @yes_option()
 def jsonl(jsonl_files, prefix, output_dir, size_hint, compression, processes, overwrite, yes):
     save_jsonl(
-        load_dataset("json", data_files=jsonl_files, split="train"),
-        output_file=f"{output_dir}/{prefix}{{part:04d}}.jsonl{extension(compression, jsonl_files[0])}",
+        load_jsonl_files(jsonl_files),
+        output_file=f"{output_dir}/{prefix}{{part:04d}}.jsonl{extension_compression(compression, jsonl_files[0])}",
         compression=compression,
         processes=processes,
         size_hint=size_hint,
@@ -35,7 +36,7 @@ def jsonl(jsonl_files, prefix, output_dir, size_hint, compression, processes, ov
 @prefix_option()
 @output_dir_option()
 @size_hint_option()
-@compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
+@compression_option(MDS_COMPRESSIONS)
 @processes_option()
 @overwrite_option()
 @yes_option()
@@ -43,16 +44,37 @@ def jsonl(jsonl_files, prefix, output_dir, size_hint, compression, processes, ov
 @batch_size_option()
 @no_bulk_option()
 @shard_size_option()
-def mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size):
+@no_pigz_option()
+def mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size, no_pigz):
     save_mds(
         load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
         output_dir=f"{output_dir}/{prefix}{{part:04d}}",
         processes=processes,
         compression=compression,
         buf_size=buf_size,
-        pigz=use_pigz(compression),
+        pigz=use_pigz(compression, no_pigz),
         shard_size=shard_size,
         size_hint=size_hint,
         overwrite=overwrite,
         yes=yes,
     )
+@split.command()
+@click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
+@prefix_option()
+@output_dir_option()
+@size_hint_option()
+@compression_option(PARQUET_COMPRESSIONS)
+@overwrite_option()
+@yes_option()
+@batch_size_option()
+def parquet(parquet_files, prefix, output_dir, size_hint, compression, overwrite, yes, batch_size):
+    save_parquet(
+        load_dataset("parquet", data_files=parquet_files, split="train"),
+        output_file=f"{output_dir}/{prefix}{{part:04d}}.parquet",
+        compression=compression,
+        batch_size=batch_size,
+        size_hint=size_hint,
+        overwrite=overwrite,
+        yes=yes,
+    )

mldataforge-0.1.5/mldataforge/compression.py ADDED Viewed

@@ -0,0 +1,158 @@
+import bz2
+from isal import igzip as gzip
+import lz4
+import lzma
+import os
+import shutil
+from tqdm import tqdm
+import zstandard
+from .brotli import brotli_open
+from .pigz import pigz_open
+from .snappy import snappy_open
+__all__ = [
+    "JSONL_COMPRESSIONS",
+    "MDS_COMPRESSIONS",
+    "PARQUET_COMPRESSIONS",
+    "determine_compression",
+    "extension_compression",
+    "infer_compression",
+    "open_compression",
+    "pigz_available",
+    "pigz_compress",
+    "use_pigz",
+]
+JSONL_COMPRESSIONS = dict(
+    default="infer",
+    choices=["infer", "none", "bz2", "gzip", "lz4", "lzma", "pigz", "snappy", "xz", "zstd"],
+)
+MDS_COMPRESSIONS = dict(
+    default=None,
+    choices=["none", "brotli", "bz2", "gzip", "pigz", "snappy", "zstd"],
+)
+PARQUET_COMPRESSIONS = dict(
+    default="snappy",
+    choices=["snappy", "brotli", "gzip", "lz4", "zstd"],
+)
+def determine_compression(fmt, file_path, compression="infer", no_pigz=False):
+    if compression == "none":
+        return None
+    if fmt == "jsonl":
+        if compression == "infer":
+            compression = infer_compression(file_path)
+        if compression == "brotli":
+            return "br"
+        return compression
+    if fmt == "mds":
+        if compression == "infer":
+            raise ValueError()
+        if compression == "pigz" or (not no_pigz and compression == "gzip" and pigz_available()):
+            return None
+        if compression == "gzip":
+            return "gz"
+        if compression == "brotli":
+            return "br"
+        return compression
+    if fmt == "parquet":
+        return compression
+    raise ValueError(f"Unsupported format: {format}")
+def extension_compression(compression, file_path):
+    """Get the file extension for the given compression type."""
+    if compression == "infer":
+        compression = infer_compression(file_path)
+    if compression == "brotli":
+        return ".br"
+    if compression == "bz2":
+        return ".bz2"
+    if compression in ("gzip", "pigz"):
+        return ".gz"
+    if compression == "lz4":
+        return ".lz4"
+    if compression == "lzma":
+        return ".lzma"
+    if compression == "snappy":
+        return ".snappy"
+    if compression == "xz":
+        return ".xz"
+    if compression == "zstd":
+        return ".zst"
+    if compression is None or compression == "none":
+        return ""
+    raise ValueError(f"Unsupported compression type: {compression}")
+def infer_compression(file_path, pigz=True):
+    """Infer the compression type from the file extension."""
+    extension = os.path.splitext(file_path)[1]
+    if extension.endswith('.br'):
+        return 'brotli'
+    if extension.endswith('.bz2'):
+        return 'bz2'
+    if extension.endswith('.gz'):
+        if pigz and pigz_available():
+            return 'pigz'
+        return 'gzip'
+    if extension.endswith('.lz4'):
+        return 'lz4'
+    if extension.endswith('.lzma'):
+        return 'lzma'
+    if extension.endswith('.snappy'):
+        return 'snappy'
+    if extension.endswith('.xz'):
+        return 'xz'
+    if extension.endswith('.zip'):
+        return 'zip'
+    if extension.endswith('.zst'):
+        return 'zstd'
+    return None
+def open_compression(file_path, mode="rt", compression="infer", processes=64):
+    """Open a file, handling compression if necessary."""
+    if compression == "infer":
+        compression = infer_compression(file_path)
+    if compression in ("brotli", "br"):
+        return brotli_open(file_path, mode)
+    if compression in ("gzip", "gz"):
+        return gzip.open(file_path, mode)
+    if compression == "pigz":
+        return pigz_open(file_path, mode, processes=processes) if mode[0] == "w" else gzip.open(file_path, mode)
+    if compression == "bz2":
+        return bz2.open(file_path, mode)
+    if compression == "lz4":
+        return lz4.frame.open(file_path, mode)
+    if compression in ("lzma", "xz"):
+        return lzma.open(file_path, mode)
+    if compression == "snappy":
+        return snappy_open(file_path, mode)
+    if compression == "zstd":
+        return zstandard.open(file_path, mode)
+    if compression is None or compression == "none":
+        return open(file_path, mode)
+    raise ValueError(f"Unsupported compression type: {compression}")
+def pigz_available():
+    """Check if pigz is available on the system."""
+    return shutil.which("pigz") is not None
+def pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=False, quiet=False):
+    """Compress a file using pigz."""
+    size = os.stat(input_file).st_size
+    num_blocks = (size+buf_size-1) // buf_size
+    with open(input_file, "rb") as f_in, pigz_open(output_file, "wb", processes=processes) as f_out:
+        for _ in tqdm(range(num_blocks), desc="Compressing with pigz", unit="block", disable=quiet):
+            buf = f_in.read(buf_size)
+            assert buf
+            f_out.write(buf)
+        buf = f_in.read()
+        assert not buf
+    if not keep:
+        os.remove(input_file)
+        if not quiet:
+            print(f"Removed {input_file}")
+def use_pigz(compression, no_pigz=False):
+    """Determine if pigz should be used based on the compression type."""
+    return compression == "pigz" or (not no_pigz and compression == "gzip" and pigz_available())

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/mds.py RENAMED Viewed

@@ -3,9 +3,13 @@ import json
 from mltiming import timing
 import numpy as np
 import os
+import snappy
 from streaming.base.format.mds.encodings import mds_decode
 from typing import Any, Optional, Generator
+from .options import MDS_COMPRESSIONS
+from .utils import open_compression
 class MDSBulkReader:
     def __init__(
         self,
@@ -42,13 +46,7 @@ class MDSShardReader:
         filename: str,
         compression: Optional[str],
     ) -> None:
-        if compression is None:
-            _open = open
-        elif compression == 'gz':
-            _open = gzip.open
-        else:
-            raise ValueError(f'Unsupported compression type: {compression}. Supported types: None, gzip.')
-        self.fp = _open(filename, "rb")
+        self.fp = open_compression(filename, "rb", compression=compression)
         self.samples = np.frombuffer(self.fp.read(4), np.uint32)[0]
         self.index = np.frombuffer(self.fp.read((1+self.samples)*4), np.uint32)
         info = json.loads(self.fp.read(self.index[0]-self.fp.tell()))

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/options.py RENAMED Viewed

@@ -1,11 +1,19 @@
 import click
-__alll__ = [
+from .compression import JSONL_COMPRESSIONS, MDS_COMPRESSIONS, PARQUET_COMPRESSIONS
+__all__ = [
     "batch_size_option",
     "buf_size_option",
     "compression_option",
+    "no_bulk_option",
+    "no_pigz_option",
+    "output_dir_option",
     "overwrite_option",
     "processes_option",
+    "prefix_option",
+    "shard_size_option",
+    "size_hint_option",
     "yes_option",
 ]
@@ -39,15 +47,25 @@ def no_bulk_option():
         help="Use a custom space and time-efficient bulk reader (only gzip and no compression).",
     )
-def compression_option(default, choices):
+def no_pigz_option():
+    """
+    Option for specifying whether to use pigz compression.
+    """
+    return click.option(
+        "--no-pigz",
+        is_flag=True,
+        help="Do not use pigz compression.",
+    )
+def compression_option(args):
     """
     Option for specifying the compression type.
     """
     return click.option(
         "--compression",
-        default=default,
-        type=click.Choice(choices, case_sensitive=False),
-        help=f"Compress the output file (default: {default}).",
+        default=args["default"],
+        type=click.Choice(args["choices"], case_sensitive=False),
+        help=f'Compress the output file (default: {args["default"]}).',
     )
 def output_dir_option(default="."):

mldataforge-0.1.5/mldataforge/snappy.py ADDED Viewed

@@ -0,0 +1,226 @@
+import snappy
+import struct
+import io
+__all__ = ["snappy_open"]
+_CHUNK_SIZE = 8192  # default read block size
+def snappy_open(filename, mode='rb', encoding='utf-8'):
+    return SnappyFile(filename, mode=mode, encoding=encoding)
+class _SnappyWriteWrapper(io.RawIOBase):
+    def __init__(self, fileobj):
+        self.fileobj = fileobj
+        self.buffer = io.BytesIO()
+    def write(self, b):
+        if not isinstance(b, (bytes, bytearray)):
+            raise TypeError("Expected bytes")
+        self.buffer.write(b)
+        return len(b)
+    def flush(self):
+        data = self.buffer.getvalue()
+        if data:
+            compressed = snappy.compress(data)
+            length = struct.pack(">I", len(compressed))
+            self.fileobj.write(length + compressed)
+            self.buffer = io.BytesIO()
+        self.fileobj.flush()
+    def close(self):
+        self.flush()
+        self.fileobj.close()
+    def writable(self):
+        return True
+# class _SnappyReadWrapper(io.RawIOBase):
+#     def __init__(self, fileobj):
+#         self.fileobj = fileobj
+#         self.buffer = io.BytesIO()
+#         self.eof = False
+#     def _fill_buffer_if_needed(self, min_bytes):
+#         self.buffer.seek(0, io.SEEK_END)
+#         while not self.eof and self.buffer.tell() < min_bytes:
+#             length_bytes = self.fileobj.read(4)
+#             if not length_bytes:
+#                 self.eof = True
+#                 break
+#             if len(length_bytes) < 4:
+#                 self.eof = True  # mark as EOF even if last chunk is malformed
+#                 break
+#             try:
+#                 length = struct.unpack(">I", length_bytes)[0]
+#                 compressed = self.fileobj.read(length)
+#                 if len(compressed) < length:
+#                     self.eof = True
+#                     break
+#                 decompressed = snappy.decompress(compressed)
+#                 self.buffer.write(decompressed)
+#             except Exception:
+#                 self.eof = True
+#                 break
+#         self.buffer.seek(0)
+#     def read(self, size=-1):
+#         if size == -1:
+#             while not self.eof:
+#                 self._fill_buffer_if_needed(_CHUNK_SIZE)
+#             result = self.buffer.read()
+#             self.buffer = io.BytesIO()
+#             return result
+#         self._fill_buffer_if_needed(size)
+#         data = self.buffer.read(size)
+#         rest = self.buffer.read()
+#         self.buffer = io.BytesIO()
+#         self.buffer.write(rest)
+#         return data
+#     def readable(self):
+#         return True
+#     def close(self):
+#         self.fileobj.close()
+class _SnappyReadWrapper(io.RawIOBase):
+    def __init__(self, fileobj):
+        self.fileobj = fileobj
+        self.buffer = io.BytesIO()
+        self.eof = False
+        self._autodetect_format()
+    def _autodetect_format(self):
+        self.fileobj.seek(0)
+        preview = self.fileobj.read()
+        try:
+            self._raw_decompressed = snappy.decompress(preview)
+            self._mode = "raw"
+            self.buffer = io.BytesIO(self._raw_decompressed)
+        except Exception:
+            self.fileobj.seek(0)
+            self._mode = "framed"
+    def _fill_buffer_if_needed(self, min_bytes):
+        self.buffer.seek(0, io.SEEK_END)
+        while not self.eof and self.buffer.tell() < min_bytes:
+            length_bytes = self.fileobj.read(4)
+            if not length_bytes:
+                self.eof = True
+                break
+            if len(length_bytes) < 4:
+                self.eof = True
+                break
+            try:
+                length = struct.unpack(">I", length_bytes)[0]
+                compressed = self.fileobj.read(length)
+                if len(compressed) < length:
+                    self.eof = True
+                    break
+                decompressed = snappy.decompress(compressed)
+                self.buffer.write(decompressed)
+            except Exception:
+                self.eof = True
+                break
+        self.buffer.seek(0)
+    def read(self, size=-1):
+        if self._mode == "raw":
+            return self.buffer.read(size)
+        else:
+            if size == -1:
+                while not self.eof:
+                    self._fill_buffer_if_needed(_CHUNK_SIZE)
+                result = self.buffer.read()
+                self.buffer = io.BytesIO()
+                return result
+            else:
+                self._fill_buffer_if_needed(size)
+                data = self.buffer.read(size)
+                rest = self.buffer.read()
+                self.buffer = io.BytesIO()
+                self.buffer.write(rest)
+                return data
+    def readable(self):
+        return True
+    def close(self):
+        self.fileobj.close()
+    def tell(self):
+        return self.buffer.tell()
+    def seek(self, offset, whence=io.SEEK_SET):
+        return self.buffer.seek(offset, whence)
+class SnappyFile:
+    def __init__(self, filename, mode='rb', encoding='utf-8'):
+        self.filename = filename
+        self.mode = mode
+        self.encoding = encoding
+        self.binary = 'b' in mode
+        raw_mode = mode.replace('t', 'b')
+        self.fileobj = open(filename, raw_mode)
+        if 'r' in mode:
+            self._stream = self._reader() if self.binary else io.TextIOWrapper(self._reader(), encoding=encoding)
+        elif 'w' in mode:
+            self._stream = self._writer() if self.binary else io.TextIOWrapper(self._writer(), encoding=encoding)
+        else:
+            raise ValueError("Unsupported mode: use 'rb', 'wb', 'rt', or 'wt'")
+    def _reader(self):
+        return _SnappyReadWrapper(self.fileobj)
+    def _writer(self):
+        return _SnappyWriteWrapper(self.fileobj)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+    def close(self):
+        if hasattr(self._stream, 'flush'):
+            self._stream.flush()
+        self._stream.close()
+    def flush(self):
+        if hasattr(self._stream, 'flush'):
+            self._stream.flush()
+    def read(self, *args, **kwargs):
+        return self._stream.read(*args, **kwargs)
+    def write(self, *args, **kwargs):
+        return self._stream.write(*args, **kwargs)
+    def readline(self, *args, **kwargs):
+        return self._stream.readline(*args, **kwargs)
+    def tell(self):
+        return self._stream.tell()
+    def seek(self, offset, whence=io.SEEK_SET):
+        return self._stream.seek(offset, whence)
+    def readable(self):
+        return hasattr(self._stream, "read")
+    def writable(self):
+        return hasattr(self._stream, "write")
+    def seekable(self):
+        return hasattr(self._stream, "seek")
+    def __iter__(self):
+        return iter(self._stream)

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/utils.py RENAMED Viewed

@@ -1,9 +1,6 @@
-import bz2
 import click
-from datasets import concatenate_datasets
-import gzip
+from datasets import concatenate_datasets, load_dataset
 import json
-import lzma
 from mltiming import timing
 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -12,22 +9,21 @@ import shutil
 from streaming import MDSWriter, StreamingDataset
 from tqdm import tqdm
+from .compression import determine_compression, open_compression, pigz_compress
 from .mds import MDSBulkReader
 from .pigz import pigz_open
 __all__ = [
-    "batch_iterable",
     "check_arguments",
     "confirm_overwrite",
-    "extension",
+    "load_jsonl_files",
     "load_mds_directories",
     "save_jsonl",
     "save_mds",
     "save_parquet",
-    "use_pigz",
 ]
-def batch_iterable(iterable, batch_size):
+def _batch_iterable(iterable, batch_size):
     batch = []
     for item in iterable:
         batch.append(item)
@@ -64,31 +60,6 @@ def confirm_overwrite(message):
     if response.lower() != 'yes':
         raise click.Abort()
-def _determine_compression(file_path, compression="infer"):
-    if compression == "infer":
-        compression = _infer_compression(file_path)
-    if compression == "none":
-        compression = None
-    return compression
-def extension(compression, file_path):
-    """Get the file extension for the given compression type."""
-    if compression == "infer":
-        compression = _infer_compression(file_path)
-    if compression in ("gzip", "pigz"):
-        return ".gz"
-    if compression == "bz2":
-        return ".bz2"
-    if compression == "xz":
-        return ".xz"
-    if compression == "zip":
-        return ".zip"
-    if compression == "zstd":
-        return ".zst"
-    if compression is None:
-        return ""
-    raise ValueError(f"Unsupported compression type: {compression}")
 def _infer_mds_encoding(value):
     """Determine the MDS encoding for a given value."""
     if isinstance(value, str):
@@ -101,22 +72,16 @@ def _infer_mds_encoding(value):
         return 'bool'
     return 'pkl'
-def _infer_compression(file_path):
-    """Infer the compression type from the file extension."""
-    extension = os.path.splitext(file_path)[1]
-    if extension.endswith('.gz'):
-        if _pigz_available():
-            return 'pigz'
-        return 'gzip'
-    if extension.endswith('.bz2'):
-        return 'bz2'
-    if extension.endswith('.xz'):
-        return 'xz'
-    if extension.endswith('.zip'):
-        return 'zip'
-    if extension.endswith('.zst'):
-        return 'zstd'
-    return None
+def _streaming_jsonl(jsonl_files, compressions):
+    for jsonl_file, compression in tqdm(zip(jsonl_files, compressions), desc="Loading JSONL files", unit="file"):
+        for line in open_compression(jsonl_file, mode="rt", compression=compression):
+            yield json.loads(line)
+def load_jsonl_files(jsonl_files):
+    compressions = [determine_compression("jsonl", jsonl_file) for jsonl_file in jsonl_files]
+    if "br" in compressions or "snappy" in compressions:
+        return _streaming_jsonl(jsonl_files, compressions)
+    return load_dataset("json", data_files=jsonl_files, split="train")
 def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True):
     if bulk:
@@ -141,50 +106,14 @@ def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True
             ds = concatenate_datasets(dsets=dss)
     return ds
-def _open_jsonl(file_path, mode="rt", compression="infer", processes=64):
-    """Open a JSONL file, handling gzip compression if necessary."""
-    compression = _determine_compression(file_path, compression)
-    if compression == "gzip":
-        return gzip.open(file_path, mode)
-    if compression == "pigz":
-        return pigz_open(file_path, mode, processes=processes) if mode[0] == "w" else gzip.open(file_path, mode)
-    if compression == "bz2":
-        return bz2.open(file_path, mode)
-    if compression == "xz":
-        return lzma.open(file_path, mode)
-    if compression is None:
-        return open(file_path, mode)
-    raise ValueError(f"Unsupported compression type: {compression}")
-def _pigz_available():
-    """Check if pigz is available on the system."""
-    return shutil.which("pigz") is not None
-def _pigz_compress(input_file, output_file, processes=64, buf_size=2**24, keep=False, quiet=False):
-    """Compress a file using pigz."""
-    size = os.stat(input_file).st_size
-    num_blocks = (size+buf_size-1) // buf_size
-    with open(input_file, "rb") as f_in, pigz_open(output_file, "wb", processes=processes) as f_out:
-        for _ in tqdm(range(num_blocks), desc="Compressing with pigz", unit="block", disable=quiet):
-            buf = f_in.read(buf_size)
-            assert buf
-            f_out.write(buf)
-        buf = f_in.read()
-        assert not buf
-    if not keep:
-        os.remove(input_file)
-        if not quiet:
-            print(f"Removed {input_file}")
 def save_jsonl(iterable, output_file, compression=None, processes=64, size_hint=None, overwrite=True, yes=True):
-    compression = _determine_compression(output_file, compression)
     f = None
     part = 0
     for item in tqdm(iterable, desc="Writing to JSONL", unit="sample"):
         if f is None:
             part_file = output_file.format(part=part)
             check_arguments(part_file, overwrite, yes)
-            f= _open_jsonl(part_file, mode="wb", compression=compression, processes=processes)
+            f = open_compression(part_file, mode="wb", compression=compression, processes=processes)
         f.write(f"{json.dumps(item)}\n".encode("utf-8"))
         if size_hint is not None and f.tell() >= size_hint:
             f.close()
@@ -193,11 +122,8 @@ def save_jsonl(iterable, output_file, compression=None, processes=64, size_hint=
     if f is not None:
         f.close()
-def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pigz=False, shard_size=None, size_hint=None, overwrite=True, yes=True):
-    if compression == "none" or pigz:
-        compression = None
-    if compression == "gzip":
-        compression = "gz"
+def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pigz=True, shard_size=None, size_hint=None, overwrite=True, yes=True):
+    compression = determine_compression("mds", output_dir, compression, no_pigz=not pigz)
     writer = None
     part = 0
     files = []
@@ -216,7 +142,8 @@ def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pig
             writer.finish()
             part += 1
             writer = None
-    writer.finish()
+    if writer is not None:
+        writer.finish()
     if pigz:
         for output_dir in files:
             index_path = os.path.join(output_dir, "index.json")
@@ -228,7 +155,7 @@ def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pig
                 compressed_file_name = file_name + ".gz"
                 file_path = os.path.join(output_dir, file_name)
                 compressed_file_path = os.path.join(output_dir, compressed_file_name)
-                _pigz_compress(file_path, compressed_file_path, processes, buf_size=buf_size, keep=False, quiet=True)
+                pigz_compress(file_path, compressed_file_path, processes, buf_size=buf_size, keep=False, quiet=True)
                 name2info[file_name]["compression"] = "gz"
                 name2info[file_name]["zip_data"] = {
                     "basename": compressed_file_name,
@@ -238,16 +165,23 @@ def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pig
             json.dump(index, open(index_path, "wt"))
             print(f"Compressed {output_dir} with pigz")
-def save_parquet(it, output_file, compression=None, batch_size=2**16):
+def save_parquet(it, output_file, compression=None, batch_size=2**16, size_hint=None, overwrite=True, yes=True):
+    compression = determine_compression("parquet", output_file, compression)
     writer = None
+    part = 0
     it = tqdm(it, desc="Writing to Parquet", unit="sample")
-    for batch in batch_iterable(it, batch_size):
+    for batch in _batch_iterable(it, batch_size):
         table = pa.Table.from_pylist(batch)
         if writer is None:
-            writer = pq.ParquetWriter(output_file, table.schema, compression=compression)
+            part_file = output_file.format(part=part)
+            check_arguments(part_file, overwrite, yes)
+            writer = pq.ParquetWriter(part_file, table.schema, compression=compression)
+            offset = 0
         writer.write_table(table)
-    writer.close()
-def use_pigz(compression):
-    """Determine if pigz should be used based on the compression type."""
-    return compression == "pigz" or (compression == "gzip" and _pigz_available())
+        offset += table.nbytes
+        if size_hint is not None and offset >= size_hint:
+            writer.close()
+            part += 1
+            writer = None
+    if writer is not None:
+        writer.close()

{mldataforge-0.1.3 → mldataforge-0.1.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "mldataforge"
-version = "0.1.3"
+version = "0.1.5"
 authors = [
   { name = "Peter Schneider-Kamp" }
 ]
@@ -19,10 +19,15 @@ classifiers = [
 ]
 dependencies = [
+    'brotlicffi',
     'click',
     'datasets',
+    'isal',
+    'lz4',
     'mltiming',
-    'mosaicml-streaming'
+    'mosaicml-streaming',
+    'python-snappy',
+    'zstandard'
 ]
 [project.optional-dependencies]

{mldataforge-0.1.3 → mldataforge-0.1.5}/.gitignore RENAMED Viewed

File without changes

{mldataforge-0.1.3 → mldataforge-0.1.5}/LICENSE RENAMED Viewed

File without changes

{mldataforge-0.1.3 → mldataforge-0.1.5}/README.md RENAMED Viewed

File without changes

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/__main__.py RENAMED Viewed

File without changes

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/commands/__init__.py RENAMED Viewed

File without changes

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/commands/convert/__init__.py RENAMED Viewed

File without changes

{mldataforge-0.1.3 → mldataforge-0.1.5}/mldataforge/pigz.py RENAMED Viewed

File without changes

mldataforge 0.1.3__tar.gz → 0.1.5__tar.gz

mldataforge 0.1.3tar.gz → 0.1.5tar.gz