PyPI - mldataforge - Versions diffs - 0.2.1__tar.gz → 0.2.3__tar.gz - Mend

mldataforge 0.2.1tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{mldataforge-0.2.1 → mldataforge-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mldataforge
-Version: 0.2.1
+Version: 0.2.3
 Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
 Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
 Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
@@ -10,7 +10,6 @@ Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3
 Requires-Python: >=3.12
-Requires-Dist: brotlicffi
 Requires-Dist: click
 Requires-Dist: datasets
 Requires-Dist: isal

{mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/brotli.py RENAMED Viewed

@@ -1,4 +1,4 @@
-import brotlicffi as brotli
+import brotli
 import io
 __all__ = ["brotli_open"]
@@ -6,11 +6,6 @@ __all__ = ["brotli_open"]
 def brotli_open(filename, mode='rb', encoding='utf-8', compress_level=11):
     return BrotliFile(filename, mode=mode, encoding=encoding, compress_level=11)
-import brotlicffi as brotli
-import io
-__all__ = ["brotli_open"]
 class BrotliFile:
     def __init__(self, filename, mode='rb', encoding='utf-8', compress_level=11):
         self.filename = filename

{mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/convert/mds.py RENAMED Viewed

@@ -20,12 +20,13 @@ def mds():
 @batch_size_option()
 @no_bulk_option()
 @trafo_option()
+@shuffle_option()
 def jsonl(**kwargs):
     mds_to_jsonl(**kwargs)
-def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk, trafo):
+def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk, trafo, shuffle):
     check_arguments(output_file, overwrite, yes, mds_directories)
     save_jsonl(
-        load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
+        load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
         output_file,
         compression=compression,
         processes=processes,
@@ -41,12 +42,13 @@ def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite
 @batch_size_option()
 @no_bulk_option()
 @trafo_option()
+@shuffle_option()
 def parquet(**kwargs):
     mds_to_parquet(**kwargs)
-def mds_to_parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk, trafo):
+def mds_to_parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk, trafo, shuffle):
     check_arguments(output_file, overwrite, yes, mds_directories)
     save_parquet(
-        load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
+        load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
         output_file,
         compression=compression,
         batch_size=batch_size,

{mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/join.py RENAMED Viewed

@@ -43,13 +43,14 @@ def join_jsonl(output_file, jsonl_files, compression, processes, overwrite, yes,
 @shard_size_option()
 @no_pigz_option()
 @trafo_option()
+@shuffle_option()
 def mds(**kwargs):
     print(kwargs)
     join_mds(**kwargs)
-def join_mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk, shard_size, no_pigz, trafo):
+def join_mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk, shard_size, no_pigz, trafo, shuffle):
     check_arguments(output_dir, overwrite, yes, mds_directories)
     save_mds(
-        load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
+        load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
         output_dir,
         processes=processes,
         compression=compression,

{mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/split.py RENAMED Viewed

@@ -50,11 +50,12 @@ def split_jsonl(jsonl_files, prefix, output_dir, size_hint, compression, process
 @shard_size_option()
 @no_pigz_option()
 @trafo_option()
+@shuffle_option()
 def mds(*args, **kwargs):
     split_mds(*args, **kwargs)
-def split_mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size, no_pigz, trafo):
+def split_mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size, no_pigz, trafo, shuffle):
     save_mds(
-        load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
+        load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
         output_dir=f"{output_dir}/{prefix}{{part:04d}}",
         processes=processes,
         compression=compression,

mldataforge-0.2.3/mldataforge/indexing.py ADDED Viewed

@@ -0,0 +1,25 @@
+import numpy as np
+__all__ = ['IndexedDatasetView', 'shuffle_permutation']
+class IndexedDatasetView:
+    def __init__(self, dataset, indices):
+        self.dataset = dataset
+        self.indices = list(indices)  # ensure repeatable accessx
+    def __iter__(self):
+        for idx in self.indices:
+            yield self.dataset[idx]
+    def __len__(self):
+        return len(self.indices)
+def shuffle_permutation(n, seed=int):
+    rng = np.random.default_rng(seed)
+    return rng.permutation(n)
+def reverse_permutation(indices):
+    n = len(indices)
+    reverse_indices = np.empty(n, dtype=int)
+    reverse_indices[indices] = np.arange(n)
+    return reverse_indices

{mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/options.py RENAMED Viewed

@@ -13,6 +13,7 @@ __all__ = [
     "processes_option",
     "prefix_option",
     "shard_size_option",
+    "shuffle_option",
     "size_hint_option",
     "trafo_option",
     "yes_option",
@@ -120,6 +121,17 @@ def shard_size_option(default=2**26):
         help=f"Shard size for the dataset (default: {default}).",
     )
+def shuffle_option():
+    """
+    Option for specifying whether to shuffle the dataset by providing a random seed.
+    """
+    return click.option(
+        "--shuffle",
+        default=None,
+        type=int,
+        help="Shuffle the dataset by providing a random seed.",
+    )
 def size_hint_option(default=2**26):
     """
     Option for specifying the size hint.

{mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/trafos.py RENAMED Viewed

@@ -1,18 +1,6 @@
 import re
-__all__ = ['IndexedDatasetView', 'Transformation', 'Transformations', 'flatten_json', 'unflatten_json']
-class IndexedDatasetView:
-    def __init__(self, dataset, indices):
-        self.dataset = dataset
-        self.indices = list(indices)  # ensure repeatable access
-    def __iter__(self):
-        for idx in self.indices:
-            yield self.dataset[idx]
-    def __len__(self):
-        return len(self.indices)
+__all__ = ['Transformation', 'Transformations', 'flatten_json', 'identity', 'unflatten_json']
 class Transformation:
     def __init__(self, code: str):
@@ -56,33 +44,23 @@ class Transformation:
             return self._last_input_len
         raise TypeError("Length is not available for this transformation.")
 class Transformations:
     def __init__(self, codes: list[str], indices=None):
         self.pipeline = [Transformation(code) for code in codes]
-        self.indices = indices  # Optional index iterable
     def __call__(self, dataset):
-        # Wrap dataset with IndexedDatasetView if indices are provided
-        if self.indices is not None:
-            dataset = IndexedDatasetView(dataset, self.indices)
         result = dataset
         for transform in self.pipeline:
             result = transform(result)
         return result
     def __len__(self):
-        # Return the input length to the pipeline
         if self.indices is not None:
             return len(self.indices)
         elif hasattr(self.pipeline[0], '_last_input_len') and self.pipeline[0]._last_input_len is not None:
             return self.pipeline[0]._last_input_len
         raise TypeError("Transformations length is not available until __call__ is used on a sized input.")
-def identity(obj):
-    return obj
 def flatten_json(obj, parent_key='', sep='.', escape_char='\\'):
     def escape(key):
         return key.replace(escape_char, escape_char * 2)\
@@ -110,6 +88,9 @@ def flatten_json(obj, parent_key='', sep='.', escape_char='\\'):
         items.append((parent_key, obj))
     return dict(items)
+def identity(obj):
+    return obj
 def unflatten_json(flat_dict, sep='.', escape_char='\\'):
     def check_flat_json(obj):
         assert isinstance(obj, dict), "Input must be a dictionary"

{mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/utils.py RENAMED Viewed

@@ -10,6 +10,7 @@ from streaming import StreamingDataset
 from tqdm import tqdm
 from .compression import determine_compression, open_compression, pigz_compress
+from .indexing import IndexedDatasetView, reverse_permutation, shuffle_permutation
 from .mds import MDSBulkReader, MDSWriter
 from .pigz import pigz_open
 from .trafos import Transformations
@@ -89,7 +90,9 @@ def load_jsonl_files(jsonl_files):
         return _streaming_jsonl(jsonl_files, compressions)
     return load_dataset("json", data_files=jsonl_files, split="train")
-def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True):
+def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True, shuffle=None):
+    if bulk and shuffle is not None:
+        raise ValueError("Bulk reader does not support shuffling by design.")
     if bulk:
         return MDSBulkReader(mds_directories, split=split)
     dss = []
@@ -110,6 +113,12 @@ def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True
     else:
         with timing(message=f"Concatenating {len(dss)} datasets"):
             ds = concatenate_datasets(dsets=dss)
+    if shuffle is not None:
+        with timing(message="Creating shuffle indices"):
+            indices = shuffle_permutation(len(ds), seed=abs(shuffle))
+            if shuffle < 0:
+                indices = reverse_permutation(indices)
+        ds = IndexedDatasetView(ds, indices)
     return ds
 def save_jsonl(iterable, output_file, compression=None, processes=64, size_hint=None, overwrite=True, yes=True, trafo=None):

{mldataforge-0.2.1 → mldataforge-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "mldataforge"
-version = "0.2.1"
+version = "0.2.3"
 authors = [
   { name = "Peter Schneider-Kamp" }
 ]
@@ -19,7 +19,6 @@ classifiers = [
 ]
 dependencies = [
-    'brotlicffi',
     'click',
     'datasets',
     'isal',