PyPI - mldataforge - Versions diffs - 0.1.6__tar.gz → 0.1.7__tar.gz - Mend

mldataforge 0.1.6tar.gz → 0.1.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{mldataforge-0.1.6 → mldataforge-0.1.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mldataforge
-Version: 0.1.6
+Version: 0.1.7
 Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
 Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
 Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues

{mldataforge-0.1.6 → mldataforge-0.1.7}/mldataforge/mds.py RENAMED Viewed

@@ -1,16 +1,26 @@
+from copy import deepcopy
 import json
 import numpy as np
 import os
 import shutil
 from streaming.base.compression import compress, decompress, get_compression_extension, is_compression
+from streaming.base.format import _readers
+from streaming.base.format.base.reader import FileInfo, JointReader
 from streaming.base.format.index import get_index_basename
-from streaming.base.format.mds.encodings import mds_decode, mds_encode, is_mds_encoding, get_mds_encodings, get_mds_encoded_size
+from streaming.base.format.mds.encodings import mds_decode, mds_encode, is_mds_encoding, is_mds_encoding_safe, get_mds_encodings, get_mds_encoded_size
 from streaming.base.hashing import get_hash, is_hash
 from streaming.base.util import bytes_to_int
 from typing import Any, Optional, Generator, Self, Union
 from .utils import open_compression
+__all__ = [
+    "MDSBulkReader",
+    "MDSBulkShardReader",
+    "MDSReader",
+    "MDSWriter",
+]
 class MDSBulkReader:
     def __init__(
         self,
@@ -37,11 +47,11 @@ class MDSBulkReader:
     def __iter__(self) -> Generator[dict[str, Any], None, None]:
         for shard in self.shards:
-            with MDSShardReader(**shard) as reader:
+            with MDSBulkShardReader(**shard) as reader:
                 for sample in reader:
                     yield sample
-class MDSShardReader:
+class MDSBulkShardReader:
     def __init__(
         self,
         filename: str,
@@ -94,7 +104,7 @@ class MDSShardReader:
         for i in range(self.samples):
             yield self.get_item(i)
-    def __enter__(self) -> "MDSShardReader":
+    def __enter__(self) -> "MDSBulkShardReader":
         return self
     def __exit__(self, exc_type, exc_value, traceback) -> None:
@@ -315,3 +325,124 @@ class MDSWriter:
     def __exit__(self, exc_type, exc, traceback):
         self.finish()
+class MDSReader(JointReader):
+    def __init__(
+        self,
+        dirname: str,
+        split: Optional[str],
+        column_encodings: list[str],
+        column_names: list[str],
+        column_sizes: list[Optional[int]],
+        compression: Optional[str],
+        hashes: list[str],
+        raw_data: FileInfo,
+        samples: int,
+        size_limit: Optional[Union[int, str]],
+        zip_data: Optional[FileInfo],
+    ) -> None:
+        self.sample_compression = None
+        if compression and compression.startswith("sample::"):
+            compression, self.sample_compression = None, compression.removeprefix("sample::")
+        super().__init__(dirname, split, compression, hashes, raw_data, samples, size_limit,
+                         zip_data)
+        self.column_encodings = column_encodings
+        self.column_names = column_names
+        self.column_sizes = column_sizes
+    @classmethod
+    def from_json(cls, dirname: str, split: Optional[str], obj: dict[str, Any]) -> Self:
+        """Initialize from JSON object.
+        Args:
+            dirname (str): Local directory containing shards.
+            split (str, optional): Which dataset split to use, if any.
+            obj (Dict[str, Any]): JSON object to load.
+        Returns:
+            Self: Loaded MDSReader.
+        """
+        args = deepcopy(obj)
+        args_version = args['version']
+        if args_version != 2:
+            raise ValueError(
+                f'Unsupported streaming data version: {args_version}. Expected version 2.')
+        del args['version']
+        args_format = args['format']
+        if args_format != 'mds':
+            raise ValueError(f'Unsupported data format: {args_format}. Expected to be `mds`.')
+        del args['format']
+        args['dirname'] = dirname
+        args['split'] = split
+        for key in ['raw_data', 'zip_data']:
+            arg = args[key]
+            args[key] = FileInfo(**arg) if arg else None
+        return cls(**args)
+    def validate(self, allow_unsafe_types: bool) -> None:
+        """Check whether this shard is acceptable to be part of some Stream.
+        Args:
+            allow_unsafe_types (bool): If a shard contains Pickle, which allows arbitrary code
+                execution during deserialization, whether to keep going if ``True`` or raise an
+                error if ``False``.
+        """
+        if not allow_unsafe_types:
+            for column_id, encoding in enumerate(self.column_encodings):
+                if not is_mds_encoding_safe(encoding):
+                    name = self.column_names[column_id]
+                    raise ValueError(f'Column {name} contains an unsafe type: {encoding}. To ' +
+                                     f'proceed anyway, set ``allow_unsafe_types=True``.')
+    def decode_sample(self, data: bytes) -> dict[str, Any]:
+        """Decode a sample dict from bytes.
+        Args:
+            data (bytes): The sample encoded as bytes.
+        Returns:
+            Dict[str, Any]: Sample dict.
+        """
+        sizes = []
+        idx = 0
+        for key, size in zip(self.column_names, self.column_sizes):
+            if size:
+                sizes.append(size)
+            else:
+                size, = np.frombuffer(data[idx:idx + 4], np.uint32)
+                sizes.append(size)
+                idx += 4
+        sample = {}
+        for key, encoding, size in zip(self.column_names, self.column_encodings, sizes):
+            value = data[idx:idx + size]
+            sample[key] = mds_decode(encoding, value)
+            idx += size
+        return sample
+    def get_sample_data(self, idx: int) -> bytes:
+        """Get the raw sample data at the index.
+        Args:
+            idx (int): Sample index.
+        Returns:
+            bytes: Sample data.
+        """
+        filename = os.path.join(self.dirname, self.split, self.raw_data.basename)
+        offset = (1 + idx) * 4
+        with open(filename, 'rb', 0) as fp:
+            fp.seek(offset)
+            pair = fp.read(8)
+            begin, end = np.frombuffer(pair, np.uint32)
+            fp.seek(begin)
+            data = fp.read(end - begin)
+        if not data:
+            raise IndexError(
+                f'Relative sample index {idx} is not present in the {self.raw_data.basename} file.'
+            )
+        if self.sample_compression:
+            data = decompress(self.sample_compression, data)
+        return data
+_readers["mds"] = MDSReader

{mldataforge-0.1.6 → mldataforge-0.1.7}/mldataforge/utils.py RENAMED Viewed

@@ -23,6 +23,11 @@ __all__ = [
     "save_parquet",
 ]
+_NO_PROGESS = False
+def set_progress(value):
+    global _NO_PROGESS
+    _NO_PROGESS = value
 def _batch_iterable(iterable, batch_size):
     batch = []
     for item in iterable:
@@ -73,7 +78,7 @@ def _infer_mds_encoding(value):
     return 'pkl'
 def _streaming_jsonl(jsonl_files, compressions):
-    for jsonl_file, compression in tqdm(zip(jsonl_files, compressions), desc="Loading JSONL files", unit="file"):
+    for jsonl_file, compression in tqdm(zip(jsonl_files, compressions), desc="Loading JSONL files", unit="file", disable=_NO_PROGESS):
         for line in open_compression(jsonl_file, mode="rt", compression=compression):
             yield json.loads(line)
@@ -109,7 +114,7 @@ def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True
 def save_jsonl(iterable, output_file, compression=None, processes=64, size_hint=None, overwrite=True, yes=True):
     f = None
     part = 0
-    for item in tqdm(iterable, desc="Writing to JSONL", unit="sample"):
+    for item in tqdm(iterable, desc="Writing to JSONL", unit="sample", disable=_NO_PROGESS):
         if f is None:
             part_file = output_file.format(part=part)
             check_arguments(part_file, overwrite, yes)
@@ -127,7 +132,7 @@ def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pig
     writer = None
     part = 0
     files = []
-    for sample in tqdm(it, desc="Writing to MDS", unit="sample"):
+    for sample in tqdm(it, desc="Writing to MDS", unit="sample", disable=_NO_PROGESS):
         if writer is None:
             part_dir = output_dir.format(part=part)
             check_arguments(part_dir, overwrite, yes)
@@ -151,7 +156,7 @@ def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pig
             name2info = {shard["raw_data"]["basename"]: shard for shard in index["shards"]}
             file_names = [file for file in os.listdir(output_dir) if file.endswith(".mds")]
             assert set(file_names) == set(name2info.keys())
-            for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file"):
+            for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file", disable=_NO_PROGESS):
                 compressed_file_name = file_name + ".gz"
                 file_path = os.path.join(output_dir, file_name)
                 compressed_file_path = os.path.join(output_dir, compressed_file_name)
@@ -169,7 +174,7 @@ def save_parquet(it, output_file, compression=None, batch_size=2**16, size_hint=
     compression = determine_compression("parquet", output_file, compression)
     writer = None
     part = 0
-    it = tqdm(it, desc="Writing to Parquet", unit="sample")
+    it = tqdm(it, desc="Writing to Parquet", unit="sample", disable=_NO_PROGESS)
     for batch in _batch_iterable(it, batch_size):
         table = pa.Table.from_pylist(batch)
         if writer is None:

{mldataforge-0.1.6 → mldataforge-0.1.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "mldataforge"
-version = "0.1.6"
+version = "0.1.7"
 authors = [
   { name = "Peter Schneider-Kamp" }
 ]