PyPI - replay-rec - Versions diffs - 0.20.3__py3-none-any.whl → 0.21.0__py3-none-any.whl - Mend

replay-rec 0.20.3py3-none-any.whl → 0.21.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (119) hide show

replay/__init__.py +1 -1
replay/data/dataset.py +11 -0
replay/data/nn/__init__.py +3 -0
replay/data/nn/parquet/__init__.py +22 -0
replay/data/nn/parquet/collate.py +29 -0
replay/data/nn/parquet/constants/__init__.py +0 -0
replay/data/nn/parquet/constants/batches.py +8 -0
replay/data/nn/parquet/constants/device.py +3 -0
replay/data/nn/parquet/constants/filesystem.py +3 -0
replay/data/nn/parquet/constants/metadata.py +5 -0
replay/data/nn/parquet/fixed_batch_dataset.py +157 -0
replay/data/nn/parquet/impl/__init__.py +0 -0
replay/data/nn/parquet/impl/array_1d_column.py +140 -0
replay/data/nn/parquet/impl/array_2d_column.py +160 -0
replay/data/nn/parquet/impl/column_protocol.py +17 -0
replay/data/nn/parquet/impl/indexing.py +123 -0
replay/data/nn/parquet/impl/masking.py +20 -0
replay/data/nn/parquet/impl/named_columns.py +100 -0
replay/data/nn/parquet/impl/numeric_column.py +110 -0
replay/data/nn/parquet/impl/utils.py +17 -0
replay/data/nn/parquet/info/__init__.py +0 -0
replay/data/nn/parquet/info/distributed_info.py +40 -0
replay/data/nn/parquet/info/partitioning.py +132 -0
replay/data/nn/parquet/info/replicas.py +67 -0
replay/data/nn/parquet/info/worker_info.py +43 -0
replay/data/nn/parquet/iterable_dataset.py +119 -0
replay/data/nn/parquet/iterator.py +61 -0
replay/data/nn/parquet/metadata/__init__.py +19 -0
replay/data/nn/parquet/metadata/metadata.py +116 -0
replay/data/nn/parquet/parquet_dataset.py +176 -0
replay/data/nn/parquet/parquet_module.py +178 -0
replay/data/nn/parquet/partitioned_iterable_dataset.py +56 -0
replay/data/nn/parquet/utils/__init__.py +0 -0
replay/data/nn/parquet/utils/compute_length.py +66 -0
replay/data/nn/schema.py +12 -14
replay/data/nn/sequence_tokenizer.py +5 -0
replay/data/nn/sequential_dataset.py +4 -0
replay/data/nn/torch_sequential_dataset.py +5 -0
replay/data/utils/__init__.py +0 -0
replay/data/utils/batching.py +69 -0
replay/data/utils/typing/__init__.py +0 -0
replay/data/utils/typing/dtype.py +65 -0
replay/metrics/torch_metrics_builder.py +20 -14
replay/models/nn/loss/sce.py +2 -7
replay/models/nn/optimizer_utils/__init__.py +6 -1
replay/models/nn/optimizer_utils/optimizer_factory.py +15 -0
replay/models/nn/sequential/bert4rec/dataset.py +70 -29
replay/models/nn/sequential/bert4rec/lightning.py +97 -36
replay/models/nn/sequential/bert4rec/model.py +11 -11
replay/models/nn/sequential/callbacks/prediction_callbacks.py +50 -8
replay/models/nn/sequential/callbacks/validation_callback.py +23 -6
replay/models/nn/sequential/compiled/base_compiled_model.py +12 -4
replay/models/nn/sequential/compiled/bert4rec_compiled.py +15 -5
replay/models/nn/sequential/compiled/sasrec_compiled.py +16 -7
replay/models/nn/sequential/postprocessors/_base.py +5 -0
replay/models/nn/sequential/postprocessors/postprocessors.py +4 -0
replay/models/nn/sequential/sasrec/dataset.py +81 -26
replay/models/nn/sequential/sasrec/lightning.py +86 -24
replay/models/nn/sequential/sasrec/model.py +14 -9
replay/nn/__init__.py +8 -0
replay/nn/agg.py +109 -0
replay/nn/attention.py +158 -0
replay/nn/embedding.py +283 -0
replay/nn/ffn.py +135 -0
replay/nn/head.py +49 -0
replay/nn/lightning/__init__.py +1 -0
replay/nn/lightning/callback/__init__.py +9 -0
replay/nn/lightning/callback/metrics_callback.py +183 -0
replay/nn/lightning/callback/predictions_callback.py +314 -0
replay/nn/lightning/module.py +123 -0
replay/nn/lightning/optimizer.py +60 -0
replay/nn/lightning/postprocessor/__init__.py +2 -0
replay/nn/lightning/postprocessor/_base.py +51 -0
replay/nn/lightning/postprocessor/seen_items.py +83 -0
replay/nn/lightning/scheduler.py +91 -0
replay/nn/loss/__init__.py +22 -0
replay/nn/loss/base.py +197 -0
replay/nn/loss/bce.py +216 -0
replay/nn/loss/ce.py +317 -0
replay/nn/loss/login_ce.py +373 -0
replay/nn/loss/logout_ce.py +230 -0
replay/nn/mask.py +87 -0
replay/nn/normalization.py +9 -0
replay/nn/output.py +37 -0
replay/nn/sequential/__init__.py +9 -0
replay/nn/sequential/sasrec/__init__.py +7 -0
replay/nn/sequential/sasrec/agg.py +53 -0
replay/nn/sequential/sasrec/diff_transformer.py +125 -0
replay/nn/sequential/sasrec/model.py +377 -0
replay/nn/sequential/sasrec/transformer.py +107 -0
replay/nn/sequential/twotower/__init__.py +2 -0
replay/nn/sequential/twotower/model.py +674 -0
replay/nn/sequential/twotower/reader.py +89 -0
replay/nn/transform/__init__.py +22 -0
replay/nn/transform/copy.py +38 -0
replay/nn/transform/grouping.py +39 -0
replay/nn/transform/negative_sampling.py +182 -0
replay/nn/transform/next_token.py +100 -0
replay/nn/transform/rename.py +33 -0
replay/nn/transform/reshape.py +41 -0
replay/nn/transform/sequence_roll.py +48 -0
replay/nn/transform/template/__init__.py +2 -0
replay/nn/transform/template/sasrec.py +53 -0
replay/nn/transform/template/twotower.py +22 -0
replay/nn/transform/token_mask.py +69 -0
replay/nn/transform/trim.py +51 -0
replay/nn/utils.py +28 -0
replay/preprocessing/filters.py +128 -0
replay/preprocessing/label_encoder.py +36 -33
replay/preprocessing/utils.py +209 -0
replay/splitters/__init__.py +1 -0
replay/splitters/random_next_n_splitter.py +224 -0
replay/utils/common.py +10 -4
{replay_rec-0.20.3.dist-info → replay_rec-0.21.0.dist-info}/METADATA +3 -3
replay_rec-0.21.0.dist-info/RECORD +223 -0
replay_rec-0.20.3.dist-info/RECORD +0 -138
{replay_rec-0.20.3.dist-info → replay_rec-0.21.0.dist-info}/WHEEL +0 -0
{replay_rec-0.20.3.dist-info → replay_rec-0.21.0.dist-info}/licenses/LICENSE +0 -0
{replay_rec-0.20.3.dist-info → replay_rec-0.21.0.dist-info}/licenses/NOTICE +0 -0

replay/data/nn/parquet/iterable_dataset.py ADDED Viewed

@@ -0,0 +1,119 @@
+from collections.abc import Iterator
+from typing import Optional
+import torch
+import torch.utils.data as data
+from replay.data.nn.parquet import DEFAULT_REPLICAS_INFO
+from replay.data.utils.batching import UniformBatching, uniform_batch_count
+from .impl.named_columns import NamedColumns
+from .info.partitioning import Partitioning, partitioning_per_replica
+from .info.replicas import ReplicasInfoProtocol
+Batch = dict[str, torch.Tensor]
+def validate_batch_size(batch_size: int) -> int:
+    if batch_size <= 0:
+        msg = f"batch_size must be a positive integer. Got {batch_size=}"
+        raise ValueError(msg)
+    return batch_size
+class IterableDataset(data.IterableDataset):
+    """
+    An iterable dataset used for processing a single partition of data.
+    Supports distributed training, where data is divided between replicas, and reproducible random shuffling.
+    A replica is a worker or a set of workers for which a unique chunk of data will be assigned
+    during distributed training/inference.
+    """
+    def __init__(
+        self,
+        named_columns: NamedColumns,
+        batch_size: int,
+        generator: Optional[torch.Generator] = None,
+        replicas_info: ReplicasInfoProtocol = DEFAULT_REPLICAS_INFO,
+    ) -> None:
+        """
+        :param named_columns: Structured data presented as columns.
+        :param batch_size: Batch size.
+        :param generator: Random number generator for batch shuffling.
+            If ``None``, shuffling will be disabled. Default: ``None``.
+        :param replicas_info: A connector object capable of fetching total replica count and replica id during runtime.
+            Default: value of ``DEFAULT_REPLICAS_INFO`` - a pre-built connector which assumes standard Torch DDP mode.
+        """
+        super().__init__()
+        self.named_columns = named_columns
+        self.generator = generator
+        self.replicas_info = replicas_info
+        self.batch_size = validate_batch_size(batch_size)
+    @property
+    def device(self) -> torch.device:
+        """Returns the device containing the dataset."""
+        return self.named_columns.device
+    @property
+    def full_length(self) -> int:
+        """Returns the total amount of elements in `named_columns`."""
+        return self.named_columns.length
+    @property
+    def length_per_replica(self) -> int:
+        """Returns the total number of available elements per replica."""
+        full_length = self.named_columns.length
+        num_replicas = self.replicas_info.num_replicas
+        return partitioning_per_replica(full_length, num_replicas)
+    @property
+    def length(self) -> int:
+        """Returns the total number of batches available to the current replica."""
+        batch_size = self.batch_size
+        per_replica = self.length_per_replica
+        return uniform_batch_count(per_replica, batch_size)
+    def __len__(self) -> int:
+        """Returns the total number of batches in a dataset."""
+        return self.length
+    def get_indices(self) -> torch.LongTensor:
+        """
+        Generates indices corresponding to data assigned to current replica.
+        :return: tensor containing relevant indices.
+        """
+        partitioning = Partitioning(
+            curr_replica=self.replicas_info.curr_replica,
+            num_replicas=self.replicas_info.num_replicas,
+            device=self.named_columns.device,
+            generator=self.generator,
+        )
+        indices = partitioning(self.full_length)
+        assert self.length_per_replica == torch.numel(indices)
+        return indices
+    def get_batching(self) -> UniformBatching:
+        """
+        Creates a partitioning object which splits data into batches.
+        :return: The partitioning object.
+        """
+        batching = UniformBatching(
+            length=self.length_per_replica,
+            batch_size=self.batch_size,
+        )
+        assert len(batching) == self.length
+        return batching
+    def __iter__(self) -> Iterator[Batch]:
+        """Batched data iterator."""
+        batching = self.get_batching()
+        indices = self.get_indices()
+        for first, last in iter(batching):
+            batch_ids = indices[first:last]
+            yield self.named_columns[batch_ids]

replay/data/nn/parquet/iterator.py ADDED Viewed

@@ -0,0 +1,61 @@
+from collections.abc import Iterator
+from typing import Any, Callable, Optional
+import pyarrow.dataset as da
+import torch
+from replay.data.nn.parquet.constants.device import DEFAULT_DEVICE
+from replay.data.nn.parquet.impl.masking import DEFAULT_MAKE_MASK_NAME
+from .impl.array_1d_column import to_array_1d_columns
+from .impl.array_2d_column import to_array_2d_columns
+from .impl.named_columns import NamedColumns
+from .impl.numeric_column import to_numeric_columns
+from .metadata import Metadata
+class BatchesIterator:
+    """Iterator for batch-wise extraction of data from a Parquet dataset with conversion to structured columns."""
+    def __init__(
+        self,
+        metadata: Metadata,
+        dataset: da.Dataset,
+        batch_size: int,
+        make_mask_name: Callable[[str], str] = DEFAULT_MAKE_MASK_NAME,
+        device: torch.device = DEFAULT_DEVICE,
+        pyarrow_kwargs: Optional[dict[str, Any]] = None,
+    ) -> None:
+        """
+        :param metadata: Metadata describing the structure and types of input data.
+        :param dataset: Pyarrow dataset implementing the ``to_batches`` method.
+        :param batch_size: Batch size sampled from a single partition.
+            Resulting batch will not always match it in size due to mismatches between
+            the target batch size and the partition size.
+        :param make_mask_name: Mask name generation function. Default: value of ``DEFAULT_MAKE_MASK_NAME``.
+        :param device: The device on which the data will be generated. Defaults: value of ``DEFAULT_DEVICE``.
+        :param pyarrow_kwargs: Additional parameters for PyArrow dataset's ``to_batches`` method. Default: ``None``.
+        """
+        if pyarrow_kwargs is None:
+            pyarrow_kwargs = {}
+        self.dataset = dataset
+        self.metadata = metadata
+        self.batch_size = batch_size
+        self.make_mask_name = make_mask_name
+        self.device = device
+        self.pyarrow_kwargs = pyarrow_kwargs
+    def __iter__(self) -> Iterator[NamedColumns]:
+        for batch in self.dataset.to_batches(
+            batch_size=self.batch_size,
+            columns=list(self.metadata.keys()),
+            **self.pyarrow_kwargs,
+        ):
+            yield NamedColumns(
+                columns={
+                    **to_numeric_columns(batch, self.metadata, self.device),
+                    **to_array_1d_columns(batch, self.metadata, self.device),
+                    **to_array_2d_columns(batch, self.metadata, self.device),
+                },
+                make_mask_name=self.make_mask_name,
+            )

replay/data/nn/parquet/metadata/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from .metadata import (
+    ColumnMetadata,
+    Metadata,
+    get_1d_array_columns,
+    get_2d_array_columns,
+    get_numeric_columns,
+    get_padding,
+    get_shape,
+)
+__all__ = [
+    "ColumnMetadata",
+    "Metadata",
+    "get_1d_array_columns",
+    "get_2d_array_columns",
+    "get_numeric_columns",
+    "get_padding",
+    "get_shape",
+]

replay/data/nn/parquet/metadata/metadata.py ADDED Viewed

@@ -0,0 +1,116 @@
+from collections.abc import Callable
+from typing import Any, Union
+from typing_extensions import TypeAlias
+from replay.data.nn.parquet.constants.metadata import (
+    DEFAULT_PADDING,
+    PADDING_FLAG,
+    SHAPE_FLAG,
+)
+FieldType: TypeAlias = Union[bool, int, float, str]
+ColumnMetadata: TypeAlias = dict[str, FieldType]
+Metadata: TypeAlias = dict[str, ColumnMetadata]
+ColumnCheck: TypeAlias = Callable[[ColumnMetadata], bool]
+CheckColumn: TypeAlias = Callable[[ColumnCheck], bool]
+Listing: TypeAlias = Callable[[Metadata], list[str]]
+def make_shape_check(dim: int) -> ColumnCheck:
+    """
+    Constructs a function which checks a column's shape.
+    :param dim: Target number of dimensions.
+    """
+    def function(column_metadata: ColumnMetadata) -> bool:
+        if SHAPE_FLAG in column_metadata:
+            value: Any = column_metadata[SHAPE_FLAG]
+            if dim == 1 and isinstance(value, int):
+                return True
+            if isinstance(value, list):
+                result: bool = len(value) == dim
+                if result:
+                    def is_int(v: Any) -> bool:
+                        return isinstance(v, int)
+                    result &= all(map(is_int, value))
+                return result
+        return False
+    return function
+def make_not_check(check: ColumnCheck) -> ColumnCheck:
+    def function(column_metadata: ColumnCheck) -> bool:
+        return not check(column_metadata)
+    return function
+def all_column_checks(*checks: ColumnCheck) -> ColumnCheck:
+    def function(column_metadata: ColumnMetadata) -> bool:
+        def perform_check(check):
+            return check(column_metadata)
+        return all(map(perform_check, checks))
+    return function
+is_array_1d = all_column_checks(make_shape_check(dim=1))
+is_array_2d = all_column_checks(make_shape_check(dim=2))
+is_number = all_column_checks(
+    make_not_check(is_array_1d),
+    make_not_check(is_array_2d),
+)
+def make_listing(check: ColumnCheck) -> Listing:
+    """
+    Filtering function for selecting columns that pass the provided check.
+    :param check: Check function to validate agains.
+    """
+    def function(metadata: Metadata) -> list[str]:
+        result: list[str] = []
+        for col_name, col_meta in metadata.items():
+            if check(col_meta):
+                result.append(col_name)
+        return sorted(result)
+    return function
+get_1d_array_columns = make_listing(is_array_1d)
+get_2d_array_columns = make_listing(is_array_2d)
+get_numeric_columns = make_listing(is_number)
+def get_padding(metadata: Metadata, column_name: str) -> Any:
+    if column_name not in metadata:
+        msg = f"Column {column_name} not found in metadata."
+        raise KeyError(msg)
+    return metadata[column_name].get(PADDING_FLAG, DEFAULT_PADDING)
+def get_shape(metadata: Metadata, column_name: str) -> list[int]:
+    if column_name not in metadata:
+        msg = f"Column {column_name} not found in metadata."
+        raise KeyError(msg)
+    if is_number(metadata[column_name]):
+        msg = f"Column {column_name} is not an array."
+        raise ValueError(msg)
+    result: Any = metadata[column_name][SHAPE_FLAG]
+    array_res: list[Any] = result if isinstance(result, list) else [result]
+    for i in range(len(array_res)):
+        if array_res[i] < 1:
+            msg = f"Shape for column {column_name} at position {i} is not a positive integer."
+            raise ValueError(msg)
+    return result

replay/data/nn/parquet/parquet_dataset.py ADDED Viewed

@@ -0,0 +1,176 @@
+import warnings
+from collections.abc import Callable, Iterator
+from typing import Optional, Union, cast
+import pyarrow.dataset as ds
+import pyarrow.fs as fs
+import torch
+from torch.utils.data import IterableDataset
+from replay.data.nn.parquet import DEFAULT_REPLICAS_INFO
+from replay.data.nn.parquet.constants.batches import GeneralBatch, GeneralCollateFn
+from replay.data.nn.parquet.constants.device import DEFAULT_DEVICE
+from replay.data.nn.parquet.constants.filesystem import DEFAULT_FILESYSTEM
+from replay.data.nn.parquet.impl.masking import (
+    DEFAULT_COLLATE_FN,
+    DEFAULT_MAKE_MASK_NAME,
+)
+from replay.data.nn.parquet.info.replicas import ReplicasInfoProtocol
+from replay.data.nn.parquet.utils.compute_length import compute_fixed_size_length
+from .fixed_batch_dataset import FixedBatchSizeDataset
+from .iterator import BatchesIterator
+from .metadata import Metadata
+from .partitioned_iterable_dataset import PartitionedIterableDataset
+class ParquetDataset(IterableDataset):
+    """
+    Combination dataset and sampler for batch-wise reading and processing of Parquet files.
+    This implementation allows one to read data using a PyArrow Dataset, convert it into structured columns,
+    split it into partitions, and then into batches needed for model training.
+    Supports distributed training and reproducible random shuffling.
+    During data loader operation, a partition of size ``partition_size`` is read.
+    There may be situations where the size of the read partition is less than
+    ``partition_size`` - this depends on the number of rows in the data fragment.
+    A fragment is a single Parquet file in the file system.
+    The partition will be read by every worker, split according to their replica ID,
+    processed and the result will be returned as a batch of size ``batch_size``.
+    Please note that the resulting batch size may be less than ``batch_size``.
+    For maximum efficiency when reading and processing data, as well as imporved data shuffling,
+    it is recommended to set ``partition_size`` to several times larger than ``batch_size``.
+    **Note:**
+    *   ``ParquetDataset`` supports only numeric values (boolean/integer/float),
+        therefore, the data paths passed as arguments must contain encoded data.
+    *   For optimal performance, set the ``OMP_NUM_THREADS`` and ``ARROW_IO_THREADS`` to match
+        the number of available CPU cores.
+    """
+    def __init__(
+        self,
+        source: Union[str, list[str]],
+        metadata: Metadata,
+        partition_size: int,
+        batch_size: int,
+        filesystem: Union[str, fs.FileSystem] = DEFAULT_FILESYSTEM,
+        make_mask_name: Callable[[str], str] = DEFAULT_MAKE_MASK_NAME,
+        device: torch.device = DEFAULT_DEVICE,
+        generator: Optional[torch.Generator] = None,
+        replicas_info: ReplicasInfoProtocol = DEFAULT_REPLICAS_INFO,
+        collate_fn: GeneralCollateFn = DEFAULT_COLLATE_FN,
+        **kwargs,
+    ) -> None:
+        """
+        :param source: The path or list of paths to files/directories containing data in Parquet format.
+        :param metadata: Metadata describing the data structure.
+            The structure of each column is defined by the following values:
+                ``shape`` - the dimension of the column being read.
+                    If the column contains only one value, this parameter does not need to be specified.
+                    If the column contains a one-dimensional array, the parameter must be a number or an array
+                    containing one number.
+                    If the column contains a two-dimensional array, the parameter
+                    must be an array containing two numbers.
+                ``padding`` - padding value that will fill the arrays if their length is less
+                    than that specified in the `shape` parameter.
+        :param partition_size: Partition size when reading data from Parquet files.
+        :param batch_size: The size of the batch that will be returned during iteration.
+        :param filesystem: A PyArrow's Filesystem object used to access data, or a URI-based path
+            to infer the filesystem from. Default: value of ``DEFAULT_FILESYSTEM``.
+        :param make_mask_name: Mask name generation function. Default: value of ``DEFAULT_MAKE_MASK_NAME``.
+        :param device: The device on which the data will be generated. Defaults: value of ``DEFAULT_DEVICE``.
+        :param generator: Random number generator for batch shuffling.
+            If ``None``, shuffling will be disabled. Default: ``None``.
+        :param replicas_info: A connector object capable of fetching total replica count and replica id during runtime.
+            Default: value of ``DEFAULT_REPLICAS_INFO`` - a pre-built connector which assumes standard Torch DDP mode.
+            ``torch.utils.data`` and ``torch.distributed`` modules.
+        :param collate_fn: Collate function for merging batches. Default: value of ``DEFAULT_COLLATE_FN``.
+        """
+        if partition_size // batch_size < 20:
+            msg = (
+                "Suboptimal parameters: partition to batch size ratio too low. "
+                "Recommended proportion of partition size to batch size is at least 20:1. "
+                f"Got: {partition_size=}, {batch_size=}."
+            )
+            warnings.warn(msg, stacklevel=2)
+        if (partition_size % batch_size) != 0:
+            msg = (
+                "Suboptimal parameters: partition size is not multiple of batch size. "
+                f"Got: {partition_size=}, {batch_size=}."
+            )
+            warnings.warn(msg, stacklevel=2)
+        if isinstance(filesystem, str):
+            filesystem, _ = fs.FileSystem.from_uri(filesystem)
+        assert isinstance(filesystem, fs.FileSystem)
+        self.filesystem = cast(fs.FileSystem, filesystem)
+        self.pyarrow_dataset = ds.dataset(
+            source,
+            filesystem=self.filesystem,
+            format="parquet",
+            **kwargs.get("pyarrow_dataset_kwargs", {}),
+        )
+        self.batch_size = batch_size
+        self.partition_size = partition_size
+        self.replicas_info = replicas_info
+        self.metadata = metadata
+        self.iterator = BatchesIterator(
+            dataset=self.pyarrow_dataset,
+            metadata=self.metadata,
+            batch_size=partition_size,
+            device=device,
+            make_mask_name=make_mask_name,
+            pyarrow_kwargs=kwargs.get("pyarrow_to_batches_kwargs", {}),
+        )
+        self.raw_dataset = PartitionedIterableDataset(
+            batch_size=batch_size,
+            iterable=self.iterator,
+            generator=generator,
+            replicas_info=replicas_info,
+        )
+        self.dataset = FixedBatchSizeDataset(
+            dataset=self.raw_dataset,
+            batch_size=batch_size,
+            collate_fn=collate_fn,
+        )
+        self.do_compute_length = True
+        self.cached_lengths: dict[int, int] = {}
+    def compute_length(self) -> int:
+        """Returns the length of the dataset counted in fixed-size batches."""
+        num_replicas = self.replicas_info.num_replicas
+        if num_replicas not in self.cached_lengths:
+            if len(self.cached_lengths) > 0:
+                msg = "`num_replicas` changed. Unable to reuse cached length."
+                warnings.warn(msg, stacklevel=2)
+            curr_length = compute_fixed_size_length(
+                iterable=self.iterator,
+                num_replicas=num_replicas,
+                batch_size=self.batch_size,
+            )
+            self.cached_lengths[num_replicas] = curr_length
+        return self.cached_lengths[num_replicas]
+    def __len__(self) -> int:
+        if self.do_compute_length:
+            return self.compute_length()
+        msg = "This instance doesn't support `len()` method. You can enable it by setting `do_compute_length=True`."
+        raise TypeError(msg)
+    def __iter__(self) -> Iterator[GeneralBatch]:
+        return iter(self.dataset)

replay/data/nn/parquet/parquet_module.py ADDED Viewed

@@ -0,0 +1,178 @@
+import copy
+import warnings
+from collections.abc import Iterable
+from typing import Literal, Optional, Union, get_args
+import lightning as L  # noqa: N812
+import torch
+from lightning.pytorch.trainer.states import RunningStage
+from lightning.pytorch.utilities import CombinedLoader
+from typing_extensions import TypeAlias, override
+from replay.data.nn.parquet.constants.filesystem import DEFAULT_FILESYSTEM
+from replay.data.nn.parquet.impl.masking import (
+    DEFAULT_COLLATE_FN,
+    DEFAULT_MAKE_MASK_NAME,
+    DEFAULT_REPLICAS_INFO,
+)
+from replay.data.nn.parquet.parquet_dataset import ParquetDataset
+TransformStage: TypeAlias = Literal["train", "validate", "test", "predict"]
+DEFAULT_CONFIG = {"train": {"generator": torch.default_generator}}
+class ParquetModule(L.LightningDataModule):
+    """
+    Standardized DataModule with batch-wise support via `ParquetDataset`.
+    Allows for unified access to all data splits across the training/inference pipeline without loading
+    full dataset into memory. See the :ref:`parquet-processing` section for details.
+    ParquetModule provides per batch data loading and preprocessing via transform pipelines.
+    See the :ref:`Transforms` section for getting info about available batch transforms.
+    **Note:**
+    *   ``ParquetModule`` supports only numeric values (boolean/integer/float),
+        therefore, the data paths passed as arguments must contain encoded data.
+    *   For optimal performance, set the OMP_NUM_THREADS and ARROW_IO_THREADS to match
+        the number of available CPU cores.
+    *   It's possible to use all train/validate/test/predict splits, then paths to splits should be passed
+        as corresponding arguments of ``ParquetModule``.
+        Alternatively, all the paths to the splits may be not specified
+        but then do not forget to configure the Pytorch Lightning Trainer's instance accordingly.
+        For example, if you don't want use validation data, you are able not to set ``validate_path`` parameter
+        in ``ParquetModule`` and set ``limit_val_batches=0`` in Ligthning.Trainer.
+    """
+    def __init__(
+        self,
+        batch_size: int,
+        metadata: dict,
+        transforms: dict[TransformStage, list[torch.nn.Module]],
+        config: Optional[dict] = None,
+        *,
+        train_path: Optional[str] = None,
+        validate_path: Optional[Union[str, list[str]]] = None,
+        test_path: Optional[Union[str, list[str]]] = None,
+        predict_path: Optional[Union[str, list[str]]] = None,
+    ) -> None:
+        """
+        :param batch_size: Target batch size.
+        :param metadata: A dictionary that each data split maps to a dictionary of feature names
+            with each feature is associated with its shape and padding_value.\n
+            Example: {"train": {"item_id" : {"shape": 100, "padding_value": 7657}}}.\n
+            For details, see the section :ref:`parquet-processing`.
+        :param config: Dict specifying configuration options of ``ParquetDataset`` (generator,
+            filesystem, collate_fn, make_mask_name, replicas_info) for each data split.
+            Default: ``DEFAULT_CONFIG``.\n
+            In most scenarios, the default configuration is sufficient.
+        :param transforms: Dict specifying sequence of Transform modules for each data split.
+        :param train_path: Path to the Parquet file containing train data split. Default: ``None``.
+        :param validate_path: Path to the Parquet file or files containing validation data split. Default: ``None``.
+        :param test_path: Path to the Parquet file or files containing testing data split. Default: ``None``.
+        :param predict_path: Path to the Parquet file or files containing prediction data split. Default: ``None``.
+        """
+        if not any([train_path, validate_path, test_path, predict_path]):
+            msg = (
+                f"{type(self)}.__init__() expects at least one of "
+                "['train_path', 'val_path', 'test_path', 'predict_path], but none were provided."
+            )
+            raise KeyError(msg)
+        if train_path and not isinstance(train_path, str) and isinstance(train_path, Iterable):
+            msg = "'train_path' does not support multiple datapaths."
+            raise TypeError(msg)
+        super().__init__()
+        if config is None:
+            config = DEFAULT_CONFIG
+        self.datapaths = {"train": train_path, "validate": validate_path, "test": test_path, "predict": predict_path}
+        missing_splits = [split_name for split_name, split_path in self.datapaths.items() if split_path is None]
+        if missing_splits:
+            msg = (
+                f"The following dataset paths aren't provided: {','.join(missing_splits)}."
+                "Make sure to disable these stages in your Lightning Trainer configuration."
+            )
+            warnings.warn(msg, stacklevel=2)
+        self.metadata = copy.deepcopy(metadata)
+        self.batch_size = batch_size
+        self.config = config
+        self.datasets: dict[str, Union[ParquetDataset, CombinedLoader]] = {}
+        self.transforms = transforms
+        self.compiled_transforms = self.prepare_transforms(transforms)
+    def prepare_transforms(
+        self, transforms: dict[TransformStage, list[torch.nn.Module]]
+    ) -> dict[TransformStage, torch.nn.Sequential]:
+        """
+        Preform meta adjustments based on provided transform pipelines,
+        then compile each subset into a `torch.nn.Sequential` module.
+        :param: transforms: Python dict where keys are names of stage (train, validate, test, predict)
+            and values are corresponding transform pipelines for every stage.
+        :returns: out: Compiled transform pipelines.
+        """
+        if not any(subset in get_args(TransformStage) for subset in transforms):
+            msg = (
+                f"Expected transform.keys()={list(transforms.keys())} to contain at least "
+                f"one of {get_args(TransformStage)}, but none were found."
+            )
+            raise KeyError(msg)
+        compiled_transorms = {}
+        for subset, transform_set in transforms.items():
+            compiled_transorms[subset] = torch.nn.Sequential(*transform_set)
+        return compiled_transorms
+    @override
+    def setup(self, stage):
+        for subset in get_args(TransformStage):
+            subset_datapaths = self.datapaths.get(subset, None)
+            if subset_datapaths is not None:
+                subset_config = self.config.get(subset, {})
+                shared_kwargs = {
+                    "metadata": self.metadata[subset],
+                    "batch_size": self.batch_size,
+                    "partition_size": subset_config.get("partition_size", 2**17),
+                    "generator": subset_config.get("generator", None),
+                    "filesystem": subset_config.get("filesystem", DEFAULT_FILESYSTEM),
+                    "make_mask_name": subset_config.get("make_mask_name", DEFAULT_MAKE_MASK_NAME),
+                    "replicas_info": subset_config.get("replicas_info", DEFAULT_REPLICAS_INFO),
+                    "collate_fn": subset_config.get("collate_fn", DEFAULT_COLLATE_FN),
+                }
+                if isinstance(subset_datapaths, list):
+                    loaders = [ParquetDataset(**{"source": path, **shared_kwargs}) for path in subset_datapaths]
+                    self.datasets[subset] = CombinedLoader(loaders, mode="sequential")
+                else:
+                    self.datasets[subset] = ParquetDataset(**{"source": subset_datapaths, **shared_kwargs})
+    @override
+    def train_dataloader(self):
+        return self.datasets["train"]
+    @override
+    def val_dataloader(self):
+        return self.datasets["validate"]
+    @override
+    def test_dataloader(self):
+        return self.datasets["test"]
+    @override
+    def predict_dataloader(self):
+        return self.datasets["predict"]
+    @override
+    def on_after_batch_transfer(self, batch, _dataloader_idx):
+        stage = self.trainer.state.stage
+        target = RunningStage.VALIDATING if stage is RunningStage.SANITY_CHECKING else stage
+        return self.compiled_transforms[str(target.value)](batch)

replay-rec 0.20.3__py3-none-any.whl → 0.21.0__py3-none-any.whl

replay-rec 0.20.3py3-none-any.whl → 0.21.0py3-none-any.whl