PyPI - replay-rec - Versions diffs - 0.20.3rc0__tar.gz → 0.21.0rc0__tar.gz - Mend

replay-rec 0.20.3rc0tar.gz → 0.21.0rc0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (279) hide show

{replay_rec-0.20.3rc0 → replay_rec-0.21.0rc0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: replay-rec
-Version: 0.20.3rc0
+Version: 0.21.0rc0
 Summary: RecSys Library
 License-Expression: Apache-2.0
 License-File: LICENSE
@@ -30,7 +30,7 @@ Requires-Dist: sb-obp (>=0.5.10,<0.6)
 Requires-Dist: scikit-learn (>=1.6.1,<1.7.0)
 Requires-Dist: scipy (>=1.8.1,<2.0.0)
 Requires-Dist: setuptools
-Requires-Dist: torch (>=1.8,<2.9.0)
+Requires-Dist: torch (>=1.8,<3.0.0)
 Requires-Dist: tqdm (>=4.67,<5)
 Project-URL: Homepage, https://sb-ai-lab.github.io/RePlay/
 Project-URL: Repository, https://github.com/sb-ai-lab/RePlay
@@ -231,7 +231,7 @@ pip install optuna
 2) Model compilation via OpenVINO:
 ```bash
-pip install openvino onnx
+pip install openvino onnx onnxscript
 ```
 3) Vector database and hierarchical search support:

{replay_rec-0.20.3rc0 → replay_rec-0.21.0rc0}/README.md RENAMED Viewed

@@ -193,7 +193,7 @@ pip install optuna
 2) Model compilation via OpenVINO:
 ```bash
-pip install openvino onnx
+pip install openvino onnx onnxscript
 ```
 3) Vector database and hierarchical search support:

{replay_rec-0.20.3rc0 → replay_rec-0.21.0rc0}/pyproject.toml RENAMED Viewed

@@ -1,11 +1,36 @@
 [build-system]
 requires = [
-    "poetry-core>=2.0.0",
+    "poetry-core>=2.2.1",
     "poetry-dynamic-versioning>=1.0.0,<2.0.0",
     "setuptools",
 ]
 build-backend = "poetry_dynamic_versioning.backend"
+[dependency-groups]
+dev = [
+    "coverage-conditional-plugin (>=0.9, <1)",
+    "jupyter (>=1.0, <1.1)",
+    "jupyterlab (>=3.6, <4)",
+    "pyarrow-stubs",
+    "pytest (>=7.1.0)",
+    "pytest-mock (>3.15, <4.0)",
+    "pytest-cov (>=3.0)",
+    "statsmodels (>=0.14, <0.15)",
+    "black (>=23.3.0)",
+    "ruff (>=0.0.261)",
+    "hypothesis",
+    "toml-sort (>=0.23, <0.24)",
+    "sphinx (==5.3.0)",
+    "sphinx-rtd-theme (==1.2.2)",
+    "sphinx-autodoc-typehints (==1.23.0)",
+    "sphinx-enum-extend (==0.1.3)",
+    "myst-parser (==1.0.0)",
+    "ghp-import (==2.1.0)",
+    "docutils (==0.16)",
+    "data-science-types (==0.2.23)",
+    "filelock (>=3.14, <3.15)",
+]
 [project]
 name = "replay-rec"
 license = "Apache-2.0"
@@ -40,7 +65,7 @@ dependencies = [
     "scikit-learn (>=1.6.1,<1.7.0)",
     "pyarrow (<22.0)",
     "tqdm (>=4.67,<5)",
-    "torch (>=1.8,<2.9.0)",
+    "torch (>=1.8,<3.0.0)",
     "lightning (>=2.0.2,<=2.4.0)",
     "pytorch-optimizer (>=3.8.0,<4)",
     "lightautoml (>=0.4.1,<0.5)",
@@ -52,7 +77,7 @@ dependencies = [
     "psutil (<=7.0.0)",
 ]
 dynamic = ["dependencies"]
-version = "0.20.3.preview"
+version = "0.21.0.preview"
 [project.urls]
 homepage = "https://sb-ai-lab.github.io/RePlay/"
@@ -68,29 +93,14 @@ exclude = [
     "replay/conftest.py",
 ]
-[tool.poetry.group.dev.dependencies]
-coverage-conditional-plugin = "^0.9.0"
-jupyter = "~1.0.0"
-jupyterlab = "^3.6.0"
-pytest = ">=7.1.0"
-pytest-cov = ">=3.0.0"
-statsmodels = "~0.14.0"
-black = ">=23.3.0"
-ruff = ">=0.0.261"
-toml-sort = "^0.23.0"
-sphinx = "5.3.0"
-sphinx-rtd-theme = "1.2.2"
-sphinx-autodoc-typehints = "1.23.0"
-sphinx-enum-extend = "0.1.3"
-myst-parser = "1.0.0"
-ghp-import = "2.1.0"
-docutils = "0.16"
-data-science-types = "0.2.23"
-filelock = "~3.14.0"
+[[tool.poetry.source]]
+name = "torch-cpu-mirror"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "explicit"
 [tool.poetry-dynamic-versioning]
 enable = false
-format-jinja = """0.20.3{{ env['PACKAGE_SUFFIX'] }}"""
+format-jinja = """0.21.0{{ env['PACKAGE_SUFFIX'] }}"""
 vcs = "git"
 [tool.ruff]

{replay_rec-0.20.3rc0 → replay_rec-0.21.0rc0}/replay/__init__.py RENAMED Viewed

@@ -4,4 +4,4 @@
 # functionality removed in Python 3.12 is used in downstream packages (like lightfm)
 import setuptools as _
-__version__ = "0.20.3.preview"
+__version__ = "0.21.0.preview"

{replay_rec-0.20.3rc0 → replay_rec-0.21.0rc0}/replay/data/dataset.py RENAMED Viewed

@@ -5,6 +5,7 @@
 from __future__ import annotations
 import json
+import warnings
 from collections.abc import Iterable, Sequence
 from pathlib import Path
 from typing import Callable, Optional, Union
@@ -45,6 +46,7 @@ class Dataset:
     ):
         """
         :param feature_schema: mapping of columns names and feature infos.
+            All features not specified in the schema will be assumed numerical by default.
         :param interactions: dataframe with interactions.
         :param query_features: dataframe with query features,
             defaults: ```None```.
@@ -498,6 +500,15 @@ class Dataset:
                 source=FeatureSource.QUERY_FEATURES,
                 feature_schema=updated_feature_schema,
             )
+        if filled_features:
+            msg = (
+                "The following features are present in the dataset but have not been specified "
+                f"by the feature schema: {[(info.column, info.feature_source.value) for info in filled_features]}. "
+                "These features will be interpreted as NUMERICAL."
+            )
+            warnings.warn(msg, stacklevel=2)
         return FeatureSchema(features_list=features_list + filled_features)
     def _fill_unlabeled_features_sources(self, feature_schema: FeatureSchema) -> list[FeatureInfo]:

{replay_rec-0.20.3rc0 → replay_rec-0.21.0rc0}/replay/data/nn/__init__.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from replay.utils import TORCH_AVAILABLE
 if TORCH_AVAILABLE:
+    from .parquet import ParquetDataset, ParquetModule
     from .schema import MutableTensorMap, TensorFeatureInfo, TensorFeatureSource, TensorMap, TensorSchema
     from .sequence_tokenizer import SequenceTokenizer
     from .sequential_dataset import PandasSequentialDataset, PolarsSequentialDataset, SequentialDataset
@@ -18,6 +19,8 @@ if TORCH_AVAILABLE:
         "DEFAULT_TRAIN_PADDING_VALUE",
         "MutableTensorMap",
         "PandasSequentialDataset",
+        "ParquetDataset",
+        "ParquetModule",
         "PolarsSequentialDataset",
         "SequenceTokenizer",
         "SequentialDataset",

replay_rec-0.21.0rc0/replay/data/nn/parquet/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""
+Implementation of the ``ParquetDataset`` and its internals.
+``ParquetDataset`` is combination of PyTorch-compatible dataset and sampler which enables
+training and inference of models on datasets of any arbitrary size by leveraging PyArrow
+Datasets to perform batch-wise reading and processing of data from disk.
+``ParquetDataset`` includes support for Pytorch's distributed training framework as well as
+access to remotely stored data via PyArrow's filesystem configs.
+"""
+from .info.replicas import DEFAULT_REPLICAS_INFO, ReplicasInfo, ReplicasInfoProtocol
+from .parquet_dataset import ParquetDataset
+from .parquet_module import ParquetModule
+__all__ = [
+    "DEFAULT_REPLICAS_INFO",
+    "ParquetDataset",
+    "ParquetModule",
+    "ReplicasInfo",
+    "ReplicasInfoProtocol",
+]

replay_rec-0.21.0rc0/replay/data/nn/parquet/collate.py ADDED Viewed

@@ -0,0 +1,29 @@
+from collections.abc import Sequence
+import torch
+from replay.data.nn.parquet.constants.batches import GeneralBatch, GeneralValue
+def dict_collate(batch: Sequence[dict[str, torch.Tensor]]) -> dict[str, torch.Tensor]:
+    """Simple collate function that converts a dict of values into a tensor dict."""
+    return {k: torch.cat([d[k] for d in batch], dim=0) for k in batch[0]}
+def general_collate(batch: Sequence[GeneralBatch]) -> GeneralBatch:
+    """General collate function that converts a nested dict of values into a tensor dict."""
+    result = {}
+    test_sample = batch[0]
+    if len(batch) == 1:
+        return test_sample
+    for key, test_value in test_sample.items():
+        values: Sequence[GeneralValue] = [sample[key] for sample in batch]
+        if torch.is_tensor(test_value):
+            result[key] = torch.cat(values, dim=0)
+        else:
+            assert isinstance(test_value, dict)
+            result[key] = general_collate(values)
+    return result

replay_rec-0.21.0rc0/replay/data/nn/parquet/constants/batches.py ADDED Viewed

@@ -0,0 +1,8 @@
+from typing import Callable, Union
+import torch
+from typing_extensions import TypeAlias
+GeneralValue: TypeAlias = Union[torch.Tensor, "GeneralBatch"]
+GeneralBatch: TypeAlias = dict[str, GeneralValue]
+GeneralCollateFn: TypeAlias = Callable[[GeneralBatch], GeneralBatch]

replay_rec-0.21.0rc0/replay/data/nn/parquet/constants/device.py ADDED Viewed

@@ -0,0 +1,3 @@
+import torch
+DEFAULT_DEVICE = torch.device("cpu")

replay_rec-0.21.0rc0/replay/data/nn/parquet/constants/filesystem.py ADDED Viewed

@@ -0,0 +1,3 @@
+import pyarrow.fs as fs
+DEFAULT_FILESYSTEM = fs.LocalFileSystem()

replay_rec-0.21.0rc0/replay/data/nn/parquet/constants/metadata.py ADDED Viewed

@@ -0,0 +1,5 @@
+SHAPE_FLAG = "shape"
+PADDING_FLAG = "padding"
+DEFAULT_PADDING = -1
+SEQUENCE_LENGTH_FLAG = "sequence_length"
+PADDING_FLAG = "padding"

replay_rec-0.21.0rc0/replay/data/nn/parquet/fixed_batch_dataset.py ADDED Viewed

@@ -0,0 +1,157 @@
+import warnings
+from collections.abc import Iterator
+from typing import Callable, Optional, Protocol, cast
+import torch
+from torch.utils.data import IterableDataset
+from replay.data.nn.parquet.constants.batches import GeneralBatch, GeneralCollateFn
+from replay.data.nn.parquet.impl.masking import DEFAULT_COLLATE_FN
+def get_batch_size(batch: GeneralBatch, strict: bool = False) -> int:
+    """
+    Retrieves the size of the ``batch`` object.
+    :param batch: Batch object.
+    :param strict: If ``True``, performs additional validation. Default: ``False``.
+    :raises ValueError: If size mismatch is found in the batch during a strict check.
+    :return: Batch size.
+    """
+    batch_size: Optional[int] = None
+    for key, value in batch.items():
+        new_batch_size: int
+        if torch.is_tensor(value):
+            new_batch_size = value.size(0)
+        else:
+            assert isinstance(value, dict)
+            new_batch_size = get_batch_size(value, strict)
+        if batch_size is None:
+            batch_size = new_batch_size
+        if strict:
+            if batch_size != new_batch_size:
+                msg = f"Batch size mismatch {key}: {batch_size} != {new_batch_size}"
+                raise ValueError(msg)
+        else:
+            break
+    assert batch_size is not None
+    return cast(int, batch_size)
+def split_batches(batch: GeneralBatch, split: int) -> tuple[GeneralBatch, GeneralBatch]:
+    left: GeneralBatch = {}
+    right: GeneralBatch = {}
+    for key, value in batch.items():
+        if torch.is_tensor(value):
+            sub_left = value[:split, ...]
+            sub_right = value[split:, ...]
+        else:
+            sub_left, sub_right = split_batches(value, split)
+        left[key], right[key] = sub_left, sub_right
+    return (left, right)
+class DatasetProtocol(Protocol):
+    def __iter__(self) -> Iterator[GeneralBatch]: ...
+    @property
+    def batch_size(self) -> int: ...
+class FixedBatchSizeDataset(IterableDataset):
+    """
+    Wrapper for arbitrary datasets that fetches batches of fixed size.
+    Concatenates batches from the wrapped dataset until it reaches the specified size.
+    The last batch may be smaller than the specified size.
+    """
+    def __init__(
+        self,
+        dataset: DatasetProtocol,
+        batch_size: Optional[int] = None,
+        collate_fn: GeneralCollateFn = DEFAULT_COLLATE_FN,
+        strict_checks: bool = False,
+    ) -> None:
+        """
+        :param dataset: An iterable object that returns batches.
+            Generally a subclass of ``torch.utils.data.IterableDataset``.
+        :param batch_size: Desired batch size. If ``None``, will search for batch size in ``dataset.batch_size``.
+            Default: ``None``.
+        :param collate_fn: Collate function for merging batches. Default: value of ``DEFAULT_COLLATE_FN``.
+        :param strict_checks: If ``True``, additional batch size checks will be performed.
+            May affect performance. Default: ``False``.
+        :raises ValueError: If an invalid batch size was provided.
+        """
+        super().__init__()
+        self.dataset: DatasetProtocol = dataset
+        if batch_size is None:
+            assert hasattr(dataset, "batch_size")
+            batch_size = self.dataset.batch_size
+        assert isinstance(batch_size, int)
+        int_batch_size: int = cast(int, batch_size)
+        if int_batch_size < 1:
+            msg = f"Insufficient batch size. Got {int_batch_size=}"
+            raise ValueError(msg)
+        if int_batch_size < 2:
+            warnings.warn(f"Low batch size. Got {int_batch_size=}. This may cause performance issues.", stacklevel=2)
+        self.collate_fn: Callable = collate_fn
+        self.batch_size: int = int_batch_size
+        self.strict_checks: bool = strict_checks
+    def get_batch_size(self, batch: GeneralBatch) -> int:
+        return get_batch_size(batch, strict=self.strict_checks)
+    def __iter__(self) -> Iterator[GeneralBatch]:
+        iterator: Iterator[GeneralBatch] = iter(self.dataset)
+        buffer: list[GeneralBatch] = []
+        buffer_size: int = 0
+        while True:
+            while buffer_size < self.batch_size:
+                try:
+                    batch: GeneralBatch = next(iterator)
+                    size: int = self.get_batch_size(batch)
+                    buffer.append(batch)
+                    buffer_size += size
+                except StopIteration:
+                    break
+            if buffer_size == 0:
+                break
+            joined: GeneralBatch = self.collate_fn(buffer)
+            assert buffer_size == self.get_batch_size(joined)
+            if self.batch_size < buffer_size:
+                left, right = split_batches(joined, self.batch_size)
+                residue: int = buffer_size - self.batch_size
+                assert residue == self.get_batch_size(right)
+                buffer_size = residue
+                buffer = [right]
+                yield left
+            else:
+                buffer_size = 0
+                buffer = []
+                yield joined
+        assert buffer_size == 0
+        assert len(buffer) == 0

replay_rec-0.21.0rc0/replay/data/nn/parquet/impl/array_1d_column.py ADDED Viewed

@@ -0,0 +1,140 @@
+from typing import Any, Union
+import pyarrow as pa
+import pyarrow.compute as pc
+import torch
+from replay.data.nn.parquet.constants.device import DEFAULT_DEVICE
+from replay.data.nn.parquet.constants.metadata import DEFAULT_PADDING
+from replay.data.nn.parquet.metadata import (
+    Metadata,
+    get_1d_array_columns,
+    get_padding,
+    get_shape,
+)
+from replay.data.utils.typing.dtype import pyarrow_to_torch
+from .column_protocol import OutputType
+from .indexing import get_mask, get_offsets
+from .utils import ensure_mutable
+class Array1DColumn:
+    """
+    Representation of a 1D array column, containing a
+    list of numbers of varying length in each of its rows.
+    """
+    def __init__(
+        self,
+        data: torch.Tensor,
+        lengths: torch.LongTensor,
+        shape: Union[int, list[int]],
+        padding: Any = DEFAULT_PADDING,
+    ) -> None:
+        """
+        :param data: A tensor containing column data.
+        :param lengths: A tensor containing lengths of each individual row array.
+        :param shape: An integer or list of integers representing the target array shapes.
+        :param padding: Padding value to use to fill null values and match target shape.
+            Default: value of ``DEFAULT_PADDING``
+        :raises ValueError: If the shape provided is not one-dimensional.
+        """
+        if isinstance(shape, list) and len(shape) > 1:
+            msg = f"Array1DColumn accepts a shape of size (1,) only. Got {shape=}"
+            raise ValueError(msg)
+        self.padding = padding
+        self.data = data
+        self.offsets = get_offsets(lengths)
+        self.shape = shape[0] if isinstance(shape, list) else shape
+        assert self.length == torch.numel(lengths)
+    @property
+    def length(self) -> int:
+        return torch.numel(self.offsets) - 1
+    def __len__(self) -> int:
+        return self.length
+    @property
+    def device(self) -> torch.device:
+        assert self.data.device == self.offsets.device
+        return self.offsets.device
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.data.dtype
+    def __getitem__(self, indices: torch.LongTensor) -> OutputType:
+        indices = indices.to(device=self.device)
+        mask, output = get_mask(indices, self.offsets, self.shape)
+        # TODO: Test this for both 1d and 2d arrays. Add same check in 2d arrays
+        if self.data.numel() == 0:
+            mask = torch.zeros((indices.size(0), self.shape), dtype=torch.bool, device=self.device)
+            output = torch.ones((indices.size(0), self.shape), dtype=torch.bool, device=self.device) * self.padding
+            return mask, output
+        unmasked_values = torch.take(self.data, output)
+        masked_values = torch.where(mask, unmasked_values, self.padding)
+        assert masked_values.device == self.device
+        assert masked_values.dtype == self.dtype
+        return (mask, masked_values)
+def to_torch(array: pa.Array, device: torch.device = DEFAULT_DEVICE) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Converts a PyArrow array into a PyTorch tensor.
+    :param array: Original PyArrow array.
+    :param device: Target device to send the resulting tensor to. Default: value of ``DEFAULT_DEVICE``.
+    :return: A PyTorch tensor obtained from original array.
+    """
+    flatten = pc.list_flatten(array)
+    lengths = pc.list_value_length(array).cast(pa.int64())
+    # Copying to be mutable
+    flatten_torch = torch.asarray(
+        ensure_mutable(flatten.to_numpy()),
+        device=device,
+        dtype=pyarrow_to_torch(flatten.type),
+    )
+    # Copying to be mutable
+    lengths_torch = torch.asarray(
+        ensure_mutable(lengths.to_numpy()),
+        device=device,
+        dtype=torch.int64,
+    )
+    return (lengths_torch, flatten_torch)
+def to_array_1d_columns(
+    data: pa.RecordBatch,
+    metadata: Metadata,
+    device: torch.device = DEFAULT_DEVICE,
+) -> dict[str, Array1DColumn]:
+    """
+    Converts a PyArrow batch of data to a set of ``Array1DColums``s.
+    This function filters only those columns matching its format from the full batch.
+    :param data: A PyArrow batch of column data.
+    :param metadata: Metadata containing information about columns' formats.
+    :param device: Target device to send column tensors to. Default: value of ``DEFAULT_DEVICE``
+    :return: A dict of tensors containing dataset's numeric columns.
+    """
+    result: dict[str, Array1DColumn] = {}
+    for column_name in get_1d_array_columns(metadata):
+        lengths, torch_array = to_torch(data.column(column_name), device=device)
+        result[column_name] = Array1DColumn(
+            data=torch_array,
+            lengths=lengths,
+            padding=get_padding(metadata, column_name),
+            shape=get_shape(metadata, column_name),
+        )
+    return result

replay-rec 0.20.3rc0__tar.gz → 0.21.0rc0__tar.gz

replay-rec 0.20.3rc0tar.gz → 0.21.0rc0tar.gz