PyPI - replay-rec - Versions diffs - 0.18.0__py3-none-any.whl → 0.18.1__py3-none-any.whl - Mend

replay-rec 0.18.0py3-none-any.whl → 0.18.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

replay/__init__.py +1 -1
replay/data/dataset.py +27 -1
replay/data/dataset_utils/dataset_label_encoder.py +6 -3
replay/data/nn/schema.py +37 -16
replay/data/nn/sequence_tokenizer.py +313 -165
replay/data/nn/torch_sequential_dataset.py +17 -8
replay/data/nn/utils.py +14 -7
replay/data/schema.py +10 -6
replay/metrics/offline_metrics.py +2 -2
replay/models/__init__.py +1 -0
replay/models/base_rec.py +18 -21
replay/models/lin_ucb.py +407 -0
replay/models/nn/sequential/bert4rec/dataset.py +17 -4
replay/models/nn/sequential/bert4rec/lightning.py +121 -54
replay/models/nn/sequential/bert4rec/model.py +21 -0
replay/models/nn/sequential/callbacks/prediction_callbacks.py +5 -1
replay/models/nn/sequential/compiled/__init__.py +5 -0
replay/models/nn/sequential/compiled/base_compiled_model.py +261 -0
replay/models/nn/sequential/compiled/bert4rec_compiled.py +152 -0
replay/models/nn/sequential/compiled/sasrec_compiled.py +145 -0
replay/models/nn/sequential/postprocessors/postprocessors.py +27 -1
replay/models/nn/sequential/sasrec/dataset.py +17 -1
replay/models/nn/sequential/sasrec/lightning.py +126 -50
replay/models/nn/sequential/sasrec/model.py +3 -4
replay/preprocessing/__init__.py +7 -1
replay/preprocessing/discretizer.py +719 -0
replay/preprocessing/label_encoder.py +384 -52
replay/splitters/cold_user_random_splitter.py +1 -1
replay/utils/__init__.py +1 -0
replay/utils/common.py +7 -8
replay/utils/session_handler.py +3 -4
replay/utils/spark_utils.py +15 -1
replay/utils/types.py +8 -0
{replay_rec-0.18.0.dist-info → replay_rec-0.18.1.dist-info}/METADATA +73 -60
{replay_rec-0.18.0.dist-info → replay_rec-0.18.1.dist-info}/RECORD +37 -31
{replay_rec-0.18.0.dist-info → replay_rec-0.18.1.dist-info}/LICENSE +0 -0
{replay_rec-0.18.0.dist-info → replay_rec-0.18.1.dist-info}/WHEEL +0 -0

replay/data/nn/torch_sequential_dataset.py CHANGED Viewed

@@ -4,6 +4,8 @@ import numpy as np
 import torch
 from torch.utils.data import Dataset as TorchDataset
+from replay.utils.model_handler import deprecation_warning
 from .schema import TensorFeatureInfo, TensorMap, TensorSchema
 from .sequential_dataset import SequentialDataset
@@ -25,6 +27,10 @@ class TorchSequentialDataset(TorchDataset):
     Torch dataset for sequential recommender models
     """
+    @deprecation_warning(
+        "`padding_value` parameter will be removed in future versions. "
+        "Instead, you should specify `padding_value` for each column in TensorSchema"
+    )
     def __init__(
         self,
         sequential: SequentialDataset,
@@ -90,15 +96,14 @@ class TorchSequentialDataset(TorchDataset):
         sequence = self._sequential.get_sequence(sequence_index, feature.name)
         if feature.is_seq:
             sequence = sequence[sequence_offset : sequence_offset + self._max_sequence_length]
-        tensor_dtype = self._get_tensor_dtype(feature)
+        tensor_dtype = self._get_tensor_dtype(sequence)
         tensor_sequence = torch.tensor(sequence, dtype=tensor_dtype)
         if feature.is_seq:
-            tensor_sequence = self._pad_sequence(tensor_sequence)
+            tensor_sequence = self._pad_sequence(tensor_sequence, feature.padding_value)
         return tensor_sequence
-    def _pad_sequence(self, sequence: torch.Tensor) -> torch.Tensor:
+    def _pad_sequence(self, sequence: torch.Tensor, padding_value: int) -> torch.Tensor:
         assert len(sequence) <= self._max_sequence_length
         if len(sequence) == self._max_sequence_length:
             return sequence
@@ -115,16 +120,16 @@ class TorchSequentialDataset(TorchDataset):
         padded_sequence = torch.full(
             padded_sequence_shape,
-            self._padding_value,
+            padding_value,
             dtype=sequence.dtype,
         )
         padded_sequence[-len(sequence) :].copy_(sequence)
         return padded_sequence
-    def _get_tensor_dtype(self, feature: TensorFeatureInfo) -> torch.dtype:
-        if feature.is_cat:
+    def _get_tensor_dtype(self, array: np.array) -> torch.dtype:
+        if np.issubdtype(array.dtype, np.integer):
             return torch.long
-        if feature.is_num:
+        if np.issubdtype(array.dtype, np.floating):
             return torch.float32
         assert False, "Unknown tensor feature type"
@@ -170,6 +175,10 @@ class TorchSequentialValidationDataset(TorchDataset):
     Torch dataset for sequential recommender models that additionally stores ground truth
     """
+    @deprecation_warning(
+        "`padding_value` parameter will be removed in future versions. "
+        "Instead, you should specify `padding_value` for each column in TensorSchema"
+    )
     def __init__(
         self,
         sequential: SequentialDataset,

replay/data/nn/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Iterable, Optional
 import polars as pl
@@ -22,9 +22,12 @@ def groupby_sequences(events: DataFrameLike, groupby_col: str, sort_col: Optiona
         event_cols_without_groupby.remove(groupby_col)
         if sort_col:
-            event_cols_without_groupby.remove(sort_col)
-            event_cols_without_groupby.insert(0, sort_col)
-            events = events.sort_values(event_cols_without_groupby)
+            event_cols_without_iterable = list(
+                filter(lambda x: not isinstance(events.iloc[0][x], Iterable), event_cols_without_groupby)
+            )  # deleting columns that cannot be sorted
+            event_cols_without_iterable.remove(sort_col)
+            event_cols_without_iterable.insert(0, sort_col)
+            events = events.sort_values(event_cols_without_iterable)
         grouped_sequences = (
             events.groupby(groupby_col).agg({col: list for col in event_cols_without_groupby}).reset_index()
@@ -34,9 +37,13 @@ def groupby_sequences(events: DataFrameLike, groupby_col: str, sort_col: Optiona
         event_cols_without_groupby.remove(groupby_col)
         if sort_col:
-            event_cols_without_groupby.remove(sort_col)
-            event_cols_without_groupby.insert(0, sort_col)
-            events = events.sort(event_cols_without_groupby)
+            map_name2type = dict(zip(events.columns, events.dtypes))
+            event_cols_without_iterable = list(
+                filter(lambda x: not isinstance(map_name2type[x], pl.List), event_cols_without_groupby)
+            )  # deleting columns that cannot be sorted
+            event_cols_without_iterable.remove(sort_col)
+            event_cols_without_iterable.insert(0, sort_col)
+            events = events.sort(event_cols_without_iterable)
         grouped_sequences = events.group_by(groupby_col).agg(*[pl.col(x) for x in event_cols_without_groupby])
     else:

replay/data/schema.py CHANGED Viewed

@@ -20,7 +20,9 @@ class FeatureType(Enum):
     """Type of Feature."""
     CATEGORICAL = "categorical"
+    CATEGORICAL_LIST = "categorical_list"
     NUMERICAL = "numerical"
+    NUMERICAL_LIST = "numerical_list"
 class FeatureSource(Enum):
@@ -70,7 +72,7 @@ class FeatureInfo:
         self._feature_source = feature_source
         self._feature_hint = feature_hint
-        if feature_type == FeatureType.NUMERICAL and cardinality:
+        if feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST] and cardinality:
             msg = "Cardinality is needed only with categorical feature_type."
             raise ValueError(msg)
         self._cardinality = cardinality
@@ -111,7 +113,7 @@ class FeatureInfo:
         """
         :returns: cardinality of the feature.
         """
-        if self.feature_type != FeatureType.CATEGORICAL:
+        if self.feature_type not in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
             msg = f"Can not get cardinality because feature_type of {self.column} column is not categorical."
             raise RuntimeError(msg)
         if hasattr(self, "_cardinality_callback") and self._cardinality is None:
@@ -143,7 +145,7 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
     def copy(self) -> "FeatureSchema":
         """
-        Creates a copy of all features.
+        Creates a copy of all features. For the returned copy, all cardinality values will be undefined.
         :returns: copy of the initial feature schema.
         """
@@ -227,14 +229,16 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
         """
         :returns: sequence of categorical features in a schema.
         """
-        return self.filter(feature_type=FeatureType.CATEGORICAL)
+        return self.filter(feature_type=FeatureType.CATEGORICAL) + self.filter(
+            feature_type=FeatureType.CATEGORICAL_LIST
+        )
     @property
     def numerical_features(self) -> "FeatureSchema":
         """
         :returns: sequence of numerical features in a schema.
         """
-        return self.filter(feature_type=FeatureType.NUMERICAL)
+        return self.filter(feature_type=FeatureType.NUMERICAL) + self.filter(feature_type=FeatureType.NUMERICAL_LIST)
     @property
     def interaction_features(self) -> "FeatureSchema":
@@ -449,7 +453,7 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
         if len(duplicates) > 0:
             msg = (
-                "Features column names should be unique, exept ITEM_ID and QUERY_ID columns. "
+                "Features column names should be unique, except ITEM_ID and QUERY_ID columns. "
                 f"{duplicates} columns are not unique."
             )
             raise ValueError(msg)

replay/metrics/offline_metrics.py CHANGED Viewed

@@ -156,13 +156,13 @@ class OfflineMetrics:
     ):
         """
         :param metrics: (list of metrics): List of metrics to be calculated.
-        :param user_column: (str): The name of the user column.
+        :param query_column:: (str): The name of the query column.
             Note that you do not need to specify the value of this parameter for each metric separately.
             It is enough to specify the value of this parameter here once.
         :param item_column: (str): The name of the item column.
             Note that you do not need to specify the value of this parameter for each metric separately.
             It is enough to specify the value of this parameter here once.
-        :param score_column: (str): The name of the score column.
+        :param rating_column: (str): The name of the rating column.
             Note that you do not need to specify the value of this parameter for each metric separately.
             It is enough to specify the value of this parameter here once.
         :param category_column: (str): The name of the category column.

replay/models/__init__.py CHANGED Viewed

@@ -14,6 +14,7 @@ from .cat_pop_rec import CatPopRec
 from .cluster import ClusterRec
 from .kl_ucb import KLUCB
 from .knn import ItemKNN
+from .lin_ucb import LinUCB
 from .pop_rec import PopRec
 from .query_pop_rec import QueryPopRec
 from .random_rec import RandomRec

replay/models/base_rec.py CHANGED Viewed

@@ -625,23 +625,21 @@ class BaseRecommender(RecommenderCommons, IsSavable, ABC):
         self, dataset: Dataset, k: int, queries: SparkDataFrame, items: SparkDataFrame, filter_seen_items: bool = True
     ) -> np.ndarray:
         """
-        Inner method where model actually predicts.
+        Inner method where model actually predicts probability estimates.
+        Mainly used in ```OBPOfflinePolicyLearner```.
-        :param log: historical log of interactions
+        :param dataset: historical interactions with query/item features
             ``[user_idx, item_idx, timestamp, rating]``
         :param k: number of recommendations for each user
-        :param users: users to create recommendations for
+        :param queries: queries to create recommendations for
             dataframe containing ``[user_idx]`` or ``array-like``;
-            if ``None``, recommend to all users from ``log``
+            if ``None``, recommend to all queries from ``interactions``
         :param items: candidate items for recommendations
             dataframe containing ``[item_idx]`` or ``array-like``;
-            if ``None``, take all items from ``log``.
+            if ``None``, take all items from ``interactions``.
             If it contains new items, ``rating`` for them will be ``0``.
-        :param user_features: user features
-            ``[user_idx , timestamp]`` + feature columns
-        :param item_features: item features
-            ``[item_idx , timestamp]`` + feature columns
-        :param filter_seen_items: flag to remove seen items from recommendations based on ``log``.
+        :param filter_seen_items: flag to remove seen items from recommendations based on ``interactions``.
         :return: distribution over items for each user with shape
             ``(n_users, n_items, k)``
             where we have probability for each user to choose item at fixed position(top-k).
@@ -1164,10 +1162,11 @@ class HybridRecommender(BaseRecommender, ABC):
     ) -> Optional[Tuple[SparkDataFrame, int]]:
         """
         Returns query or item feature vectors as a Column with type ArrayType
+        If a model does not have a vector for some ids they are not present in the final result.
         :param ids: Spark DataFrame with unique ids
         :param features: Spark DataFrame with features for provided ids
         :return: feature vectors
-            If a model does not have a vector for some ids they are not present in the final result.
         """
         return self._get_features_wrap(ids, features)
@@ -1644,23 +1643,21 @@ class NonPersonalizedRecommender(Recommender, ABC):
         self, dataset: Dataset, k: int, queries: SparkDataFrame, items: SparkDataFrame, filter_seen_items: bool = True
     ) -> np.ndarray:
         """
-        Inner method where model actually predicts.
+        Inner method where model actually predicts probability estimates.
+        Mainly used in ```OBPOfflinePolicyLearner```.
-        :param log: historical log of interactions
+        :param dataset: historical interactions with query/item features
             ``[user_idx, item_idx, timestamp, rating]``
         :param k: number of recommendations for each user
-        :param users: users to create recommendations for
+        :param queries: queries to create recommendations for
             dataframe containing ``[user_idx]`` or ``array-like``;
-            if ``None``, recommend to all users from ``log``
+            if ``None``, recommend to all queries from ``interactions``
         :param items: candidate items for recommendations
             dataframe containing ``[item_idx]`` or ``array-like``;
-            if ``None``, take all items from ``log``.
+            if ``None``, take all items from ``interactions``.
             If it contains new items, ``rating`` for them will be ``0``.
-        :param user_features: user features
-            ``[user_idx , timestamp]`` + feature columns
-        :param item_features: item features
-            ``[item_idx , timestamp]`` + feature columns
-        :param filter_seen_items: flag to remove seen items from recommendations based on ``log``.
+        :param filter_seen_items: flag to remove seen items from recommendations based on ``interactions``.
         :return: distribution over items for each user with shape
             ``(n_users, n_items, k)``
             where we have probability for each user to choose item at fixed position(top-k).

replay-rec 0.18.0__py3-none-any.whl → 0.18.1__py3-none-any.whl

replay-rec 0.18.0py3-none-any.whl → 0.18.1py3-none-any.whl