PyPI - replay-rec - Versions diffs - 0.18.1rc0__py3-none-any.whl → 0.19.0__py3-none-any.whl - Mend

replay-rec 0.18.1rc0py3-none-any.whl → 0.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

replay/__init__.py +1 -1
replay/data/nn/schema.py +3 -1
replay/metrics/surprisal.py +4 -2
replay/models/lin_ucb.py +2 -3
replay/models/nn/loss/__init__.py +1 -0
replay/models/nn/loss/sce.py +131 -0
replay/models/nn/sequential/bert4rec/lightning.py +36 -4
replay/models/nn/sequential/bert4rec/model.py +5 -46
replay/models/nn/sequential/sasrec/lightning.py +27 -3
replay/models/nn/sequential/sasrec/model.py +1 -1
replay/preprocessing/filters.py +102 -1
replay/preprocessing/label_encoder.py +8 -4
{replay_rec-0.18.1rc0.dist-info → replay_rec-0.19.0.dist-info}/METADATA +5 -12
{replay_rec-0.18.1rc0.dist-info → replay_rec-0.19.0.dist-info}/RECORD +16 -70
{replay_rec-0.18.1rc0.dist-info → replay_rec-0.19.0.dist-info}/WHEEL +1 -1
replay/experimental/__init__.py +0 -0
replay/experimental/metrics/__init__.py +0 -62
replay/experimental/metrics/base_metric.py +0 -602
replay/experimental/metrics/coverage.py +0 -97
replay/experimental/metrics/experiment.py +0 -175
replay/experimental/metrics/hitrate.py +0 -26
replay/experimental/metrics/map.py +0 -30
replay/experimental/metrics/mrr.py +0 -18
replay/experimental/metrics/ncis_precision.py +0 -31
replay/experimental/metrics/ndcg.py +0 -49
replay/experimental/metrics/precision.py +0 -22
replay/experimental/metrics/recall.py +0 -25
replay/experimental/metrics/rocauc.py +0 -49
replay/experimental/metrics/surprisal.py +0 -90
replay/experimental/metrics/unexpectedness.py +0 -76
replay/experimental/models/__init__.py +0 -13
replay/experimental/models/admm_slim.py +0 -205
replay/experimental/models/base_neighbour_rec.py +0 -204
replay/experimental/models/base_rec.py +0 -1340
replay/experimental/models/base_torch_rec.py +0 -234
replay/experimental/models/cql.py +0 -454
replay/experimental/models/ddpg.py +0 -923
replay/experimental/models/dt4rec/__init__.py +0 -0
replay/experimental/models/dt4rec/dt4rec.py +0 -189
replay/experimental/models/dt4rec/gpt1.py +0 -401
replay/experimental/models/dt4rec/trainer.py +0 -127
replay/experimental/models/dt4rec/utils.py +0 -265
replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -792
replay/experimental/models/hierarchical_recommender.py +0 -331
replay/experimental/models/implicit_wrap.py +0 -131
replay/experimental/models/lightfm_wrap.py +0 -302
replay/experimental/models/mult_vae.py +0 -332
replay/experimental/models/neural_ts.py +0 -986
replay/experimental/models/neuromf.py +0 -406
replay/experimental/models/scala_als.py +0 -296
replay/experimental/models/u_lin_ucb.py +0 -115
replay/experimental/nn/data/__init__.py +0 -1
replay/experimental/nn/data/schema_builder.py +0 -102
replay/experimental/preprocessing/__init__.py +0 -3
replay/experimental/preprocessing/data_preparator.py +0 -839
replay/experimental/preprocessing/padder.py +0 -229
replay/experimental/preprocessing/sequence_generator.py +0 -208
replay/experimental/scenarios/__init__.py +0 -1
replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -74
replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -261
replay/experimental/scenarios/obp_wrapper/utils.py +0 -87
replay/experimental/scenarios/two_stages/__init__.py +0 -0
replay/experimental/scenarios/two_stages/reranker.py +0 -117
replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -757
replay/experimental/utils/__init__.py +0 -0
replay/experimental/utils/logger.py +0 -24
replay/experimental/utils/model_handler.py +0 -186
replay/experimental/utils/session_handler.py +0 -44
replay_rec-0.18.1rc0.dist-info/NOTICE +0 -41
{replay_rec-0.18.1rc0.dist-info → replay_rec-0.19.0.dist-info}/LICENSE +0 -0

replay/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """ RecSys library """
-__version__ = "0.18.1.preview"
+__version__ = "0.19.0"

replay/data/nn/schema.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import (
     List,
     Mapping,
     Optional,
+    OrderedDict,
     Sequence,
     Set,
     Union,
@@ -262,6 +263,8 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
         """
         :param features_list: list of tensor feature infos.
         """
+        if isinstance(features_list, OrderedDict):
+            features_list = list(features_list.values())
         features_list = [features_list] if not isinstance(features_list, Sequence) else features_list
         self._tensor_schema = {feature.name: feature for feature in features_list}
@@ -501,7 +504,6 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
                     filtered_features,
                 )
             )
         return TensorSchema(filtered_features)
     @staticmethod

replay/metrics/surprisal.py CHANGED Viewed

@@ -129,7 +129,9 @@ class Surprisal(Metric):
         item_weights = train.group_by(self.item_column).agg(
             (np.log2(n_users / pl.col(self.query_column).n_unique()) / np.log2(n_users)).alias("weight")
         )
-        recommendations = recommendations.join(item_weights, on=self.item_column, how="left").fill_nan(1.0)
+        recommendations = recommendations.join(item_weights, on=self.item_column, how="left").with_columns(
+            pl.col("weight").fill_null(1.0)
+        )
         sorted_by_score_recommendations = self._get_items_list_per_user(recommendations, "weight")
         return self._rearrange_columns(sorted_by_score_recommendations)
@@ -175,7 +177,7 @@ class Surprisal(Metric):
         weights = self._get_recommendation_weights(recommendations, train)
         return self._dict_call(
-            list(train),
+            list(recommendations),
             pred_item_id=recommendations,
             pred_weight=weights,
         )

replay/models/lin_ucb.py CHANGED Viewed

@@ -98,9 +98,8 @@ class LinUCB(HybridRecommender):
     The model assumes a linear relationship between user context, item features and action rewards,
     making it efficient for high-dimensional contexts.
-    Note:
-        It's recommended to scale features to a similar range (e.g., using StandardScaler or MinMaxScaler)
-        to ensure proper convergence and prevent numerical instability (since relationships to learn are linear).
+    Note: It's recommended to scale features to a similar range (e.g., using StandardScaler or MinMaxScaler)
+    to ensure proper convergence and prevent numerical instability (since relationships to learn are linear).
     >>> import pandas as pd
     >>> from replay.data.dataset import (

replay/models/nn/loss/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .sce import ScalableCrossEntropyLoss, SCEParams

replay/models/nn/loss/sce.py ADDED Viewed

@@ -0,0 +1,131 @@
+from dataclasses import dataclass
+from typing import Optional
+import torch
+@dataclass(frozen=True)
+class SCEParams:
+    """Set of parameters for ScalableCrossEntropyLoss.
+    Constructor arguments:
+    :param n_buckets: Number of buckets into which samples will be distributed.
+    :param bucket_size_x: Number of item hidden representations that will be in each bucket.
+    :param bucket_size_y: Number of item embeddings that will be in each bucket.
+    :param mix_x: Whether a randomly generated matrix will be multiplied by the model output matrix or not.
+        Default: ``False``.
+    """
+    n_buckets: int
+    bucket_size_x: int
+    bucket_size_y: int
+    mix_x: bool = False
+    def _get_not_none_params(self):
+        return [self.n_buckets, self.bucket_size_x, self.bucket_size_y]
+class ScalableCrossEntropyLoss:
+    def __init__(self, sce_params: SCEParams):
+        """
+        ScalableCrossEntropyLoss for Sequential Recommendations with Large Item Catalogs.
+        Reference article may be found at https://arxiv.org/pdf/2409.18721.
+        :param SCEParams: Dataclass with ScalableCrossEntropyLoss parameters.
+            Dataclass contains following values:
+                :param n_buckets: Number of buckets into which samples will be distributed.
+                :param bucket_size_x: Number of item hidden representations that will be in each bucket.
+                :param bucket_size_y: Number of item embeddings that will be in each bucket.
+                :param mix_x: Whether a randomly generated matrix will be multiplied by the model output matrix or not.
+                    Default: ``False``.
+        """
+        assert all(
+            param is not None for param in sce_params._get_not_none_params()
+        ), "You should define ``n_buckets``, ``bucket_size_x``, ``bucket_size_y`` when using SCE loss function."
+        self._n_buckets = sce_params.n_buckets
+        self._bucket_size_x = sce_params.bucket_size_x
+        self._bucket_size_y = sce_params.bucket_size_y
+        self._mix_x = sce_params.mix_x
+    def __call__(
+        self,
+        embeddings: torch.Tensor,
+        positive_labels: torch.LongTensor,
+        all_embeddings: torch.Tensor,
+        padding_mask: torch.BoolTensor,
+        tokens_mask: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        """
+        ScalableCrossEntropyLoss computation.
+        :param embeddings: Matrix of the last transformer block outputs.
+        :param positive_labels: Positive labels.
+        :param all_embeddings: Matrix of all item embeddings.
+        :param padding_mask: Padding mask.
+        :param tokens_mask: Tokens mask (need only for Bert4Rec).
+            Default: ``None``.
+        """
+        masked_tokens = padding_mask if tokens_mask is None else ~(~padding_mask + tokens_mask)
+        hd = torch.tensor(embeddings.shape[-1])
+        x = embeddings.view(-1, hd)
+        y = positive_labels.view(-1)
+        w = all_embeddings
+        correct_class_logits_ = (x * torch.index_select(w, dim=0, index=y)).sum(dim=1)  # (bs,)
+        with torch.no_grad():
+            if self._mix_x:
+                omega = 1 / torch.sqrt(torch.sqrt(hd)) * torch.randn(x.shape[0], self._n_buckets, device=x.device)
+                buckets = omega.T @ x
+                del omega
+            else:
+                buckets = (
+                    1 / torch.sqrt(torch.sqrt(hd)) * torch.randn(self._n_buckets, hd, device=x.device)
+                )  # (n_b, hd)
+        with torch.no_grad():
+            x_bucket = buckets @ x.T  # (n_b, hd) x (hd, b) -> (n_b, b)
+            x_bucket[:, ~padding_mask.view(-1)] = float("-inf")
+            _, top_x_bucket = torch.topk(x_bucket, dim=1, k=self._bucket_size_x)  # (n_b, bs_x)
+            del x_bucket
+            y_bucket = buckets @ w.T  # (n_b, hd) x (hd, n_cl) -> (n_b, n_cl)
+            _, top_y_bucket = torch.topk(y_bucket, dim=1, k=self._bucket_size_y)  # (n_b, bs_y)
+            del y_bucket
+        x_bucket = torch.gather(x, 0, top_x_bucket.view(-1, 1).expand(-1, hd)).view(
+            self._n_buckets, self._bucket_size_x, hd
+        )  # (n_b, bs_x, hd)
+        y_bucket = torch.gather(w, 0, top_y_bucket.view(-1, 1).expand(-1, hd)).view(
+            self._n_buckets, self._bucket_size_y, hd
+        )  # (n_b, bs_y, hd)
+        wrong_class_logits = x_bucket @ y_bucket.transpose(-1, -2)  # (n_b, bs_x, bs_y)
+        mask = (
+            torch.index_select(y, dim=0, index=top_x_bucket.view(-1)).view(self._n_buckets, self._bucket_size_x)[
+                :, :, None
+            ]
+            == top_y_bucket[:, None, :]
+        )  # (n_b, bs_x, bs_y)
+        wrong_class_logits = wrong_class_logits.masked_fill(mask, float("-inf"))  # (n_b, bs_x, bs_y)
+        correct_class_logits = torch.index_select(correct_class_logits_, dim=0, index=top_x_bucket.view(-1)).view(
+            self._n_buckets, self._bucket_size_x
+        )[
+            :, :, None
+        ]  # (n_b, bs_x, 1)
+        logits = torch.cat((wrong_class_logits, correct_class_logits), dim=2)  # (n_b, bs_x, bs_y + 1)
+        loss_ = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.shape[-1]),
+            (logits.shape[-1] - 1)
+            * torch.ones(logits.shape[0] * logits.shape[1], dtype=torch.int64, device=logits.device),
+            reduction="none",
+        )  # (n_b * bs_x,)
+        loss = torch.zeros(x.shape[0], device=x.device, dtype=x.dtype)
+        loss.scatter_reduce_(0, top_x_bucket.view(-1), loss_, reduce="amax", include_self=False)
+        loss = loss[(loss != 0) & (masked_tokens).view(-1)]
+        loss = torch.mean(loss)
+        return loss

replay/models/nn/sequential/bert4rec/lightning.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import math
-from typing import Any, Dict, Optional, Tuple, Union, cast
+from typing import Any, Dict, Literal, Optional, Tuple, Union, cast
 import lightning
 import torch
@@ -27,7 +27,7 @@ class Bert4Rec(lightning.LightningModule):
         pass_per_transformer_block_count: int = 1,
         enable_positional_embedding: bool = True,
         enable_embedding_tying: bool = False,
-        loss_type: str = "CE",
+        loss_type: Literal["BCE", "CE", "CE_restricted"] = "CE",
         loss_sample_count: Optional[int] = None,
         negative_sampling_strategy: str = "global_uniform",
         negatives_sharing: bool = False,
@@ -54,7 +54,7 @@ class Bert4Rec(lightning.LightningModule):
             If `True` - result scores are calculated by dot product of input and output embeddings,
             if `False` - default linear layer is applied to calculate logits for each item.
             Default: ``False``.
-        :param loss_type: Loss type. Possible values: ``"CE"``, ``"BCE"``.
+        :param loss_type: Loss type. Possible values: ``"CE"``, ``"BCE"``, ``"CE_restricted"``.
             Default: ``CE``.
         :param loss_sample_count (Optional[int]): Sample count to calculate loss.
             Default: ``None``.
@@ -197,6 +197,8 @@ class Bert4Rec(lightning.LightningModule):
             loss_func = self._compute_loss_bce if self._loss_sample_count is None else self._compute_loss_bce_sampled
         elif self._loss_type == "CE":
             loss_func = self._compute_loss_ce if self._loss_sample_count is None else self._compute_loss_ce_sampled
+        elif self._loss_type == "CE_restricted":
+            loss_func = self._compute_loss_ce_restricted
         else:
             msg = f"Not supported loss type: {self._loss_type}"
             raise ValueError(msg)
@@ -316,6 +318,20 @@ class Bert4Rec(lightning.LightningModule):
         loss = self._loss(logits, labels_flat)
         return loss
+    def _compute_loss_ce_restricted(
+        self,
+        feature_tensors: TensorMap,
+        positive_labels: torch.LongTensor,
+        padding_mask: torch.BoolTensor,
+        tokens_mask: torch.BoolTensor,
+    ) -> torch.Tensor:
+        (logits, labels) = self._get_restricted_logits_for_ce_loss(
+            feature_tensors, positive_labels, padding_mask, tokens_mask
+        )
+        loss = self._loss(logits, labels)
+        return loss
     def _get_sampled_logits(
         self,
         feature_tensors: TensorMap,
@@ -398,11 +414,27 @@ class Bert4Rec(lightning.LightningModule):
             vocab_size,
         )
+    def _get_restricted_logits_for_ce_loss(
+        self,
+        feature_tensors: TensorMap,
+        positive_labels: torch.LongTensor,
+        padding_mask: torch.BoolTensor,
+        tokens_mask: torch.BoolTensor,
+    ):
+        labels_mask = (~padding_mask) + tokens_mask
+        masked_tokens = ~labels_mask
+        positive_labels = cast(
+            torch.LongTensor, torch.masked_select(positive_labels, masked_tokens)
+        )  # (masked_batch_seq_size,)
+        output_emb = self._model.forward_step(feature_tensors, padding_mask, tokens_mask)[masked_tokens]
+        logits = self._model.get_logits(output_emb)
+        return (logits, positive_labels)
     def _create_loss(self) -> Union[torch.nn.BCEWithLogitsLoss, torch.nn.CrossEntropyLoss]:
         if self._loss_type == "BCE":
             return torch.nn.BCEWithLogitsLoss(reduction="sum")
-        if self._loss_type == "CE":
+        if self._loss_type == "CE" or self._loss_type == "CE_restricted":
             return torch.nn.CrossEntropyLoss()
         msg = "Not supported loss_type"

replay/models/nn/sequential/bert4rec/model.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import contextlib
-import math
 from abc import ABC, abstractmethod
 from typing import Dict, Optional, Union
 import torch
+import torch.nn as nn
 from replay.data.nn import TensorFeatureInfo, TensorMap, TensorSchema
@@ -379,7 +379,7 @@ class BaseHead(ABC, torch.nn.Module):
             item_embeddings = item_embeddings[item_ids]
             bias = bias[item_ids]
-        logits = item_embeddings.matmul(out_embeddings.unsqueeze(-1)).squeeze(-1) + bias
+        logits = torch.nn.functional.linear(out_embeddings, item_embeddings, bias)
         return logits
     @abstractmethod
@@ -471,11 +471,11 @@ class TransformerBlock(torch.nn.Module):
         super().__init__()
         self.attention = torch.nn.MultiheadAttention(hidden_size, attn_heads, dropout=dropout, batch_first=True)
         self.attention_dropout = torch.nn.Dropout(dropout)
-        self.attention_norm = LayerNorm(hidden_size)
+        self.attention_norm = torch.nn.LayerNorm(hidden_size)
         self.pff = PositionwiseFeedForward(d_model=hidden_size, d_ff=feed_forward_hidden, dropout=dropout)
         self.pff_dropout = torch.nn.Dropout(dropout)
-        self.pff_norm = LayerNorm(hidden_size)
+        self.pff_norm = torch.nn.LayerNorm(hidden_size)
         self.dropout = torch.nn.Dropout(p=dropout)
@@ -501,33 +501,6 @@ class TransformerBlock(torch.nn.Module):
         return self.dropout(z)
-class LayerNorm(torch.nn.Module):
-    """
-    Construct a layernorm module (See citation for details).
-    """
-    def __init__(self, features: int, eps: float = 1e-6):
-        """
-        :param features: Number of features.
-        :param eps: A value added to the denominator for numerical stability.
-            Default: ``1e-6``.
-        """
-        super().__init__()
-        self.a_2 = torch.nn.Parameter(torch.ones(features))
-        self.b_2 = torch.nn.Parameter(torch.zeros(features))
-        self.eps = eps
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        :param x: Input tensor.
-        :returns: Normalized input tensor.
-        """
-        mean = x.mean(-1, keepdim=True)
-        std = x.std(-1, keepdim=True)
-        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
 class PositionwiseFeedForward(torch.nn.Module):
     """
     Implements FFN equation.
@@ -544,7 +517,7 @@ class PositionwiseFeedForward(torch.nn.Module):
         self.w_1 = torch.nn.Linear(d_model, d_ff)
         self.w_2 = torch.nn.Linear(d_ff, d_model)
         self.dropout = torch.nn.Dropout(dropout)
-        self.activation = GELU()
+        self.activation = nn.GELU()
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
@@ -553,17 +526,3 @@ class PositionwiseFeedForward(torch.nn.Module):
         :returns: Position wised output.
         """
         return self.w_2(self.dropout(self.activation(self.w_1(x))))
-class GELU(torch.nn.Module):
-    """
-    Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU
-    """
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        :param x: Input tensor.
-        :returns: Activated input tensor.
-        """
-        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

replay/models/nn/sequential/sasrec/lightning.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import math
-from typing import Any, Dict, Optional, Tuple, Union, cast
+from typing import Any, Dict, Literal, Optional, Tuple, Union, cast
 import lightning
 import torch
 from replay.data.nn import TensorMap, TensorSchema
+from replay.models.nn.loss import ScalableCrossEntropyLoss, SCEParams
 from replay.models.nn.optimizer_utils import FatOptimizerFactory, LRSchedulerFactory, OptimizerFactory
 from .dataset import SasRecPredictionBatch, SasRecTrainingBatch, SasRecValidationBatch
@@ -29,12 +30,13 @@ class SasRec(lightning.LightningModule):
         dropout_rate: float = 0.2,
         ti_modification: bool = False,
         time_span: int = 256,
-        loss_type: str = "CE",
+        loss_type: Literal["BCE", "CE", "SCE"] = "CE",
         loss_sample_count: Optional[int] = None,
         negative_sampling_strategy: str = "global_uniform",
         negatives_sharing: bool = False,
         optimizer_factory: OptimizerFactory = FatOptimizerFactory(),
         lr_scheduler_factory: Optional[LRSchedulerFactory] = None,
+        sce_params: Optional[SCEParams] = None,
     ):
         """
         :param tensor_schema: Tensor schema of features.
@@ -52,9 +54,10 @@ class SasRec(lightning.LightningModule):
             Default: ``False``.
         :param time_span: Time span value.
             Default: ``256``.
-        :param loss_type: Loss type. Possible values: ``"CE"``, ``"BCE"``.
+        :param loss_type: Loss type. Possible values: ``"CE"``, ``"BCE"``, ``"SCE"``.
             Default: ``CE``.
         :param loss_sample_count (Optional[int]): Sample count to calculate loss.
+            Suitable for ``"CE"`` and ``"BCE"`` loss functions.
             Default: ``None``.
         :param negative_sampling_strategy: Negative sampling strategy to calculate loss on sampled negatives.
             Is used when large count of items in dataset.
@@ -66,6 +69,8 @@ class SasRec(lightning.LightningModule):
             Default: ``FatOptimizerFactory``.
         :param lr_scheduler_factory: Learning rate schedule factory.
             Default: ``None``.
+        :param sce_params: Dataclass with SCE parameters. Need to be defined if ``loss_type`` is ``SCE``.
+            Default: ``None``.
         """
         super().__init__()
         self.save_hyperparameters()
@@ -85,9 +90,12 @@ class SasRec(lightning.LightningModule):
         self._negatives_sharing = negatives_sharing
         self._optimizer_factory = optimizer_factory
         self._lr_scheduler_factory = lr_scheduler_factory
+        self._sce_params = sce_params
         self._loss = self._create_loss()
         self._schema = tensor_schema
         assert negative_sampling_strategy in {"global_uniform", "inbatch"}
+        if self._loss_type == "SCE":
+            assert sce_params is not None, "You should define ``sce_params`` when using SCE loss function."
         item_count = tensor_schema.item_id_features.item().cardinality
         assert item_count
@@ -197,6 +205,8 @@ class SasRec(lightning.LightningModule):
             loss_func = self._compute_loss_bce if self._loss_sample_count is None else self._compute_loss_bce_sampled
         elif self._loss_type == "CE":
             loss_func = self._compute_loss_ce if self._loss_sample_count is None else self._compute_loss_ce_sampled
+        elif self._loss_type == "SCE":
+            loss_func = self._compute_loss_scalable_ce
         else:
             msg = f"Not supported loss type: {self._loss_type}"
             raise ValueError(msg)
@@ -314,6 +324,17 @@ class SasRec(lightning.LightningModule):
         loss = self._loss(logits, labels_flat)
         return loss
+    def _compute_loss_scalable_ce(
+        self,
+        feature_tensors: TensorMap,
+        positive_labels: torch.LongTensor,
+        padding_mask: torch.BoolTensor,
+        tokens_mask: torch.BoolTensor,  # noqa: ARG002
+    ) -> torch.Tensor:
+        emb = self._model.forward_step(feature_tensors, padding_mask)
+        all_embeddings = self.get_all_embeddings()["item_embedding"]
+        return self._loss(emb, positive_labels, all_embeddings, padding_mask)
     def _get_sampled_logits(
         self,
         feature_tensors: TensorMap,
@@ -401,6 +422,9 @@ class SasRec(lightning.LightningModule):
         if self._loss_type == "CE":
             return torch.nn.CrossEntropyLoss()
+        if self._loss_type == "SCE":
+            return ScalableCrossEntropyLoss(self._sce_params)
         msg = "Not supported loss_type"
         raise NotImplementedError(msg)

replay/models/nn/sequential/sasrec/model.py CHANGED Viewed

@@ -298,7 +298,7 @@ class EmbeddingTyingHead(torch.nn.Module):
         if len(item_embeddings.shape) > 2:  # global_uniform, negative sharing=False, train only
             logits = (item_embeddings * out_embeddings.unsqueeze(-2)).sum(dim=-1)
         else:
-            logits = item_embeddings.matmul(out_embeddings.unsqueeze(-1)).squeeze(-1)
+            logits = torch.matmul(out_embeddings, item_embeddings.t())
         return logits

replay/preprocessing/filters.py CHANGED Viewed

@@ -4,7 +4,8 @@ Select or remove data by some criteria
 from abc import ABC, abstractmethod
 from datetime import datetime, timedelta
-from typing import Callable, Optional, Tuple, Union
+from typing import Callable, Literal, Optional, Tuple, Union
+from uuid import uuid4
 import numpy as np
 import pandas as pd
@@ -989,3 +990,103 @@ class QuantileItemsFilter(_BaseFilter):
         )
         short_tail = short_tail.filter(sf.col("index") > sf.col("num_items_to_delete"))
         return long_tail.select(df.columns).union(short_tail.select(df.columns))
+class ConsecutiveDuplicatesFilter(_BaseFilter):
+    """Removes consecutive duplicate items from sequential dataset.
+    >>> import datetime as dt
+    >>> import pandas as pd
+    >>> from replay.utils.spark_utils import convert2spark
+    >>> interactions = pd.DataFrame({
+    ...     "user_id": ["u0", "u1", "u1", "u0", "u0", "u0", "u1", "u0"],
+    ...     "item_id": ["i0", "i1", "i1", "i2", "i0", "i1", "i2", "i1"],
+    ...     "timestamp": [dt.datetime(2024, 1, 1) + dt.timedelta(days=i) for i in range(8)]
+    ... })
+    >>> interactions = convert2spark(interactions)
+    >>> interactions.show()
+    +-------+-------+-------------------+
+    |user_id|item_id|          timestamp|
+    +-------+-------+-------------------+
+    |     u0|     i0|2024-01-01 00:00:00|
+    |     u1|     i1|2024-01-02 00:00:00|
+    |     u1|     i1|2024-01-03 00:00:00|
+    |     u0|     i2|2024-01-04 00:00:00|
+    |     u0|     i0|2024-01-05 00:00:00|
+    |     u0|     i1|2024-01-06 00:00:00|
+    |     u1|     i2|2024-01-07 00:00:00|
+    |     u0|     i1|2024-01-08 00:00:00|
+    +-------+-------+-------------------+
+    <BLANKLINE>
+    >>> ConsecutiveDuplicatesFilter(query_column="user_id").transform(interactions).show()
+    +-------+-------+-------------------+
+    |user_id|item_id|          timestamp|
+    +-------+-------+-------------------+
+    |     u0|     i0|2024-01-01 00:00:00|
+    |     u0|     i2|2024-01-04 00:00:00|
+    |     u0|     i0|2024-01-05 00:00:00|
+    |     u0|     i1|2024-01-06 00:00:00|
+    |     u1|     i1|2024-01-02 00:00:00|
+    |     u1|     i2|2024-01-07 00:00:00|
+    +-------+-------+-------------------+
+    <BLANKLINE>
+    """
+    def __init__(
+        self,
+        keep: Literal["first", "last"] = "first",
+        query_column: str = "query_id",
+        item_column: str = "item_id",
+        timestamp_column: str = "timestamp",
+    ) -> None:
+        """
+        :param keep: whether to keep first or last occurrence,
+            Default: ``first``.
+        :param query_column: query column,
+            Default: ``query_id``.
+        :param item_column: item column,
+            Default: ``item_id``.
+        :param timestamp_column: timestamp column,
+            Default: ``timestamp``.
+        """
+        super().__init__()
+        self.query_column = query_column
+        self.item_column = item_column
+        self.timestamp_column = timestamp_column
+        if keep not in ("first", "last"):
+            msg = "`keep` must be either 'first' or 'last'"
+            raise ValueError(msg)
+        self.bias = 1 if keep == "first" else -1
+        self.temporary_column = f"__shifted_{uuid4().hex[:8]}"
+    def _filter_pandas(self, interactions: PandasDataFrame) -> PandasDataFrame:
+        interactions = interactions.sort_values(self.timestamp_column)
+        interactions[self.temporary_column] = interactions.groupby(self.query_column)[self.item_column].shift(
+            periods=self.bias
+        )
+        return (
+            interactions[interactions[self.item_column] != interactions[self.temporary_column]]
+            .drop(self.temporary_column, axis=1)
+            .reset_index(drop=True)
+        )
+    def _filter_polars(self, interactions: PolarsDataFrame) -> PolarsDataFrame:
+        return (
+            interactions.sort(self.timestamp_column)
+            .with_columns(
+                pl.col(self.item_column).shift(n=self.bias).over(self.query_column).alias(self.temporary_column)
+            )
+            .filter((pl.col(self.item_column) != pl.col(self.temporary_column)).fill_null(True))
+            .drop(self.temporary_column)
+        )
+    def _filter_spark(self, interactions: SparkDataFrame) -> SparkDataFrame:
+        window = Window.partitionBy(self.query_column).orderBy(self.timestamp_column)
+        return (
+            interactions.withColumn(self.temporary_column, sf.lag(self.item_column, offset=self.bias).over(window))
+            .where((sf.col(self.item_column) != sf.col(self.temporary_column)) | sf.col(self.temporary_column).isNull())
+            .drop(self.temporary_column)
+        )

replay/preprocessing/label_encoder.py CHANGED Viewed

@@ -10,7 +10,6 @@ import abc
 import json
 import os
 import warnings
-from itertools import chain
 from pathlib import Path
 from typing import Dict, List, Literal, Mapping, Optional, Sequence, Union
@@ -27,7 +26,7 @@ from replay.utils import (
 if PYSPARK_AVAILABLE:
     from pyspark.sql import Window, functions as sf  # noqa: I001
-    from pyspark.sql.types import LongType
+    from pyspark.sql.types import LongType, IntegerType, ArrayType
 HandleUnknownStrategies = Literal["error", "use_default_value", "drop"]
@@ -336,6 +335,7 @@ class LabelEncodingRule(BaseLabelEncodingRule):
                         "with `handle_unknown_strategy=drop` leads to empty dataframe",
                         LabelEncoderTransformWarning,
                     )
+                joined_df[self._target_col] = joined_df[self._target_col].astype("int")
             elif self._handle_unknown == "error":
                 unknown_unique_labels = joined_df[self._col][unknown_mask].unique().tolist()
                 msg = f"Found unknown labels {unknown_unique_labels} in column {self._col} during transform"
@@ -629,8 +629,12 @@ class SequenceEncodingRule(LabelEncodingRule):
         return self
     def _transform_spark(self, df: SparkDataFrame, default_value: Optional[int]) -> SparkDataFrame:
-        map_expr = sf.create_map([sf.lit(x) for x in chain(*self.get_mapping().items())])
-        encoded_df = df.withColumn(self._target_col, sf.transform(self.column, lambda x: map_expr.getItem(x)))
+        def mapper_udf(x):
+            return [mapping.get(value) for value in x]  # pragma: no cover
+        mapping = self.get_mapping()
+        call_mapper_udf = sf.udf(mapper_udf, ArrayType(IntegerType()))
+        encoded_df = df.withColumn(self._target_col, call_mapper_udf(sf.col(self.column)))
         if self._handle_unknown == "drop":
             encoded_df = encoded_df.withColumn(self._target_col, sf.filter(self._target_col, lambda x: x.isNotNull()))

replay-rec 0.18.1rc0__py3-none-any.whl → 0.19.0__py3-none-any.whl

replay-rec 0.18.1rc0py3-none-any.whl → 0.19.0py3-none-any.whl