PyPI - replay-rec - Versions diffs - 0.17.0rc0__py3-none-any.whl → 0.17.1rc0__py3-none-any.whl - Mend

replay-rec 0.17.0rc0py3-none-any.whl → 0.17.1rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

replay/__init__.py +1 -1
replay/data/dataset.py +246 -20
replay/data/nn/schema.py +42 -0
replay/data/nn/sequence_tokenizer.py +17 -47
replay/data/nn/sequential_dataset.py +76 -2
replay/preprocessing/filters.py +169 -4
replay/splitters/base_splitter.py +1 -1
replay/utils/common.py +107 -5
replay/utils/spark_utils.py +13 -6
{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/METADATA +3 -3
{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/RECORD +14 -14
{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/LICENSE +0 -0
{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/NOTICE +0 -0
{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/WHEEL +0 -0

replay/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
 """ RecSys library """
-__version__ = "0.17.0.preview"
+__version__ = "0.17.1.preview"

replay/data/dataset.py CHANGED Viewed

@@ -3,11 +3,22 @@
 """
 from __future__ import annotations
-from typing import Callable, Dict, Iterable, List, Optional, Sequence
+import json
+from pathlib import Path
+from typing import Callable, Dict, Iterable, List, Optional, Sequence, Union
 import numpy as np
-from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, PandasDataFrame, PolarsDataFrame, SparkDataFrame
+from pandas import read_parquet as pd_read_parquet
+from polars import read_parquet as pl_read_parquet
+from replay.utils import (
+    PYSPARK_AVAILABLE,
+    DataFrameLike,
+    PandasDataFrame,
+    PolarsDataFrame,
+    SparkDataFrame,
+)
+from replay.utils.session_handler import get_spark_session
 from .schema import FeatureHint, FeatureInfo, FeatureSchema, FeatureSource, FeatureType
@@ -47,9 +58,7 @@ class Dataset:
         self._query_features = query_features
         self._item_features = item_features
-        self.is_pandas = isinstance(interactions, PandasDataFrame)
-        self.is_spark = isinstance(interactions, SparkDataFrame)
-        self.is_polars = isinstance(interactions, PolarsDataFrame)
+        self._assign_df_type()
         self._categorical_encoded = categorical_encoded
@@ -74,16 +83,8 @@ class Dataset:
             msg = "Interactions and query features should have the same type."
             raise TypeError(msg)
-        self._feature_source_map: Dict[FeatureSource, DataFrameLike] = {
-            FeatureSource.INTERACTIONS: self.interactions,
-            FeatureSource.QUERY_FEATURES: self.query_features,
-            FeatureSource.ITEM_FEATURES: self.item_features,
-        }
-        self._ids_feature_map: Dict[FeatureHint, DataFrameLike] = {
-            FeatureHint.QUERY_ID: self.query_features if self.query_features is not None else self.interactions,
-            FeatureHint.ITEM_ID: self.item_features if self.item_features is not None else self.interactions,
-        }
+        self._get_feature_source_map()
+        self._get_ids_source_map()
         self._feature_schema = self._fill_feature_schema(feature_schema)
@@ -92,7 +93,6 @@ class Dataset:
                 self._check_ids_consistency(hint=FeatureHint.QUERY_ID)
             if self.item_features is not None:
                 self._check_ids_consistency(hint=FeatureHint.ITEM_ID)
             if self._categorical_encoded:
                 self._check_encoded()
@@ -189,6 +189,157 @@ class Dataset:
         """
         return self._feature_schema
+    def _get_df_type(self) -> str:
+        """
+        :returns: Stored dataframe type.
+        """
+        if self.is_spark:
+            return "spark"
+        if self.is_pandas:
+            return "pandas"
+        if self.is_polars:
+            return "polars"
+        msg = "No known dataframe types are provided"
+        raise ValueError(msg)
+    def _to_parquet(self, df: DataFrameLike, path: Path) -> None:
+        """
+        Save the content of the dataframe in parquet format to the provided path.
+        :param df: Dataframe to save.
+        :param path: Path to save the dataframe to.
+        """
+        if self.is_spark:
+            path = str(path)
+            df = df.withColumn("idx", sf.monotonically_increasing_id())
+            df.write.mode("overwrite").parquet(path)
+        elif self.is_pandas:
+            df.to_parquet(path)
+        elif self.is_polars:
+            df.write_parquet(path)
+        else:
+            msg = """
+            _to_parquet() can only be used to save polars|pandas|spark dataframes;
+            No known dataframe types are provided
+            """
+            raise TypeError(msg)
+    @staticmethod
+    def _read_parquet(path: Path, mode: str) -> Union[SparkDataFrame, PandasDataFrame, PolarsDataFrame]:
+        """
+        Read the parquet file as dataframe.
+        :param path: The parquet file path.
+        :param mode: Dataframe type. Can be spark|pandas|polars.
+        :returns: The dataframe read from the file.
+        """
+        if mode == "spark":
+            path = str(path)
+            spark_session = get_spark_session()
+            df = spark_session.read.parquet(path)
+            if "idx" in df.columns:
+                df = df.orderBy("idx").drop("idx")
+            return df
+        if mode == "pandas":
+            df = pd_read_parquet(path)
+            if "idx" in df.columns:
+                df = df.set_index("idx").reset_index(drop=True)
+            return df
+        if mode == "polars":
+            df = pl_read_parquet(path, use_pyarrow=True)
+            if "idx" in df.columns:
+                df = df.sort("idx").drop("idx")
+            return df
+        msg = f"_read_parquet() can only be used to read polars|pandas|spark dataframes, not {mode}"
+        raise TypeError(msg)
+    def save(self, path: str) -> None:
+        """
+        Save the Dataset to the provided path.
+        :param path: Path to save the Dataset to.
+        """
+        dataset_dict = {}
+        dataset_dict["_class_name"] = self.__class__.__name__
+        interactions_type = self._get_df_type()
+        dataset_dict["init_args"] = {
+            "feature_schema": [],
+            "interactions": interactions_type,
+            "item_features": (interactions_type if self.item_features is not None else None),
+            "query_features": (interactions_type if self.query_features is not None else None),
+            "check_consistency": False,
+            "categorical_encoded": self._categorical_encoded,
+        }
+        for feature in self.feature_schema.all_features:
+            dataset_dict["init_args"]["feature_schema"].append(
+                {
+                    "column": feature.column,
+                    "feature_type": feature.feature_type.name,
+                    "feature_hint": (feature.feature_hint.name if feature.feature_hint else None),
+                }
+            )
+        base_path = Path(path).with_suffix(".replay").resolve()
+        base_path.mkdir(parents=True, exist_ok=True)
+        with open(base_path / "init_args.json", "w+") as file:
+            json.dump(dataset_dict, file)
+        df_data = {
+            "interactions": self.interactions,
+            "item_features": self.item_features,
+            "query_features": self.query_features,
+        }
+        for df_name, df in df_data.items():
+            if df is not None:
+                df_path = base_path / f"{df_name}.parquet"
+                self._to_parquet(df, df_path)
+    @classmethod
+    def load(
+        cls,
+        path: str,
+        dataframe_type: Optional[str] = None,
+    ) -> Dataset:
+        """
+        Load the Dataset from the provided path.
+        :param path: The file path
+        :dataframe_type: Dataframe type to use to store internal data.
+            Can be spark|pandas|polars|None.
+            If not provided automatically sets to the one used when the Dataset was saved.
+        :returns: Loaded Dataset.
+        """
+        base_path = Path(path).with_suffix(".replay").resolve()
+        with open(base_path / "init_args.json", "r") as file:
+            dataset_dict = json.loads(file.read())
+        if dataframe_type not in ["pandas", "spark", "polars", None]:
+            msg = f"Argument dataframe_type can be spark|pandas|polars|None, not {dataframe_type}"
+            raise ValueError(msg)
+        feature_schema_data = dataset_dict["init_args"]["feature_schema"]
+        features_list = []
+        for feature_data in feature_schema_data:
+            f_type = feature_data["feature_type"]
+            f_hint = feature_data["feature_hint"]
+            feature_data["feature_type"] = FeatureType[f_type] if f_type else None
+            feature_data["feature_hint"] = FeatureHint[f_hint] if f_hint else None
+            features_list.append(FeatureInfo(**feature_data))
+        dataset_dict["init_args"]["feature_schema"] = FeatureSchema(features_list)
+        for df_name in ["interactions", "query_features", "item_features"]:
+            df_type = dataset_dict["init_args"][df_name]
+            if df_type:
+                df_type = dataframe_type or df_type
+                load_path = base_path / f"{df_name}.parquet"
+                dataset_dict["init_args"][df_name] = cls._read_parquet(load_path, df_type)
+        dataset = cls(**dataset_dict["init_args"])
+        return dataset
     if PYSPARK_AVAILABLE:
         def persist(self, storage_level: StorageLevel = StorageLevel(True, True, False, True, 1)) -> None:
@@ -283,6 +434,24 @@ class Dataset:
             categorical_encoded=self._categorical_encoded,
         )
+    def _get_feature_source_map(self):
+        self._feature_source_map: Dict[FeatureSource, DataFrameLike] = {
+            FeatureSource.INTERACTIONS: self.interactions,
+            FeatureSource.QUERY_FEATURES: self.query_features,
+            FeatureSource.ITEM_FEATURES: self.item_features,
+        }
+    def _get_ids_source_map(self):
+        self._ids_feature_map: Dict[FeatureHint, DataFrameLike] = {
+            FeatureHint.QUERY_ID: self.query_features if self.query_features is not None else self.interactions,
+            FeatureHint.ITEM_ID: self.item_features if self.item_features is not None else self.interactions,
+        }
+    def _assign_df_type(self):
+        self.is_pandas = isinstance(self.interactions, PandasDataFrame)
+        self.is_spark = isinstance(self.interactions, SparkDataFrame)
+        self.is_polars = isinstance(self.interactions, PolarsDataFrame)
     def _get_cardinality(self, feature: FeatureInfo) -> Callable:
         def callback(column: str) -> int:
             if feature.feature_hint in [FeatureHint.ITEM_ID, FeatureHint.QUERY_ID]:
@@ -381,7 +550,11 @@ class Dataset:
             is_consistent = (
                 self.interactions.select(ids_column)
                 .distinct()
-                .join(features_df.select(ids_column).distinct(), on=[ids_column], how="leftanti")
+                .join(
+                    features_df.select(ids_column).distinct(),
+                    on=[ids_column],
+                    how="leftanti",
+                )
                 .count()
             ) == 0
         else:
@@ -389,7 +562,11 @@ class Dataset:
                 len(
                     self.interactions.select(ids_column)
                     .unique()
-                    .join(features_df.select(ids_column).unique(), on=ids_column, how="anti")
+                    .join(
+                        features_df.select(ids_column).unique(),
+                        on=ids_column,
+                        how="anti",
+                    )
                 )
                 == 0
             )
@@ -399,7 +576,11 @@ class Dataset:
             raise ValueError(msg)
     def _check_column_encoded(
-        self, data: DataFrameLike, column: str, source: FeatureSource, cardinality: Optional[int]
+        self,
+        data: DataFrameLike,
+        column: str,
+        source: FeatureSource,
+        cardinality: Optional[int],
     ) -> None:
         """
         Checks that IDs are encoded:
@@ -482,6 +663,51 @@ class Dataset:
                     feature.cardinality,
                 )
+    def to_pandas(self) -> None:
+        """
+        Convert internally stored dataframes to pandas.DataFrame.
+        """
+        from replay.utils.common import convert2pandas
+        self._interactions = convert2pandas(self._interactions)
+        if self._query_features is not None:
+            self._query_features = convert2pandas(self._query_features)
+        if self._item_features is not None:
+            self._item_features = convert2pandas(self.item_features)
+        self._get_feature_source_map()
+        self._get_ids_source_map()
+        self._assign_df_type()
+    def to_spark(self):
+        """
+        Convert internally stored dataframes to pyspark.sql.DataFrame.
+        """
+        from replay.utils.common import convert2spark
+        self._interactions = convert2spark(self._interactions)
+        if self._query_features is not None:
+            self._query_features = convert2spark(self._query_features)
+        if self._item_features is not None:
+            self._item_features = convert2spark(self._item_features)
+        self._get_feature_source_map()
+        self._get_ids_source_map()
+        self._assign_df_type()
+    def to_polars(self):
+        """
+        Convert internally stored dataframes to polars.DataFrame.
+        """
+        from replay.utils.common import convert2polars
+        self._interactions = convert2polars(self._interactions)
+        if self._query_features is not None:
+            self._query_features = convert2polars(self._query_features)
+        if self._item_features is not None:
+            self._item_features = convert2polars(self._item_features)
+        self._get_feature_source_map()
+        self._get_ids_source_map()
+        self._assign_df_type()
 def nunique(data: DataFrameLike, column: str) -> int:
     """

replay/data/nn/schema.py CHANGED Viewed

@@ -408,6 +408,48 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
             return None
         return rating_features.item().name
+    def _get_object_args(self) -> Dict:
+        """
+        Returns list of features represented as dictionaries.
+        """
+        features = [
+            {
+                "name": feature.name,
+                "feature_type": feature.feature_type.name,
+                "is_seq": feature.is_seq,
+                "feature_hint": feature.feature_hint.name if feature.feature_hint else None,
+                "feature_sources": [
+                    {"source": x.source.name, "column": x.column, "index": x.index} for x in feature.feature_sources
+                ]
+                if feature.feature_sources
+                else None,
+                "cardinality": feature.cardinality if feature.feature_type == FeatureType.CATEGORICAL else None,
+                "embedding_dim": feature.embedding_dim if feature.feature_type == FeatureType.CATEGORICAL else None,
+                "tensor_dim": feature.tensor_dim if feature.feature_type == FeatureType.NUMERICAL else None,
+            }
+            for feature in self.all_features
+        ]
+        return features
+    @classmethod
+    def _create_object_by_args(cls, args: Dict) -> "TensorSchema":
+        features_list = []
+        for feature_data in args:
+            feature_data["feature_sources"] = (
+                [
+                    TensorFeatureSource(source=FeatureSource[x["source"]], column=x["column"], index=x["index"])
+                    for x in feature_data["feature_sources"]
+                ]
+                if feature_data["feature_sources"]
+                else None
+            )
+            f_type = feature_data["feature_type"]
+            f_hint = feature_data["feature_hint"]
+            feature_data["feature_type"] = FeatureType[f_type] if f_type else None
+            feature_data["feature_hint"] = FeatureHint[f_hint] if f_hint else None
+            features_list.append(TensorFeatureInfo(**feature_data))
+        return TensorSchema(features_list)
     def filter(
         self,
         name: Optional[str] = None,

replay/data/nn/sequence_tokenizer.py CHANGED Viewed

@@ -24,7 +24,10 @@ SequenceDataFrameLike = Union[PandasDataFrame, PolarsDataFrame]
 class SequenceTokenizer:
     """
-    Data tokenizer for transformers
+    Data tokenizer for transformers;
+    Encodes all categorical features (the ones marked as FeatureType.CATEGORICAL in
+    the FeatureSchema) and stores all data as items sequences (sorted by time if a
+    feature of type FeatureHint.TIMESTAMP is provided, unsorted otherwise).
     """
     def __init__(
@@ -278,17 +281,17 @@ class SequenceTokenizer:
         ]
         for tensor_feature in tensor_schema.values():
-            source = tensor_feature.feature_source
-            assert source is not None
+            for source in tensor_feature.feature_sources:
+                assert source is not None
-            # Some columns already added to encoder, skip them
-            if source.column in features_subset:
-                continue
+                # Some columns already added to encoder, skip them
+                if source.column in features_subset:
+                    continue
-            if isinstance(source.source, FeatureSource):
-                features_subset.append(source.column)
-            else:
-                assert False, "Unknown tensor feature source"
+                if isinstance(source.source, FeatureSource):
+                    features_subset.append(source.column)
+                else:
+                    assert False, "Unknown tensor feature source"
         return set(features_subset)
@@ -404,7 +407,7 @@ class SequenceTokenizer:
     @classmethod
     @deprecation_warning("with `use_pickle` equals to `True` will be deprecated in future versions")
-    def load(cls, path: str, use_pickle: bool = False) -> "SequenceTokenizer":
+    def load(cls, path: str, use_pickle: bool = False, **kwargs) -> "SequenceTokenizer":
         """
         Load tokenizer object from the given path.
@@ -422,18 +425,7 @@ class SequenceTokenizer:
             # load tensor_schema, tensor_features
             tensor_schema_data = tokenizer_dict["init_args"]["tensor_schema"]
-            features_list = []
-            for feature_data in tensor_schema_data:
-                feature_data["feature_sources"] = [
-                    TensorFeatureSource(source=FeatureSource[x["source"]], column=x["column"], index=x["index"])
-                    for x in feature_data["feature_sources"]
-                ]
-                f_type = feature_data["feature_type"]
-                f_hint = feature_data["feature_hint"]
-                feature_data["feature_type"] = FeatureType[f_type] if f_type else None
-                feature_data["feature_hint"] = FeatureHint[f_hint] if f_hint else None
-                features_list.append(TensorFeatureInfo(**feature_data))
-            tokenizer_dict["init_args"]["tensor_schema"] = TensorSchema(features_list)
+            tokenizer_dict["init_args"]["tensor_schema"] = TensorSchema._create_object_by_args(tensor_schema_data)
             # Load encoder columns and rules
             types = list(FeatureHint) + list(FeatureSource)
@@ -447,7 +439,7 @@ class SequenceTokenizer:
                 rule_data = rules_dict[rule]
                 if rule_data["mapping"] and rule_data["is_int"]:
                     rule_data["mapping"] = {int(key): value for key, value in rule_data["mapping"].items()}
-                    del rule_data["is_int"]
+                del rule_data["is_int"]
                 tokenizer_dict["encoder"]["encoding_rules"][rule] = LabelEncodingRule(**rule_data)
@@ -478,31 +470,9 @@ class SequenceTokenizer:
                 "allow_collect_to_master": self._allow_collect_to_master,
                 "handle_unknown_rule": self._encoder._handle_unknown_rule,
                 "default_value_rule": self._encoder._default_value_rule,
-                "tensor_schema": [],
+                "tensor_schema": self._tensor_schema._get_object_args(),
             }
-            # save tensor schema
-            for feature in list(self._tensor_schema.values()):
-                tokenizer_dict["init_args"]["tensor_schema"].append(
-                    {
-                        "name": feature.name,
-                        "feature_type": feature.feature_type.name,
-                        "is_seq": feature.is_seq,
-                        "feature_hint": feature.feature_hint.name if feature.feature_hint else None,
-                        "feature_sources": [
-                            {"source": x.source.name, "column": x.column, "index": x.index}
-                            for x in feature.feature_sources
-                        ]
-                        if feature.feature_sources
-                        else None,
-                        "cardinality": feature.cardinality if feature.feature_type == FeatureType.CATEGORICAL else None,
-                        "embedding_dim": feature.embedding_dim
-                        if feature.feature_type == FeatureType.CATEGORICAL
-                        else None,
-                        "tensor_dim": feature.tensor_dim if feature.feature_type == FeatureType.NUMERICAL else None,
-                    }
-                )
             # save DatasetLabelEncoder
             tokenizer_dict["encoder"] = {
                 "features_columns": {key.name: value for key, value in self._encoder._features_columns.items()},

replay/data/nn/sequential_dataset.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import abc
+import json
+from pathlib import Path
 from typing import Tuple, Union
 import numpy as np
+import pandas as pd
 import polars as pl
 from pandas import DataFrame as PandasDataFrame
 from polars import DataFrame as PolarsDataFrame
@@ -100,6 +103,23 @@ class SequentialDataset(abc.ABC):
         rhs_filtered = rhs.filter_by_query_id(common_queries)
         return lhs_filtered, rhs_filtered
+    def save(self, path: str) -> None:
+        base_path = Path(path).with_suffix(".replay").resolve()
+        base_path.mkdir(parents=True, exist_ok=True)
+        sequential_dict = {}
+        sequential_dict["_class_name"] = self.__class__.__name__
+        self._sequences.reset_index().to_json(base_path / "sequences.json")
+        sequential_dict["init_args"] = {
+            "tensor_schema": self._tensor_schema._get_object_args(),
+            "query_id_column": self._query_id_column,
+            "item_id_column": self._item_id_column,
+            "sequences_path": "sequences.json",
+        }
+        with open(base_path / "init_args.json", "w+") as file:
+            json.dump(sequential_dict, file)
 class PandasSequentialDataset(SequentialDataset):
     """
@@ -174,6 +194,25 @@ class PandasSequentialDataset(SequentialDataset):
                 msg = "Tensor schema does not match with provided data frame"
                 raise ValueError(msg)
+    @classmethod
+    def load(cls, path: str, **kwargs) -> "PandasSequentialDataset":
+        """
+        Method for loading PandasSequentialDataset object from `.replay` directory.
+        """
+        base_path = Path(path).with_suffix(".replay").resolve()
+        with open(base_path / "init_args.json", "r") as file:
+            sequential_dict = json.loads(file.read())
+        sequences = pd.read_json(base_path / sequential_dict["init_args"]["sequences_path"])
+        dataset = cls(
+            tensor_schema=TensorSchema._create_object_by_args(sequential_dict["init_args"]["tensor_schema"]),
+            query_id_column=sequential_dict["init_args"]["query_id_column"],
+            item_id_column=sequential_dict["init_args"]["item_id_column"],
+            sequences=sequences,
+        )
+        return dataset
 class PolarsSequentialDataset(PandasSequentialDataset):
     """
@@ -199,7 +238,7 @@ class PolarsSequentialDataset(PandasSequentialDataset):
         self._query_id_column = query_id_column
         self._item_id_column = item_id_column
-        self._sequences = sequences.to_pandas()
+        self._sequences = self._convert_polars_to_pandas(sequences)
         if self._sequences.index.name != query_id_column:
             self._sequences = self._sequences.set_index(query_id_column)
@@ -211,12 +250,47 @@ class PolarsSequentialDataset(PandasSequentialDataset):
             tensor_schema=self._tensor_schema,
             query_id_column=self._query_id_column,
             item_id_column=self._item_id_column,
-            sequences=pl.from_pandas(filtered_sequences),
+            sequences=self._convert_pandas_to_polars(filtered_sequences),
         )
+    def _convert_polars_to_pandas(self, df: PolarsDataFrame) -> PandasDataFrame:
+        pandas_df = PandasDataFrame(df.to_dict(as_series=False))
+        for column in pandas_df.select_dtypes(include="object").columns:
+            if isinstance(pandas_df[column].iloc[0], list):
+                pandas_df[column] = pandas_df[column].apply(lambda x: np.array(x))
+        return pandas_df
+    def _convert_pandas_to_polars(self, df: PandasDataFrame) -> PolarsDataFrame:
+        for column in df.select_dtypes(include="object").columns:
+            if isinstance(df[column].iloc[0], np.ndarray):
+                df[column] = df[column].apply(lambda x: x.tolist())
+        return pl.from_dict(df.to_dict("list"))
     @classmethod
     def _check_if_schema_matches_data(cls, tensor_schema: TensorSchema, data: PolarsDataFrame) -> None:
         for tensor_feature_name in tensor_schema:
             if tensor_feature_name not in data:
                 msg = "Tensor schema does not match with provided data frame"
                 raise ValueError(msg)
+    @classmethod
+    def load(cls, path: str, **kwargs) -> "PandasSequentialDataset":
+        """
+        Method for loading PandasSequentialDataset object from `.replay` directory.
+        """
+        base_path = Path(path).with_suffix(".replay").resolve()
+        with open(base_path / "init_args.json", "r") as file:
+            sequential_dict = json.loads(file.read())
+        sequences = pl.DataFrame(pd.read_json(base_path / sequential_dict["init_args"]["sequences_path"]))
+        dataset = cls(
+            tensor_schema=TensorSchema._create_object_by_args(sequential_dict["init_args"]["tensor_schema"]),
+            query_id_column=sequential_dict["init_args"]["query_id_column"],
+            item_id_column=sequential_dict["init_args"]["item_id_column"],
+            sequences=sequences,
+        )
+        return dataset

replay/preprocessing/filters.py CHANGED Viewed

@@ -5,6 +5,8 @@ from abc import ABC, abstractmethod
 from datetime import datetime, timedelta
 from typing import Callable, Optional, Tuple, Union
+import numpy as np
+import pandas as pd
 import polars as pl
 from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, PandasDataFrame, PolarsDataFrame, SparkDataFrame
@@ -357,7 +359,7 @@ class NumInteractionsFilter(_BaseFilter):
     ...                                   "2020-02-01", "2020-01-01 00:04:15",
     ...                                   "2020-01-02 00:04:14", "2020-01-05 23:59:59"]},
     ...             )
-    >>> log_pd["timestamp"] = pd.to_datetime(log_pd["timestamp"])
+    >>> log_pd["timestamp"] = pd.to_datetime(log_pd["timestamp"], format="ISO8601")
     >>> log_sp = convert2spark(log_pd)
     >>> log_sp.show()
     +-------+-------+------+-------------------+
@@ -499,7 +501,7 @@ class EntityDaysFilter(_BaseFilter):
     ...                                   "2020-02-01", "2020-01-01 00:04:15",
     ...                                   "2020-01-02 00:04:14", "2020-01-05 23:59:59"]},
     ...             )
-    >>> log_pd["timestamp"] = pd.to_datetime(log_pd["timestamp"])
+    >>> log_pd["timestamp"] = pd.to_datetime(log_pd["timestamp"], format="ISO8601")
     >>> log_sp = convert2spark(log_pd)
     >>> log_sp.orderBy('user_id', 'item_id').show()
     +-------+-------+------+-------------------+
@@ -638,7 +640,7 @@ class GlobalDaysFilter(_BaseFilter):
     ...                                   "2020-02-01", "2020-01-01 00:04:15",
     ...                                   "2020-01-02 00:04:14", "2020-01-05 23:59:59"]},
     ...             )
-    >>> log_pd["timestamp"] = pd.to_datetime(log_pd["timestamp"])
+    >>> log_pd["timestamp"] = pd.to_datetime(log_pd["timestamp"], format="ISO8601")
     >>> log_sp = convert2spark(log_pd)
     >>> log_sp.show()
     +-------+-------+------+-------------------+
@@ -740,7 +742,7 @@ class TimePeriodFilter(_BaseFilter):
     ...                                   "2020-02-01", "2020-01-01 00:04:15",
     ...                                   "2020-01-02 00:04:14", "2020-01-05 23:59:59"]},
     ...             )
-    >>> log_pd["timestamp"] = pd.to_datetime(log_pd["timestamp"])
+    >>> log_pd["timestamp"] = pd.to_datetime(log_pd["timestamp"], format="ISO8601")
     >>> log_sp = convert2spark(log_pd)
     >>> log_sp.show()
     +-------+-------+------+-------------------+
@@ -823,3 +825,166 @@ class TimePeriodFilter(_BaseFilter):
         return interactions.filter(
             pl.col(self.timestamp_column).is_between(self.start_date, self.end_date, closed="left")
         )
+class QuantileItemsFilter(_BaseFilter):
+    """
+    Filter is aimed on undersampling the interactions dataset.
+    Filter algorithm performs undersampling by removing `items_proportion` of interactions
+    for each items counts that exceeds the `alpha_quantile` value in distribution. Filter firstly
+    removes popular items (items that have most interactions). Filter also keeps the original
+    relation of items popularity among each other by removing interactions only in range of
+    current item count and quantile count (specified by `alpha_quantile`).
+    >>> import pandas as pd
+    >>> from replay.utils.spark_utils import convert2spark
+    >>> log_pd = pd.DataFrame({
+    ...        "user_id": [0, 0, 1, 2, 2, 2, 2],
+    ...        "item_id": [0, 2, 1, 1, 2, 2, 2]
+    ... })
+    >>> log_spark = convert2spark(log_pd)
+    >>> log_spark.show()
+    +-------+-------+
+    |user_id|item_id|
+    +-------+-------+
+    |      0|      0|
+    |      0|      2|
+    |      1|      1|
+    |      2|      1|
+    |      2|      2|
+    |      2|      2|
+    |      2|      2|
+    +-------+-------+
+    <BLANKLINE>
+    >>> QuantileItemsFilter(query_column="user_id").transform(log_spark).show()
+    +-------+-------+
+    |user_id|item_id|
+    +-------+-------+
+    |      0|      0|
+    |      1|      1|
+    |      2|      1|
+    |      2|      2|
+    |      2|      2|
+    |      0|      2|
+    +-------+-------+
+    <BLANKLINE>
+    """
+    def __init__(
+        self,
+        alpha_quantile: float = 0.99,
+        items_proportion: float = 0.5,
+        query_column: str = "query_id",
+        item_column: str = "item_id",
+    ) -> None:
+        """
+        :param alpha_quantile: Quantile value of items counts distribution to keep unchanged.
+            Every items count that exceeds this value will be undersampled.
+            Default: ``0.99``.
+        :param items_proportion: proportion of items counts to remove for items that
+            exceeds `alpha_quantile` value in range of current item count and quantile count
+            to make sure we keep original relation between items unchanged.
+            Default: ``0.5``.
+        :param query_column: query column name.
+            Default: ``query_id``.
+        :param item_column: item column name.
+            Default: ``item_id``.
+        """
+        if not 0 < alpha_quantile < 1:
+            msg = "`alpha_quantile` value must be in (0, 1)"
+            raise ValueError(msg)
+        if not 0 < items_proportion < 1:
+            msg = "`items_proportion` value must be in (0, 1)"
+            raise ValueError(msg)
+        self.alpha_quantile = alpha_quantile
+        self.items_proportion = items_proportion
+        self.query_column = query_column
+        self.item_column = item_column
+    def _filter_pandas(self, df: pd.DataFrame):
+        items_distribution = df.groupby(self.item_column).size().reset_index().rename(columns={0: "counts"})
+        users_distribution = df.groupby(self.query_column).size().reset_index().rename(columns={0: "counts"})
+        count_threshold = items_distribution.loc[:, "counts"].quantile(self.alpha_quantile, interpolation="midpoint")
+        df_with_counts = df.merge(items_distribution, how="left", on=self.item_column).merge(
+            users_distribution, how="left", on=self.query_column, suffixes=["_items", "_users"]
+        )
+        long_tail = df_with_counts.loc[df_with_counts["counts_items"] <= count_threshold]
+        short_tail = df_with_counts.loc[df_with_counts["counts_items"] > count_threshold]
+        short_tail["num_items_to_delete"] = self.items_proportion * (
+            short_tail["counts_items"] - long_tail["counts_items"].max()
+        )
+        short_tail["num_items_to_delete"] = short_tail["num_items_to_delete"].astype("int")
+        short_tail = short_tail.sort_values("counts_users", ascending=False)
+        def get_mask(x):
+            mask = np.ones_like(x)
+            threshold = x.iloc[0]
+            mask[:threshold] = 0
+            return mask
+        mask = short_tail.groupby(self.item_column)["num_items_to_delete"].transform(get_mask).astype(bool)
+        return pd.concat([long_tail[df.columns], short_tail.loc[mask][df.columns]])
+    def _filter_polars(self, df: pl.DataFrame):
+        items_distribution = df.group_by(self.item_column).len()
+        users_distribution = df.group_by(self.query_column).len()
+        count_threshold = items_distribution.select("len").quantile(self.alpha_quantile, "midpoint")["len"][0]
+        df_with_counts = (
+            df.join(items_distribution, how="left", on=self.item_column).join(
+                users_distribution, how="left", on=self.query_column
+            )
+        ).rename({"len": "counts_items", "len_right": "counts_users"})
+        long_tail = df_with_counts.filter(pl.col("counts_items") <= count_threshold)
+        short_tail = df_with_counts.filter(pl.col("counts_items") > count_threshold)
+        max_long_tail_count = long_tail["counts_items"].max()
+        items_to_delete = (
+            short_tail.select(
+                self.query_column,
+                self.item_column,
+                self.items_proportion * (pl.col("counts_items") - max_long_tail_count),
+            )
+            .with_columns(pl.col("literal").cast(pl.Int64).alias("num_items_to_delete"))
+            .select(self.item_column, "num_items_to_delete")
+            .unique(maintain_order=True)
+        )
+        short_tail = short_tail.join(items_to_delete, how="left", on=self.item_column).sort(
+            "counts_users", descending=True
+        )
+        short_tail = short_tail.with_columns(index=pl.int_range(short_tail.shape[0]))
+        grouped = short_tail.group_by(self.item_column, maintain_order=True).agg(
+            pl.col("index"), pl.col("num_items_to_delete")
+        )
+        grouped = grouped.with_columns(
+            pl.col("num_items_to_delete").list.get(0),
+            (pl.col("index").list.len() - pl.col("num_items_to_delete").list.get(0)).alias("tail"),
+        )
+        grouped = grouped.with_columns(pl.col("index").list.tail(pl.col("tail")))
+        grouped = grouped.explode("index").select("index")
+        short_tail = grouped.join(short_tail, how="left", on="index")
+        return pl.concat([long_tail.select(df.columns), short_tail.select(df.columns)])
+    def _filter_spark(self, df: SparkDataFrame):
+        items_distribution = df.groupBy(self.item_column).agg(sf.count(self.query_column).alias("counts_items"))
+        users_distribution = df.groupBy(self.query_column).agg(sf.count(self.item_column).alias("counts_users"))
+        count_threshold = items_distribution.toPandas().loc[:, "counts_items"].quantile(self.alpha_quantile, "midpoint")
+        df_with_counts = df.join(items_distribution, on=self.item_column).join(users_distribution, on=self.query_column)
+        long_tail = df_with_counts.filter(sf.col("counts_items") <= count_threshold)
+        short_tail = df_with_counts.filter(sf.col("counts_items") > count_threshold)
+        max_long_tail_count = long_tail.agg({"counts_items": "max"}).collect()[0][0]
+        items_to_delete = (
+            short_tail.withColumn(
+                "num_items_to_delete",
+                (self.items_proportion * (sf.col("counts_items") - max_long_tail_count)).cast("int"),
+            )
+            .select(self.item_column, "num_items_to_delete")
+            .distinct()
+        )
+        short_tail = short_tail.join(items_to_delete, on=self.item_column, how="left")
+        short_tail = short_tail.withColumn(
+            "index", sf.row_number().over(Window.partitionBy(self.item_column).orderBy(sf.col("counts_users").desc()))
+        )
+        short_tail = short_tail.filter(sf.col("index") > sf.col("num_items_to_delete"))
+        return long_tail.select(df.columns).union(short_tail.select(df.columns))

replay/splitters/base_splitter.py CHANGED Viewed

@@ -85,7 +85,7 @@ class Splitter(ABC):
             json.dump(splitter_dict, file)
     @classmethod
-    def load(cls, path: str) -> "Splitter":
+    def load(cls, path: str, **kwargs) -> "Splitter":
         """
         Method for loading splitter from `.replay` directory.
         """

replay/utils/common.py CHANGED Viewed

@@ -1,7 +1,12 @@
+import functools
+import inspect
 import json
 from pathlib import Path
-from typing import Union
+from typing import Any, Callable, Union
+from polars import from_pandas as pl_from_pandas
+from replay.data.dataset import Dataset
 from replay.splitters import (
     ColdUserRandomSplitter,
     KFolds,
@@ -12,7 +17,16 @@ from replay.splitters import (
     TimeSplitter,
     TwoStageSplitter,
 )
-from replay.utils import TORCH_AVAILABLE
+from replay.utils import (
+    TORCH_AVAILABLE,
+    PandasDataFrame,
+    PolarsDataFrame,
+    SparkDataFrame,
+)
+from replay.utils.spark_utils import (
+    convert2spark as pandas_to_spark,
+    spark_to_pandas,
+)
 SavableObject = Union[
     ColdUserRandomSplitter,
@@ -23,10 +37,11 @@ SavableObject = Union[
     RatioSplitter,
     TimeSplitter,
     TwoStageSplitter,
+    Dataset,
 ]
 if TORCH_AVAILABLE:
-    from replay.data.nn import SequenceTokenizer
+    from replay.data.nn import PandasSequentialDataset, PolarsSequentialDataset, SequenceTokenizer
     SavableObject = Union[
         ColdUserRandomSplitter,
@@ -38,6 +53,8 @@ if TORCH_AVAILABLE:
         TimeSplitter,
         TwoStageSplitter,
         SequenceTokenizer,
+        PandasSequentialDataset,
+        PolarsSequentialDataset,
     ]
@@ -50,7 +67,7 @@ def save_to_replay(obj: SavableObject, path: Union[str, Path]) -> None:
     obj.save(path)
-def load_from_replay(path: Union[str, Path]) -> SavableObject:
+def load_from_replay(path: Union[str, Path], **kwargs) -> SavableObject:
     """
     General function to load RePlay models, splitters and tokenizer.
@@ -60,6 +77,91 @@ def load_from_replay(path: Union[str, Path]) -> SavableObject:
     with open(path / "init_args.json", "r") as file:
         class_name = json.loads(file.read())["_class_name"]
     obj_type = globals()[class_name]
-    obj = obj_type.load(path)
+    obj = obj_type.load(path, **kwargs)
     return obj
+def _check_if_dataframe(var: Any):
+    if not isinstance(var, (SparkDataFrame, PolarsDataFrame, PandasDataFrame)):
+        msg = f"Object of type {type(var)} is not a dataframe of known type (can be pandas|spark|polars)"
+        raise ValueError(msg)
+def check_if_dataframe(*args_to_check: str) -> Callable[..., Any]:
+    def decorator_func(func: Callable[..., Any]) -> Callable[..., Any]:
+        @functools.wraps(func)
+        def wrap_func(*args: Any, **kwargs: Any) -> Any:
+            extended_kwargs = {}
+            extended_kwargs.update(kwargs)
+            extended_kwargs.update(dict(zip(inspect.signature(func).parameters.keys(), args)))
+            # add default param values to dict with arguments
+            extended_kwargs.update(
+                {
+                    x.name: x.default
+                    for x in inspect.signature(func).parameters.values()
+                    if x.name not in extended_kwargs and x.default is not x.empty
+                }
+            )
+            vals_to_check = [extended_kwargs[_arg] for _arg in args_to_check]
+            for val in vals_to_check:
+                _check_if_dataframe(val)
+            return func(*args, **kwargs)
+        return wrap_func
+    return decorator_func
+@check_if_dataframe("data")
+def convert2pandas(
+    data: Union[SparkDataFrame, PolarsDataFrame, PandasDataFrame], allow_collect_to_master: bool = False
+) -> PandasDataFrame:
+    """
+    Convert the spark|polars DataFrame to a pandas.DataFrame.
+    Returns unchanged dataframe if the input is already of type pandas.DataFrame.
+    :param data: The dataframe to convert. Can be polars|spark|pandas DataFrame.
+    :param allow_collect_to_master: If set to False (default) raises a warning
+        about collecting parallelized data to the master node.
+    """
+    if isinstance(data, PandasDataFrame):
+        return data
+    if isinstance(data, PolarsDataFrame):
+        return data.to_pandas()
+    if isinstance(data, SparkDataFrame):
+        return spark_to_pandas(data, allow_collect_to_master, from_constructor=False)
+@check_if_dataframe("data")
+def convert2polars(
+    data: Union[SparkDataFrame, PolarsDataFrame, PandasDataFrame], allow_collect_to_master: bool = False
+) -> PolarsDataFrame:
+    """
+    Convert the spark|pandas DataFrame to a polars.DataFrame.
+    Returns unchanged dataframe if the input is already of type polars.DataFrame.
+    :param data: The dataframe to convert. Can be spark|pandas|polars DataFrame.
+    :param allow_collect_to_master: If set to False (default) raises a warning
+        about collecting parallelized data to the master node.
+    """
+    if isinstance(data, PandasDataFrame):
+        return pl_from_pandas(data)
+    if isinstance(data, PolarsDataFrame):
+        return data
+    if isinstance(data, SparkDataFrame):
+        return pl_from_pandas(spark_to_pandas(data, allow_collect_to_master, from_constructor=False))
+@check_if_dataframe("data")
+def convert2spark(data: Union[SparkDataFrame, PolarsDataFrame, PandasDataFrame]) -> SparkDataFrame:
+    """
+    Convert the pandas|polars DataFrame to a pysaprk.sql.DataFrame.
+    Returns unchanged dataframe if the input is already of type pysaprk.sql.DataFrame.
+    :param data: The dataframe to convert. Can be pandas|polars|spark Datarame.
+    """
+    if isinstance(data, (PandasDataFrame, SparkDataFrame)):
+        return pandas_to_spark(data)
+    if isinstance(data, PolarsDataFrame):
+        return pandas_to_spark(data.to_pandas())

replay/utils/spark_utils.py CHANGED Viewed

@@ -33,7 +33,9 @@ class SparkCollectToMasterWarning(Warning):  # pragma: no cover
     """
-def spark_to_pandas(data: SparkDataFrame, allow_collect_to_master: bool = False) -> pd.DataFrame:  # pragma: no cover
+def spark_to_pandas(
+    data: SparkDataFrame, allow_collect_to_master: bool = False, from_constructor: bool = True
+) -> pd.DataFrame:  # pragma: no cover
     """
     Convert Spark DataFrame to Pandas DataFrame.
@@ -42,10 +44,15 @@ def spark_to_pandas(data: SparkDataFrame, allow_collect_to_master: bool = False)
     :returns: Converted Pandas DataFrame.
     """
+    warn_msg = "Spark Data Frame is collected to master node, this may lead to OOM exception for larger dataset. "
+    if from_constructor:
+        _msg = "To remove this warning set allow_collect_to_master=True in the recommender constructor."
+    else:
+        _msg = "To remove this warning set allow_collect_to_master=True."
+    warn_msg += _msg
     if not allow_collect_to_master:
         warnings.warn(
-            "Spark Data Frame is collected to master node, this may lead to OOM exception for larger dataset. "
-            "To remove this warning set allow_collect_to_master=True in the recommender constructor.",
+            warn_msg,
             SparkCollectToMasterWarning,
         )
     return data.toPandas()
@@ -169,7 +176,7 @@ if PYSPARK_AVAILABLE:
         <BLANKLINE>
         >>> output_data = input_data.select(vector_dot("one", "two").alias("dot"))
         >>> output_data.schema
-        StructType(List(StructField(dot,DoubleType,true)))
+        StructType([StructField('dot', DoubleType(), True)])
         >>> output_data.show()
         +----+
         | dot|
@@ -207,7 +214,7 @@ if PYSPARK_AVAILABLE:
         <BLANKLINE>
         >>> output_data = input_data.select(vector_mult("one", "two").alias("mult"))
         >>> output_data.schema
-        StructType(List(StructField(mult,VectorUDT,true)))
+        StructType([StructField('mult', VectorUDT(), True)])
         >>> output_data.show()
         +---------+
         |     mult|
@@ -244,7 +251,7 @@ if PYSPARK_AVAILABLE:
         <BLANKLINE>
         >>> output_data = input_data.select(array_mult("one", "two").alias("mult"))
         >>> output_data.schema
-        StructType(List(StructField(mult,ArrayType(DoubleType,true),true)))
+        StructType([StructField('mult', ArrayType(DoubleType(), True), True)])
         >>> output_data.show()
         +----------+
         |      mult|

{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: replay-rec
-Version: 0.17.0rc0
+Version: 0.17.1rc0
 Summary: RecSys Library
 Home-page: https://sb-ai-lab.github.io/RePlay/
 License: Apache-2.0
@@ -32,11 +32,11 @@ Requires-Dist: nmslib (==2.1.1)
 Requires-Dist: numba (>=0.50)
 Requires-Dist: numpy (>=1.20.0)
 Requires-Dist: optuna (>=3.2.0,<3.3.0)
-Requires-Dist: pandas (>=1.3.5,<2.0.0)
+Requires-Dist: pandas (>=1.3.5,<=2.2.2)
 Requires-Dist: polars (>=0.20.7,<0.21.0)
 Requires-Dist: psutil (>=5.9.5,<5.10.0)
 Requires-Dist: pyarrow (>=12.0.1)
-Requires-Dist: pyspark (>=3.0,<3.3) ; extra == "spark" or extra == "all"
+Requires-Dist: pyspark (>=3.0,<3.5) ; extra == "spark" or extra == "all"
 Requires-Dist: pytorch-ranger (>=0.1.1,<0.2.0) ; extra == "torch" or extra == "all"
 Requires-Dist: sb-obp (>=0.5.7,<0.6.0)
 Requires-Dist: scikit-learn (>=1.0.2,<2.0.0)

{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
-replay/__init__.py,sha256=y6Ms_dBdP_0tx6CPUF9QV0jrhb-ogRReafA6edgal_E,54
+replay/__init__.py,sha256=_PQ2zFERSGjgeThzFv3t6MPODgutry1eR82biGhB98o,54
 replay/data/__init__.py,sha256=g5bKRyF76QL_BqlED-31RnS8pBdcyj9loMsx5vAG_0E,301
-replay/data/dataset.py,sha256=ysMTNfx8I2hI9fSugtt3IPhenmutgzQMw-8VcM3oUJk,21299
+replay/data/dataset.py,sha256=cSStvCqIc6WAJNtbmsxncSpcQZ1KfULMsrmf_V0UdPw,29490
 replay/data/dataset_utils/__init__.py,sha256=9wUvG8ZwGUvuzLU4zQI5FDcH0WVVo5YLN2ey3DterP0,55
 replay/data/dataset_utils/dataset_label_encoder.py,sha256=TEx2zLw5rJdIz1SRBEznyVv5x_Cs7o6QQbzMk-M1LU0,9598
 replay/data/nn/__init__.py,sha256=WxLsi4rgOuuvGYHN49xBPxP2Srhqf3NYgfBDVH-ZvBo,1122
-replay/data/nn/schema.py,sha256=BYU65vLqPDl69OE-rReh59fiQK0ERfs1xbBLWCiIJnw,14258
-replay/data/nn/sequence_tokenizer.py,sha256=dXD8l7IfK1dod8p--I6BhvE9af3iUOfpaoW2QBU9hTs,34133
-replay/data/nn/sequential_dataset.py,sha256=fqlyBAzDmpH332S-LoMP9PoRYMtgZpxG6Qdahmk5GtE,7840
+replay/data/nn/schema.py,sha256=pO4N7RgmgrqfD1-2d95OTeihKHTZ-5y2BG7CX_wBFi4,16198
+replay/data/nn/sequence_tokenizer.py,sha256=Ambrp3CMOp3JP68PiwmVh0m-_zNXiWzxxVreHkEwOyY,32592
+replay/data/nn/sequential_dataset.py,sha256=jCWxC0Pm1eQ5p8Y6_Bmg4fSEvPaecLrqz1iaWzaICdI,11014
 replay/data/nn/torch_sequential_dataset.py,sha256=BqrK_PtkhpsaY1zRIWGk4EgwPL31a7IWCc0hLDuwDQc,10984
 replay/data/nn/utils.py,sha256=YKE9gkIHZDDiwv4THqOWL4PzsdOujnPuM97v79Mwq0E,2769
 replay/data/schema.py,sha256=F_cv6sYb6l23yuX5xWnbqoJ9oSeUT2NpIM19u8Lf2jA,15606
@@ -148,14 +148,14 @@ replay/optimization/__init__.py,sha256=az6U10rF7X6rPRUUPwLyiM1WFNJ_6kl0imA5xLVWF
 replay/optimization/optuna_objective.py,sha256=Z-8X0_FT3BicVWj0UhxoLrvZAck3Dhn7jHDGo0i0hxA,7653
 replay/preprocessing/__init__.py,sha256=TtBysFqYeDy4kZAEnWEaNSwPvbffYdfMkEs71YG51fM,411
 replay/preprocessing/converter.py,sha256=DczqsVLrwFi6EFhK2HR8rGiIxGCwXeY7QNgWorjA41g,4390
-replay/preprocessing/filters.py,sha256=6MaO4IIyKNFP2AR94YA5iQUhQvuCRhAFfj0opI6o4-Q,33744
+replay/preprocessing/filters.py,sha256=wsXWQoZ-2aAecunLkaTxeLWi5ow4e3FAGcElx0iNx0w,41669
 replay/preprocessing/history_based_fp.py,sha256=tfgKJPKm53LSNqM6VmMXYsVrRDc-rP1Tbzn8s3mbziQ,18751
 replay/preprocessing/label_encoder.py,sha256=MLBavPD-dB644as0E9ZJSE9-8QxGCB_IHek1w3xtqDI,27040
 replay/preprocessing/sessionizer.py,sha256=G6i0K3FwqtweRxvcSYraJ-tBWAT2HnV-bWHHlIZJF-s,12217
 replay/scenarios/__init__.py,sha256=kw2wRkPPinw0IBA20D83XQ3xeSudk3KuYAAA1Wdr8xY,93
 replay/scenarios/fallback.py,sha256=EeBmIR-5igzKR2m55bQRFyhxTkpJez6ZkCW449n8hWs,7130
 replay/splitters/__init__.py,sha256=DnqVMelrzLwR8fGQgcWN_8FipGs8T4XGSPOMW-L_x2g,454
-replay/splitters/base_splitter.py,sha256=qWW8Sueu0BrYt0WIxMbzooAC4-jhEmyd6pMND_H_qB0,7751
+replay/splitters/base_splitter.py,sha256=hj9_GYDWllzv3XnxN6WHu1JKRRVjXo77vZEOLbF9v-s,7761
 replay/splitters/cold_user_random_splitter.py,sha256=gVwBVdn_0IOaLGT_UzJoS9AMaPhelZy-FpC5JQS1PhA,4136
 replay/splitters/k_folds.py,sha256=WH02_DP18A2ae893ysonmfLPB56_i1ETllTAwaCYekg,6218
 replay/splitters/last_n_splitter.py,sha256=r9kdq2JPi508C9ywjwc68an-iq27KsigMfHWLz0YohE,15346
@@ -165,16 +165,16 @@ replay/splitters/ratio_splitter.py,sha256=8zvuCn16Icc4ntQPKXJ5ArAWuJzCZ9NHZtgWct
 replay/splitters/time_splitter.py,sha256=iXhuafjBx7dWyJSy-TEVy1IUQBwMpA1gAiF4-GtRe2g,9031
 replay/splitters/two_stage_splitter.py,sha256=PWozxjjgjrVzdz6Sm9dcDTeH0bOA24reFzkk_N_TgbQ,17734
 replay/utils/__init__.py,sha256=vDJgOWq81fbBs-QO4ZDpdqR4KDyO1kMOOxBRi-5Gp7E,253
-replay/utils/common.py,sha256=6JxR5bFuTFTFWad36J5Zu8dFgpFXoof6VsVpF2sD7h8,1471
+replay/utils/common.py,sha256=s4Pro3QCkPeVBsj-s0vrbhd_pkJD-_-2M_sIguxGzQQ,5411
 replay/utils/dataframe_bucketizer.py,sha256=LipmBBQkdkLGroZpbP9i7qvTombLdMxo2dUUys1m5OY,3748
 replay/utils/distributions.py,sha256=kGGq2KzQZ-yhTuw_vtOsKFXVpXUOQ2l4aIFBcaDufZ8,1202
 replay/utils/model_handler.py,sha256=V-mHDh8_UexjVSsMBBRA9yrjS_5MPHwYOwv_UrI-Zfs,6466
 replay/utils/session_handler.py,sha256=ijTvDSNAe1D9R1e-dhtd-r80tFNiIBsFdWZLgw-gLEo,5153
-replay/utils/spark_utils.py,sha256=PhNi9fW28ek0ZB90AUg3tsT5BULbQjDhLalxxww9eLE,26700
+replay/utils/spark_utils.py,sha256=k5lUFM2C9QZKQON3dqhgfswyUF4tsgJOn0U2wCKimqM,26901
 replay/utils/time.py,sha256=J8asoQBytPcNw-BLGADYIsKeWhIoN1H5hKiX9t2AMqo,9376
 replay/utils/types.py,sha256=5sw0A7NG4ZgQKdWORnBy0wBZ5F98sP_Ju8SKQ6zbDS4,651
-replay_rec-0.17.0rc0.dist-info/LICENSE,sha256=rPmcA7UrHxBChEAAlJyE24qUWKKl9yLQXxFsKeg_LX4,11344
-replay_rec-0.17.0rc0.dist-info/METADATA,sha256=8Ki81O8-t1bWieQu4WJFFNWMu4CrvhwBSaU0mcfhh4o,10889
-replay_rec-0.17.0rc0.dist-info/NOTICE,sha256=k0bo4KHiHLRax5K3XKTTrf2Fi8V91mJ-R3FMdh6Reg0,2002
-replay_rec-0.17.0rc0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
-replay_rec-0.17.0rc0.dist-info/RECORD,,
+replay_rec-0.17.1rc0.dist-info/LICENSE,sha256=rPmcA7UrHxBChEAAlJyE24qUWKKl9yLQXxFsKeg_LX4,11344
+replay_rec-0.17.1rc0.dist-info/METADATA,sha256=FgZduBS6AVq1qSNahVyNFCJILLPdVLVosbxjUxN7WkQ,10890
+replay_rec-0.17.1rc0.dist-info/NOTICE,sha256=k0bo4KHiHLRax5K3XKTTrf2Fi8V91mJ-R3FMdh6Reg0,2002
+replay_rec-0.17.1rc0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+replay_rec-0.17.1rc0.dist-info/RECORD,,

{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/LICENSE RENAMED Viewed

File without changes

{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/NOTICE RENAMED Viewed

File without changes

{replay_rec-0.17.0rc0.dist-info → replay_rec-0.17.1rc0.dist-info}/WHEEL RENAMED Viewed

File without changes

replay-rec 0.17.0rc0__py3-none-any.whl → 0.17.1rc0__py3-none-any.whl

replay-rec 0.17.0rc0py3-none-any.whl → 0.17.1rc0py3-none-any.whl