PyPI - replay-rec - Versions diffs - 0.19.0__py3-none-any.whl → 0.20.0__py3-none-any.whl - Mend

replay-rec 0.19.0py3-none-any.whl → 0.20.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

replay/__init__.py +6 -2
replay/data/dataset.py +9 -9
replay/data/nn/__init__.py +6 -6
replay/data/nn/sequence_tokenizer.py +44 -38
replay/data/nn/sequential_dataset.py +13 -8
replay/data/nn/torch_sequential_dataset.py +14 -13
replay/data/nn/utils.py +1 -1
replay/metrics/base_metric.py +1 -1
replay/metrics/coverage.py +7 -11
replay/metrics/experiment.py +3 -3
replay/metrics/offline_metrics.py +2 -2
replay/models/__init__.py +19 -0
replay/models/association_rules.py +1 -4
replay/models/base_neighbour_rec.py +6 -9
replay/models/base_rec.py +44 -293
replay/models/cat_pop_rec.py +2 -1
replay/models/common.py +69 -0
replay/models/extensions/ann/ann_mixin.py +30 -25
replay/models/extensions/ann/index_builders/driver_hnswlib_index_builder.py +1 -1
replay/models/extensions/ann/utils.py +4 -3
replay/models/knn.py +18 -17
replay/models/nn/sequential/bert4rec/dataset.py +1 -1
replay/models/nn/sequential/callbacks/prediction_callbacks.py +2 -2
replay/models/nn/sequential/compiled/__init__.py +10 -0
replay/models/nn/sequential/compiled/base_compiled_model.py +3 -1
replay/models/nn/sequential/compiled/bert4rec_compiled.py +11 -2
replay/models/nn/sequential/compiled/sasrec_compiled.py +5 -1
replay/models/nn/sequential/sasrec/dataset.py +1 -1
replay/models/nn/sequential/sasrec/model.py +1 -1
replay/models/optimization/__init__.py +14 -0
replay/models/optimization/optuna_mixin.py +279 -0
replay/{optimization → models/optimization}/optuna_objective.py +13 -15
replay/models/slim.py +2 -4
replay/models/word2vec.py +7 -12
replay/preprocessing/discretizer.py +1 -2
replay/preprocessing/history_based_fp.py +1 -1
replay/preprocessing/label_encoder.py +1 -1
replay/splitters/cold_user_random_splitter.py +13 -7
replay/splitters/last_n_splitter.py +17 -10
replay/utils/__init__.py +6 -2
replay/utils/common.py +4 -2
replay/utils/model_handler.py +11 -31
replay/utils/session_handler.py +2 -2
replay/utils/spark_utils.py +2 -2
replay/utils/types.py +28 -18
replay/utils/warnings.py +26 -0
{replay_rec-0.19.0.dist-info → replay_rec-0.20.0.dist-info}/METADATA +56 -32
{replay_rec-0.19.0.dist-info → replay_rec-0.20.0.dist-info}/RECORD +51 -47
{replay_rec-0.19.0.dist-info → replay_rec-0.20.0.dist-info}/WHEEL +1 -1
replay_rec-0.20.0.dist-info/licenses/NOTICE +41 -0
replay/optimization/__init__.py +0 -5
{replay_rec-0.19.0.dist-info → replay_rec-0.20.0.dist-info/licenses}/LICENSE +0 -0

replay/preprocessing/label_encoder.py CHANGED Viewed

@@ -21,12 +21,12 @@ from replay.utils import (
     PandasDataFrame,
     PolarsDataFrame,
     SparkDataFrame,
-    get_spark_session,
 )
 if PYSPARK_AVAILABLE:
     from pyspark.sql import Window, functions as sf  # noqa: I001
     from pyspark.sql.types import LongType, IntegerType, ArrayType
+    from replay.utils.session_handler import get_spark_session
 HandleUnknownStrategies = Literal["error", "use_default_value", "drop"]

replay/splitters/cold_user_random_splitter.py CHANGED Viewed

@@ -38,12 +38,16 @@ class ColdUserRandomSplitter(Splitter):
         item_column: Optional[str] = "item_id",
     ):
         """
-        :param test_size: fraction of users to be in test
-        :param drop_cold_items: flag to drop cold items from test
-        :param drop_cold_users: flag to drop cold users from test
-        :param seed: random seed
-        :param query_column: query id column name
-        :param item_column: item id column name
+        :param test_size: The proportion of users to allocate to the test set.
+            Must be a float between 0.0 and 1.0.
+        :param drop_cold_items: Drop items from test DataFrame
+            which are not in train DataFrame, default: False.
+        :param seed: Seed for the random number generator to ensure
+            reproducibility of the split, default: None.
+        :param query_column: Name of query interaction column.
+            default: ``query_id``.
+        :param item_column: Name of item interaction column.
+            default: ``item_id``.
         """
         super().__init__(
             drop_cold_items=drop_cold_items,
@@ -81,7 +85,9 @@ class ColdUserRandomSplitter(Splitter):
             seed=self.seed,
         )
         interactions = interactions.join(
-            train_users.withColumn("is_test", sf.lit(False)), on=self.query_column, how="left"
+            train_users.withColumn("is_test", sf.lit(False)),
+            on=self.query_column,
+            how="left",
         ).na.fill({"is_test": True})
         train = interactions.filter(~sf.col("is_test")).drop("is_test")

replay/splitters/last_n_splitter.py CHANGED Viewed

@@ -4,7 +4,13 @@ import numpy as np
 import pandas as pd
 import polars as pl
-from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, PandasDataFrame, PolarsDataFrame, SparkDataFrame
+from replay.utils import (
+    PYSPARK_AVAILABLE,
+    DataFrameLike,
+    PandasDataFrame,
+    PolarsDataFrame,
+    SparkDataFrame,
+)
 from .base_splitter import Splitter
@@ -118,14 +124,12 @@ class LastNSplitter(Splitter):
         session_id_processing_strategy: str = "test",
     ):
         """
-        :param N: Array of interactions/timedelta to split.
+        :param N: Number of last interactions or size of the time window in seconds
         :param divide_column: Name of column for dividing
             in dataframe, default: ``query_id``.
-        :param time_column_format: Format of time_column,
-            needs for convert time_column into unix_timestamp type.
-            If strategy is set to 'interactions', then you can omit this parameter.
-            If time_column has already transformed into unix_timestamp type,
-            then you can omit this parameter.
+        :param time_column_format: Format of the timestamp column,
+            used for converting string dates to a numerical timestamp when strategy is 'timedelta'.
+            If the column is already a datetime object or a numerical timestamp, this parameter is ignored.
             default: ``yyyy-MM-dd HH:mm:ss``
         :param strategy: Defines the type of data splitting.
             Must be ``interactions`` or ``timedelta``.
@@ -223,7 +227,8 @@ class LastNSplitter(Splitter):
         time_column_type = dict(interactions.dtypes)[self.timestamp_column]
         if time_column_type == "date":
             interactions = interactions.withColumn(
-                self.timestamp_column, sf.unix_timestamp(self.timestamp_column, self.time_column_format)
+                self.timestamp_column,
+                sf.unix_timestamp(self.timestamp_column, self.time_column_format),
             )
         return interactions
@@ -260,7 +265,8 @@ class LastNSplitter(Splitter):
         self, interactions: SparkDataFrame, n: int
     ) -> Tuple[SparkDataFrame, SparkDataFrame]:
         interactions = interactions.withColumn(
-            "count", sf.count(self.timestamp_column).over(Window.partitionBy(self.divide_column))
+            "count",
+            sf.count(self.timestamp_column).over(Window.partitionBy(self.divide_column)),
         )
         # float(n) - because DataFrame.filter is changing order
         # of sorted DataFrame to descending
@@ -317,7 +323,8 @@ class LastNSplitter(Splitter):
         self, interactions: SparkDataFrame, timedelta: int
     ) -> Tuple[SparkDataFrame, SparkDataFrame]:
         inter_with_max_time = interactions.withColumn(
-            "max_timestamp", sf.max(self.timestamp_column).over(Window.partitionBy(self.divide_column))
+            "max_timestamp",
+            sf.max(self.timestamp_column).over(Window.partitionBy(self.divide_column)),
         )
         inter_with_diff = inter_with_max_time.withColumn(
             "diff_timestamp", sf.col("max_timestamp") - sf.col(self.timestamp_column)

replay/utils/__init__.py CHANGED Viewed

@@ -1,13 +1,17 @@
-from .session_handler import State, get_spark_session
 from .types import (
+    ANN_AVAILABLE,
     OPENVINO_AVAILABLE,
+    OPTUNA_AVAILABLE,
     PYSPARK_AVAILABLE,
     TORCH_AVAILABLE,
     DataFrameLike,
+    FeatureUnavailableError,
+    FeatureUnavailableWarning,
     IntOrList,
-    MissingImportType,
+    MissingImport,
     NumType,
     PandasDataFrame,
     PolarsDataFrame,
     SparkDataFrame,
 )
+from .warnings import deprecation_warning

replay/utils/common.py CHANGED Viewed

@@ -126,6 +126,7 @@ def convert2pandas(
     """
     if isinstance(data, PandasDataFrame):
         return data
     if isinstance(data, PolarsDataFrame):
         return data.to_pandas()
     if isinstance(data, SparkDataFrame):
@@ -144,10 +145,11 @@ def convert2polars(
     :param allow_collect_to_master: If set to False (default) raises a warning
         about collecting parallelized data to the master node.
     """
-    if isinstance(data, PandasDataFrame):
-        return pl_from_pandas(data)
     if isinstance(data, PolarsDataFrame):
         return data
+    if isinstance(data, PandasDataFrame):
+        return pl_from_pandas(data)
     if isinstance(data, SparkDataFrame):
         return pl_from_pandas(spark_to_pandas(data, allow_collect_to_master, from_constructor=False))

replay/utils/model_handler.py CHANGED Viewed

@@ -1,16 +1,13 @@
-import functools
 import json
 import os
 import pickle
-import warnings
 from os.path import join
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import Union
 from replay.data.dataset_utils import DatasetLabelEncoder
-from replay.models import *
 from replay.models.base_rec import BaseRecommender
-from replay.splitters import *
+from replay.splitters import Splitter
 from .session_handler import State
 from .types import PYSPARK_AVAILABLE
@@ -43,7 +40,7 @@ if PYSPARK_AVAILABLE:
         return [str(f.getPath()) for f in statuses]
-def save(model: BaseRecommender, path: Union[str, Path], overwrite: bool = False):
+def save(model: "BaseRecommender", path: Union[str, Path], overwrite: bool = False):
     """
     Save fitted model to disk as a folder
@@ -86,19 +83,22 @@ def save(model: BaseRecommender, path: Union[str, Path], overwrite: bool = False
         save_picklable_to_parquet(model.study, join(path, "study"))
-def load(path: str, model_type=None) -> BaseRecommender:
+def load(path: str, model_type=None) -> "BaseRecommender":
     """
     Load saved model from disk
     :param path: path to model folder
     :return: Restored trained model
     """
+    # FIXME: Surely there's a better way to handle this? Not having this method at all perhaps?
+    import replay.models as models
     spark = State().session
     args = spark.read.json(join(path, "init_args.json")).first().asDict(recursive=True)
     name = args["_model_name"]
     del args["_model_name"]
-    model_class = model_type if model_type is not None else globals()[name]
+    model_class = model_type if model_type is not None else getattr(models, name)
     model = model_class(**args)
@@ -175,31 +175,11 @@ def load_splitter(path: str) -> Splitter:
     :param path: path to folder
     :return: restored Splitter
     """
+    import replay.splitters as splitters
     spark = State().session
     args = spark.read.json(join(path, "init_args.json")).first().asDict()
     name = args["_splitter_name"]
     del args["_splitter_name"]
-    splitter = globals()[name]
+    splitter = getattr(splitters, name)
     return splitter(**args)
-def deprecation_warning(message: Optional[str] = None) -> Callable[..., Any]:
-    """
-    Decorator that throws deprecation warnings.
-    :param message: message to deprecation warning without func name.
-    """
-    base_msg = "will be deprecated in future versions."
-    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
-        @functools.wraps(func)
-        def wrapper(*args: Any, **kwargs: Any) -> Any:
-            msg = f"{func.__qualname__} {message if message else base_msg}"
-            warnings.simplefilter("always", DeprecationWarning)  # turn off filter
-            warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
-            warnings.simplefilter("default", DeprecationWarning)  # reset filter
-            return func(*args, **kwargs)
-        return wrapper
-    return decorator

replay/utils/session_handler.py CHANGED Viewed

@@ -10,13 +10,13 @@ from typing import Any, Dict, Optional
 import psutil
-from .types import PYSPARK_AVAILABLE, MissingImportType
+from .types import PYSPARK_AVAILABLE, MissingImport
 if PYSPARK_AVAILABLE:
     from pyspark import __version__ as pyspark_version
     from pyspark.sql import SparkSession
 else:
-    SparkSession = MissingImportType
+    SparkSession = MissingImport
 def get_spark_session(

replay/utils/spark_utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ import pandas as pd
 from numpy.random import default_rng
 from .session_handler import State
-from .types import PYSPARK_AVAILABLE, DataFrameLike, MissingImportType, NumType, PolarsDataFrame, SparkDataFrame
+from .types import PYSPARK_AVAILABLE, DataFrameLike, MissingImport, NumType, PolarsDataFrame, SparkDataFrame
 if PYSPARK_AVAILABLE:
     import pyspark.sql.types as st
@@ -24,7 +24,7 @@ if PYSPARK_AVAILABLE:
     from pyspark.sql.column import _to_java_column, _to_seq
     from pyspark.sql.types import DoubleType, IntegerType, StructField, StructType
 else:
-    Column = MissingImportType
+    Column = MissingImport
 class PolarsConvertToSparkWarning(Warning):

replay/utils/types.py CHANGED Viewed

@@ -1,38 +1,48 @@
+from importlib.util import find_spec
 from typing import Iterable, Union
 from pandas import DataFrame as PandasDataFrame
 from polars import DataFrame as PolarsDataFrame
+from typing_extensions import TypeAlias
-class MissingImportType:
+class MissingImport:
     """
     Replacement class with missing import
     """
-try:
-    from pyspark.sql import DataFrame as SparkDataFrame
+class FeatureUnavailableError(Exception):
+    """Exception class for failing a conditional import check."""
-    PYSPARK_AVAILABLE = True
-except ImportError:
-    PYSPARK_AVAILABLE = False
-    SparkDataFrame = MissingImportType
-try:
-    import torch  # noqa: F401
+class FeatureUnavailableWarning(Warning):
+    """Warning class for failing a conditional import check."""
-    TORCH_AVAILABLE = True
-except ImportError:
-    TORCH_AVAILABLE = False
-try:
-    import onnx  # noqa: F401
-    import openvino  # noqa: F401
+PYSPARK_AVAILABLE = find_spec("pyspark")
+if not PYSPARK_AVAILABLE:
+    SparkDataFrame: TypeAlias = MissingImport
+else:
+    from pyspark.sql import DataFrame
-    OPENVINO_AVAILABLE = TORCH_AVAILABLE
-except ImportError:
-    OPENVINO_AVAILABLE = False
+    SparkDataFrame: TypeAlias = DataFrame
+TORCH_AVAILABLE = find_spec("torch") and find_spec("lightning")
 DataFrameLike = Union[PandasDataFrame, SparkDataFrame, PolarsDataFrame]
 IntOrList = Union[Iterable[int], int]
 NumType = Union[int, float]
+# Conditional import flags
+ANN_AVAILABLE = all(
+    [
+        find_spec("nmslib"),
+        find_spec("hnswlib"),
+        find_spec("pyarrow"),
+    ]
+)
+OPENVINO_AVAILABLE = TORCH_AVAILABLE and find_spec("onnx") and find_spec("openvino")
+OPTUNA_AVAILABLE = find_spec("optuna")

replay/utils/warnings.py ADDED Viewed

@@ -0,0 +1,26 @@
+import functools
+import warnings
+from collections.abc import Callable
+from typing import Any, Optional
+def deprecation_warning(message: Optional[str] = None) -> Callable[..., Any]:
+    """
+    Decorator that throws deprecation warnings.
+    :param message: message to deprecation warning without func name.
+    """
+    base_msg = "will be deprecated in future versions."
+    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            msg = f"{func.__qualname__} {message if message else base_msg}"
+            warnings.simplefilter("always", DeprecationWarning)  # turn off filter
+            warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
+            warnings.simplefilter("default", DeprecationWarning)  # reset filter
+            return func(*args, **kwargs)
+        return wrapper
+    return decorator

{replay_rec-0.19.0.dist-info → replay_rec-0.20.0.dist-info}/METADATA RENAMED Viewed

@@ -1,45 +1,44 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: replay-rec
-Version: 0.19.0
+Version: 0.20.0
 Summary: RecSys Library
-Home-page: https://sb-ai-lab.github.io/RePlay/
-License: Apache-2.0
+License-Expression: Apache-2.0
+License-File: LICENSE
+License-File: NOTICE
 Author: AI Lab
-Requires-Python: >=3.8.1,<3.12
+Requires-Python: >=3.9, <3.13
+Classifier: Operating System :: Unix
 Classifier: Development Status :: 4 - Beta
 Classifier: Environment :: Console
 Classifier: Intended Audience :: Developers
 Classifier: Intended Audience :: Science/Research
-Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Natural Language :: English
-Classifier: Operating System :: Unix
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
-Provides-Extra: all
 Provides-Extra: spark
 Provides-Extra: torch
-Provides-Extra: torch-openvino
-Requires-Dist: fixed-install-nmslib (==2.1.2)
-Requires-Dist: hnswlib (>=0.7.0,<0.8.0)
-Requires-Dist: lightning (>=2.0.2,<=2.4.0) ; extra == "torch" or extra == "torch-openvino" or extra == "all"
-Requires-Dist: numpy (>=1.20.0)
-Requires-Dist: onnx (>=1.16.2,<1.17.0) ; extra == "torch-openvino" or extra == "all"
-Requires-Dist: openvino (>=2024.3.0,<2024.4.0) ; extra == "torch-openvino" or extra == "all"
-Requires-Dist: optuna (>=3.2.0,<3.3.0)
-Requires-Dist: pandas (>=1.3.5,<=2.2.2)
-Requires-Dist: polars (>=1.0.0,<1.1.0)
-Requires-Dist: psutil (>=6.0.0,<6.1.0)
-Requires-Dist: pyarrow (>=12.0.1)
-Requires-Dist: pyspark (>=3.0,<3.6) ; (python_full_version >= "3.8.1" and python_version < "3.11") and (extra == "spark" or extra == "all")
-Requires-Dist: pyspark (>=3.4,<3.6) ; (python_version >= "3.11" and python_version < "3.12") and (extra == "spark" or extra == "all")
-Requires-Dist: pytorch-ranger (>=0.1.1,<0.2.0) ; extra == "torch" or extra == "torch-openvino" or extra == "all"
-Requires-Dist: scikit-learn (>=1.0.2,<2.0.0)
-Requires-Dist: scipy (>=1.8.1,<2.0.0)
-Requires-Dist: torch (>=1.8,<3.0.0) ; (python_version >= "3.9") and (extra == "torch" or extra == "torch-openvino" or extra == "all")
-Requires-Dist: torch (>=1.8,<=2.4.1) ; (python_version >= "3.8" and python_version < "3.9") and (extra == "torch" or extra == "torch-openvino" or extra == "all")
+Provides-Extra: torch-cpu
+Requires-Dist: lightning (<2.6.0) ; extra == "torch" or extra == "torch-cpu"
+Requires-Dist: lightning ; extra == "torch"
+Requires-Dist: lightning ; extra == "torch-cpu"
+Requires-Dist: numpy (>=1.20.0,<2)
+Requires-Dist: pandas (>=1.3.5,<2.4.0)
+Requires-Dist: polars (<2.0)
+Requires-Dist: psutil (<=7.0.0) ; extra == "spark"
+Requires-Dist: psutil ; extra == "spark"
+Requires-Dist: pyarrow (<22.0)
+Requires-Dist: pyspark (>=3.0,<3.5) ; extra == "spark"
+Requires-Dist: pyspark ; extra == "spark"
+Requires-Dist: pytorch-optimizer (>=3.8.0,<3.9.0) ; extra == "torch" or extra == "torch-cpu"
+Requires-Dist: pytorch-optimizer ; extra == "torch"
+Requires-Dist: pytorch-optimizer ; extra == "torch-cpu"
+Requires-Dist: scikit-learn (>=1.6.1,<1.7.0)
+Requires-Dist: scipy (>=1.13.1,<1.14)
+Requires-Dist: setuptools
+Requires-Dist: torch (>=1.8,<3.0.0) ; extra == "torch" or extra == "torch-cpu"
+Requires-Dist: torch ; extra == "torch"
+Requires-Dist: torch ; extra == "torch-cpu"
+Requires-Dist: tqdm (>=4.67,<5)
+Project-URL: Homepage, https://sb-ai-lab.github.io/RePlay/
 Project-URL: Repository, https://github.com/sb-ai-lab/RePlay
 Description-Content-Type: text/markdown
@@ -208,7 +207,6 @@ pip install replay-rec==XX.YY.ZZrc0
 In addition to the core package, several extras are also provided, including:
 - `[spark]`: Install PySpark functionality
 - `[torch]`: Install PyTorch and Lightning functionality
-- `[all]`: `[spark]` `[torch]`
 Example:
 ```bash
@@ -219,9 +217,35 @@ pip install replay-rec[spark]
 pip install replay-rec[spark]==XX.YY.ZZrc0
 ```
+Additionally, `replay-rec[torch]` may be installed with CPU-only version of `torch` by providing its respective index URL during installation:
+```bash
+# Install package with the CPU version of torch
+pip install replay-rec[torch] --extra-index-url https://download.pytorch.org/whl/cpu
+```
 To build RePlay from sources please use the [instruction](CONTRIBUTING.md#installing-from-the-source).
+### Optional features
+RePlay includes a set of optional features which require users to install optional dependencies manually. These features include:
+1) Hyperpearameter search via Optuna:
+```bash
+pip install optuna
+```
+2) Model compilation via OpenVINO:
+```bash
+pip install openvino onnx
+```
+3) Vector database and hierarchical search support:
+```bash
+pip install hnswlib fixed-install-nmslib
+```
 <a name="examples"></a>
 ## 📑  Resources

replay-rec 0.19.0__py3-none-any.whl → 0.20.0__py3-none-any.whl

replay-rec 0.19.0py3-none-any.whl → 0.20.0py3-none-any.whl