PyPI - replay-rec - Versions diffs - 0.16.0rc0__tar.gz → 0.17.0rc0__tar.gz - Mend

replay-rec 0.16.0rc0tar.gz → 0.17.0rc0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

{replay_rec-0.16.0rc0 → replay_rec-0.17.0rc0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: replay-rec
-Version: 0.16.0rc0
+Version: 0.17.0rc0
 Summary: RecSys Library
 Home-page: https://sb-ai-lab.github.io/RePlay/
 License: Apache-2.0
@@ -35,7 +35,7 @@ Requires-Dist: optuna (>=3.2.0,<3.3.0)
 Requires-Dist: pandas (>=1.3.5,<2.0.0)
 Requires-Dist: polars (>=0.20.7,<0.21.0)
 Requires-Dist: psutil (>=5.9.5,<5.10.0)
-Requires-Dist: pyarrow (>=12.0.1,<=14.0.1)
+Requires-Dist: pyarrow (>=12.0.1)
 Requires-Dist: pyspark (>=3.0,<3.3) ; extra == "spark" or extra == "all"
 Requires-Dist: pytorch-ranger (>=0.1.1,<0.2.0) ; extra == "torch" or extra == "all"
 Requires-Dist: sb-obp (>=0.5.7,<0.6.0)

{replay_rec-0.16.0rc0 → replay_rec-0.17.0rc0}/pyproject.toml RENAMED Viewed

@@ -1,16 +1,29 @@
+[build-system]
+requires = [
+    "poetry-core>=1.0.0",
+    "poetry-dynamic-versioning>=1.0.0,<2.0.0",
+]
+build-backend = "poetry_dynamic_versioning.backend"
+[tool.black]
+line-length = 120
+target-versions = ["py38", "py39", "py310"]
 [tool.poetry]
 name = "replay-rec"
 packages = [{include = "replay"}]
 license = "Apache-2.0"
 description = "RecSys Library"
-authors = ["AI Lab",
-           "Alexey Vasilev",
-           "Anna Volodkevich",
-           "Alexey Grishanov",
-           "Yan-Martin Tamm",
-           "Boris Shminke",
-           "Alexander Sidorenko",
-           "Roza Aysina"]
+authors = [
+    "AI Lab",
+    "Alexey Vasilev",
+    "Anna Volodkevich",
+    "Alexey Grishanov",
+    "Yan-Martin Tamm",
+    "Boris Shminke",
+    "Alexander Sidorenko",
+    "Roza Aysina",
+]
 readme = "README.md"
 homepage = "https://sb-ai-lab.github.io/RePlay/"
 repository = "https://github.com/sb-ai-lab/RePlay"
@@ -27,7 +40,7 @@ classifiers = [
 exclude = [
     "replay/conftest.py",
 ]
-version = "0.16.0.preview"
+version = "0.17.0.preview"
 [tool.poetry.dependencies]
 python = ">=3.8.1, <3.11"
@@ -37,12 +50,11 @@ polars = "~0.20.7"
 optuna = "~3.2.0"
 scipy = "~1.8.1"
 psutil = "~5.9.5"
-pyspark = { version = ">=3.0,<3.3", optional = true }
+pyspark = {version = ">=3.0,<3.3", optional = true}
 scikit-learn = "^1.0.2"
-pyarrow = ">=12.0.1, <=14.0.1"
+pyarrow = ">=12.0.1"
 nmslib = "2.1.1"
 hnswlib = "0.7.0"
 torch = "^1.8"
 lightning = "^2.0.2"
 pytorch-ranger = "^0.1.1"
@@ -55,20 +67,20 @@ d3rlpy = "^2.0.4"
 implicit = "~0.7.0"
 gym = "^0.26.0"
+[tool.poetry.extras]
+spark = ["pyspark"]
+torch = ["torch", "pytorch-ranger", "lightning"]
+all = ["pyspark", "torch", "pytorch-ranger", "lightning"]
 [tool.poetry.group.dev.dependencies]
-# visualization
 jupyter = "~1.0.0"
 jupyterlab = "^3.6.0"
-# testing
 pytest = ">=7.1.0"
 pytest-cov = ">=3.0.0"
 statsmodels = "~0.13.5"
-# style
-black = "^23.3"
-pre-commit = ">=2.21.0"
-pylint = "^2.13"
-pycodestyle = "^2.10"
-# docs
+black = ">=23.3.0"
+ruff = ">=0.0.261"
+toml-sort = "^0.23.0"
 sphinx = "5.3.0"
 sphinx-rtd-theme = "1.2.2"
 sphinx-autodoc-typehints = "1.23.0"
@@ -76,26 +88,45 @@ sphinx-enum-extend = "0.1.3"
 myst-parser = "1.0.0"
 ghp-import = "2.1.0"
 docutils = "0.16"
-# stubs
 data-science-types = "0.2.23"
-[tool.poetry.extras]
-spark = ["pyspark"]
-torch = ["torch", "pytorch-ranger", "lightning"]
-all = ["pyspark", "torch", "pytorch-ranger", "lightning"]
-[build-system]
-requires = [
-    "poetry-core>=1.0.0",
-    "poetry-dynamic-versioning>=1.0.0,<2.0.0",
-]
-build-backend = "poetry_dynamic_versioning.backend"
 [tool.poetry-dynamic-versioning]
 enable = false
-format-jinja = """0.16.0{{ env['PACKAGE_SUFFIX'] }}"""
+format-jinja = """0.17.0{{ env['PACKAGE_SUFFIX'] }}"""
 vcs = "git"
-[tool.black]
+[tool.ruff]
+exclude = [".git", ".venv", "__pycache__", "env", "venv", "docs", "projects", "examples"]
+extend-select = ["C90", "T10", "T20", "UP004"]
 line-length = 120
-target-versions = ["py38", "py39", "py310"]
+select = ["ARG", "C4", "E", "EM", "ERA", "F", "FLY", "I", "INP", "ISC", "N", "PERF", "PGH", "PIE", "PYI", "Q", "RUF", "SIM", "TID", "W"]
+[tool.ruff.flake8-quotes]
+docstring-quotes = "double"
+inline-quotes = "double"
+multiline-quotes = "double"
+[tool.ruff.flake8-unused-arguments]
+ignore-variadic-names = false
+[tool.ruff.isort]
+combine-as-imports = true
+force-wrap-aliases = true
+[tool.ruff.mccabe]
+max-complexity = 13
+[tool.ruff.per-file-ignores]
+"*/" = ["PERF203", "RUF001", "RUF002", "RUF012", "E402"]
+"__init__.py" = ["F401"]
+"replay/utils/model_handler.py" = ["F403", "F405"]
+"tests/*" = ["ARG", "E402", "INP", "ISC", "N", "S", "SIM", "F811"]
+"tests/experimental/*" = ["F401", "F811"]
+"replay/experimental/models/extensions/spark_custom_models/als_extension.py" = ["ARG002", "N802", "N803", "N815"]
+[tool.tomlsort]
+ignore_case = true
+in_place = true
+no_comments = true
+spaces_indent_inline_array = 4
+trailing_comma_inline_array = true

replay_rec-0.17.0rc0/replay/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """ RecSys library """
2	+ __version__ = "0.17.0.preview"

{replay_rec-0.16.0rc0 → replay_rec-0.17.0rc0}/replay/data/dataset.py RENAMED Viewed

@@ -7,21 +7,20 @@ from typing import Callable, Dict, Iterable, List, Optional, Sequence
 import numpy as np
-from .schema import FeatureHint, FeatureInfo, FeatureSchema, FeatureSource, FeatureType
 from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, PandasDataFrame, PolarsDataFrame, SparkDataFrame
+from .schema import FeatureHint, FeatureInfo, FeatureSchema, FeatureSource, FeatureType
 if PYSPARK_AVAILABLE:
-    import pyspark.sql.functions as F
+    import pyspark.sql.functions as sf
     from pyspark.storagelevel import StorageLevel
-# pylint: disable=too-many-instance-attributes
 class Dataset:
     """
     Universal dataset for feeding data to models.
     """
-    # pylint: disable=too-many-arguments
     def __init__(
         self,
         feature_schema: FeatureSchema,
@@ -57,23 +56,23 @@ class Dataset:
         try:
             feature_schema.item_id_column
         except Exception as exception:
-            raise ValueError("Item id column is not set.") from exception
+            msg = "Item id column is not set."
+            raise ValueError(msg) from exception
         try:
             feature_schema.query_id_column
         except Exception as exception:
-            raise ValueError("Query id column is not set.") from exception
-        if (
-            self.item_features is not None
-            and not check_dataframes_types_equal(self._interactions, self.item_features)
-        ):
-            raise TypeError("Interactions and item features should have the same type.")
-        if (
-            self.query_features is not None
-            and not check_dataframes_types_equal(self._interactions, self.query_features)
+            msg = "Query id column is not set."
+            raise ValueError(msg) from exception
+        if self.item_features is not None and not check_dataframes_types_equal(self._interactions, self.item_features):
+            msg = "Interactions and item features should have the same type."
+            raise TypeError(msg)
+        if self.query_features is not None and not check_dataframes_types_equal(
+            self._interactions, self.query_features
         ):
-            raise TypeError("Interactions and query features should have the same type.")
+            msg = "Interactions and query features should have the same type."
+            raise TypeError(msg)
         self._feature_source_map: Dict[FeatureSource, DataFrameLike] = {
             FeatureSource.INTERACTIONS: self.interactions,
@@ -191,6 +190,7 @@ class Dataset:
         return self._feature_schema
     if PYSPARK_AVAILABLE:
         def persist(self, storage_level: StorageLevel = StorageLevel(True, True, False, True, 1)) -> None:
             """
             Sets the storage level to persist SparkDataFrame for interactions, item_features
@@ -295,7 +295,6 @@ class Dataset:
     def _set_cardinality(self, features_list: Sequence[FeatureInfo]) -> None:
         for feature in features_list:
             if feature.feature_type == FeatureType.CATEGORICAL:
-                # pylint: disable=protected-access
                 feature._set_cardinality_callback(self._get_cardinality(feature))
     def _fill_feature_schema(self, feature_schema: FeatureSchema) -> FeatureSchema:
@@ -333,15 +332,14 @@ class Dataset:
         for feature in features_list:
             if feature.feature_hint in [FeatureHint.QUERY_ID, FeatureHint.ITEM_ID]:
-                # pylint: disable=protected-access
                 feature._set_feature_source(source=FeatureSource.INTERACTIONS)
                 continue
-            source = source_mapping.get(feature.column)  # type: ignore
+            source = source_mapping.get(feature.column)
             if source:
-                # pylint: disable=protected-access
                 feature._set_feature_source(source=source_mapping[feature.column])
             else:
-                raise ValueError(f"{feature.column} doesn't exist in provided dataframes")
+                msg = f"{feature.column} doesn't exist in provided dataframes"
+                raise ValueError(msg)
         self._set_cardinality(features_list=features_list)
         return features_list
@@ -362,10 +360,8 @@ class Dataset:
         self._set_cardinality(features_list=unlabeled_columns)
         return unlabeled_columns
-    # pylint: disable=no-self-use
     def _set_features_source(self, feature_list: List[FeatureInfo], source: FeatureSource) -> None:
         for feature in feature_list:
-            # pylint: disable=protected-access
             feature._set_feature_source(source)
     def _check_ids_consistency(self, hint: FeatureHint) -> None:
@@ -377,8 +373,8 @@ class Dataset:
             self.feature_schema.item_id_column if hint == FeatureHint.ITEM_ID else self.feature_schema.query_id_column
         )
         if self.is_pandas:
-            interactions_unique_ids = set(self.interactions[ids_column].unique())  # type: ignore
-            features_df_unique_ids = set(features_df[ids_column].unique())  # type: ignore  # pylint: disable=E1136
+            interactions_unique_ids = set(self.interactions[ids_column].unique())
+            features_df_unique_ids = set(features_df[ids_column].unique())
             in_interactions_not_in_features_ids = interactions_unique_ids - features_df_unique_ids
             is_consistent = len(in_interactions_not_in_features_ids) == 0
         elif self.is_spark:
@@ -389,14 +385,18 @@ class Dataset:
                 .count()
             ) == 0
         else:
-            is_consistent = len(
-                self.interactions.select(ids_column)
-                .unique()
-                .join(features_df.select(ids_column).unique(), on=ids_column, how="anti")
-            ) == 0
+            is_consistent = (
+                len(
+                    self.interactions.select(ids_column)
+                    .unique()
+                    .join(features_df.select(ids_column).unique(), on=ids_column, how="anti")
+                )
+                == 0
+            )
         if not is_consistent:
-            raise ValueError(f"There are IDs in the interactions that are missing in the {hint.name} dataframe.")
+            msg = f"There are IDs in the interactions that are missing in the {hint.name} dataframe."
+            raise ValueError(msg)
     def _check_column_encoded(
         self, data: DataFrameLike, column: str, source: FeatureSource, cardinality: Optional[int]
@@ -419,26 +419,29 @@ class Dataset:
             is_int = data[column].dtype.is_integer()
         if not is_int:
-            raise ValueError(f"IDs in {source.name}.{column} are not encoded. They are not int.")
+            msg = f"IDs in {source.name}.{column} are not encoded. They are not int."
+            raise ValueError(msg)
         if self.is_pandas:
-            min_id = data[column].min()  # type: ignore
+            min_id = data[column].min()
         elif self.is_spark:
-            min_id = data.agg(F.min(column).alias("min_index")).collect()[0][0]
+            min_id = data.agg(sf.min(column).alias("min_index")).collect()[0][0]
         else:
-            min_id = data[column].min()  # type: ignore
+            min_id = data[column].min()
         if min_id < 0:
-            raise ValueError(f"IDs in {source.name}.{column} are not encoded. Min ID is less than 0.")
+            msg = f"IDs in {source.name}.{column} are not encoded. Min ID is less than 0."
+            raise ValueError(msg)
         if self.is_pandas:
-            max_id = data[column].max()  # type: ignore
+            max_id = data[column].max()
         elif self.is_spark:
-            max_id = data.agg(F.max(column).alias("max_index")).collect()[0][0]
+            max_id = data.agg(sf.max(column).alias("max_index")).collect()[0][0]
         else:
-            max_id = data[column].max()  # type: ignore
+            max_id = data[column].max()
         if max_id >= cardinality:
-            raise ValueError(f"IDs in {source.name}.{column} are not encoded. Max ID is more than quantity of IDs.")
+            msg = f"IDs in {source.name}.{column} are not encoded. Max ID is more than quantity of IDs."
+            raise ValueError(msg)
     def _check_encoded(self) -> None:
         for feature in self.feature_schema.categorical_features.all_features:
@@ -471,11 +474,11 @@ class Dataset:
                         feature.cardinality,
                     )
             else:
-                data = self._feature_source_map[feature.feature_source]  # type: ignore
+                data = self._feature_source_map[feature.feature_source]
                 self._check_column_encoded(
                     data,
                     feature.column,
-                    feature.feature_source,  # type: ignore
+                    feature.feature_source,
                     feature.cardinality,
                 )

{replay_rec-0.16.0rc0 → replay_rec-0.17.0rc0}/replay/data/dataset_utils/dataset_label_encoder.py RENAMED Viewed

@@ -31,6 +31,7 @@ class DatasetLabelEncoder:
             When set to ``error`` an error will be raised in case an unknown label is present during transform.
             When set to ``use_default_value``, the encoded value of unknown label will be set
             to the value given for the parameter default_value.
+            When set to ``drop``, the unknown labels will be dropped.
             Default: ``error``.
         :param default_value: Default value that will fill the unknown labels after transform.
             When the parameter handle_unknown is set to ``use_default_value``,
@@ -105,7 +106,7 @@ class DatasetLabelEncoder:
         for column, feature_info in dataset.feature_schema.categorical_features.items():
             if column not in self._encoding_rules:
                 warnings.warn(
-                    f"Cannot transform feature '{column}' " "as it was not present at the fit stage",
+                    f"Cannot transform feature '{column}' as it was not present at the fit stage",
                     LabelEncoderTransformWarning,
                 )
                 continue
@@ -157,10 +158,7 @@ class DatasetLabelEncoder:
         self._check_if_initialized()
         columns_set: Set[str]
-        if isinstance(columns, str):
-            columns_set = set([columns])
-        else:
-            columns_set = set(columns)
+        columns_set = {columns} if isinstance(columns, str) else {*columns}
         def get_encoding_rules() -> Iterator[LabelEncodingRule]:
             for column, rule in self._encoding_rules.items():
@@ -200,7 +198,7 @@ class DatasetLabelEncoder:
         """
         query_id_column = self._features_columns[FeatureHint.QUERY_ID]
         item_id_column = self._features_columns[FeatureHint.ITEM_ID]
-        encoder = self.get_encoder(query_id_column + item_id_column)  # type: ignore
+        encoder = self.get_encoder(query_id_column + item_id_column)
         assert encoder is not None
         return encoder
@@ -231,7 +229,8 @@ class DatasetLabelEncoder:
     def _check_if_initialized(self) -> None:
         if not self._encoding_rules:
-            raise ValueError("Encoder is not initialized")
+            msg = "Encoder is not initialized"
+            raise ValueError(msg)
     def _fill_features_columns(self, feature_info: FeatureSchema) -> None:
         self._features_columns = {

{replay_rec-0.16.0rc0 → replay_rec-0.17.0rc0}/replay/data/nn/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ from replay.utils import TORCH_AVAILABLE
 if TORCH_AVAILABLE:
     from .schema import MutableTensorMap, TensorFeatureInfo, TensorFeatureSource, TensorMap, TensorSchema
     from .sequence_tokenizer import SequenceTokenizer
-    from .sequential_dataset import PandasSequentialDataset, SequentialDataset, PolarsSequentialDataset
+    from .sequential_dataset import PandasSequentialDataset, PolarsSequentialDataset, SequentialDataset
     from .torch_sequential_dataset import (
         DEFAULT_GROUND_TRUTH_PADDING_VALUE,
         DEFAULT_TRAIN_PADDING_VALUE,

{replay_rec-0.16.0rc0 → replay_rec-0.17.0rc0}/replay/data/nn/schema.py RENAMED Viewed

@@ -11,7 +11,6 @@ from typing import (
     Set,
     Union,
     ValuesView,
-    Callable
 )
 import torch
@@ -23,7 +22,6 @@ TensorMap = Mapping[str, torch.Tensor]
 MutableTensorMap = Dict[str, torch.Tensor]
-# pylint: disable=too-many-instance-attributes
 class TensorFeatureSource:
     """
     Describes source of a feature
@@ -72,7 +70,6 @@ class TensorFeatureInfo:
     Information about a tensor feature.
     """
-    # pylint: disable=too-many-arguments
     def __init__(
         self,
         name: str,
@@ -108,15 +105,18 @@ class TensorFeatureInfo:
         self._is_seq = is_seq
         if not isinstance(feature_type, FeatureType):
-            raise ValueError("Unknown feature type")
+            msg = "Unknown feature type"
+            raise ValueError(msg)
         self._feature_type = feature_type
         if feature_type == FeatureType.NUMERICAL and (cardinality or embedding_dim):
-            raise ValueError("Cardinality and embedding dimensions are needed only with categorical feature type.")
+            msg = "Cardinality and embedding dimensions are needed only with categorical feature type."
+            raise ValueError(msg)
         self._cardinality = cardinality
         if feature_type == FeatureType.CATEGORICAL and tensor_dim:
-            raise ValueError("Tensor dimensions is needed only with numerical feature type.")
+            msg = "Tensor dimensions is needed only with numerical feature type."
+            raise ValueError(msg)
         if feature_type == FeatureType.CATEGORICAL:
             default_embedding_dim = 64
@@ -168,7 +168,8 @@ class TensorFeatureInfo:
             return None
         if len(source) > 1:
-            raise ValueError("Only one element feature sources can be converted to single feature source.")
+            msg = "Only one element feature sources can be converted to single feature source."
+            raise ValueError(msg)
         assert isinstance(self.feature_sources, list)
         return self.feature_sources[0]
@@ -199,35 +200,21 @@ class TensorFeatureInfo:
         :returns: Cardinality of the feature.
         """
         if self.feature_type != FeatureType.CATEGORICAL:
-            raise RuntimeError(
-                f"Can not get cardinality because feature type of {self.name} column is not categorical."
-            )
-        if hasattr(self, "_cardinality_callback") and self._cardinality is None:
-            self._set_cardinality(self._cardinality_callback(self._name))
+            msg = f"Can not get cardinality because feature type of {self.name} column is not categorical."
+            raise RuntimeError(msg)
         return self._cardinality
-    # pylint: disable=attribute-defined-outside-init
-    def _set_cardinality_callback(self, callback: Callable) -> None:
-        self._cardinality_callback = callback
     def _set_cardinality(self, cardinality: int) -> None:
         self._cardinality = cardinality
-    def reset_cardinality(self) -> None:
-        """
-        Reset cardinality of the feature to None.
-        """
-        self._cardinality = None
     @property
     def tensor_dim(self) -> Optional[int]:
         """
         :returns: Dimensions of the numerical feature.
         """
         if self.feature_type != FeatureType.NUMERICAL:
-            raise RuntimeError(
-                f"Can not get tensor dimensions because feature type of {self.name} feature is not numerical."
-            )
+            msg = f"Can not get tensor dimensions because feature type of {self.name} feature is not numerical."
+            raise RuntimeError(msg)
         return self._tensor_dim
     def _set_tensor_dim(self, tensor_dim: int) -> None:
@@ -239,9 +226,8 @@ class TensorFeatureInfo:
         :returns: Embedding dimensions of the feature.
         """
         if self.feature_type != FeatureType.CATEGORICAL:
-            raise RuntimeError(
-                f"Can not get embedding dimensions because feature type of {self.name} feature is not categorical."
-            )
+            msg = f"Can not get embedding dimensions because feature type of {self.name} feature is not categorical."
+            raise RuntimeError(msg)
         return self._embedding_dim
     def _set_embedding_dim(self, embedding_dim: int) -> None:
@@ -278,8 +264,9 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
         :returns: Extract single feature from a schema.
         """
         if len(self._tensor_schema) != 1:
-            raise ValueError("Only one element tensor schema can be converted to single feature")
-        return list(self._tensor_schema.values())[0]
+            msg = "Only one element tensor schema can be converted to single feature"
+            raise ValueError(msg)
+        return next(iter(self._tensor_schema.values()))
     def items(self) -> ItemsView[str, TensorFeatureInfo]:
         return self._tensor_schema.items()
@@ -290,7 +277,7 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
     def values(self) -> ValuesView[TensorFeatureInfo]:
         return self._tensor_schema.values()
-    def get(  # type: ignore
+    def get(
         self,
         key: str,
         default: Optional[TensorFeatureInfo] = None,
@@ -377,7 +364,7 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
     @property
     def names(self) -> Sequence[str]:
         """
-       :returns: List of all feature's names.
+        :returns: List of all feature's names.
         """
         return list(self._tensor_schema)
@@ -447,7 +434,7 @@ class TensorSchema(Mapping[str, TensorFeatureInfo]):
         for filtration_func, filtration_param in zip(filter_functions, filter_parameters):
             filtered_features = list(
                 filter(
-                    lambda x: filtration_func(x, filtration_param),  # type: ignore  # pylint: disable=W0640
+                    lambda x: filtration_func(x, filtration_param),
                     filtered_features,
                 )
             )

replay-rec 0.16.0rc0__tar.gz → 0.17.0rc0__tar.gz

replay-rec 0.16.0rc0tar.gz → 0.17.0rc0tar.gz