replay-rec 0.18.0__py3-none-any.whl → 0.18.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- replay/__init__.py +1 -1
- replay/data/dataset.py +27 -1
- replay/data/dataset_utils/dataset_label_encoder.py +6 -3
- replay/data/nn/schema.py +37 -16
- replay/data/nn/sequence_tokenizer.py +313 -165
- replay/data/nn/torch_sequential_dataset.py +17 -8
- replay/data/nn/utils.py +14 -7
- replay/data/schema.py +10 -6
- replay/metrics/offline_metrics.py +2 -2
- replay/models/__init__.py +1 -0
- replay/models/base_rec.py +18 -21
- replay/models/lin_ucb.py +407 -0
- replay/models/nn/sequential/bert4rec/dataset.py +17 -4
- replay/models/nn/sequential/bert4rec/lightning.py +121 -54
- replay/models/nn/sequential/bert4rec/model.py +21 -0
- replay/models/nn/sequential/callbacks/prediction_callbacks.py +5 -1
- replay/models/nn/sequential/compiled/__init__.py +5 -0
- replay/models/nn/sequential/compiled/base_compiled_model.py +261 -0
- replay/models/nn/sequential/compiled/bert4rec_compiled.py +152 -0
- replay/models/nn/sequential/compiled/sasrec_compiled.py +145 -0
- replay/models/nn/sequential/postprocessors/postprocessors.py +27 -1
- replay/models/nn/sequential/sasrec/dataset.py +17 -1
- replay/models/nn/sequential/sasrec/lightning.py +126 -50
- replay/models/nn/sequential/sasrec/model.py +3 -4
- replay/preprocessing/__init__.py +7 -1
- replay/preprocessing/discretizer.py +719 -0
- replay/preprocessing/label_encoder.py +384 -52
- replay/splitters/cold_user_random_splitter.py +1 -1
- replay/utils/__init__.py +1 -0
- replay/utils/common.py +7 -8
- replay/utils/session_handler.py +3 -4
- replay/utils/spark_utils.py +15 -1
- replay/utils/types.py +8 -0
- {replay_rec-0.18.0.dist-info → replay_rec-0.18.1.dist-info}/METADATA +73 -60
- {replay_rec-0.18.0.dist-info → replay_rec-0.18.1.dist-info}/RECORD +37 -31
- {replay_rec-0.18.0.dist-info → replay_rec-0.18.1.dist-info}/LICENSE +0 -0
- {replay_rec-0.18.0.dist-info → replay_rec-0.18.1.dist-info}/WHEEL +0 -0
|
@@ -4,6 +4,8 @@ import numpy as np
|
|
|
4
4
|
import torch
|
|
5
5
|
from torch.utils.data import Dataset as TorchDataset
|
|
6
6
|
|
|
7
|
+
from replay.utils.model_handler import deprecation_warning
|
|
8
|
+
|
|
7
9
|
from .schema import TensorFeatureInfo, TensorMap, TensorSchema
|
|
8
10
|
from .sequential_dataset import SequentialDataset
|
|
9
11
|
|
|
@@ -25,6 +27,10 @@ class TorchSequentialDataset(TorchDataset):
|
|
|
25
27
|
Torch dataset for sequential recommender models
|
|
26
28
|
"""
|
|
27
29
|
|
|
30
|
+
@deprecation_warning(
|
|
31
|
+
"`padding_value` parameter will be removed in future versions. "
|
|
32
|
+
"Instead, you should specify `padding_value` for each column in TensorSchema"
|
|
33
|
+
)
|
|
28
34
|
def __init__(
|
|
29
35
|
self,
|
|
30
36
|
sequential: SequentialDataset,
|
|
@@ -90,15 +96,14 @@ class TorchSequentialDataset(TorchDataset):
|
|
|
90
96
|
sequence = self._sequential.get_sequence(sequence_index, feature.name)
|
|
91
97
|
if feature.is_seq:
|
|
92
98
|
sequence = sequence[sequence_offset : sequence_offset + self._max_sequence_length]
|
|
93
|
-
|
|
94
|
-
tensor_dtype = self._get_tensor_dtype(feature)
|
|
99
|
+
tensor_dtype = self._get_tensor_dtype(sequence)
|
|
95
100
|
tensor_sequence = torch.tensor(sequence, dtype=tensor_dtype)
|
|
96
101
|
if feature.is_seq:
|
|
97
|
-
tensor_sequence = self._pad_sequence(tensor_sequence)
|
|
102
|
+
tensor_sequence = self._pad_sequence(tensor_sequence, feature.padding_value)
|
|
98
103
|
|
|
99
104
|
return tensor_sequence
|
|
100
105
|
|
|
101
|
-
def _pad_sequence(self, sequence: torch.Tensor) -> torch.Tensor:
|
|
106
|
+
def _pad_sequence(self, sequence: torch.Tensor, padding_value: int) -> torch.Tensor:
|
|
102
107
|
assert len(sequence) <= self._max_sequence_length
|
|
103
108
|
if len(sequence) == self._max_sequence_length:
|
|
104
109
|
return sequence
|
|
@@ -115,16 +120,16 @@ class TorchSequentialDataset(TorchDataset):
|
|
|
115
120
|
|
|
116
121
|
padded_sequence = torch.full(
|
|
117
122
|
padded_sequence_shape,
|
|
118
|
-
|
|
123
|
+
padding_value,
|
|
119
124
|
dtype=sequence.dtype,
|
|
120
125
|
)
|
|
121
126
|
padded_sequence[-len(sequence) :].copy_(sequence)
|
|
122
127
|
return padded_sequence
|
|
123
128
|
|
|
124
|
-
def _get_tensor_dtype(self,
|
|
125
|
-
if
|
|
129
|
+
def _get_tensor_dtype(self, array: np.array) -> torch.dtype:
|
|
130
|
+
if np.issubdtype(array.dtype, np.integer):
|
|
126
131
|
return torch.long
|
|
127
|
-
if
|
|
132
|
+
if np.issubdtype(array.dtype, np.floating):
|
|
128
133
|
return torch.float32
|
|
129
134
|
assert False, "Unknown tensor feature type"
|
|
130
135
|
|
|
@@ -170,6 +175,10 @@ class TorchSequentialValidationDataset(TorchDataset):
|
|
|
170
175
|
Torch dataset for sequential recommender models that additionally stores ground truth
|
|
171
176
|
"""
|
|
172
177
|
|
|
178
|
+
@deprecation_warning(
|
|
179
|
+
"`padding_value` parameter will be removed in future versions. "
|
|
180
|
+
"Instead, you should specify `padding_value` for each column in TensorSchema"
|
|
181
|
+
)
|
|
173
182
|
def __init__(
|
|
174
183
|
self,
|
|
175
184
|
sequential: SequentialDataset,
|
replay/data/nn/utils.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional
|
|
1
|
+
from typing import Iterable, Optional
|
|
2
2
|
|
|
3
3
|
import polars as pl
|
|
4
4
|
|
|
@@ -22,9 +22,12 @@ def groupby_sequences(events: DataFrameLike, groupby_col: str, sort_col: Optiona
|
|
|
22
22
|
event_cols_without_groupby.remove(groupby_col)
|
|
23
23
|
|
|
24
24
|
if sort_col:
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
25
|
+
event_cols_without_iterable = list(
|
|
26
|
+
filter(lambda x: not isinstance(events.iloc[0][x], Iterable), event_cols_without_groupby)
|
|
27
|
+
) # deleting columns that cannot be sorted
|
|
28
|
+
event_cols_without_iterable.remove(sort_col)
|
|
29
|
+
event_cols_without_iterable.insert(0, sort_col)
|
|
30
|
+
events = events.sort_values(event_cols_without_iterable)
|
|
28
31
|
|
|
29
32
|
grouped_sequences = (
|
|
30
33
|
events.groupby(groupby_col).agg({col: list for col in event_cols_without_groupby}).reset_index()
|
|
@@ -34,9 +37,13 @@ def groupby_sequences(events: DataFrameLike, groupby_col: str, sort_col: Optiona
|
|
|
34
37
|
event_cols_without_groupby.remove(groupby_col)
|
|
35
38
|
|
|
36
39
|
if sort_col:
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
+
map_name2type = dict(zip(events.columns, events.dtypes))
|
|
41
|
+
event_cols_without_iterable = list(
|
|
42
|
+
filter(lambda x: not isinstance(map_name2type[x], pl.List), event_cols_without_groupby)
|
|
43
|
+
) # deleting columns that cannot be sorted
|
|
44
|
+
event_cols_without_iterable.remove(sort_col)
|
|
45
|
+
event_cols_without_iterable.insert(0, sort_col)
|
|
46
|
+
events = events.sort(event_cols_without_iterable)
|
|
40
47
|
|
|
41
48
|
grouped_sequences = events.group_by(groupby_col).agg(*[pl.col(x) for x in event_cols_without_groupby])
|
|
42
49
|
else:
|
replay/data/schema.py
CHANGED
|
@@ -20,7 +20,9 @@ class FeatureType(Enum):
|
|
|
20
20
|
"""Type of Feature."""
|
|
21
21
|
|
|
22
22
|
CATEGORICAL = "categorical"
|
|
23
|
+
CATEGORICAL_LIST = "categorical_list"
|
|
23
24
|
NUMERICAL = "numerical"
|
|
25
|
+
NUMERICAL_LIST = "numerical_list"
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
class FeatureSource(Enum):
|
|
@@ -70,7 +72,7 @@ class FeatureInfo:
|
|
|
70
72
|
self._feature_source = feature_source
|
|
71
73
|
self._feature_hint = feature_hint
|
|
72
74
|
|
|
73
|
-
if feature_type
|
|
75
|
+
if feature_type in [FeatureType.NUMERICAL, FeatureType.NUMERICAL_LIST] and cardinality:
|
|
74
76
|
msg = "Cardinality is needed only with categorical feature_type."
|
|
75
77
|
raise ValueError(msg)
|
|
76
78
|
self._cardinality = cardinality
|
|
@@ -111,7 +113,7 @@ class FeatureInfo:
|
|
|
111
113
|
"""
|
|
112
114
|
:returns: cardinality of the feature.
|
|
113
115
|
"""
|
|
114
|
-
if self.feature_type
|
|
116
|
+
if self.feature_type not in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
|
|
115
117
|
msg = f"Can not get cardinality because feature_type of {self.column} column is not categorical."
|
|
116
118
|
raise RuntimeError(msg)
|
|
117
119
|
if hasattr(self, "_cardinality_callback") and self._cardinality is None:
|
|
@@ -143,7 +145,7 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
|
|
|
143
145
|
|
|
144
146
|
def copy(self) -> "FeatureSchema":
|
|
145
147
|
"""
|
|
146
|
-
Creates a copy of all features.
|
|
148
|
+
Creates a copy of all features. For the returned copy, all cardinality values will be undefined.
|
|
147
149
|
|
|
148
150
|
:returns: copy of the initial feature schema.
|
|
149
151
|
"""
|
|
@@ -227,14 +229,16 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
|
|
|
227
229
|
"""
|
|
228
230
|
:returns: sequence of categorical features in a schema.
|
|
229
231
|
"""
|
|
230
|
-
return self.filter(feature_type=FeatureType.CATEGORICAL)
|
|
232
|
+
return self.filter(feature_type=FeatureType.CATEGORICAL) + self.filter(
|
|
233
|
+
feature_type=FeatureType.CATEGORICAL_LIST
|
|
234
|
+
)
|
|
231
235
|
|
|
232
236
|
@property
|
|
233
237
|
def numerical_features(self) -> "FeatureSchema":
|
|
234
238
|
"""
|
|
235
239
|
:returns: sequence of numerical features in a schema.
|
|
236
240
|
"""
|
|
237
|
-
return self.filter(feature_type=FeatureType.NUMERICAL)
|
|
241
|
+
return self.filter(feature_type=FeatureType.NUMERICAL) + self.filter(feature_type=FeatureType.NUMERICAL_LIST)
|
|
238
242
|
|
|
239
243
|
@property
|
|
240
244
|
def interaction_features(self) -> "FeatureSchema":
|
|
@@ -449,7 +453,7 @@ class FeatureSchema(Mapping[str, FeatureInfo]):
|
|
|
449
453
|
|
|
450
454
|
if len(duplicates) > 0:
|
|
451
455
|
msg = (
|
|
452
|
-
"Features column names should be unique,
|
|
456
|
+
"Features column names should be unique, except ITEM_ID and QUERY_ID columns. "
|
|
453
457
|
f"{duplicates} columns are not unique."
|
|
454
458
|
)
|
|
455
459
|
raise ValueError(msg)
|
|
@@ -156,13 +156,13 @@ class OfflineMetrics:
|
|
|
156
156
|
):
|
|
157
157
|
"""
|
|
158
158
|
:param metrics: (list of metrics): List of metrics to be calculated.
|
|
159
|
-
:param
|
|
159
|
+
:param query_column:: (str): The name of the query column.
|
|
160
160
|
Note that you do not need to specify the value of this parameter for each metric separately.
|
|
161
161
|
It is enough to specify the value of this parameter here once.
|
|
162
162
|
:param item_column: (str): The name of the item column.
|
|
163
163
|
Note that you do not need to specify the value of this parameter for each metric separately.
|
|
164
164
|
It is enough to specify the value of this parameter here once.
|
|
165
|
-
:param
|
|
165
|
+
:param rating_column: (str): The name of the rating column.
|
|
166
166
|
Note that you do not need to specify the value of this parameter for each metric separately.
|
|
167
167
|
It is enough to specify the value of this parameter here once.
|
|
168
168
|
:param category_column: (str): The name of the category column.
|
replay/models/__init__.py
CHANGED
|
@@ -14,6 +14,7 @@ from .cat_pop_rec import CatPopRec
|
|
|
14
14
|
from .cluster import ClusterRec
|
|
15
15
|
from .kl_ucb import KLUCB
|
|
16
16
|
from .knn import ItemKNN
|
|
17
|
+
from .lin_ucb import LinUCB
|
|
17
18
|
from .pop_rec import PopRec
|
|
18
19
|
from .query_pop_rec import QueryPopRec
|
|
19
20
|
from .random_rec import RandomRec
|
replay/models/base_rec.py
CHANGED
|
@@ -625,23 +625,21 @@ class BaseRecommender(RecommenderCommons, IsSavable, ABC):
|
|
|
625
625
|
self, dataset: Dataset, k: int, queries: SparkDataFrame, items: SparkDataFrame, filter_seen_items: bool = True
|
|
626
626
|
) -> np.ndarray:
|
|
627
627
|
"""
|
|
628
|
-
Inner method where model actually predicts.
|
|
628
|
+
Inner method where model actually predicts probability estimates.
|
|
629
|
+
|
|
630
|
+
Mainly used in ```OBPOfflinePolicyLearner```.
|
|
629
631
|
|
|
630
|
-
:param
|
|
632
|
+
:param dataset: historical interactions with query/item features
|
|
631
633
|
``[user_idx, item_idx, timestamp, rating]``
|
|
632
634
|
:param k: number of recommendations for each user
|
|
633
|
-
:param
|
|
635
|
+
:param queries: queries to create recommendations for
|
|
634
636
|
dataframe containing ``[user_idx]`` or ``array-like``;
|
|
635
|
-
if ``None``, recommend to all
|
|
637
|
+
if ``None``, recommend to all queries from ``interactions``
|
|
636
638
|
:param items: candidate items for recommendations
|
|
637
639
|
dataframe containing ``[item_idx]`` or ``array-like``;
|
|
638
|
-
if ``None``, take all items from ``
|
|
640
|
+
if ``None``, take all items from ``interactions``.
|
|
639
641
|
If it contains new items, ``rating`` for them will be ``0``.
|
|
640
|
-
:param
|
|
641
|
-
``[user_idx , timestamp]`` + feature columns
|
|
642
|
-
:param item_features: item features
|
|
643
|
-
``[item_idx , timestamp]`` + feature columns
|
|
644
|
-
:param filter_seen_items: flag to remove seen items from recommendations based on ``log``.
|
|
642
|
+
:param filter_seen_items: flag to remove seen items from recommendations based on ``interactions``.
|
|
645
643
|
:return: distribution over items for each user with shape
|
|
646
644
|
``(n_users, n_items, k)``
|
|
647
645
|
where we have probability for each user to choose item at fixed position(top-k).
|
|
@@ -1164,10 +1162,11 @@ class HybridRecommender(BaseRecommender, ABC):
|
|
|
1164
1162
|
) -> Optional[Tuple[SparkDataFrame, int]]:
|
|
1165
1163
|
"""
|
|
1166
1164
|
Returns query or item feature vectors as a Column with type ArrayType
|
|
1165
|
+
If a model does not have a vector for some ids they are not present in the final result.
|
|
1166
|
+
|
|
1167
1167
|
:param ids: Spark DataFrame with unique ids
|
|
1168
1168
|
:param features: Spark DataFrame with features for provided ids
|
|
1169
1169
|
:return: feature vectors
|
|
1170
|
-
If a model does not have a vector for some ids they are not present in the final result.
|
|
1171
1170
|
"""
|
|
1172
1171
|
return self._get_features_wrap(ids, features)
|
|
1173
1172
|
|
|
@@ -1644,23 +1643,21 @@ class NonPersonalizedRecommender(Recommender, ABC):
|
|
|
1644
1643
|
self, dataset: Dataset, k: int, queries: SparkDataFrame, items: SparkDataFrame, filter_seen_items: bool = True
|
|
1645
1644
|
) -> np.ndarray:
|
|
1646
1645
|
"""
|
|
1647
|
-
Inner method where model actually predicts.
|
|
1646
|
+
Inner method where model actually predicts probability estimates.
|
|
1647
|
+
|
|
1648
|
+
Mainly used in ```OBPOfflinePolicyLearner```.
|
|
1648
1649
|
|
|
1649
|
-
:param
|
|
1650
|
+
:param dataset: historical interactions with query/item features
|
|
1650
1651
|
``[user_idx, item_idx, timestamp, rating]``
|
|
1651
1652
|
:param k: number of recommendations for each user
|
|
1652
|
-
:param
|
|
1653
|
+
:param queries: queries to create recommendations for
|
|
1653
1654
|
dataframe containing ``[user_idx]`` or ``array-like``;
|
|
1654
|
-
if ``None``, recommend to all
|
|
1655
|
+
if ``None``, recommend to all queries from ``interactions``
|
|
1655
1656
|
:param items: candidate items for recommendations
|
|
1656
1657
|
dataframe containing ``[item_idx]`` or ``array-like``;
|
|
1657
|
-
if ``None``, take all items from ``
|
|
1658
|
+
if ``None``, take all items from ``interactions``.
|
|
1658
1659
|
If it contains new items, ``rating`` for them will be ``0``.
|
|
1659
|
-
:param
|
|
1660
|
-
``[user_idx , timestamp]`` + feature columns
|
|
1661
|
-
:param item_features: item features
|
|
1662
|
-
``[item_idx , timestamp]`` + feature columns
|
|
1663
|
-
:param filter_seen_items: flag to remove seen items from recommendations based on ``log``.
|
|
1660
|
+
:param filter_seen_items: flag to remove seen items from recommendations based on ``interactions``.
|
|
1664
1661
|
:return: distribution over items for each user with shape
|
|
1665
1662
|
``(n_users, n_items, k)``
|
|
1666
1663
|
where we have probability for each user to choose item at fixed position(top-k).
|