replay-rec 0.20.2__py3-none-any.whl → 0.20.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. replay/__init__.py +1 -1
  2. replay/data/nn/sequential_dataset.py +8 -2
  3. replay/experimental/__init__.py +0 -0
  4. replay/experimental/metrics/__init__.py +62 -0
  5. replay/experimental/metrics/base_metric.py +603 -0
  6. replay/experimental/metrics/coverage.py +97 -0
  7. replay/experimental/metrics/experiment.py +175 -0
  8. replay/experimental/metrics/hitrate.py +26 -0
  9. replay/experimental/metrics/map.py +30 -0
  10. replay/experimental/metrics/mrr.py +18 -0
  11. replay/experimental/metrics/ncis_precision.py +31 -0
  12. replay/experimental/metrics/ndcg.py +49 -0
  13. replay/experimental/metrics/precision.py +22 -0
  14. replay/experimental/metrics/recall.py +25 -0
  15. replay/experimental/metrics/rocauc.py +49 -0
  16. replay/experimental/metrics/surprisal.py +90 -0
  17. replay/experimental/metrics/unexpectedness.py +76 -0
  18. replay/experimental/models/__init__.py +50 -0
  19. replay/experimental/models/admm_slim.py +257 -0
  20. replay/experimental/models/base_neighbour_rec.py +200 -0
  21. replay/experimental/models/base_rec.py +1386 -0
  22. replay/experimental/models/base_torch_rec.py +234 -0
  23. replay/experimental/models/cql.py +454 -0
  24. replay/experimental/models/ddpg.py +932 -0
  25. replay/experimental/models/dt4rec/__init__.py +0 -0
  26. replay/experimental/models/dt4rec/dt4rec.py +189 -0
  27. replay/experimental/models/dt4rec/gpt1.py +401 -0
  28. replay/experimental/models/dt4rec/trainer.py +127 -0
  29. replay/experimental/models/dt4rec/utils.py +264 -0
  30. replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
  31. replay/experimental/models/extensions/spark_custom_models/als_extension.py +792 -0
  32. replay/experimental/models/hierarchical_recommender.py +331 -0
  33. replay/experimental/models/implicit_wrap.py +131 -0
  34. replay/experimental/models/lightfm_wrap.py +303 -0
  35. replay/experimental/models/mult_vae.py +332 -0
  36. replay/experimental/models/neural_ts.py +986 -0
  37. replay/experimental/models/neuromf.py +406 -0
  38. replay/experimental/models/scala_als.py +293 -0
  39. replay/experimental/models/u_lin_ucb.py +115 -0
  40. replay/experimental/nn/data/__init__.py +1 -0
  41. replay/experimental/nn/data/schema_builder.py +102 -0
  42. replay/experimental/preprocessing/__init__.py +3 -0
  43. replay/experimental/preprocessing/data_preparator.py +839 -0
  44. replay/experimental/preprocessing/padder.py +229 -0
  45. replay/experimental/preprocessing/sequence_generator.py +208 -0
  46. replay/experimental/scenarios/__init__.py +1 -0
  47. replay/experimental/scenarios/obp_wrapper/__init__.py +8 -0
  48. replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +74 -0
  49. replay/experimental/scenarios/obp_wrapper/replay_offline.py +261 -0
  50. replay/experimental/scenarios/obp_wrapper/utils.py +85 -0
  51. replay/experimental/scenarios/two_stages/__init__.py +0 -0
  52. replay/experimental/scenarios/two_stages/reranker.py +117 -0
  53. replay/experimental/scenarios/two_stages/two_stages_scenario.py +757 -0
  54. replay/experimental/utils/__init__.py +0 -0
  55. replay/experimental/utils/logger.py +24 -0
  56. replay/experimental/utils/model_handler.py +186 -0
  57. replay/experimental/utils/session_handler.py +44 -0
  58. {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/METADATA +11 -17
  59. {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/RECORD +62 -7
  60. {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/WHEEL +0 -0
  61. {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/licenses/LICENSE +0 -0
  62. {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,331 @@
1
+ from typing import Optional
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+
7
+ from replay.experimental.models.base_rec import HybridRecommender
8
+ from replay.experimental.models.u_lin_ucb import ULinUCB
9
+ from replay.utils import PYSPARK_AVAILABLE, PandasDataFrame, SparkDataFrame
10
+
11
+ if PYSPARK_AVAILABLE:
12
+ from replay.utils.spark_utils import convert2spark
13
+
14
+
15
+ class HierarchicalRecommender(HybridRecommender):
16
+ """
17
+ Hierarchical Recommender class is inspired by
18
+ `the article of Song et al <https://arxiv.org/abs/2110.09905>`_ and is a
19
+ generalization of the method. By default it works as HCB proposed there.
20
+
21
+ The model sequentially clusterizes the item space constructing a tree of
22
+ given ``depth``. The clusterization is performed according to the
23
+ ``cluster_model`` - any sklearn clusterer instance provided by the user.
24
+
25
+ At each node of a tree a node recommender instance is mounted. All of them
26
+ are produced by ``recommender_class`` object (not an instance!) and are
27
+ initialized with ``recommender_params``.
28
+
29
+ To predict an item the model goes down the tree each time selecting the
30
+ next node as the one predicted by the parent node recommender. A leaf
31
+ node recommender would give an item itself.
32
+
33
+ The log is considered as the history of user-item interactions. To fit
34
+ the model each interaction is counted in all node recommenders on the
35
+ path from the root to the item as if such path would be traversed through
36
+ the prediction process.
37
+
38
+ Hierarchical Recommender may be useful to enhance the perforamance of
39
+ simple models not suitable for large item space problems (such as many
40
+ contextual bandits) and to reduce prediction time in models that need to
41
+ iterate through all of the items to make a recommendation.
42
+
43
+ In this version Hierarchical Recommender is implemented as
44
+ ``HybridRecommender`` and apart from ``log`` requires both ``item_features``
45
+ and ``user_features`` in ``fit()`` method. By the same reason only
46
+ ``HybridRecommender`` classess may be passed as a ``recommender_class``.
47
+ Need in features at ``predict()`` depends on the ``recommender_class``
48
+ itself.
49
+
50
+ Note that current implementation relies mostly on python rather than
51
+ pyspark.
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ depth,
57
+ cluster_model,
58
+ recommender_class=ULinUCB,
59
+ recommender_params={},
60
+ ):
61
+ """
62
+ :param depth: depth of the item tree
63
+ :param cluster_model: an sklearn.cluster object (or any with similar
64
+ API) that would perform clustering on the item space
65
+ :param recommender_class: a RePlay hybrid recommender class object (not an
66
+ instance!) instances of which would be mounted at each tree node
67
+ :param recommender_params: initialization parameters for the recommenders
68
+ """
69
+
70
+ self.depth = depth
71
+ self.cluster_model = cluster_model
72
+ self.recommender_class = recommender_class
73
+ self.recommender_params = recommender_params
74
+ self.root = Node(parent=None, tree=self)
75
+
76
+ @property
77
+ def _init_args(self):
78
+ return {
79
+ "depth": self.depth,
80
+ "cluster_model": self.cluster_model,
81
+ "recommender_class": self.recommender_class,
82
+ "recommender_params": self.recommender_params,
83
+ }
84
+
85
+ def _fit(
86
+ self,
87
+ log: SparkDataFrame,
88
+ user_features: Optional[SparkDataFrame] = None,
89
+ item_features: Optional[SparkDataFrame] = None,
90
+ ) -> None:
91
+ self.logger.debug("Clustering...")
92
+ self.root._procreate(item_features.toPandas())
93
+
94
+ self.logger.debug("Fitting...")
95
+ self.root._fit(log.toPandas(), user_features.toPandas(), item_features.toPandas())
96
+
97
+ def _predict(
98
+ self,
99
+ log: SparkDataFrame,
100
+ k: int,
101
+ users: SparkDataFrame,
102
+ items: SparkDataFrame,
103
+ user_features: Optional[SparkDataFrame] = None,
104
+ item_features: Optional[SparkDataFrame] = None,
105
+ filter_seen_items: bool = True,
106
+ ) -> SparkDataFrame:
107
+ self.logger.debug("Predicting...")
108
+ pred = self.root._predict(
109
+ log.toPandas(),
110
+ k,
111
+ users.toPandas(),
112
+ items.toPandas(),
113
+ user_features,
114
+ item_features,
115
+ filter_seen_items,
116
+ )
117
+ return convert2spark(pred)
118
+
119
+ def _get_recommender(self):
120
+ new_recommender = self.recommender_class(**self.recommender_params)
121
+ assert isinstance(new_recommender, HybridRecommender)
122
+ return new_recommender
123
+
124
+ def _get_clusterer(self, node):
125
+ if node.is_leaf:
126
+ return Clusterer(model=DiscreteClusterer())
127
+ else:
128
+ return Clusterer(model=self.cluster_model)
129
+
130
+
131
+ class Node:
132
+ """
133
+ Node of a Hierarchichal Recommender. The Node receives a clusterer and a
134
+ recommender from the tree and interacts with them at clustering, fitting
135
+ and predicting stages.
136
+ """
137
+
138
+ def __init__(self, parent, tree: HierarchicalRecommender = None):
139
+ """
140
+ :param parent: the parent node
141
+ :param tree: the tree which the node belongs to (is None by default
142
+ and is inherited from the parent)
143
+ """
144
+ self.parent = parent
145
+ self.tree = tree
146
+ self.is_leaf = False
147
+
148
+ if parent is None:
149
+ self.level = 0
150
+ assert tree is not None
151
+ else:
152
+ self.tree = self.parent.tree
153
+ self.level = self.parent.level + 1
154
+
155
+ if self.level == (self.tree.depth - 1):
156
+ self.is_leaf = True
157
+ self.children = None
158
+
159
+ self.clusterer = self.tree._get_clusterer(self)
160
+ self.recommender = self.tree._get_recommender()
161
+
162
+ def get_num_children(self):
163
+ return len(self.children)
164
+
165
+ def _procreate(
166
+ self,
167
+ items: PandasDataFrame,
168
+ ) -> None:
169
+ items["cluster_idx"] = self.clusterer.fit_predict(items)
170
+
171
+ if not self.is_leaf:
172
+ self.children = [None] * self.clusterer.get_num_clusters()
173
+ for cl_idx, cl_items in items.groupby("cluster_idx"):
174
+ self.children[cl_idx] = Node(parent=self)
175
+ self.children[cl_idx]._procreate(cl_items)
176
+
177
+ def _fit(
178
+ self,
179
+ log: PandasDataFrame,
180
+ user_features: PandasDataFrame,
181
+ item_features: PandasDataFrame,
182
+ ) -> None:
183
+ log["cluster_idx"] = self.clusterer.predict(log[["item_idx"]])
184
+
185
+ if not self.is_leaf:
186
+ for cl_idx, cl_log in tqdm(log.groupby("cluster_idx")):
187
+ self.children[cl_idx]._fit(cl_log, user_features, item_features)
188
+
189
+ rec_params = {
190
+ "log": convert2spark(log.drop(columns="item_idx").rename(columns={"cluster_idx": "item_idx"})),
191
+ "user_features": convert2spark(user_features),
192
+ "item_features": convert2spark(self.clusterer.get_cluster_centers()),
193
+ }
194
+ self.recommender.fit(**rec_params)
195
+
196
+ def _predict(
197
+ self,
198
+ log: PandasDataFrame,
199
+ k: int,
200
+ users: PandasDataFrame,
201
+ items: PandasDataFrame,
202
+ user_features: Optional[SparkDataFrame] = None,
203
+ item_features: Optional[SparkDataFrame] = None,
204
+ filter_seen_items: bool = True,
205
+ ) -> PandasDataFrame:
206
+ pred = pd.DataFrame(columns=["user_idx", "item_idx", "relevance"])
207
+ log["cluster_idx"] = self.clusterer.predict(log[["item_idx"]])
208
+ items["cluster_idx"] = self.clusterer.predict(items[["item_idx"]])
209
+
210
+ rec_params = {
211
+ "log": convert2spark(log.drop(columns="item_idx").rename(columns={"cluster_idx": "item_idx"})),
212
+ "users": convert2spark(users),
213
+ "items": convert2spark(items.drop(columns="item_idx").rename(columns={"cluster_idx": "item_idx"})),
214
+ "user_features": user_features,
215
+ "item_features": item_features,
216
+ }
217
+
218
+ if self.is_leaf:
219
+ rec_params["k"] = k
220
+ rec_params["filter_seen_items"] = filter_seen_items
221
+ pred = self.recommender.predict(**rec_params).toPandas().rename(columns={"item_idx": "cluster_idx"})
222
+ pred["item_idx"] = self.clusterer.predict_items(pred)
223
+ pred = pred.drop(columns=["cluster_idx"])
224
+ else:
225
+ rec_params["k"] = 1
226
+ rec_params["filter_seen_items"] = False
227
+ pred_clusters = (
228
+ self.recommender.predict(**rec_params).toPandas().rename(columns={"item_idx": "cluster_idx"})
229
+ )
230
+
231
+ for cl_idx, cluster in pred_clusters.groupby("cluster_idx"):
232
+ child_params = {
233
+ "log": log[log["cluster_idx"] == cl_idx].drop(columns="cluster_idx"),
234
+ "k": k,
235
+ "users": cluster[["user_idx"]],
236
+ "items": items[items["cluster_idx"] == cl_idx].drop(columns="cluster_idx"),
237
+ "user_features": user_features,
238
+ "item_features": item_features,
239
+ "filter_seen_items": filter_seen_items,
240
+ }
241
+ cl_pred = self.children[cl_idx]._predict(**child_params)
242
+ pred = pd.concat([pred, cl_pred])
243
+
244
+ return pred
245
+
246
+
247
+ class Clusterer:
248
+ """
249
+ Wrapper class to provide proper and unified interaction with sklearn
250
+ clusterers.
251
+ """
252
+
253
+ def __init__(self, model):
254
+ """
255
+ :param model: sklearn.cluster object or one with similar API
256
+ """
257
+ self._model = model
258
+
259
+ def fit_predict(
260
+ self,
261
+ items: PandasDataFrame,
262
+ ):
263
+ self.fit(items)
264
+ return self.predict(items)
265
+
266
+ def fit(
267
+ self,
268
+ items: PandasDataFrame,
269
+ ) -> None:
270
+ items = items.sort_values(
271
+ by="item_idx"
272
+ ) # for discrete clusterer to work right, otherwise items would be shuffled
273
+
274
+ item_idx = items["item_idx"].to_numpy()
275
+ item_features = items.drop(columns="item_idx").to_numpy()
276
+
277
+ self._labels = self._model.fit_predict(item_features)
278
+
279
+ self._cluster_map = dict(zip(item_idx, self._labels))
280
+ self._item_map = dict(zip(self._labels, item_idx))
281
+
282
+ self._set_cluster_centers(items)
283
+
284
+ def predict(
285
+ self,
286
+ items: PandasDataFrame,
287
+ ):
288
+ return items["item_idx"].map(self.get_cluster_map())
289
+
290
+ def predict_items(
291
+ self,
292
+ clusters: PandasDataFrame,
293
+ ):
294
+ return clusters["cluster_idx"].map(self.get_item_map())
295
+
296
+ def _set_cluster_centers(
297
+ self,
298
+ items: PandasDataFrame,
299
+ ) -> None:
300
+ items["cluster_idx"] = self.predict(items)
301
+ self._cluster_centers = (
302
+ items.drop(columns="item_idx")
303
+ .groupby("cluster_idx")
304
+ .mean()
305
+ .reset_index()
306
+ .rename(columns={"cluster_idx": "item_idx"})
307
+ )
308
+
309
+ self._num_clusters = self._cluster_centers.shape[0]
310
+
311
+ def get_cluster_map(self) -> dict:
312
+ return self._cluster_map
313
+
314
+ def get_item_map(self) -> dict:
315
+ return self._item_map
316
+
317
+ def get_cluster_centers(self) -> PandasDataFrame:
318
+ return self._cluster_centers
319
+
320
+ def get_num_clusters(self) -> int:
321
+ return self._num_clusters
322
+
323
+
324
+ class DiscreteClusterer:
325
+ """
326
+ Discrete Clusterer - one that counts each item as a cluster already.
327
+ """
328
+
329
+ def fit_predict(self, items):
330
+ self.cluster_centers_ = items
331
+ return np.arange(items.shape[0])
@@ -0,0 +1,131 @@
1
+ from os.path import join
2
+ from typing import Optional
3
+
4
+ from replay.data import get_schema
5
+ from replay.experimental.models.base_rec import Recommender
6
+ from replay.preprocessing import CSRConverter
7
+ from replay.utils import PandasDataFrame, SparkDataFrame
8
+ from replay.utils.spark_utils import load_pickled_from_parquet, save_picklable_to_parquet
9
+
10
+
11
+ class ImplicitWrap(Recommender):
12
+ """Wrapper for `implicit
13
+ <https://github.com/benfred/implicit>`_
14
+
15
+ Example:
16
+
17
+ >>> import implicit
18
+ >>> model = implicit.als.AlternatingLeastSquares(factors=5)
19
+ >>> als = ImplicitWrap(model)
20
+
21
+ This way you can use implicit models as any other in replay
22
+ with conversions made under the hood.
23
+
24
+ >>> import pandas as pd
25
+ >>> from replay.utils.spark_utils import convert2spark
26
+ >>> df = pd.DataFrame({"user_idx": [1, 1, 2, 2], "item_idx": [1, 2, 2, 3], "relevance": [1, 1, 1, 1]})
27
+ >>> df = convert2spark(df)
28
+ >>> als.fit_predict(df, 1, users=[1])[["user_idx", "item_idx"]].toPandas()
29
+ user_idx item_idx
30
+ 0 1 3
31
+ """
32
+
33
+ def __init__(self, model):
34
+ """Provide initialized ``implicit`` model."""
35
+ self.model = model
36
+ self.logger.info("The model is a wrapper of a non-distributed model which may affect performance")
37
+
38
+ @property
39
+ def _init_args(self):
40
+ return {"model": None}
41
+
42
+ def _save_model(self, path: str):
43
+ save_picklable_to_parquet(self.model, join(path, "model"))
44
+
45
+ def _load_model(self, path: str):
46
+ self.model = load_pickled_from_parquet(join(path, "model"))
47
+
48
+ def _fit(
49
+ self,
50
+ log: SparkDataFrame,
51
+ user_features: Optional[SparkDataFrame] = None, # noqa: ARG002
52
+ item_features: Optional[SparkDataFrame] = None, # noqa: ARG002
53
+ ) -> None:
54
+ matrix = CSRConverter(
55
+ first_dim_column="user_idx", second_dim_column="item_idx", data_column="relevance"
56
+ ).transform(log)
57
+ self.model.fit(matrix)
58
+
59
+ @staticmethod
60
+ def _pd_func(model, items_to_use=None, user_item_data=None, filter_seen_items=False):
61
+ def predict_by_user_item(pandas_df):
62
+ user = int(pandas_df["user_idx"].iloc[0])
63
+ items = items_to_use if items_to_use else pandas_df.item_idx.to_list()
64
+
65
+ items_res, rel = model.recommend(
66
+ userid=user,
67
+ user_items=user_item_data[user] if filter_seen_items else None,
68
+ N=len(items),
69
+ filter_already_liked_items=filter_seen_items,
70
+ items=items,
71
+ )
72
+ return PandasDataFrame(
73
+ {
74
+ "user_idx": [user] * len(items_res),
75
+ "item_idx": items_res,
76
+ "relevance": rel,
77
+ }
78
+ )
79
+
80
+ return predict_by_user_item
81
+
82
+ def _predict(
83
+ self,
84
+ log: SparkDataFrame,
85
+ k: int, # noqa: ARG002
86
+ users: SparkDataFrame,
87
+ items: SparkDataFrame,
88
+ user_features: Optional[SparkDataFrame] = None, # noqa: ARG002
89
+ item_features: Optional[SparkDataFrame] = None, # noqa: ARG002
90
+ filter_seen_items: bool = True,
91
+ ) -> SparkDataFrame:
92
+ items_to_use = items.distinct().toPandas().item_idx.tolist()
93
+ user_item_data = CSRConverter(
94
+ first_dim_column="user_idx", second_dim_column="item_idx", data_column="relevance"
95
+ ).transform(log)
96
+ model = self.model
97
+ rec_schema = get_schema(
98
+ query_column="user_idx",
99
+ item_column="item_idx",
100
+ rating_column="relevance",
101
+ has_timestamp=False,
102
+ )
103
+ return (
104
+ users.select("user_idx")
105
+ .groupby("user_idx")
106
+ .applyInPandas(
107
+ self._pd_func(
108
+ model=model,
109
+ items_to_use=items_to_use,
110
+ user_item_data=user_item_data,
111
+ filter_seen_items=filter_seen_items,
112
+ ),
113
+ rec_schema,
114
+ )
115
+ )
116
+
117
+ def _predict_pairs(
118
+ self,
119
+ pairs: SparkDataFrame,
120
+ log: Optional[SparkDataFrame] = None, # noqa: ARG002
121
+ user_features: Optional[SparkDataFrame] = None, # noqa: ARG002
122
+ item_features: Optional[SparkDataFrame] = None, # noqa: ARG002
123
+ ) -> SparkDataFrame:
124
+ model = self.model
125
+ rec_schema = get_schema(
126
+ query_column="user_idx",
127
+ item_column="item_idx",
128
+ rating_column="relevance",
129
+ has_timestamp=False,
130
+ )
131
+ return pairs.groupby("user_idx").applyInPandas(self._pd_func(model=model, filter_seen_items=False), rec_schema)