replay-rec 0.20.1rc0__py3-none-any.whl → 0.20.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- replay/__init__.py +1 -1
- {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/METADATA +18 -12
- {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/RECORD +6 -61
- replay/experimental/__init__.py +0 -0
- replay/experimental/metrics/__init__.py +0 -62
- replay/experimental/metrics/base_metric.py +0 -603
- replay/experimental/metrics/coverage.py +0 -97
- replay/experimental/metrics/experiment.py +0 -175
- replay/experimental/metrics/hitrate.py +0 -26
- replay/experimental/metrics/map.py +0 -30
- replay/experimental/metrics/mrr.py +0 -18
- replay/experimental/metrics/ncis_precision.py +0 -31
- replay/experimental/metrics/ndcg.py +0 -49
- replay/experimental/metrics/precision.py +0 -22
- replay/experimental/metrics/recall.py +0 -25
- replay/experimental/metrics/rocauc.py +0 -49
- replay/experimental/metrics/surprisal.py +0 -90
- replay/experimental/metrics/unexpectedness.py +0 -76
- replay/experimental/models/__init__.py +0 -50
- replay/experimental/models/admm_slim.py +0 -257
- replay/experimental/models/base_neighbour_rec.py +0 -200
- replay/experimental/models/base_rec.py +0 -1386
- replay/experimental/models/base_torch_rec.py +0 -234
- replay/experimental/models/cql.py +0 -454
- replay/experimental/models/ddpg.py +0 -932
- replay/experimental/models/dt4rec/__init__.py +0 -0
- replay/experimental/models/dt4rec/dt4rec.py +0 -189
- replay/experimental/models/dt4rec/gpt1.py +0 -401
- replay/experimental/models/dt4rec/trainer.py +0 -127
- replay/experimental/models/dt4rec/utils.py +0 -264
- replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
- replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -792
- replay/experimental/models/hierarchical_recommender.py +0 -331
- replay/experimental/models/implicit_wrap.py +0 -131
- replay/experimental/models/lightfm_wrap.py +0 -303
- replay/experimental/models/mult_vae.py +0 -332
- replay/experimental/models/neural_ts.py +0 -986
- replay/experimental/models/neuromf.py +0 -406
- replay/experimental/models/scala_als.py +0 -293
- replay/experimental/models/u_lin_ucb.py +0 -115
- replay/experimental/nn/data/__init__.py +0 -1
- replay/experimental/nn/data/schema_builder.py +0 -102
- replay/experimental/preprocessing/__init__.py +0 -3
- replay/experimental/preprocessing/data_preparator.py +0 -839
- replay/experimental/preprocessing/padder.py +0 -229
- replay/experimental/preprocessing/sequence_generator.py +0 -208
- replay/experimental/scenarios/__init__.py +0 -1
- replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
- replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -74
- replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -261
- replay/experimental/scenarios/obp_wrapper/utils.py +0 -85
- replay/experimental/scenarios/two_stages/__init__.py +0 -0
- replay/experimental/scenarios/two_stages/reranker.py +0 -117
- replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -757
- replay/experimental/utils/__init__.py +0 -0
- replay/experimental/utils/logger.py +0 -24
- replay/experimental/utils/model_handler.py +0 -186
- replay/experimental/utils/session_handler.py +0 -44
- {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/WHEEL +0 -0
- {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/licenses/LICENSE +0 -0
- {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,331 +0,0 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
|
|
3
|
-
import numpy as np
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from tqdm import tqdm
|
|
6
|
-
|
|
7
|
-
from replay.experimental.models.base_rec import HybridRecommender
|
|
8
|
-
from replay.experimental.models.u_lin_ucb import ULinUCB
|
|
9
|
-
from replay.utils import PYSPARK_AVAILABLE, PandasDataFrame, SparkDataFrame
|
|
10
|
-
|
|
11
|
-
if PYSPARK_AVAILABLE:
|
|
12
|
-
from replay.utils.spark_utils import convert2spark
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class HierarchicalRecommender(HybridRecommender):
|
|
16
|
-
"""
|
|
17
|
-
Hierarchical Recommender class is inspired by
|
|
18
|
-
`the article of Song et al <https://arxiv.org/abs/2110.09905>`_ and is a
|
|
19
|
-
generalization of the method. By default it works as HCB proposed there.
|
|
20
|
-
|
|
21
|
-
The model sequentially clusterizes the item space constructing a tree of
|
|
22
|
-
given ``depth``. The clusterization is performed according to the
|
|
23
|
-
``cluster_model`` - any sklearn clusterer instance provided by the user.
|
|
24
|
-
|
|
25
|
-
At each node of a tree a node recommender instance is mounted. All of them
|
|
26
|
-
are produced by ``recommender_class`` object (not an instance!) and are
|
|
27
|
-
initialized with ``recommender_params``.
|
|
28
|
-
|
|
29
|
-
To predict an item the model goes down the tree each time selecting the
|
|
30
|
-
next node as the one predicted by the parent node recommender. A leaf
|
|
31
|
-
node recommender would give an item itself.
|
|
32
|
-
|
|
33
|
-
The log is considered as the history of user-item interactions. To fit
|
|
34
|
-
the model each interaction is counted in all node recommenders on the
|
|
35
|
-
path from the root to the item as if such path would be traversed through
|
|
36
|
-
the prediction process.
|
|
37
|
-
|
|
38
|
-
Hierarchical Recommender may be useful to enhance the perforamance of
|
|
39
|
-
simple models not suitable for large item space problems (such as many
|
|
40
|
-
contextual bandits) and to reduce prediction time in models that need to
|
|
41
|
-
iterate through all of the items to make a recommendation.
|
|
42
|
-
|
|
43
|
-
In this version Hierarchical Recommender is implemented as
|
|
44
|
-
``HybridRecommender`` and apart from ``log`` requires both ``item_features``
|
|
45
|
-
and ``user_features`` in ``fit()`` method. By the same reason only
|
|
46
|
-
``HybridRecommender`` classess may be passed as a ``recommender_class``.
|
|
47
|
-
Need in features at ``predict()`` depends on the ``recommender_class``
|
|
48
|
-
itself.
|
|
49
|
-
|
|
50
|
-
Note that current implementation relies mostly on python rather than
|
|
51
|
-
pyspark.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
def __init__(
|
|
55
|
-
self,
|
|
56
|
-
depth,
|
|
57
|
-
cluster_model,
|
|
58
|
-
recommender_class=ULinUCB,
|
|
59
|
-
recommender_params={},
|
|
60
|
-
):
|
|
61
|
-
"""
|
|
62
|
-
:param depth: depth of the item tree
|
|
63
|
-
:param cluster_model: an sklearn.cluster object (or any with similar
|
|
64
|
-
API) that would perform clustering on the item space
|
|
65
|
-
:param recommender_class: a RePlay hybrid recommender class object (not an
|
|
66
|
-
instance!) instances of which would be mounted at each tree node
|
|
67
|
-
:param recommender_params: initialization parameters for the recommenders
|
|
68
|
-
"""
|
|
69
|
-
|
|
70
|
-
self.depth = depth
|
|
71
|
-
self.cluster_model = cluster_model
|
|
72
|
-
self.recommender_class = recommender_class
|
|
73
|
-
self.recommender_params = recommender_params
|
|
74
|
-
self.root = Node(parent=None, tree=self)
|
|
75
|
-
|
|
76
|
-
@property
|
|
77
|
-
def _init_args(self):
|
|
78
|
-
return {
|
|
79
|
-
"depth": self.depth,
|
|
80
|
-
"cluster_model": self.cluster_model,
|
|
81
|
-
"recommender_class": self.recommender_class,
|
|
82
|
-
"recommender_params": self.recommender_params,
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
def _fit(
|
|
86
|
-
self,
|
|
87
|
-
log: SparkDataFrame,
|
|
88
|
-
user_features: Optional[SparkDataFrame] = None,
|
|
89
|
-
item_features: Optional[SparkDataFrame] = None,
|
|
90
|
-
) -> None:
|
|
91
|
-
self.logger.debug("Clustering...")
|
|
92
|
-
self.root._procreate(item_features.toPandas())
|
|
93
|
-
|
|
94
|
-
self.logger.debug("Fitting...")
|
|
95
|
-
self.root._fit(log.toPandas(), user_features.toPandas(), item_features.toPandas())
|
|
96
|
-
|
|
97
|
-
def _predict(
|
|
98
|
-
self,
|
|
99
|
-
log: SparkDataFrame,
|
|
100
|
-
k: int,
|
|
101
|
-
users: SparkDataFrame,
|
|
102
|
-
items: SparkDataFrame,
|
|
103
|
-
user_features: Optional[SparkDataFrame] = None,
|
|
104
|
-
item_features: Optional[SparkDataFrame] = None,
|
|
105
|
-
filter_seen_items: bool = True,
|
|
106
|
-
) -> SparkDataFrame:
|
|
107
|
-
self.logger.debug("Predicting...")
|
|
108
|
-
pred = self.root._predict(
|
|
109
|
-
log.toPandas(),
|
|
110
|
-
k,
|
|
111
|
-
users.toPandas(),
|
|
112
|
-
items.toPandas(),
|
|
113
|
-
user_features,
|
|
114
|
-
item_features,
|
|
115
|
-
filter_seen_items,
|
|
116
|
-
)
|
|
117
|
-
return convert2spark(pred)
|
|
118
|
-
|
|
119
|
-
def _get_recommender(self):
|
|
120
|
-
new_recommender = self.recommender_class(**self.recommender_params)
|
|
121
|
-
assert isinstance(new_recommender, HybridRecommender)
|
|
122
|
-
return new_recommender
|
|
123
|
-
|
|
124
|
-
def _get_clusterer(self, node):
|
|
125
|
-
if node.is_leaf:
|
|
126
|
-
return Clusterer(model=DiscreteClusterer())
|
|
127
|
-
else:
|
|
128
|
-
return Clusterer(model=self.cluster_model)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
class Node:
|
|
132
|
-
"""
|
|
133
|
-
Node of a Hierarchichal Recommender. The Node receives a clusterer and a
|
|
134
|
-
recommender from the tree and interacts with them at clustering, fitting
|
|
135
|
-
and predicting stages.
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
def __init__(self, parent, tree: HierarchicalRecommender = None):
|
|
139
|
-
"""
|
|
140
|
-
:param parent: the parent node
|
|
141
|
-
:param tree: the tree which the node belongs to (is None by default
|
|
142
|
-
and is inherited from the parent)
|
|
143
|
-
"""
|
|
144
|
-
self.parent = parent
|
|
145
|
-
self.tree = tree
|
|
146
|
-
self.is_leaf = False
|
|
147
|
-
|
|
148
|
-
if parent is None:
|
|
149
|
-
self.level = 0
|
|
150
|
-
assert tree is not None
|
|
151
|
-
else:
|
|
152
|
-
self.tree = self.parent.tree
|
|
153
|
-
self.level = self.parent.level + 1
|
|
154
|
-
|
|
155
|
-
if self.level == (self.tree.depth - 1):
|
|
156
|
-
self.is_leaf = True
|
|
157
|
-
self.children = None
|
|
158
|
-
|
|
159
|
-
self.clusterer = self.tree._get_clusterer(self)
|
|
160
|
-
self.recommender = self.tree._get_recommender()
|
|
161
|
-
|
|
162
|
-
def get_num_children(self):
|
|
163
|
-
return len(self.children)
|
|
164
|
-
|
|
165
|
-
def _procreate(
|
|
166
|
-
self,
|
|
167
|
-
items: PandasDataFrame,
|
|
168
|
-
) -> None:
|
|
169
|
-
items["cluster_idx"] = self.clusterer.fit_predict(items)
|
|
170
|
-
|
|
171
|
-
if not self.is_leaf:
|
|
172
|
-
self.children = [None] * self.clusterer.get_num_clusters()
|
|
173
|
-
for cl_idx, cl_items in items.groupby("cluster_idx"):
|
|
174
|
-
self.children[cl_idx] = Node(parent=self)
|
|
175
|
-
self.children[cl_idx]._procreate(cl_items)
|
|
176
|
-
|
|
177
|
-
def _fit(
|
|
178
|
-
self,
|
|
179
|
-
log: PandasDataFrame,
|
|
180
|
-
user_features: PandasDataFrame,
|
|
181
|
-
item_features: PandasDataFrame,
|
|
182
|
-
) -> None:
|
|
183
|
-
log["cluster_idx"] = self.clusterer.predict(log[["item_idx"]])
|
|
184
|
-
|
|
185
|
-
if not self.is_leaf:
|
|
186
|
-
for cl_idx, cl_log in tqdm(log.groupby("cluster_idx")):
|
|
187
|
-
self.children[cl_idx]._fit(cl_log, user_features, item_features)
|
|
188
|
-
|
|
189
|
-
rec_params = {
|
|
190
|
-
"log": convert2spark(log.drop(columns="item_idx").rename(columns={"cluster_idx": "item_idx"})),
|
|
191
|
-
"user_features": convert2spark(user_features),
|
|
192
|
-
"item_features": convert2spark(self.clusterer.get_cluster_centers()),
|
|
193
|
-
}
|
|
194
|
-
self.recommender.fit(**rec_params)
|
|
195
|
-
|
|
196
|
-
def _predict(
|
|
197
|
-
self,
|
|
198
|
-
log: PandasDataFrame,
|
|
199
|
-
k: int,
|
|
200
|
-
users: PandasDataFrame,
|
|
201
|
-
items: PandasDataFrame,
|
|
202
|
-
user_features: Optional[SparkDataFrame] = None,
|
|
203
|
-
item_features: Optional[SparkDataFrame] = None,
|
|
204
|
-
filter_seen_items: bool = True,
|
|
205
|
-
) -> PandasDataFrame:
|
|
206
|
-
pred = pd.DataFrame(columns=["user_idx", "item_idx", "relevance"])
|
|
207
|
-
log["cluster_idx"] = self.clusterer.predict(log[["item_idx"]])
|
|
208
|
-
items["cluster_idx"] = self.clusterer.predict(items[["item_idx"]])
|
|
209
|
-
|
|
210
|
-
rec_params = {
|
|
211
|
-
"log": convert2spark(log.drop(columns="item_idx").rename(columns={"cluster_idx": "item_idx"})),
|
|
212
|
-
"users": convert2spark(users),
|
|
213
|
-
"items": convert2spark(items.drop(columns="item_idx").rename(columns={"cluster_idx": "item_idx"})),
|
|
214
|
-
"user_features": user_features,
|
|
215
|
-
"item_features": item_features,
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
if self.is_leaf:
|
|
219
|
-
rec_params["k"] = k
|
|
220
|
-
rec_params["filter_seen_items"] = filter_seen_items
|
|
221
|
-
pred = self.recommender.predict(**rec_params).toPandas().rename(columns={"item_idx": "cluster_idx"})
|
|
222
|
-
pred["item_idx"] = self.clusterer.predict_items(pred)
|
|
223
|
-
pred = pred.drop(columns=["cluster_idx"])
|
|
224
|
-
else:
|
|
225
|
-
rec_params["k"] = 1
|
|
226
|
-
rec_params["filter_seen_items"] = False
|
|
227
|
-
pred_clusters = (
|
|
228
|
-
self.recommender.predict(**rec_params).toPandas().rename(columns={"item_idx": "cluster_idx"})
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
for cl_idx, cluster in pred_clusters.groupby("cluster_idx"):
|
|
232
|
-
child_params = {
|
|
233
|
-
"log": log[log["cluster_idx"] == cl_idx].drop(columns="cluster_idx"),
|
|
234
|
-
"k": k,
|
|
235
|
-
"users": cluster[["user_idx"]],
|
|
236
|
-
"items": items[items["cluster_idx"] == cl_idx].drop(columns="cluster_idx"),
|
|
237
|
-
"user_features": user_features,
|
|
238
|
-
"item_features": item_features,
|
|
239
|
-
"filter_seen_items": filter_seen_items,
|
|
240
|
-
}
|
|
241
|
-
cl_pred = self.children[cl_idx]._predict(**child_params)
|
|
242
|
-
pred = pd.concat([pred, cl_pred])
|
|
243
|
-
|
|
244
|
-
return pred
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
class Clusterer:
|
|
248
|
-
"""
|
|
249
|
-
Wrapper class to provide proper and unified interaction with sklearn
|
|
250
|
-
clusterers.
|
|
251
|
-
"""
|
|
252
|
-
|
|
253
|
-
def __init__(self, model):
|
|
254
|
-
"""
|
|
255
|
-
:param model: sklearn.cluster object or one with similar API
|
|
256
|
-
"""
|
|
257
|
-
self._model = model
|
|
258
|
-
|
|
259
|
-
def fit_predict(
|
|
260
|
-
self,
|
|
261
|
-
items: PandasDataFrame,
|
|
262
|
-
):
|
|
263
|
-
self.fit(items)
|
|
264
|
-
return self.predict(items)
|
|
265
|
-
|
|
266
|
-
def fit(
|
|
267
|
-
self,
|
|
268
|
-
items: PandasDataFrame,
|
|
269
|
-
) -> None:
|
|
270
|
-
items = items.sort_values(
|
|
271
|
-
by="item_idx"
|
|
272
|
-
) # for discrete clusterer to work right, otherwise items would be shuffled
|
|
273
|
-
|
|
274
|
-
item_idx = items["item_idx"].to_numpy()
|
|
275
|
-
item_features = items.drop(columns="item_idx").to_numpy()
|
|
276
|
-
|
|
277
|
-
self._labels = self._model.fit_predict(item_features)
|
|
278
|
-
|
|
279
|
-
self._cluster_map = dict(zip(item_idx, self._labels))
|
|
280
|
-
self._item_map = dict(zip(self._labels, item_idx))
|
|
281
|
-
|
|
282
|
-
self._set_cluster_centers(items)
|
|
283
|
-
|
|
284
|
-
def predict(
|
|
285
|
-
self,
|
|
286
|
-
items: PandasDataFrame,
|
|
287
|
-
):
|
|
288
|
-
return items["item_idx"].map(self.get_cluster_map())
|
|
289
|
-
|
|
290
|
-
def predict_items(
|
|
291
|
-
self,
|
|
292
|
-
clusters: PandasDataFrame,
|
|
293
|
-
):
|
|
294
|
-
return clusters["cluster_idx"].map(self.get_item_map())
|
|
295
|
-
|
|
296
|
-
def _set_cluster_centers(
|
|
297
|
-
self,
|
|
298
|
-
items: PandasDataFrame,
|
|
299
|
-
) -> None:
|
|
300
|
-
items["cluster_idx"] = self.predict(items)
|
|
301
|
-
self._cluster_centers = (
|
|
302
|
-
items.drop(columns="item_idx")
|
|
303
|
-
.groupby("cluster_idx")
|
|
304
|
-
.mean()
|
|
305
|
-
.reset_index()
|
|
306
|
-
.rename(columns={"cluster_idx": "item_idx"})
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
self._num_clusters = self._cluster_centers.shape[0]
|
|
310
|
-
|
|
311
|
-
def get_cluster_map(self) -> dict:
|
|
312
|
-
return self._cluster_map
|
|
313
|
-
|
|
314
|
-
def get_item_map(self) -> dict:
|
|
315
|
-
return self._item_map
|
|
316
|
-
|
|
317
|
-
def get_cluster_centers(self) -> PandasDataFrame:
|
|
318
|
-
return self._cluster_centers
|
|
319
|
-
|
|
320
|
-
def get_num_clusters(self) -> int:
|
|
321
|
-
return self._num_clusters
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
class DiscreteClusterer:
|
|
325
|
-
"""
|
|
326
|
-
Discrete Clusterer - one that counts each item as a cluster already.
|
|
327
|
-
"""
|
|
328
|
-
|
|
329
|
-
def fit_predict(self, items):
|
|
330
|
-
self.cluster_centers_ = items
|
|
331
|
-
return np.arange(items.shape[0])
|
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
from os.path import join
|
|
2
|
-
from typing import Optional
|
|
3
|
-
|
|
4
|
-
from replay.data import get_schema
|
|
5
|
-
from replay.experimental.models.base_rec import Recommender
|
|
6
|
-
from replay.preprocessing import CSRConverter
|
|
7
|
-
from replay.utils import PandasDataFrame, SparkDataFrame
|
|
8
|
-
from replay.utils.spark_utils import load_pickled_from_parquet, save_picklable_to_parquet
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class ImplicitWrap(Recommender):
|
|
12
|
-
"""Wrapper for `implicit
|
|
13
|
-
<https://github.com/benfred/implicit>`_
|
|
14
|
-
|
|
15
|
-
Example:
|
|
16
|
-
|
|
17
|
-
>>> import implicit
|
|
18
|
-
>>> model = implicit.als.AlternatingLeastSquares(factors=5)
|
|
19
|
-
>>> als = ImplicitWrap(model)
|
|
20
|
-
|
|
21
|
-
This way you can use implicit models as any other in replay
|
|
22
|
-
with conversions made under the hood.
|
|
23
|
-
|
|
24
|
-
>>> import pandas as pd
|
|
25
|
-
>>> from replay.utils.spark_utils import convert2spark
|
|
26
|
-
>>> df = pd.DataFrame({"user_idx": [1, 1, 2, 2], "item_idx": [1, 2, 2, 3], "relevance": [1, 1, 1, 1]})
|
|
27
|
-
>>> df = convert2spark(df)
|
|
28
|
-
>>> als.fit_predict(df, 1, users=[1])[["user_idx", "item_idx"]].toPandas()
|
|
29
|
-
user_idx item_idx
|
|
30
|
-
0 1 3
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
def __init__(self, model):
|
|
34
|
-
"""Provide initialized ``implicit`` model."""
|
|
35
|
-
self.model = model
|
|
36
|
-
self.logger.info("The model is a wrapper of a non-distributed model which may affect performance")
|
|
37
|
-
|
|
38
|
-
@property
|
|
39
|
-
def _init_args(self):
|
|
40
|
-
return {"model": None}
|
|
41
|
-
|
|
42
|
-
def _save_model(self, path: str):
|
|
43
|
-
save_picklable_to_parquet(self.model, join(path, "model"))
|
|
44
|
-
|
|
45
|
-
def _load_model(self, path: str):
|
|
46
|
-
self.model = load_pickled_from_parquet(join(path, "model"))
|
|
47
|
-
|
|
48
|
-
def _fit(
|
|
49
|
-
self,
|
|
50
|
-
log: SparkDataFrame,
|
|
51
|
-
user_features: Optional[SparkDataFrame] = None, # noqa: ARG002
|
|
52
|
-
item_features: Optional[SparkDataFrame] = None, # noqa: ARG002
|
|
53
|
-
) -> None:
|
|
54
|
-
matrix = CSRConverter(
|
|
55
|
-
first_dim_column="user_idx", second_dim_column="item_idx", data_column="relevance"
|
|
56
|
-
).transform(log)
|
|
57
|
-
self.model.fit(matrix)
|
|
58
|
-
|
|
59
|
-
@staticmethod
|
|
60
|
-
def _pd_func(model, items_to_use=None, user_item_data=None, filter_seen_items=False):
|
|
61
|
-
def predict_by_user_item(pandas_df):
|
|
62
|
-
user = int(pandas_df["user_idx"].iloc[0])
|
|
63
|
-
items = items_to_use if items_to_use else pandas_df.item_idx.to_list()
|
|
64
|
-
|
|
65
|
-
items_res, rel = model.recommend(
|
|
66
|
-
userid=user,
|
|
67
|
-
user_items=user_item_data[user] if filter_seen_items else None,
|
|
68
|
-
N=len(items),
|
|
69
|
-
filter_already_liked_items=filter_seen_items,
|
|
70
|
-
items=items,
|
|
71
|
-
)
|
|
72
|
-
return PandasDataFrame(
|
|
73
|
-
{
|
|
74
|
-
"user_idx": [user] * len(items_res),
|
|
75
|
-
"item_idx": items_res,
|
|
76
|
-
"relevance": rel,
|
|
77
|
-
}
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
return predict_by_user_item
|
|
81
|
-
|
|
82
|
-
def _predict(
|
|
83
|
-
self,
|
|
84
|
-
log: SparkDataFrame,
|
|
85
|
-
k: int, # noqa: ARG002
|
|
86
|
-
users: SparkDataFrame,
|
|
87
|
-
items: SparkDataFrame,
|
|
88
|
-
user_features: Optional[SparkDataFrame] = None, # noqa: ARG002
|
|
89
|
-
item_features: Optional[SparkDataFrame] = None, # noqa: ARG002
|
|
90
|
-
filter_seen_items: bool = True,
|
|
91
|
-
) -> SparkDataFrame:
|
|
92
|
-
items_to_use = items.distinct().toPandas().item_idx.tolist()
|
|
93
|
-
user_item_data = CSRConverter(
|
|
94
|
-
first_dim_column="user_idx", second_dim_column="item_idx", data_column="relevance"
|
|
95
|
-
).transform(log)
|
|
96
|
-
model = self.model
|
|
97
|
-
rec_schema = get_schema(
|
|
98
|
-
query_column="user_idx",
|
|
99
|
-
item_column="item_idx",
|
|
100
|
-
rating_column="relevance",
|
|
101
|
-
has_timestamp=False,
|
|
102
|
-
)
|
|
103
|
-
return (
|
|
104
|
-
users.select("user_idx")
|
|
105
|
-
.groupby("user_idx")
|
|
106
|
-
.applyInPandas(
|
|
107
|
-
self._pd_func(
|
|
108
|
-
model=model,
|
|
109
|
-
items_to_use=items_to_use,
|
|
110
|
-
user_item_data=user_item_data,
|
|
111
|
-
filter_seen_items=filter_seen_items,
|
|
112
|
-
),
|
|
113
|
-
rec_schema,
|
|
114
|
-
)
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
def _predict_pairs(
|
|
118
|
-
self,
|
|
119
|
-
pairs: SparkDataFrame,
|
|
120
|
-
log: Optional[SparkDataFrame] = None, # noqa: ARG002
|
|
121
|
-
user_features: Optional[SparkDataFrame] = None, # noqa: ARG002
|
|
122
|
-
item_features: Optional[SparkDataFrame] = None, # noqa: ARG002
|
|
123
|
-
) -> SparkDataFrame:
|
|
124
|
-
model = self.model
|
|
125
|
-
rec_schema = get_schema(
|
|
126
|
-
query_column="user_idx",
|
|
127
|
-
item_column="item_idx",
|
|
128
|
-
rating_column="relevance",
|
|
129
|
-
has_timestamp=False,
|
|
130
|
-
)
|
|
131
|
-
return pairs.groupby("user_idx").applyInPandas(self._pd_func(model=model, filter_seen_items=False), rec_schema)
|