replay-rec 0.20.2__py3-none-any.whl → 0.20.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- replay/__init__.py +1 -1
- replay/data/nn/sequential_dataset.py +8 -2
- replay/experimental/__init__.py +0 -0
- replay/experimental/metrics/__init__.py +62 -0
- replay/experimental/metrics/base_metric.py +603 -0
- replay/experimental/metrics/coverage.py +97 -0
- replay/experimental/metrics/experiment.py +175 -0
- replay/experimental/metrics/hitrate.py +26 -0
- replay/experimental/metrics/map.py +30 -0
- replay/experimental/metrics/mrr.py +18 -0
- replay/experimental/metrics/ncis_precision.py +31 -0
- replay/experimental/metrics/ndcg.py +49 -0
- replay/experimental/metrics/precision.py +22 -0
- replay/experimental/metrics/recall.py +25 -0
- replay/experimental/metrics/rocauc.py +49 -0
- replay/experimental/metrics/surprisal.py +90 -0
- replay/experimental/metrics/unexpectedness.py +76 -0
- replay/experimental/models/__init__.py +50 -0
- replay/experimental/models/admm_slim.py +257 -0
- replay/experimental/models/base_neighbour_rec.py +200 -0
- replay/experimental/models/base_rec.py +1386 -0
- replay/experimental/models/base_torch_rec.py +234 -0
- replay/experimental/models/cql.py +454 -0
- replay/experimental/models/ddpg.py +932 -0
- replay/experimental/models/dt4rec/__init__.py +0 -0
- replay/experimental/models/dt4rec/dt4rec.py +189 -0
- replay/experimental/models/dt4rec/gpt1.py +401 -0
- replay/experimental/models/dt4rec/trainer.py +127 -0
- replay/experimental/models/dt4rec/utils.py +264 -0
- replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
- replay/experimental/models/extensions/spark_custom_models/als_extension.py +792 -0
- replay/experimental/models/hierarchical_recommender.py +331 -0
- replay/experimental/models/implicit_wrap.py +131 -0
- replay/experimental/models/lightfm_wrap.py +303 -0
- replay/experimental/models/mult_vae.py +332 -0
- replay/experimental/models/neural_ts.py +986 -0
- replay/experimental/models/neuromf.py +406 -0
- replay/experimental/models/scala_als.py +293 -0
- replay/experimental/models/u_lin_ucb.py +115 -0
- replay/experimental/nn/data/__init__.py +1 -0
- replay/experimental/nn/data/schema_builder.py +102 -0
- replay/experimental/preprocessing/__init__.py +3 -0
- replay/experimental/preprocessing/data_preparator.py +839 -0
- replay/experimental/preprocessing/padder.py +229 -0
- replay/experimental/preprocessing/sequence_generator.py +208 -0
- replay/experimental/scenarios/__init__.py +1 -0
- replay/experimental/scenarios/obp_wrapper/__init__.py +8 -0
- replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +74 -0
- replay/experimental/scenarios/obp_wrapper/replay_offline.py +261 -0
- replay/experimental/scenarios/obp_wrapper/utils.py +85 -0
- replay/experimental/scenarios/two_stages/__init__.py +0 -0
- replay/experimental/scenarios/two_stages/reranker.py +117 -0
- replay/experimental/scenarios/two_stages/two_stages_scenario.py +757 -0
- replay/experimental/utils/__init__.py +0 -0
- replay/experimental/utils/logger.py +24 -0
- replay/experimental/utils/model_handler.py +186 -0
- replay/experimental/utils/session_handler.py +44 -0
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/METADATA +11 -17
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/RECORD +62 -7
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/WHEEL +0 -0
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/licenses/LICENSE +0 -0
- {replay_rec-0.20.2.dist-info → replay_rec-0.20.3rc0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,757 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from typing import Any, Optional, Union
|
|
3
|
+
|
|
4
|
+
from replay.experimental.models import ScalaALSWrap
|
|
5
|
+
from replay.experimental.preprocessing.data_preparator import ToNumericFeatureTransformer
|
|
6
|
+
from replay.experimental.scenarios.two_stages.reranker import LamaWrap
|
|
7
|
+
from replay.metrics import Metric, Precision
|
|
8
|
+
from replay.models import PopRec, RandomRec
|
|
9
|
+
from replay.models.base_rec import BaseRecommender, HybridRecommender
|
|
10
|
+
from replay.preprocessing.history_based_fp import HistoryBasedFeaturesProcessor
|
|
11
|
+
from replay.splitters import RatioSplitter, Splitter
|
|
12
|
+
from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, SparkDataFrame
|
|
13
|
+
from replay.utils.session_handler import State
|
|
14
|
+
from replay.utils.spark_utils import (
|
|
15
|
+
array_mult,
|
|
16
|
+
cache_if_exists,
|
|
17
|
+
fallback,
|
|
18
|
+
get_log_info,
|
|
19
|
+
get_top_k_recs,
|
|
20
|
+
horizontal_explode,
|
|
21
|
+
join_or_return,
|
|
22
|
+
join_with_col_renaming,
|
|
23
|
+
unpersist_if_exists,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
if PYSPARK_AVAILABLE:
|
|
27
|
+
import pyspark.sql.functions as sf
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_first_level_model_features(
|
|
31
|
+
model: SparkDataFrame,
|
|
32
|
+
pairs: SparkDataFrame,
|
|
33
|
+
user_features: Optional[SparkDataFrame] = None,
|
|
34
|
+
item_features: Optional[SparkDataFrame] = None,
|
|
35
|
+
add_factors_mult: bool = True,
|
|
36
|
+
prefix: str = "",
|
|
37
|
+
) -> SparkDataFrame:
|
|
38
|
+
"""
|
|
39
|
+
Get user and item embeddings from replay model.
|
|
40
|
+
Can also compute elementwise multiplication between them with ``add_factors_mult`` parameter.
|
|
41
|
+
Zero vectors are returned if a model does not have embeddings for specific users/items.
|
|
42
|
+
|
|
43
|
+
:param model: trained model
|
|
44
|
+
:param pairs: user-item pairs to get vectors for `[user_id/user_idx, item_id/item_id]`
|
|
45
|
+
:param user_features: user features `[user_id/user_idx, feature_1, ....]`
|
|
46
|
+
:param item_features: item features `[item_id/item_idx, feature_1, ....]`
|
|
47
|
+
:param add_factors_mult: flag to add elementwise multiplication
|
|
48
|
+
:param prefix: name to add to the columns
|
|
49
|
+
:return: DataFrame
|
|
50
|
+
"""
|
|
51
|
+
users = pairs.select("user_idx").distinct()
|
|
52
|
+
items = pairs.select("item_idx").distinct()
|
|
53
|
+
user_factors, user_vector_len = model._get_features_wrap(users, user_features)
|
|
54
|
+
item_factors, item_vector_len = model._get_features_wrap(items, item_features)
|
|
55
|
+
|
|
56
|
+
pairs_with_features = join_or_return(pairs, user_factors, how="left", on="user_idx")
|
|
57
|
+
pairs_with_features = join_or_return(
|
|
58
|
+
pairs_with_features,
|
|
59
|
+
item_factors,
|
|
60
|
+
how="left",
|
|
61
|
+
on="item_idx",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
factors_to_explode = []
|
|
65
|
+
if user_factors is not None:
|
|
66
|
+
pairs_with_features = pairs_with_features.withColumn(
|
|
67
|
+
"user_factors",
|
|
68
|
+
sf.coalesce(
|
|
69
|
+
sf.col("user_factors"),
|
|
70
|
+
sf.array([sf.lit(0.0)] * user_vector_len),
|
|
71
|
+
),
|
|
72
|
+
)
|
|
73
|
+
factors_to_explode.append(("user_factors", "uf"))
|
|
74
|
+
|
|
75
|
+
if item_factors is not None:
|
|
76
|
+
pairs_with_features = pairs_with_features.withColumn(
|
|
77
|
+
"item_factors",
|
|
78
|
+
sf.coalesce(
|
|
79
|
+
sf.col("item_factors"),
|
|
80
|
+
sf.array([sf.lit(0.0)] * item_vector_len),
|
|
81
|
+
),
|
|
82
|
+
)
|
|
83
|
+
factors_to_explode.append(("item_factors", "if"))
|
|
84
|
+
|
|
85
|
+
if model.__str__() == "LightFMWrap":
|
|
86
|
+
pairs_with_features = (
|
|
87
|
+
pairs_with_features.fillna({"user_bias": 0, "item_bias": 0})
|
|
88
|
+
.withColumnRenamed("user_bias", f"{prefix}_user_bias")
|
|
89
|
+
.withColumnRenamed("item_bias", f"{prefix}_item_bias")
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
if add_factors_mult and user_factors is not None and item_factors is not None:
|
|
93
|
+
pairs_with_features = pairs_with_features.withColumn(
|
|
94
|
+
"factors_mult",
|
|
95
|
+
array_mult(sf.col("item_factors"), sf.col("user_factors")),
|
|
96
|
+
)
|
|
97
|
+
factors_to_explode.append(("factors_mult", "fm"))
|
|
98
|
+
|
|
99
|
+
for col_name, feature_prefix in factors_to_explode:
|
|
100
|
+
col_set = set(pairs_with_features.columns)
|
|
101
|
+
col_set.remove(col_name)
|
|
102
|
+
pairs_with_features = horizontal_explode(
|
|
103
|
+
data_frame=pairs_with_features,
|
|
104
|
+
column_to_explode=col_name,
|
|
105
|
+
other_columns=[sf.col(column) for column in sorted(col_set)],
|
|
106
|
+
prefix=f"{prefix}_{feature_prefix}",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return pairs_with_features
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class TwoStagesScenario(HybridRecommender):
|
|
113
|
+
"""
|
|
114
|
+
*train*:
|
|
115
|
+
|
|
116
|
+
1) take input ``log`` and split it into first_level_train and second_level_train
|
|
117
|
+
default splitter splits each user's data 50/50
|
|
118
|
+
2) train ``first_stage_models`` on ``first_stage_train``
|
|
119
|
+
3) create negative examples to train second stage model using one of:
|
|
120
|
+
|
|
121
|
+
- wrong recommendations from first stage
|
|
122
|
+
- random examples
|
|
123
|
+
|
|
124
|
+
use ``num_negatives`` to specify number of negatives per user
|
|
125
|
+
4) augments dataset with features:
|
|
126
|
+
|
|
127
|
+
- get 1 level recommendations for positive examples
|
|
128
|
+
from second_level_train and for generated negative examples
|
|
129
|
+
- add user and item features
|
|
130
|
+
- generate statistical and pair features
|
|
131
|
+
|
|
132
|
+
5) train ``TabularAutoML`` from LightAutoML
|
|
133
|
+
|
|
134
|
+
*inference*:
|
|
135
|
+
|
|
136
|
+
1) take ``log``
|
|
137
|
+
2) generate candidates, their number can be specified with ``num_candidates``
|
|
138
|
+
3) add features as in train
|
|
139
|
+
4) get recommendations
|
|
140
|
+
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
can_predict_cold_users: bool = True
|
|
144
|
+
can_predict_cold_items: bool = True
|
|
145
|
+
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
train_splitter: Splitter = RatioSplitter(test_size=0.5),
|
|
149
|
+
first_level_models: Union[list[BaseRecommender], BaseRecommender] = ScalaALSWrap(rank=128),
|
|
150
|
+
fallback_model: Optional[BaseRecommender] = PopRec(),
|
|
151
|
+
use_first_level_models_feat: Union[list[bool], bool] = False,
|
|
152
|
+
second_model_params: Optional[Union[dict, str]] = None,
|
|
153
|
+
second_model_config_path: Optional[str] = None,
|
|
154
|
+
num_negatives: int = 100,
|
|
155
|
+
negatives_type: str = "first_level",
|
|
156
|
+
use_generated_features: bool = False,
|
|
157
|
+
user_cat_features_list: Optional[list] = None,
|
|
158
|
+
item_cat_features_list: Optional[list] = None,
|
|
159
|
+
custom_features_processor: HistoryBasedFeaturesProcessor = None,
|
|
160
|
+
seed: int = 123,
|
|
161
|
+
) -> None:
|
|
162
|
+
"""
|
|
163
|
+
:param train_splitter: splitter to get ``first_level_train`` and ``second_level_train``.
|
|
164
|
+
Default is random 50% split.
|
|
165
|
+
:param first_level_models: model or a list of models
|
|
166
|
+
:param fallback_model: model used to fill missing recommendations at first level models
|
|
167
|
+
:param use_first_level_models_feat: flag or a list of flags to use
|
|
168
|
+
features created by first level models
|
|
169
|
+
:param second_model_params: TabularAutoML parameters
|
|
170
|
+
:param second_model_config_path: path to config file for TabularAutoML
|
|
171
|
+
:param num_negatives: number of negative examples used during train
|
|
172
|
+
:param negatives_type: negative examples creation strategy,``random``
|
|
173
|
+
or most relevant examples from ``first-level``
|
|
174
|
+
:param use_generated_features: flag to use generated features to train second level
|
|
175
|
+
:param user_cat_features_list: list of user categorical features
|
|
176
|
+
:param item_cat_features_list: list of item categorical features
|
|
177
|
+
:param custom_features_processor: you can pass custom feature processor
|
|
178
|
+
:param seed: random seed
|
|
179
|
+
|
|
180
|
+
"""
|
|
181
|
+
self.train_splitter = train_splitter
|
|
182
|
+
self.cached_list = []
|
|
183
|
+
|
|
184
|
+
self.first_level_models = (
|
|
185
|
+
first_level_models if isinstance(first_level_models, Iterable) else [first_level_models]
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
self.first_level_item_len = 0
|
|
189
|
+
self.first_level_user_len = 0
|
|
190
|
+
|
|
191
|
+
self.random_model = RandomRec(seed=seed)
|
|
192
|
+
self.fallback_model = fallback_model
|
|
193
|
+
self.first_level_user_features_transformer = ToNumericFeatureTransformer()
|
|
194
|
+
self.first_level_item_features_transformer = ToNumericFeatureTransformer()
|
|
195
|
+
|
|
196
|
+
if isinstance(use_first_level_models_feat, bool):
|
|
197
|
+
self.use_first_level_models_feat = [use_first_level_models_feat] * len(self.first_level_models)
|
|
198
|
+
else:
|
|
199
|
+
if len(self.first_level_models) != len(use_first_level_models_feat):
|
|
200
|
+
msg = (
|
|
201
|
+
f"For each model from first_level_models specify "
|
|
202
|
+
f"flag to use first level features."
|
|
203
|
+
f"Length of first_level_models is {len(first_level_models)}, "
|
|
204
|
+
f"Length of use_first_level_models_feat is {len(use_first_level_models_feat)}"
|
|
205
|
+
)
|
|
206
|
+
raise ValueError(msg)
|
|
207
|
+
|
|
208
|
+
self.use_first_level_models_feat = use_first_level_models_feat
|
|
209
|
+
|
|
210
|
+
self.second_stage_model = LamaWrap(params=second_model_params, config_path=second_model_config_path)
|
|
211
|
+
|
|
212
|
+
self.num_negatives = num_negatives
|
|
213
|
+
if negatives_type not in ["random", "first_level"]:
|
|
214
|
+
msg = f"Invalid negatives_type value: {negatives_type}. Use 'random' or 'first_level'"
|
|
215
|
+
raise ValueError(msg)
|
|
216
|
+
self.negatives_type = negatives_type
|
|
217
|
+
|
|
218
|
+
self.use_generated_features = use_generated_features
|
|
219
|
+
self.features_processor = (
|
|
220
|
+
custom_features_processor
|
|
221
|
+
if custom_features_processor
|
|
222
|
+
else HistoryBasedFeaturesProcessor(
|
|
223
|
+
user_cat_features_list=user_cat_features_list,
|
|
224
|
+
item_cat_features_list=item_cat_features_list,
|
|
225
|
+
)
|
|
226
|
+
)
|
|
227
|
+
self.seed = seed
|
|
228
|
+
|
|
229
|
+
# TO DO: add save/load for scenarios
|
|
230
|
+
@property
|
|
231
|
+
def _init_args(self):
|
|
232
|
+
return {}
|
|
233
|
+
|
|
234
|
+
def _add_features_for_second_level(
|
|
235
|
+
self,
|
|
236
|
+
log_to_add_features: SparkDataFrame,
|
|
237
|
+
log_for_first_level_models: SparkDataFrame,
|
|
238
|
+
user_features: SparkDataFrame,
|
|
239
|
+
item_features: SparkDataFrame,
|
|
240
|
+
) -> SparkDataFrame:
|
|
241
|
+
"""
|
|
242
|
+
Added features are:
|
|
243
|
+
- relevance from first level models
|
|
244
|
+
- user and item features from first level models
|
|
245
|
+
- dataset features
|
|
246
|
+
- FeatureProcessor features
|
|
247
|
+
|
|
248
|
+
:param log_to_add_features: input DataFrame``[user_idx, item_idx, timestamp, relevance]``
|
|
249
|
+
:param log_for_first_level_models: DataFrame``[user_idx, item_idx, timestamp, relevance]``
|
|
250
|
+
:param user_features: user features``[user_idx]`` + feature columns
|
|
251
|
+
:param item_features: item features``[item_idx]`` + feature columns
|
|
252
|
+
:return: DataFrame
|
|
253
|
+
"""
|
|
254
|
+
self.logger.info("Generating features")
|
|
255
|
+
full_second_level_train = log_to_add_features
|
|
256
|
+
first_level_item_features_cached = cache_if_exists(
|
|
257
|
+
self.first_level_item_features_transformer.transform(item_features)
|
|
258
|
+
)
|
|
259
|
+
first_level_user_features_cached = cache_if_exists(
|
|
260
|
+
self.first_level_user_features_transformer.transform(user_features)
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
pairs = log_to_add_features.select("user_idx", "item_idx")
|
|
264
|
+
for idx, model in enumerate(self.first_level_models):
|
|
265
|
+
current_pred = self._predict_pairs_with_first_level_model(
|
|
266
|
+
model=model,
|
|
267
|
+
log=log_for_first_level_models,
|
|
268
|
+
pairs=pairs,
|
|
269
|
+
user_features=first_level_user_features_cached,
|
|
270
|
+
item_features=first_level_item_features_cached,
|
|
271
|
+
).withColumnRenamed("relevance", f"rel_{idx}_{model}")
|
|
272
|
+
full_second_level_train = full_second_level_train.join(
|
|
273
|
+
sf.broadcast(current_pred),
|
|
274
|
+
on=["user_idx", "item_idx"],
|
|
275
|
+
how="left",
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
if self.use_first_level_models_feat[idx]:
|
|
279
|
+
features = get_first_level_model_features(
|
|
280
|
+
model=model,
|
|
281
|
+
pairs=full_second_level_train.select("user_idx", "item_idx"),
|
|
282
|
+
user_features=first_level_user_features_cached,
|
|
283
|
+
item_features=first_level_item_features_cached,
|
|
284
|
+
prefix=f"m_{idx}",
|
|
285
|
+
)
|
|
286
|
+
full_second_level_train = join_with_col_renaming(
|
|
287
|
+
left=full_second_level_train,
|
|
288
|
+
right=features,
|
|
289
|
+
on_col_name=["user_idx", "item_idx"],
|
|
290
|
+
how="left",
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
unpersist_if_exists(first_level_user_features_cached)
|
|
294
|
+
unpersist_if_exists(first_level_item_features_cached)
|
|
295
|
+
|
|
296
|
+
full_second_level_train_cached = full_second_level_train.fillna(0).cache()
|
|
297
|
+
|
|
298
|
+
self.logger.info("Adding features from the dataset")
|
|
299
|
+
full_second_level_train = join_or_return(
|
|
300
|
+
full_second_level_train_cached,
|
|
301
|
+
user_features,
|
|
302
|
+
on="user_idx",
|
|
303
|
+
how="left",
|
|
304
|
+
)
|
|
305
|
+
full_second_level_train = join_or_return(
|
|
306
|
+
full_second_level_train,
|
|
307
|
+
item_features,
|
|
308
|
+
on="item_idx",
|
|
309
|
+
how="left",
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
if self.use_generated_features:
|
|
313
|
+
if not self.features_processor.fitted:
|
|
314
|
+
self.features_processor.fit(
|
|
315
|
+
log=log_for_first_level_models,
|
|
316
|
+
user_features=user_features,
|
|
317
|
+
item_features=item_features,
|
|
318
|
+
)
|
|
319
|
+
self.logger.info("Adding generated features")
|
|
320
|
+
full_second_level_train = self.features_processor.transform(log=full_second_level_train)
|
|
321
|
+
|
|
322
|
+
self.logger.info(
|
|
323
|
+
"Columns at second level: %s",
|
|
324
|
+
" ".join(full_second_level_train.columns),
|
|
325
|
+
)
|
|
326
|
+
full_second_level_train_cached.unpersist()
|
|
327
|
+
return full_second_level_train
|
|
328
|
+
|
|
329
|
+
def _split_data(self, log: SparkDataFrame) -> tuple[SparkDataFrame, SparkDataFrame]:
|
|
330
|
+
"""Write statistics"""
|
|
331
|
+
first_level_train, second_level_train = self.train_splitter.split(log)
|
|
332
|
+
State().logger.debug("Log info: %s", get_log_info(log))
|
|
333
|
+
State().logger.debug("first_level_train info: %s", get_log_info(first_level_train))
|
|
334
|
+
State().logger.debug("second_level_train info: %s", get_log_info(second_level_train))
|
|
335
|
+
return first_level_train, second_level_train
|
|
336
|
+
|
|
337
|
+
@staticmethod
|
|
338
|
+
def _filter_or_return(dataframe, condition):
|
|
339
|
+
if dataframe is None:
|
|
340
|
+
return dataframe
|
|
341
|
+
return dataframe.filter(condition)
|
|
342
|
+
|
|
343
|
+
def _predict_with_first_level_model(
|
|
344
|
+
self,
|
|
345
|
+
model: BaseRecommender,
|
|
346
|
+
log: SparkDataFrame,
|
|
347
|
+
k: int,
|
|
348
|
+
users: SparkDataFrame,
|
|
349
|
+
items: SparkDataFrame,
|
|
350
|
+
user_features: SparkDataFrame,
|
|
351
|
+
item_features: SparkDataFrame,
|
|
352
|
+
log_to_filter: SparkDataFrame,
|
|
353
|
+
):
|
|
354
|
+
"""
|
|
355
|
+
Filter users and items using can_predict_cold_items and can_predict_cold_users, and predict
|
|
356
|
+
"""
|
|
357
|
+
if not model.can_predict_cold_items:
|
|
358
|
+
log, items, item_features = (
|
|
359
|
+
self._filter_or_return(
|
|
360
|
+
dataframe=df,
|
|
361
|
+
condition=sf.col("item_idx") < self.first_level_item_len,
|
|
362
|
+
)
|
|
363
|
+
for df in [log, items, item_features]
|
|
364
|
+
)
|
|
365
|
+
if not model.can_predict_cold_users:
|
|
366
|
+
log, users, user_features = (
|
|
367
|
+
self._filter_or_return(
|
|
368
|
+
dataframe=df,
|
|
369
|
+
condition=sf.col("user_idx") < self.first_level_user_len,
|
|
370
|
+
)
|
|
371
|
+
for df in [log, users, user_features]
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
log_to_filter_cached = join_with_col_renaming(
|
|
375
|
+
left=log_to_filter,
|
|
376
|
+
right=users,
|
|
377
|
+
on_col_name="user_idx",
|
|
378
|
+
).cache()
|
|
379
|
+
max_positives_to_filter = 0
|
|
380
|
+
|
|
381
|
+
if log_to_filter_cached.count() > 0:
|
|
382
|
+
max_positives_to_filter = (
|
|
383
|
+
log_to_filter_cached.groupBy("user_idx")
|
|
384
|
+
.agg(sf.count("item_idx").alias("num_positives"))
|
|
385
|
+
.select(sf.max("num_positives"))
|
|
386
|
+
.first()[0]
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
pred = model._predict(
|
|
390
|
+
log,
|
|
391
|
+
k=k + max_positives_to_filter,
|
|
392
|
+
users=users,
|
|
393
|
+
items=items,
|
|
394
|
+
user_features=user_features,
|
|
395
|
+
item_features=item_features,
|
|
396
|
+
filter_seen_items=False,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
pred = pred.join(
|
|
400
|
+
log_to_filter_cached.select("user_idx", "item_idx"),
|
|
401
|
+
on=["user_idx", "item_idx"],
|
|
402
|
+
how="anti",
|
|
403
|
+
).drop("user", "item")
|
|
404
|
+
|
|
405
|
+
log_to_filter_cached.unpersist()
|
|
406
|
+
|
|
407
|
+
return get_top_k_recs(pred, k)
|
|
408
|
+
|
|
409
|
+
def _predict_pairs_with_first_level_model(
|
|
410
|
+
self,
|
|
411
|
+
model: BaseRecommender,
|
|
412
|
+
log: SparkDataFrame,
|
|
413
|
+
pairs: SparkDataFrame,
|
|
414
|
+
user_features: SparkDataFrame,
|
|
415
|
+
item_features: SparkDataFrame,
|
|
416
|
+
):
|
|
417
|
+
"""
|
|
418
|
+
Get relevance for selected user-item pairs.
|
|
419
|
+
"""
|
|
420
|
+
if not model.can_predict_cold_items:
|
|
421
|
+
log, pairs, item_features = (
|
|
422
|
+
self._filter_or_return(
|
|
423
|
+
dataframe=df,
|
|
424
|
+
condition=sf.col("item_idx") < self.first_level_item_len,
|
|
425
|
+
)
|
|
426
|
+
for df in [log, pairs, item_features]
|
|
427
|
+
)
|
|
428
|
+
if not model.can_predict_cold_users:
|
|
429
|
+
log, pairs, user_features = (
|
|
430
|
+
self._filter_or_return(
|
|
431
|
+
dataframe=df,
|
|
432
|
+
condition=sf.col("user_idx") < self.first_level_user_len,
|
|
433
|
+
)
|
|
434
|
+
for df in [log, pairs, user_features]
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
return model._predict_pairs(
|
|
438
|
+
pairs=pairs,
|
|
439
|
+
log=log,
|
|
440
|
+
user_features=user_features,
|
|
441
|
+
item_features=item_features,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
def _get_first_level_candidates(
|
|
445
|
+
self,
|
|
446
|
+
model: BaseRecommender,
|
|
447
|
+
log: SparkDataFrame,
|
|
448
|
+
k: int,
|
|
449
|
+
users: SparkDataFrame,
|
|
450
|
+
items: SparkDataFrame,
|
|
451
|
+
user_features: SparkDataFrame,
|
|
452
|
+
item_features: SparkDataFrame,
|
|
453
|
+
log_to_filter: SparkDataFrame,
|
|
454
|
+
) -> SparkDataFrame:
|
|
455
|
+
"""
|
|
456
|
+
Combining the base model predictions with the fallback model
|
|
457
|
+
predictions.
|
|
458
|
+
"""
|
|
459
|
+
passed_arguments = locals()
|
|
460
|
+
passed_arguments.pop("self")
|
|
461
|
+
candidates = self._predict_with_first_level_model(**passed_arguments)
|
|
462
|
+
|
|
463
|
+
if self.fallback_model is not None:
|
|
464
|
+
passed_arguments.pop("model")
|
|
465
|
+
fallback_candidates = self._predict_with_first_level_model(model=self.fallback_model, **passed_arguments)
|
|
466
|
+
|
|
467
|
+
candidates = fallback(
|
|
468
|
+
base=candidates,
|
|
469
|
+
fill=fallback_candidates,
|
|
470
|
+
k=self.num_negatives,
|
|
471
|
+
)
|
|
472
|
+
return candidates
|
|
473
|
+
|
|
474
|
+
def _fit(
|
|
475
|
+
self,
|
|
476
|
+
log: SparkDataFrame,
|
|
477
|
+
user_features: Optional[SparkDataFrame] = None,
|
|
478
|
+
item_features: Optional[SparkDataFrame] = None,
|
|
479
|
+
) -> None:
|
|
480
|
+
self.cached_list = []
|
|
481
|
+
|
|
482
|
+
self.logger.info("Data split")
|
|
483
|
+
first_level_train, second_level_positive = self._split_data(log)
|
|
484
|
+
|
|
485
|
+
self.first_level_item_len = first_level_train.select("item_idx").distinct().count()
|
|
486
|
+
self.first_level_user_len = first_level_train.select("user_idx").distinct().count()
|
|
487
|
+
|
|
488
|
+
log.cache()
|
|
489
|
+
first_level_train.cache()
|
|
490
|
+
second_level_positive.cache()
|
|
491
|
+
self.cached_list.extend([log, first_level_train, second_level_positive])
|
|
492
|
+
|
|
493
|
+
if user_features is not None:
|
|
494
|
+
user_features.cache()
|
|
495
|
+
self.cached_list.append(user_features)
|
|
496
|
+
|
|
497
|
+
if item_features is not None:
|
|
498
|
+
item_features.cache()
|
|
499
|
+
self.cached_list.append(item_features)
|
|
500
|
+
|
|
501
|
+
self.first_level_item_features_transformer.fit(item_features)
|
|
502
|
+
self.first_level_user_features_transformer.fit(user_features)
|
|
503
|
+
|
|
504
|
+
first_level_item_features = cache_if_exists(self.first_level_item_features_transformer.transform(item_features))
|
|
505
|
+
first_level_user_features = cache_if_exists(self.first_level_user_features_transformer.transform(user_features))
|
|
506
|
+
|
|
507
|
+
for base_model in [
|
|
508
|
+
*self.first_level_models,
|
|
509
|
+
self.random_model,
|
|
510
|
+
self.fallback_model,
|
|
511
|
+
]:
|
|
512
|
+
base_model._fit_wrap(
|
|
513
|
+
log=first_level_train,
|
|
514
|
+
user_features=first_level_user_features.filter(sf.col("user_idx") < self.first_level_user_len),
|
|
515
|
+
item_features=first_level_item_features.filter(sf.col("item_idx") < self.first_level_item_len),
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
self.logger.info("Generate negative examples")
|
|
519
|
+
negatives_source = self.first_level_models[0] if self.negatives_type == "first_level" else self.random_model
|
|
520
|
+
|
|
521
|
+
first_level_candidates = self._get_first_level_candidates(
|
|
522
|
+
model=negatives_source,
|
|
523
|
+
log=first_level_train,
|
|
524
|
+
k=self.num_negatives,
|
|
525
|
+
users=log.select("user_idx").distinct(),
|
|
526
|
+
items=log.select("item_idx").distinct(),
|
|
527
|
+
user_features=first_level_user_features,
|
|
528
|
+
item_features=first_level_item_features,
|
|
529
|
+
log_to_filter=first_level_train,
|
|
530
|
+
).select("user_idx", "item_idx")
|
|
531
|
+
|
|
532
|
+
unpersist_if_exists(first_level_user_features)
|
|
533
|
+
unpersist_if_exists(first_level_item_features)
|
|
534
|
+
|
|
535
|
+
self.logger.info("Crate train dataset for second level")
|
|
536
|
+
|
|
537
|
+
second_level_train = (
|
|
538
|
+
first_level_candidates.join(
|
|
539
|
+
second_level_positive.select("user_idx", "item_idx").withColumn("target", sf.lit(1.0)),
|
|
540
|
+
on=["user_idx", "item_idx"],
|
|
541
|
+
how="left",
|
|
542
|
+
).fillna(0.0, subset="target")
|
|
543
|
+
).cache()
|
|
544
|
+
|
|
545
|
+
self.cached_list.append(second_level_train)
|
|
546
|
+
|
|
547
|
+
self.logger.info(
|
|
548
|
+
"Distribution of classes in second-level train dataset:/n %s",
|
|
549
|
+
(second_level_train.groupBy("target").agg(sf.count(sf.col("target")).alias("count_for_class")).take(2)),
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
self.features_processor.fit(
|
|
553
|
+
log=first_level_train,
|
|
554
|
+
user_features=user_features,
|
|
555
|
+
item_features=item_features,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
self.logger.info("Adding features to second-level train dataset")
|
|
559
|
+
second_level_train_to_convert = self._add_features_for_second_level(
|
|
560
|
+
log_to_add_features=second_level_train,
|
|
561
|
+
log_for_first_level_models=first_level_train,
|
|
562
|
+
user_features=user_features,
|
|
563
|
+
item_features=item_features,
|
|
564
|
+
).cache()
|
|
565
|
+
|
|
566
|
+
self.cached_list.append(second_level_train_to_convert)
|
|
567
|
+
self.second_stage_model.fit(second_level_train_to_convert)
|
|
568
|
+
for dataframe in self.cached_list:
|
|
569
|
+
unpersist_if_exists(dataframe)
|
|
570
|
+
|
|
571
|
+
def _predict(
|
|
572
|
+
self,
|
|
573
|
+
log: SparkDataFrame,
|
|
574
|
+
k: int,
|
|
575
|
+
users: SparkDataFrame,
|
|
576
|
+
items: SparkDataFrame,
|
|
577
|
+
user_features: Optional[SparkDataFrame] = None,
|
|
578
|
+
item_features: Optional[SparkDataFrame] = None,
|
|
579
|
+
filter_seen_items: bool = True, # noqa: ARG002
|
|
580
|
+
) -> SparkDataFrame:
|
|
581
|
+
State().logger.debug(msg="Generating candidates to rerank")
|
|
582
|
+
|
|
583
|
+
first_level_user_features = cache_if_exists(self.first_level_user_features_transformer.transform(user_features))
|
|
584
|
+
first_level_item_features = cache_if_exists(self.first_level_item_features_transformer.transform(item_features))
|
|
585
|
+
|
|
586
|
+
candidates = self._get_first_level_candidates(
|
|
587
|
+
model=self.first_level_models[0],
|
|
588
|
+
log=log,
|
|
589
|
+
k=self.num_negatives,
|
|
590
|
+
users=users,
|
|
591
|
+
items=items,
|
|
592
|
+
user_features=first_level_user_features,
|
|
593
|
+
item_features=first_level_item_features,
|
|
594
|
+
log_to_filter=log,
|
|
595
|
+
).select("user_idx", "item_idx")
|
|
596
|
+
|
|
597
|
+
candidates_cached = candidates.cache()
|
|
598
|
+
unpersist_if_exists(first_level_user_features)
|
|
599
|
+
unpersist_if_exists(first_level_item_features)
|
|
600
|
+
self.logger.info("Adding features")
|
|
601
|
+
candidates_features = self._add_features_for_second_level(
|
|
602
|
+
log_to_add_features=candidates_cached,
|
|
603
|
+
log_for_first_level_models=log,
|
|
604
|
+
user_features=user_features,
|
|
605
|
+
item_features=item_features,
|
|
606
|
+
)
|
|
607
|
+
candidates_features.cache()
|
|
608
|
+
candidates_cached.unpersist()
|
|
609
|
+
self.logger.info(
|
|
610
|
+
"Generated %s candidates for %s users",
|
|
611
|
+
candidates_features.count(),
|
|
612
|
+
candidates_features.select("user_idx").distinct().count(),
|
|
613
|
+
)
|
|
614
|
+
return self.second_stage_model.predict(data=candidates_features, k=k)
|
|
615
|
+
|
|
616
|
+
def fit_predict(
|
|
617
|
+
self,
|
|
618
|
+
log: DataFrameLike,
|
|
619
|
+
k: int,
|
|
620
|
+
users: Optional[Union[DataFrameLike, Iterable]] = None,
|
|
621
|
+
items: Optional[Union[DataFrameLike, Iterable]] = None,
|
|
622
|
+
user_features: Optional[DataFrameLike] = None,
|
|
623
|
+
item_features: Optional[DataFrameLike] = None,
|
|
624
|
+
filter_seen_items: bool = True,
|
|
625
|
+
) -> SparkDataFrame:
|
|
626
|
+
"""
|
|
627
|
+
:param log: input DataFrame ``[user_id, item_id, timestamp, relevance]``
|
|
628
|
+
:param k: length of a recommendation list, must be smaller than the number of ``items``
|
|
629
|
+
:param users: users to get recommendations for
|
|
630
|
+
:param items: items to get recommendations for
|
|
631
|
+
:param user_features: user features``[user_id]`` + feature columns
|
|
632
|
+
:param item_features: item features``[item_id]`` + feature columns
|
|
633
|
+
:param filter_seen_items: flag to removed seen items from recommendations
|
|
634
|
+
:return: DataFrame ``[user_id, item_id, relevance]``
|
|
635
|
+
"""
|
|
636
|
+
self.fit(log, user_features, item_features)
|
|
637
|
+
return self.predict(
|
|
638
|
+
log,
|
|
639
|
+
k,
|
|
640
|
+
users,
|
|
641
|
+
items,
|
|
642
|
+
user_features,
|
|
643
|
+
item_features,
|
|
644
|
+
filter_seen_items,
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
@staticmethod
|
|
648
|
+
def _optimize_one_model(
|
|
649
|
+
model: BaseRecommender,
|
|
650
|
+
train: DataFrameLike,
|
|
651
|
+
test: DataFrameLike,
|
|
652
|
+
user_features: Optional[DataFrameLike] = None,
|
|
653
|
+
item_features: Optional[DataFrameLike] = None,
|
|
654
|
+
param_borders: Optional[dict[str, list[Any]]] = None,
|
|
655
|
+
criterion: Metric = Precision,
|
|
656
|
+
k: int = 10,
|
|
657
|
+
budget: int = 10,
|
|
658
|
+
new_study: bool = True,
|
|
659
|
+
):
|
|
660
|
+
params = model.optimize(
|
|
661
|
+
train,
|
|
662
|
+
test,
|
|
663
|
+
user_features,
|
|
664
|
+
item_features,
|
|
665
|
+
param_borders,
|
|
666
|
+
criterion,
|
|
667
|
+
k,
|
|
668
|
+
budget,
|
|
669
|
+
new_study,
|
|
670
|
+
)
|
|
671
|
+
return params
|
|
672
|
+
|
|
673
|
+
def optimize(
|
|
674
|
+
self,
|
|
675
|
+
train: DataFrameLike,
|
|
676
|
+
test: DataFrameLike,
|
|
677
|
+
user_features: Optional[DataFrameLike] = None,
|
|
678
|
+
item_features: Optional[DataFrameLike] = None,
|
|
679
|
+
param_borders: Optional[list[dict[str, list[Any]]]] = None,
|
|
680
|
+
criterion: Metric = Precision,
|
|
681
|
+
k: int = 10,
|
|
682
|
+
budget: int = 10,
|
|
683
|
+
new_study: bool = True,
|
|
684
|
+
) -> tuple[list[dict[str, Any]], Optional[dict[str, Any]]]:
|
|
685
|
+
"""
|
|
686
|
+
Optimize first level models with optuna.
|
|
687
|
+
|
|
688
|
+
:param train: train DataFrame ``[user_id, item_id, timestamp, relevance]``
|
|
689
|
+
:param test: test DataFrame ``[user_id, item_id, timestamp, relevance]``
|
|
690
|
+
:param user_features: user features ``[user_id , timestamp]`` + feature columns
|
|
691
|
+
:param item_features: item features``[item_id]`` + feature columns
|
|
692
|
+
:param param_borders: list with param grids for first level models and a fallback model.
|
|
693
|
+
Empty dict skips optimization for that model.
|
|
694
|
+
Param grid is a dict ``{param: [low, high]}``.
|
|
695
|
+
:param criterion: metric to optimize
|
|
696
|
+
:param k: length of a recommendation list
|
|
697
|
+
:param budget: number of points to train each model
|
|
698
|
+
:param new_study: keep searching with previous study or start a new study
|
|
699
|
+
:return: list of dicts of parameters
|
|
700
|
+
"""
|
|
701
|
+
number_of_models = len(self.first_level_models)
|
|
702
|
+
if self.fallback_model is not None:
|
|
703
|
+
number_of_models += 1
|
|
704
|
+
if number_of_models != len(param_borders):
|
|
705
|
+
msg = "Provide search grid or None for every first level model"
|
|
706
|
+
raise ValueError(msg)
|
|
707
|
+
|
|
708
|
+
first_level_user_features_tr = ToNumericFeatureTransformer()
|
|
709
|
+
first_level_user_features = first_level_user_features_tr.fit_transform(user_features)
|
|
710
|
+
first_level_item_features_tr = ToNumericFeatureTransformer()
|
|
711
|
+
first_level_item_features = first_level_item_features_tr.fit_transform(item_features)
|
|
712
|
+
|
|
713
|
+
first_level_user_features = cache_if_exists(first_level_user_features)
|
|
714
|
+
first_level_item_features = cache_if_exists(first_level_item_features)
|
|
715
|
+
|
|
716
|
+
params_found = []
|
|
717
|
+
for i, model in enumerate(self.first_level_models):
|
|
718
|
+
if param_borders[i] is None or (isinstance(param_borders[i], dict) and param_borders[i]):
|
|
719
|
+
self.logger.info(
|
|
720
|
+
"Optimizing first level model number %s, %s",
|
|
721
|
+
i,
|
|
722
|
+
model.__str__(),
|
|
723
|
+
)
|
|
724
|
+
params_found.append(
|
|
725
|
+
self._optimize_one_model(
|
|
726
|
+
model=model,
|
|
727
|
+
train=train,
|
|
728
|
+
test=test,
|
|
729
|
+
user_features=first_level_user_features,
|
|
730
|
+
item_features=first_level_item_features,
|
|
731
|
+
param_borders=param_borders[i],
|
|
732
|
+
criterion=criterion,
|
|
733
|
+
k=k,
|
|
734
|
+
budget=budget,
|
|
735
|
+
new_study=new_study,
|
|
736
|
+
)
|
|
737
|
+
)
|
|
738
|
+
else:
|
|
739
|
+
params_found.append(None)
|
|
740
|
+
|
|
741
|
+
if self.fallback_model is None or (isinstance(param_borders[-1], dict) and not param_borders[-1]):
|
|
742
|
+
return params_found, None
|
|
743
|
+
|
|
744
|
+
self.logger.info("Optimizing fallback-model")
|
|
745
|
+
fallback_params = self._optimize_one_model(
|
|
746
|
+
model=self.fallback_model,
|
|
747
|
+
train=train,
|
|
748
|
+
test=test,
|
|
749
|
+
user_features=first_level_user_features,
|
|
750
|
+
item_features=first_level_item_features,
|
|
751
|
+
param_borders=param_borders[-1],
|
|
752
|
+
criterion=criterion,
|
|
753
|
+
new_study=new_study,
|
|
754
|
+
)
|
|
755
|
+
unpersist_if_exists(first_level_item_features)
|
|
756
|
+
unpersist_if_exists(first_level_user_features)
|
|
757
|
+
return params_found, fallback_params
|