replay-rec 0.20.1rc0__py3-none-any.whl → 0.20.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. replay/__init__.py +1 -1
  2. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/METADATA +18 -12
  3. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/RECORD +6 -61
  4. replay/experimental/__init__.py +0 -0
  5. replay/experimental/metrics/__init__.py +0 -62
  6. replay/experimental/metrics/base_metric.py +0 -603
  7. replay/experimental/metrics/coverage.py +0 -97
  8. replay/experimental/metrics/experiment.py +0 -175
  9. replay/experimental/metrics/hitrate.py +0 -26
  10. replay/experimental/metrics/map.py +0 -30
  11. replay/experimental/metrics/mrr.py +0 -18
  12. replay/experimental/metrics/ncis_precision.py +0 -31
  13. replay/experimental/metrics/ndcg.py +0 -49
  14. replay/experimental/metrics/precision.py +0 -22
  15. replay/experimental/metrics/recall.py +0 -25
  16. replay/experimental/metrics/rocauc.py +0 -49
  17. replay/experimental/metrics/surprisal.py +0 -90
  18. replay/experimental/metrics/unexpectedness.py +0 -76
  19. replay/experimental/models/__init__.py +0 -50
  20. replay/experimental/models/admm_slim.py +0 -257
  21. replay/experimental/models/base_neighbour_rec.py +0 -200
  22. replay/experimental/models/base_rec.py +0 -1386
  23. replay/experimental/models/base_torch_rec.py +0 -234
  24. replay/experimental/models/cql.py +0 -454
  25. replay/experimental/models/ddpg.py +0 -932
  26. replay/experimental/models/dt4rec/__init__.py +0 -0
  27. replay/experimental/models/dt4rec/dt4rec.py +0 -189
  28. replay/experimental/models/dt4rec/gpt1.py +0 -401
  29. replay/experimental/models/dt4rec/trainer.py +0 -127
  30. replay/experimental/models/dt4rec/utils.py +0 -264
  31. replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
  32. replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -792
  33. replay/experimental/models/hierarchical_recommender.py +0 -331
  34. replay/experimental/models/implicit_wrap.py +0 -131
  35. replay/experimental/models/lightfm_wrap.py +0 -303
  36. replay/experimental/models/mult_vae.py +0 -332
  37. replay/experimental/models/neural_ts.py +0 -986
  38. replay/experimental/models/neuromf.py +0 -406
  39. replay/experimental/models/scala_als.py +0 -293
  40. replay/experimental/models/u_lin_ucb.py +0 -115
  41. replay/experimental/nn/data/__init__.py +0 -1
  42. replay/experimental/nn/data/schema_builder.py +0 -102
  43. replay/experimental/preprocessing/__init__.py +0 -3
  44. replay/experimental/preprocessing/data_preparator.py +0 -839
  45. replay/experimental/preprocessing/padder.py +0 -229
  46. replay/experimental/preprocessing/sequence_generator.py +0 -208
  47. replay/experimental/scenarios/__init__.py +0 -1
  48. replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
  49. replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -74
  50. replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -261
  51. replay/experimental/scenarios/obp_wrapper/utils.py +0 -85
  52. replay/experimental/scenarios/two_stages/__init__.py +0 -0
  53. replay/experimental/scenarios/two_stages/reranker.py +0 -117
  54. replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -757
  55. replay/experimental/utils/__init__.py +0 -0
  56. replay/experimental/utils/logger.py +0 -24
  57. replay/experimental/utils/model_handler.py +0 -186
  58. replay/experimental/utils/session_handler.py +0 -44
  59. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/WHEEL +0 -0
  60. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/licenses/LICENSE +0 -0
  61. {replay_rec-0.20.1rc0.dist-info → replay_rec-0.20.2.dist-info}/licenses/NOTICE +0 -0
@@ -1,757 +0,0 @@
1
- from collections.abc import Iterable
2
- from typing import Any, Optional, Union
3
-
4
- from replay.experimental.models import ScalaALSWrap
5
- from replay.experimental.preprocessing.data_preparator import ToNumericFeatureTransformer
6
- from replay.experimental.scenarios.two_stages.reranker import LamaWrap
7
- from replay.metrics import Metric, Precision
8
- from replay.models import PopRec, RandomRec
9
- from replay.models.base_rec import BaseRecommender, HybridRecommender
10
- from replay.preprocessing.history_based_fp import HistoryBasedFeaturesProcessor
11
- from replay.splitters import RatioSplitter, Splitter
12
- from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, SparkDataFrame
13
- from replay.utils.session_handler import State
14
- from replay.utils.spark_utils import (
15
- array_mult,
16
- cache_if_exists,
17
- fallback,
18
- get_log_info,
19
- get_top_k_recs,
20
- horizontal_explode,
21
- join_or_return,
22
- join_with_col_renaming,
23
- unpersist_if_exists,
24
- )
25
-
26
- if PYSPARK_AVAILABLE:
27
- import pyspark.sql.functions as sf
28
-
29
-
30
- def get_first_level_model_features(
31
- model: SparkDataFrame,
32
- pairs: SparkDataFrame,
33
- user_features: Optional[SparkDataFrame] = None,
34
- item_features: Optional[SparkDataFrame] = None,
35
- add_factors_mult: bool = True,
36
- prefix: str = "",
37
- ) -> SparkDataFrame:
38
- """
39
- Get user and item embeddings from replay model.
40
- Can also compute elementwise multiplication between them with ``add_factors_mult`` parameter.
41
- Zero vectors are returned if a model does not have embeddings for specific users/items.
42
-
43
- :param model: trained model
44
- :param pairs: user-item pairs to get vectors for `[user_id/user_idx, item_id/item_id]`
45
- :param user_features: user features `[user_id/user_idx, feature_1, ....]`
46
- :param item_features: item features `[item_id/item_idx, feature_1, ....]`
47
- :param add_factors_mult: flag to add elementwise multiplication
48
- :param prefix: name to add to the columns
49
- :return: DataFrame
50
- """
51
- users = pairs.select("user_idx").distinct()
52
- items = pairs.select("item_idx").distinct()
53
- user_factors, user_vector_len = model._get_features_wrap(users, user_features)
54
- item_factors, item_vector_len = model._get_features_wrap(items, item_features)
55
-
56
- pairs_with_features = join_or_return(pairs, user_factors, how="left", on="user_idx")
57
- pairs_with_features = join_or_return(
58
- pairs_with_features,
59
- item_factors,
60
- how="left",
61
- on="item_idx",
62
- )
63
-
64
- factors_to_explode = []
65
- if user_factors is not None:
66
- pairs_with_features = pairs_with_features.withColumn(
67
- "user_factors",
68
- sf.coalesce(
69
- sf.col("user_factors"),
70
- sf.array([sf.lit(0.0)] * user_vector_len),
71
- ),
72
- )
73
- factors_to_explode.append(("user_factors", "uf"))
74
-
75
- if item_factors is not None:
76
- pairs_with_features = pairs_with_features.withColumn(
77
- "item_factors",
78
- sf.coalesce(
79
- sf.col("item_factors"),
80
- sf.array([sf.lit(0.0)] * item_vector_len),
81
- ),
82
- )
83
- factors_to_explode.append(("item_factors", "if"))
84
-
85
- if model.__str__() == "LightFMWrap":
86
- pairs_with_features = (
87
- pairs_with_features.fillna({"user_bias": 0, "item_bias": 0})
88
- .withColumnRenamed("user_bias", f"{prefix}_user_bias")
89
- .withColumnRenamed("item_bias", f"{prefix}_item_bias")
90
- )
91
-
92
- if add_factors_mult and user_factors is not None and item_factors is not None:
93
- pairs_with_features = pairs_with_features.withColumn(
94
- "factors_mult",
95
- array_mult(sf.col("item_factors"), sf.col("user_factors")),
96
- )
97
- factors_to_explode.append(("factors_mult", "fm"))
98
-
99
- for col_name, feature_prefix in factors_to_explode:
100
- col_set = set(pairs_with_features.columns)
101
- col_set.remove(col_name)
102
- pairs_with_features = horizontal_explode(
103
- data_frame=pairs_with_features,
104
- column_to_explode=col_name,
105
- other_columns=[sf.col(column) for column in sorted(col_set)],
106
- prefix=f"{prefix}_{feature_prefix}",
107
- )
108
-
109
- return pairs_with_features
110
-
111
-
112
- class TwoStagesScenario(HybridRecommender):
113
- """
114
- *train*:
115
-
116
- 1) take input ``log`` and split it into first_level_train and second_level_train
117
- default splitter splits each user's data 50/50
118
- 2) train ``first_stage_models`` on ``first_stage_train``
119
- 3) create negative examples to train second stage model using one of:
120
-
121
- - wrong recommendations from first stage
122
- - random examples
123
-
124
- use ``num_negatives`` to specify number of negatives per user
125
- 4) augments dataset with features:
126
-
127
- - get 1 level recommendations for positive examples
128
- from second_level_train and for generated negative examples
129
- - add user and item features
130
- - generate statistical and pair features
131
-
132
- 5) train ``TabularAutoML`` from LightAutoML
133
-
134
- *inference*:
135
-
136
- 1) take ``log``
137
- 2) generate candidates, their number can be specified with ``num_candidates``
138
- 3) add features as in train
139
- 4) get recommendations
140
-
141
- """
142
-
143
- can_predict_cold_users: bool = True
144
- can_predict_cold_items: bool = True
145
-
146
- def __init__(
147
- self,
148
- train_splitter: Splitter = RatioSplitter(test_size=0.5),
149
- first_level_models: Union[list[BaseRecommender], BaseRecommender] = ScalaALSWrap(rank=128),
150
- fallback_model: Optional[BaseRecommender] = PopRec(),
151
- use_first_level_models_feat: Union[list[bool], bool] = False,
152
- second_model_params: Optional[Union[dict, str]] = None,
153
- second_model_config_path: Optional[str] = None,
154
- num_negatives: int = 100,
155
- negatives_type: str = "first_level",
156
- use_generated_features: bool = False,
157
- user_cat_features_list: Optional[list] = None,
158
- item_cat_features_list: Optional[list] = None,
159
- custom_features_processor: HistoryBasedFeaturesProcessor = None,
160
- seed: int = 123,
161
- ) -> None:
162
- """
163
- :param train_splitter: splitter to get ``first_level_train`` and ``second_level_train``.
164
- Default is random 50% split.
165
- :param first_level_models: model or a list of models
166
- :param fallback_model: model used to fill missing recommendations at first level models
167
- :param use_first_level_models_feat: flag or a list of flags to use
168
- features created by first level models
169
- :param second_model_params: TabularAutoML parameters
170
- :param second_model_config_path: path to config file for TabularAutoML
171
- :param num_negatives: number of negative examples used during train
172
- :param negatives_type: negative examples creation strategy,``random``
173
- or most relevant examples from ``first-level``
174
- :param use_generated_features: flag to use generated features to train second level
175
- :param user_cat_features_list: list of user categorical features
176
- :param item_cat_features_list: list of item categorical features
177
- :param custom_features_processor: you can pass custom feature processor
178
- :param seed: random seed
179
-
180
- """
181
- self.train_splitter = train_splitter
182
- self.cached_list = []
183
-
184
- self.first_level_models = (
185
- first_level_models if isinstance(first_level_models, Iterable) else [first_level_models]
186
- )
187
-
188
- self.first_level_item_len = 0
189
- self.first_level_user_len = 0
190
-
191
- self.random_model = RandomRec(seed=seed)
192
- self.fallback_model = fallback_model
193
- self.first_level_user_features_transformer = ToNumericFeatureTransformer()
194
- self.first_level_item_features_transformer = ToNumericFeatureTransformer()
195
-
196
- if isinstance(use_first_level_models_feat, bool):
197
- self.use_first_level_models_feat = [use_first_level_models_feat] * len(self.first_level_models)
198
- else:
199
- if len(self.first_level_models) != len(use_first_level_models_feat):
200
- msg = (
201
- f"For each model from first_level_models specify "
202
- f"flag to use first level features."
203
- f"Length of first_level_models is {len(first_level_models)}, "
204
- f"Length of use_first_level_models_feat is {len(use_first_level_models_feat)}"
205
- )
206
- raise ValueError(msg)
207
-
208
- self.use_first_level_models_feat = use_first_level_models_feat
209
-
210
- self.second_stage_model = LamaWrap(params=second_model_params, config_path=second_model_config_path)
211
-
212
- self.num_negatives = num_negatives
213
- if negatives_type not in ["random", "first_level"]:
214
- msg = f"Invalid negatives_type value: {negatives_type}. Use 'random' or 'first_level'"
215
- raise ValueError(msg)
216
- self.negatives_type = negatives_type
217
-
218
- self.use_generated_features = use_generated_features
219
- self.features_processor = (
220
- custom_features_processor
221
- if custom_features_processor
222
- else HistoryBasedFeaturesProcessor(
223
- user_cat_features_list=user_cat_features_list,
224
- item_cat_features_list=item_cat_features_list,
225
- )
226
- )
227
- self.seed = seed
228
-
229
- # TO DO: add save/load for scenarios
230
- @property
231
- def _init_args(self):
232
- return {}
233
-
234
- def _add_features_for_second_level(
235
- self,
236
- log_to_add_features: SparkDataFrame,
237
- log_for_first_level_models: SparkDataFrame,
238
- user_features: SparkDataFrame,
239
- item_features: SparkDataFrame,
240
- ) -> SparkDataFrame:
241
- """
242
- Added features are:
243
- - relevance from first level models
244
- - user and item features from first level models
245
- - dataset features
246
- - FeatureProcessor features
247
-
248
- :param log_to_add_features: input DataFrame``[user_idx, item_idx, timestamp, relevance]``
249
- :param log_for_first_level_models: DataFrame``[user_idx, item_idx, timestamp, relevance]``
250
- :param user_features: user features``[user_idx]`` + feature columns
251
- :param item_features: item features``[item_idx]`` + feature columns
252
- :return: DataFrame
253
- """
254
- self.logger.info("Generating features")
255
- full_second_level_train = log_to_add_features
256
- first_level_item_features_cached = cache_if_exists(
257
- self.first_level_item_features_transformer.transform(item_features)
258
- )
259
- first_level_user_features_cached = cache_if_exists(
260
- self.first_level_user_features_transformer.transform(user_features)
261
- )
262
-
263
- pairs = log_to_add_features.select("user_idx", "item_idx")
264
- for idx, model in enumerate(self.first_level_models):
265
- current_pred = self._predict_pairs_with_first_level_model(
266
- model=model,
267
- log=log_for_first_level_models,
268
- pairs=pairs,
269
- user_features=first_level_user_features_cached,
270
- item_features=first_level_item_features_cached,
271
- ).withColumnRenamed("relevance", f"rel_{idx}_{model}")
272
- full_second_level_train = full_second_level_train.join(
273
- sf.broadcast(current_pred),
274
- on=["user_idx", "item_idx"],
275
- how="left",
276
- )
277
-
278
- if self.use_first_level_models_feat[idx]:
279
- features = get_first_level_model_features(
280
- model=model,
281
- pairs=full_second_level_train.select("user_idx", "item_idx"),
282
- user_features=first_level_user_features_cached,
283
- item_features=first_level_item_features_cached,
284
- prefix=f"m_{idx}",
285
- )
286
- full_second_level_train = join_with_col_renaming(
287
- left=full_second_level_train,
288
- right=features,
289
- on_col_name=["user_idx", "item_idx"],
290
- how="left",
291
- )
292
-
293
- unpersist_if_exists(first_level_user_features_cached)
294
- unpersist_if_exists(first_level_item_features_cached)
295
-
296
- full_second_level_train_cached = full_second_level_train.fillna(0).cache()
297
-
298
- self.logger.info("Adding features from the dataset")
299
- full_second_level_train = join_or_return(
300
- full_second_level_train_cached,
301
- user_features,
302
- on="user_idx",
303
- how="left",
304
- )
305
- full_second_level_train = join_or_return(
306
- full_second_level_train,
307
- item_features,
308
- on="item_idx",
309
- how="left",
310
- )
311
-
312
- if self.use_generated_features:
313
- if not self.features_processor.fitted:
314
- self.features_processor.fit(
315
- log=log_for_first_level_models,
316
- user_features=user_features,
317
- item_features=item_features,
318
- )
319
- self.logger.info("Adding generated features")
320
- full_second_level_train = self.features_processor.transform(log=full_second_level_train)
321
-
322
- self.logger.info(
323
- "Columns at second level: %s",
324
- " ".join(full_second_level_train.columns),
325
- )
326
- full_second_level_train_cached.unpersist()
327
- return full_second_level_train
328
-
329
- def _split_data(self, log: SparkDataFrame) -> tuple[SparkDataFrame, SparkDataFrame]:
330
- """Write statistics"""
331
- first_level_train, second_level_train = self.train_splitter.split(log)
332
- State().logger.debug("Log info: %s", get_log_info(log))
333
- State().logger.debug("first_level_train info: %s", get_log_info(first_level_train))
334
- State().logger.debug("second_level_train info: %s", get_log_info(second_level_train))
335
- return first_level_train, second_level_train
336
-
337
- @staticmethod
338
- def _filter_or_return(dataframe, condition):
339
- if dataframe is None:
340
- return dataframe
341
- return dataframe.filter(condition)
342
-
343
- def _predict_with_first_level_model(
344
- self,
345
- model: BaseRecommender,
346
- log: SparkDataFrame,
347
- k: int,
348
- users: SparkDataFrame,
349
- items: SparkDataFrame,
350
- user_features: SparkDataFrame,
351
- item_features: SparkDataFrame,
352
- log_to_filter: SparkDataFrame,
353
- ):
354
- """
355
- Filter users and items using can_predict_cold_items and can_predict_cold_users, and predict
356
- """
357
- if not model.can_predict_cold_items:
358
- log, items, item_features = (
359
- self._filter_or_return(
360
- dataframe=df,
361
- condition=sf.col("item_idx") < self.first_level_item_len,
362
- )
363
- for df in [log, items, item_features]
364
- )
365
- if not model.can_predict_cold_users:
366
- log, users, user_features = (
367
- self._filter_or_return(
368
- dataframe=df,
369
- condition=sf.col("user_idx") < self.first_level_user_len,
370
- )
371
- for df in [log, users, user_features]
372
- )
373
-
374
- log_to_filter_cached = join_with_col_renaming(
375
- left=log_to_filter,
376
- right=users,
377
- on_col_name="user_idx",
378
- ).cache()
379
- max_positives_to_filter = 0
380
-
381
- if log_to_filter_cached.count() > 0:
382
- max_positives_to_filter = (
383
- log_to_filter_cached.groupBy("user_idx")
384
- .agg(sf.count("item_idx").alias("num_positives"))
385
- .select(sf.max("num_positives"))
386
- .first()[0]
387
- )
388
-
389
- pred = model._predict(
390
- log,
391
- k=k + max_positives_to_filter,
392
- users=users,
393
- items=items,
394
- user_features=user_features,
395
- item_features=item_features,
396
- filter_seen_items=False,
397
- )
398
-
399
- pred = pred.join(
400
- log_to_filter_cached.select("user_idx", "item_idx"),
401
- on=["user_idx", "item_idx"],
402
- how="anti",
403
- ).drop("user", "item")
404
-
405
- log_to_filter_cached.unpersist()
406
-
407
- return get_top_k_recs(pred, k)
408
-
409
- def _predict_pairs_with_first_level_model(
410
- self,
411
- model: BaseRecommender,
412
- log: SparkDataFrame,
413
- pairs: SparkDataFrame,
414
- user_features: SparkDataFrame,
415
- item_features: SparkDataFrame,
416
- ):
417
- """
418
- Get relevance for selected user-item pairs.
419
- """
420
- if not model.can_predict_cold_items:
421
- log, pairs, item_features = (
422
- self._filter_or_return(
423
- dataframe=df,
424
- condition=sf.col("item_idx") < self.first_level_item_len,
425
- )
426
- for df in [log, pairs, item_features]
427
- )
428
- if not model.can_predict_cold_users:
429
- log, pairs, user_features = (
430
- self._filter_or_return(
431
- dataframe=df,
432
- condition=sf.col("user_idx") < self.first_level_user_len,
433
- )
434
- for df in [log, pairs, user_features]
435
- )
436
-
437
- return model._predict_pairs(
438
- pairs=pairs,
439
- log=log,
440
- user_features=user_features,
441
- item_features=item_features,
442
- )
443
-
444
- def _get_first_level_candidates(
445
- self,
446
- model: BaseRecommender,
447
- log: SparkDataFrame,
448
- k: int,
449
- users: SparkDataFrame,
450
- items: SparkDataFrame,
451
- user_features: SparkDataFrame,
452
- item_features: SparkDataFrame,
453
- log_to_filter: SparkDataFrame,
454
- ) -> SparkDataFrame:
455
- """
456
- Combining the base model predictions with the fallback model
457
- predictions.
458
- """
459
- passed_arguments = locals()
460
- passed_arguments.pop("self")
461
- candidates = self._predict_with_first_level_model(**passed_arguments)
462
-
463
- if self.fallback_model is not None:
464
- passed_arguments.pop("model")
465
- fallback_candidates = self._predict_with_first_level_model(model=self.fallback_model, **passed_arguments)
466
-
467
- candidates = fallback(
468
- base=candidates,
469
- fill=fallback_candidates,
470
- k=self.num_negatives,
471
- )
472
- return candidates
473
-
474
- def _fit(
475
- self,
476
- log: SparkDataFrame,
477
- user_features: Optional[SparkDataFrame] = None,
478
- item_features: Optional[SparkDataFrame] = None,
479
- ) -> None:
480
- self.cached_list = []
481
-
482
- self.logger.info("Data split")
483
- first_level_train, second_level_positive = self._split_data(log)
484
-
485
- self.first_level_item_len = first_level_train.select("item_idx").distinct().count()
486
- self.first_level_user_len = first_level_train.select("user_idx").distinct().count()
487
-
488
- log.cache()
489
- first_level_train.cache()
490
- second_level_positive.cache()
491
- self.cached_list.extend([log, first_level_train, second_level_positive])
492
-
493
- if user_features is not None:
494
- user_features.cache()
495
- self.cached_list.append(user_features)
496
-
497
- if item_features is not None:
498
- item_features.cache()
499
- self.cached_list.append(item_features)
500
-
501
- self.first_level_item_features_transformer.fit(item_features)
502
- self.first_level_user_features_transformer.fit(user_features)
503
-
504
- first_level_item_features = cache_if_exists(self.first_level_item_features_transformer.transform(item_features))
505
- first_level_user_features = cache_if_exists(self.first_level_user_features_transformer.transform(user_features))
506
-
507
- for base_model in [
508
- *self.first_level_models,
509
- self.random_model,
510
- self.fallback_model,
511
- ]:
512
- base_model._fit_wrap(
513
- log=first_level_train,
514
- user_features=first_level_user_features.filter(sf.col("user_idx") < self.first_level_user_len),
515
- item_features=first_level_item_features.filter(sf.col("item_idx") < self.first_level_item_len),
516
- )
517
-
518
- self.logger.info("Generate negative examples")
519
- negatives_source = self.first_level_models[0] if self.negatives_type == "first_level" else self.random_model
520
-
521
- first_level_candidates = self._get_first_level_candidates(
522
- model=negatives_source,
523
- log=first_level_train,
524
- k=self.num_negatives,
525
- users=log.select("user_idx").distinct(),
526
- items=log.select("item_idx").distinct(),
527
- user_features=first_level_user_features,
528
- item_features=first_level_item_features,
529
- log_to_filter=first_level_train,
530
- ).select("user_idx", "item_idx")
531
-
532
- unpersist_if_exists(first_level_user_features)
533
- unpersist_if_exists(first_level_item_features)
534
-
535
- self.logger.info("Crate train dataset for second level")
536
-
537
- second_level_train = (
538
- first_level_candidates.join(
539
- second_level_positive.select("user_idx", "item_idx").withColumn("target", sf.lit(1.0)),
540
- on=["user_idx", "item_idx"],
541
- how="left",
542
- ).fillna(0.0, subset="target")
543
- ).cache()
544
-
545
- self.cached_list.append(second_level_train)
546
-
547
- self.logger.info(
548
- "Distribution of classes in second-level train dataset:/n %s",
549
- (second_level_train.groupBy("target").agg(sf.count(sf.col("target")).alias("count_for_class")).take(2)),
550
- )
551
-
552
- self.features_processor.fit(
553
- log=first_level_train,
554
- user_features=user_features,
555
- item_features=item_features,
556
- )
557
-
558
- self.logger.info("Adding features to second-level train dataset")
559
- second_level_train_to_convert = self._add_features_for_second_level(
560
- log_to_add_features=second_level_train,
561
- log_for_first_level_models=first_level_train,
562
- user_features=user_features,
563
- item_features=item_features,
564
- ).cache()
565
-
566
- self.cached_list.append(second_level_train_to_convert)
567
- self.second_stage_model.fit(second_level_train_to_convert)
568
- for dataframe in self.cached_list:
569
- unpersist_if_exists(dataframe)
570
-
571
- def _predict(
572
- self,
573
- log: SparkDataFrame,
574
- k: int,
575
- users: SparkDataFrame,
576
- items: SparkDataFrame,
577
- user_features: Optional[SparkDataFrame] = None,
578
- item_features: Optional[SparkDataFrame] = None,
579
- filter_seen_items: bool = True, # noqa: ARG002
580
- ) -> SparkDataFrame:
581
- State().logger.debug(msg="Generating candidates to rerank")
582
-
583
- first_level_user_features = cache_if_exists(self.first_level_user_features_transformer.transform(user_features))
584
- first_level_item_features = cache_if_exists(self.first_level_item_features_transformer.transform(item_features))
585
-
586
- candidates = self._get_first_level_candidates(
587
- model=self.first_level_models[0],
588
- log=log,
589
- k=self.num_negatives,
590
- users=users,
591
- items=items,
592
- user_features=first_level_user_features,
593
- item_features=first_level_item_features,
594
- log_to_filter=log,
595
- ).select("user_idx", "item_idx")
596
-
597
- candidates_cached = candidates.cache()
598
- unpersist_if_exists(first_level_user_features)
599
- unpersist_if_exists(first_level_item_features)
600
- self.logger.info("Adding features")
601
- candidates_features = self._add_features_for_second_level(
602
- log_to_add_features=candidates_cached,
603
- log_for_first_level_models=log,
604
- user_features=user_features,
605
- item_features=item_features,
606
- )
607
- candidates_features.cache()
608
- candidates_cached.unpersist()
609
- self.logger.info(
610
- "Generated %s candidates for %s users",
611
- candidates_features.count(),
612
- candidates_features.select("user_idx").distinct().count(),
613
- )
614
- return self.second_stage_model.predict(data=candidates_features, k=k)
615
-
616
- def fit_predict(
617
- self,
618
- log: DataFrameLike,
619
- k: int,
620
- users: Optional[Union[DataFrameLike, Iterable]] = None,
621
- items: Optional[Union[DataFrameLike, Iterable]] = None,
622
- user_features: Optional[DataFrameLike] = None,
623
- item_features: Optional[DataFrameLike] = None,
624
- filter_seen_items: bool = True,
625
- ) -> SparkDataFrame:
626
- """
627
- :param log: input DataFrame ``[user_id, item_id, timestamp, relevance]``
628
- :param k: length of a recommendation list, must be smaller than the number of ``items``
629
- :param users: users to get recommendations for
630
- :param items: items to get recommendations for
631
- :param user_features: user features``[user_id]`` + feature columns
632
- :param item_features: item features``[item_id]`` + feature columns
633
- :param filter_seen_items: flag to removed seen items from recommendations
634
- :return: DataFrame ``[user_id, item_id, relevance]``
635
- """
636
- self.fit(log, user_features, item_features)
637
- return self.predict(
638
- log,
639
- k,
640
- users,
641
- items,
642
- user_features,
643
- item_features,
644
- filter_seen_items,
645
- )
646
-
647
- @staticmethod
648
- def _optimize_one_model(
649
- model: BaseRecommender,
650
- train: DataFrameLike,
651
- test: DataFrameLike,
652
- user_features: Optional[DataFrameLike] = None,
653
- item_features: Optional[DataFrameLike] = None,
654
- param_borders: Optional[dict[str, list[Any]]] = None,
655
- criterion: Metric = Precision,
656
- k: int = 10,
657
- budget: int = 10,
658
- new_study: bool = True,
659
- ):
660
- params = model.optimize(
661
- train,
662
- test,
663
- user_features,
664
- item_features,
665
- param_borders,
666
- criterion,
667
- k,
668
- budget,
669
- new_study,
670
- )
671
- return params
672
-
673
- def optimize(
674
- self,
675
- train: DataFrameLike,
676
- test: DataFrameLike,
677
- user_features: Optional[DataFrameLike] = None,
678
- item_features: Optional[DataFrameLike] = None,
679
- param_borders: Optional[list[dict[str, list[Any]]]] = None,
680
- criterion: Metric = Precision,
681
- k: int = 10,
682
- budget: int = 10,
683
- new_study: bool = True,
684
- ) -> tuple[list[dict[str, Any]], Optional[dict[str, Any]]]:
685
- """
686
- Optimize first level models with optuna.
687
-
688
- :param train: train DataFrame ``[user_id, item_id, timestamp, relevance]``
689
- :param test: test DataFrame ``[user_id, item_id, timestamp, relevance]``
690
- :param user_features: user features ``[user_id , timestamp]`` + feature columns
691
- :param item_features: item features``[item_id]`` + feature columns
692
- :param param_borders: list with param grids for first level models and a fallback model.
693
- Empty dict skips optimization for that model.
694
- Param grid is a dict ``{param: [low, high]}``.
695
- :param criterion: metric to optimize
696
- :param k: length of a recommendation list
697
- :param budget: number of points to train each model
698
- :param new_study: keep searching with previous study or start a new study
699
- :return: list of dicts of parameters
700
- """
701
- number_of_models = len(self.first_level_models)
702
- if self.fallback_model is not None:
703
- number_of_models += 1
704
- if number_of_models != len(param_borders):
705
- msg = "Provide search grid or None for every first level model"
706
- raise ValueError(msg)
707
-
708
- first_level_user_features_tr = ToNumericFeatureTransformer()
709
- first_level_user_features = first_level_user_features_tr.fit_transform(user_features)
710
- first_level_item_features_tr = ToNumericFeatureTransformer()
711
- first_level_item_features = first_level_item_features_tr.fit_transform(item_features)
712
-
713
- first_level_user_features = cache_if_exists(first_level_user_features)
714
- first_level_item_features = cache_if_exists(first_level_item_features)
715
-
716
- params_found = []
717
- for i, model in enumerate(self.first_level_models):
718
- if param_borders[i] is None or (isinstance(param_borders[i], dict) and param_borders[i]):
719
- self.logger.info(
720
- "Optimizing first level model number %s, %s",
721
- i,
722
- model.__str__(),
723
- )
724
- params_found.append(
725
- self._optimize_one_model(
726
- model=model,
727
- train=train,
728
- test=test,
729
- user_features=first_level_user_features,
730
- item_features=first_level_item_features,
731
- param_borders=param_borders[i],
732
- criterion=criterion,
733
- k=k,
734
- budget=budget,
735
- new_study=new_study,
736
- )
737
- )
738
- else:
739
- params_found.append(None)
740
-
741
- if self.fallback_model is None or (isinstance(param_borders[-1], dict) and not param_borders[-1]):
742
- return params_found, None
743
-
744
- self.logger.info("Optimizing fallback-model")
745
- fallback_params = self._optimize_one_model(
746
- model=self.fallback_model,
747
- train=train,
748
- test=test,
749
- user_features=first_level_user_features,
750
- item_features=first_level_item_features,
751
- param_borders=param_borders[-1],
752
- criterion=criterion,
753
- new_study=new_study,
754
- )
755
- unpersist_if_exists(first_level_item_features)
756
- unpersist_if_exists(first_level_user_features)
757
- return params_found, fallback_params