replay-rec 0.16.0rc0__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. replay/__init__.py +1 -1
  2. replay/data/__init__.py +1 -1
  3. replay/data/dataset.py +45 -42
  4. replay/data/dataset_utils/dataset_label_encoder.py +6 -7
  5. replay/data/nn/__init__.py +1 -1
  6. replay/data/nn/schema.py +20 -33
  7. replay/data/nn/sequence_tokenizer.py +217 -87
  8. replay/data/nn/sequential_dataset.py +6 -22
  9. replay/data/nn/torch_sequential_dataset.py +20 -11
  10. replay/data/nn/utils.py +7 -9
  11. replay/data/schema.py +17 -17
  12. replay/data/spark_schema.py +0 -1
  13. replay/metrics/base_metric.py +38 -79
  14. replay/metrics/categorical_diversity.py +24 -58
  15. replay/metrics/coverage.py +25 -49
  16. replay/metrics/descriptors.py +4 -13
  17. replay/metrics/experiment.py +3 -8
  18. replay/metrics/hitrate.py +3 -6
  19. replay/metrics/map.py +3 -6
  20. replay/metrics/mrr.py +1 -4
  21. replay/metrics/ndcg.py +4 -7
  22. replay/metrics/novelty.py +10 -29
  23. replay/metrics/offline_metrics.py +26 -61
  24. replay/metrics/precision.py +3 -6
  25. replay/metrics/recall.py +3 -6
  26. replay/metrics/rocauc.py +7 -10
  27. replay/metrics/surprisal.py +13 -30
  28. replay/metrics/torch_metrics_builder.py +0 -4
  29. replay/metrics/unexpectedness.py +15 -20
  30. replay/models/__init__.py +1 -2
  31. replay/models/als.py +7 -15
  32. replay/models/association_rules.py +12 -28
  33. replay/models/base_neighbour_rec.py +21 -36
  34. replay/models/base_rec.py +92 -215
  35. replay/models/cat_pop_rec.py +9 -22
  36. replay/models/cluster.py +17 -28
  37. replay/models/extensions/ann/ann_mixin.py +7 -12
  38. replay/models/extensions/ann/entities/base_hnsw_param.py +1 -1
  39. replay/models/extensions/ann/entities/hnswlib_param.py +0 -6
  40. replay/models/extensions/ann/entities/nmslib_hnsw_param.py +0 -6
  41. replay/models/extensions/ann/index_builders/driver_hnswlib_index_builder.py +4 -10
  42. replay/models/extensions/ann/index_builders/driver_nmslib_index_builder.py +7 -11
  43. replay/models/extensions/ann/index_builders/executor_hnswlib_index_builder.py +5 -12
  44. replay/models/extensions/ann/index_builders/executor_nmslib_index_builder.py +11 -18
  45. replay/models/extensions/ann/index_builders/nmslib_index_builder_mixin.py +1 -4
  46. replay/models/extensions/ann/index_inferers/base_inferer.py +3 -10
  47. replay/models/extensions/ann/index_inferers/hnswlib_filter_index_inferer.py +7 -17
  48. replay/models/extensions/ann/index_inferers/hnswlib_index_inferer.py +6 -14
  49. replay/models/extensions/ann/index_inferers/nmslib_filter_index_inferer.py +14 -28
  50. replay/models/extensions/ann/index_inferers/nmslib_index_inferer.py +15 -25
  51. replay/models/extensions/ann/index_inferers/utils.py +2 -9
  52. replay/models/extensions/ann/index_stores/hdfs_index_store.py +4 -9
  53. replay/models/extensions/ann/index_stores/shared_disk_index_store.py +2 -6
  54. replay/models/extensions/ann/index_stores/spark_files_index_store.py +8 -14
  55. replay/models/extensions/ann/index_stores/utils.py +5 -2
  56. replay/models/extensions/ann/utils.py +3 -5
  57. replay/models/kl_ucb.py +16 -22
  58. replay/models/knn.py +37 -59
  59. replay/models/nn/optimizer_utils/__init__.py +1 -6
  60. replay/models/nn/optimizer_utils/optimizer_factory.py +3 -6
  61. replay/models/nn/sequential/bert4rec/__init__.py +1 -1
  62. replay/models/nn/sequential/bert4rec/dataset.py +6 -7
  63. replay/models/nn/sequential/bert4rec/lightning.py +53 -56
  64. replay/models/nn/sequential/bert4rec/model.py +12 -25
  65. replay/models/nn/sequential/callbacks/__init__.py +1 -1
  66. replay/models/nn/sequential/callbacks/prediction_callbacks.py +23 -25
  67. replay/models/nn/sequential/callbacks/validation_callback.py +27 -30
  68. replay/models/nn/sequential/postprocessors/postprocessors.py +1 -1
  69. replay/models/nn/sequential/sasrec/dataset.py +8 -7
  70. replay/models/nn/sequential/sasrec/lightning.py +53 -48
  71. replay/models/nn/sequential/sasrec/model.py +4 -17
  72. replay/models/pop_rec.py +9 -10
  73. replay/models/query_pop_rec.py +7 -15
  74. replay/models/random_rec.py +10 -18
  75. replay/models/slim.py +8 -13
  76. replay/models/thompson_sampling.py +13 -14
  77. replay/models/ucb.py +11 -22
  78. replay/models/wilson.py +5 -14
  79. replay/models/word2vec.py +24 -69
  80. replay/optimization/optuna_objective.py +13 -27
  81. replay/preprocessing/__init__.py +1 -2
  82. replay/preprocessing/converter.py +2 -7
  83. replay/preprocessing/filters.py +67 -142
  84. replay/preprocessing/history_based_fp.py +44 -116
  85. replay/preprocessing/label_encoder.py +106 -68
  86. replay/preprocessing/sessionizer.py +1 -11
  87. replay/scenarios/fallback.py +3 -8
  88. replay/splitters/base_splitter.py +43 -15
  89. replay/splitters/cold_user_random_splitter.py +18 -31
  90. replay/splitters/k_folds.py +14 -24
  91. replay/splitters/last_n_splitter.py +33 -43
  92. replay/splitters/new_users_splitter.py +31 -55
  93. replay/splitters/random_splitter.py +16 -23
  94. replay/splitters/ratio_splitter.py +30 -54
  95. replay/splitters/time_splitter.py +13 -18
  96. replay/splitters/two_stage_splitter.py +44 -79
  97. replay/utils/__init__.py +1 -1
  98. replay/utils/common.py +65 -0
  99. replay/utils/dataframe_bucketizer.py +25 -31
  100. replay/utils/distributions.py +3 -15
  101. replay/utils/model_handler.py +36 -33
  102. replay/utils/session_handler.py +11 -15
  103. replay/utils/spark_utils.py +51 -85
  104. replay/utils/time.py +8 -22
  105. replay/utils/types.py +1 -3
  106. {replay_rec-0.16.0rc0.dist-info → replay_rec-0.17.0.dist-info}/METADATA +2 -10
  107. replay_rec-0.17.0.dist-info/RECORD +127 -0
  108. {replay_rec-0.16.0rc0.dist-info → replay_rec-0.17.0.dist-info}/WHEEL +1 -1
  109. replay/experimental/__init__.py +0 -0
  110. replay/experimental/metrics/__init__.py +0 -61
  111. replay/experimental/metrics/base_metric.py +0 -661
  112. replay/experimental/metrics/coverage.py +0 -117
  113. replay/experimental/metrics/experiment.py +0 -200
  114. replay/experimental/metrics/hitrate.py +0 -27
  115. replay/experimental/metrics/map.py +0 -31
  116. replay/experimental/metrics/mrr.py +0 -19
  117. replay/experimental/metrics/ncis_precision.py +0 -32
  118. replay/experimental/metrics/ndcg.py +0 -50
  119. replay/experimental/metrics/precision.py +0 -23
  120. replay/experimental/metrics/recall.py +0 -26
  121. replay/experimental/metrics/rocauc.py +0 -50
  122. replay/experimental/metrics/surprisal.py +0 -102
  123. replay/experimental/metrics/unexpectedness.py +0 -74
  124. replay/experimental/models/__init__.py +0 -10
  125. replay/experimental/models/admm_slim.py +0 -216
  126. replay/experimental/models/base_neighbour_rec.py +0 -222
  127. replay/experimental/models/base_rec.py +0 -1361
  128. replay/experimental/models/base_torch_rec.py +0 -247
  129. replay/experimental/models/cql.py +0 -468
  130. replay/experimental/models/ddpg.py +0 -1007
  131. replay/experimental/models/dt4rec/__init__.py +0 -0
  132. replay/experimental/models/dt4rec/dt4rec.py +0 -193
  133. replay/experimental/models/dt4rec/gpt1.py +0 -411
  134. replay/experimental/models/dt4rec/trainer.py +0 -128
  135. replay/experimental/models/dt4rec/utils.py +0 -274
  136. replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
  137. replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -733
  138. replay/experimental/models/implicit_wrap.py +0 -138
  139. replay/experimental/models/lightfm_wrap.py +0 -327
  140. replay/experimental/models/mult_vae.py +0 -374
  141. replay/experimental/models/neuromf.py +0 -462
  142. replay/experimental/models/scala_als.py +0 -311
  143. replay/experimental/nn/data/__init__.py +0 -1
  144. replay/experimental/nn/data/schema_builder.py +0 -58
  145. replay/experimental/preprocessing/__init__.py +0 -3
  146. replay/experimental/preprocessing/data_preparator.py +0 -929
  147. replay/experimental/preprocessing/padder.py +0 -231
  148. replay/experimental/preprocessing/sequence_generator.py +0 -218
  149. replay/experimental/scenarios/__init__.py +0 -1
  150. replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
  151. replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -86
  152. replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -271
  153. replay/experimental/scenarios/obp_wrapper/utils.py +0 -88
  154. replay/experimental/scenarios/two_stages/reranker.py +0 -116
  155. replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -843
  156. replay/experimental/utils/__init__.py +0 -0
  157. replay/experimental/utils/logger.py +0 -24
  158. replay/experimental/utils/model_handler.py +0 -213
  159. replay/experimental/utils/session_handler.py +0 -47
  160. replay_rec-0.16.0rc0.dist-info/NOTICE +0 -41
  161. replay_rec-0.16.0rc0.dist-info/RECORD +0 -178
  162. {replay_rec-0.16.0rc0.dist-info → replay_rec-0.17.0.dist-info}/LICENSE +0 -0
@@ -1,843 +0,0 @@
1
- # pylint: disable=too-many-lines
2
- from collections.abc import Iterable
3
- from typing import Any, Dict, List, Optional, Tuple, Union
4
-
5
- from replay.experimental.models import ScalaALSWrap
6
- from replay.experimental.preprocessing.data_preparator import ToNumericFeatureTransformer
7
- from replay.experimental.scenarios.two_stages.reranker import LamaWrap
8
- from replay.metrics import Metric, Precision
9
- from replay.models import PopRec, RandomRec
10
- from replay.models.base_rec import BaseRecommender, HybridRecommender
11
- from replay.preprocessing.history_based_fp import HistoryBasedFeaturesProcessor
12
- from replay.splitters import RatioSplitter, Splitter
13
- from replay.utils import PYSPARK_AVAILABLE, DataFrameLike, SparkDataFrame
14
- from replay.utils.session_handler import State
15
- from replay.utils.spark_utils import (
16
- array_mult,
17
- cache_if_exists,
18
- fallback,
19
- get_log_info,
20
- get_top_k_recs,
21
- horizontal_explode,
22
- join_or_return,
23
- join_with_col_renaming,
24
- unpersist_if_exists,
25
- )
26
-
27
- if PYSPARK_AVAILABLE:
28
- import pyspark.sql.functions as sf
29
-
30
-
31
- # pylint: disable=too-many-locals, too-many-arguments
32
- def get_first_level_model_features(
33
- model: SparkDataFrame,
34
- pairs: SparkDataFrame,
35
- user_features: Optional[SparkDataFrame] = None,
36
- item_features: Optional[SparkDataFrame] = None,
37
- add_factors_mult: bool = True,
38
- prefix: str = "",
39
- ) -> SparkDataFrame:
40
- """
41
- Get user and item embeddings from replay model.
42
- Can also compute elementwise multiplication between them with ``add_factors_mult`` parameter.
43
- Zero vectors are returned if a model does not have embeddings for specific users/items.
44
-
45
- :param model: trained model
46
- :param pairs: user-item pairs to get vectors for `[user_id/user_idx, item_id/item_id]`
47
- :param user_features: user features `[user_id/user_idx, feature_1, ....]`
48
- :param item_features: item features `[item_id/item_idx, feature_1, ....]`
49
- :param add_factors_mult: flag to add elementwise multiplication
50
- :param prefix: name to add to the columns
51
- :return: DataFrame
52
- """
53
- users = pairs.select("user_idx").distinct()
54
- items = pairs.select("item_idx").distinct()
55
- user_factors, user_vector_len = model._get_features_wrap(
56
- users, user_features
57
- )
58
- item_factors, item_vector_len = model._get_features_wrap(
59
- items, item_features
60
- )
61
-
62
- pairs_with_features = join_or_return(
63
- pairs, user_factors, how="left", on="user_idx"
64
- )
65
- pairs_with_features = join_or_return(
66
- pairs_with_features,
67
- item_factors,
68
- how="left",
69
- on="item_idx",
70
- )
71
-
72
- factors_to_explode = []
73
- if user_factors is not None:
74
- pairs_with_features = pairs_with_features.withColumn(
75
- "user_factors",
76
- sf.coalesce(
77
- sf.col("user_factors"),
78
- sf.array([sf.lit(0.0)] * user_vector_len),
79
- ),
80
- )
81
- factors_to_explode.append(("user_factors", "uf"))
82
-
83
- if item_factors is not None:
84
- pairs_with_features = pairs_with_features.withColumn(
85
- "item_factors",
86
- sf.coalesce(
87
- sf.col("item_factors"),
88
- sf.array([sf.lit(0.0)] * item_vector_len),
89
- ),
90
- )
91
- factors_to_explode.append(("item_factors", "if"))
92
-
93
- if model.__str__() == "LightFMWrap":
94
- pairs_with_features = (
95
- pairs_with_features.fillna({"user_bias": 0, "item_bias": 0})
96
- .withColumnRenamed("user_bias", f"{prefix}_user_bias")
97
- .withColumnRenamed("item_bias", f"{prefix}_item_bias")
98
- )
99
-
100
- if (
101
- add_factors_mult
102
- and user_factors is not None
103
- and item_factors is not None
104
- ):
105
- pairs_with_features = pairs_with_features.withColumn(
106
- "factors_mult",
107
- array_mult(sf.col("item_factors"), sf.col("user_factors")),
108
- )
109
- factors_to_explode.append(("factors_mult", "fm"))
110
-
111
- for col_name, feature_prefix in factors_to_explode:
112
- col_set = set(pairs_with_features.columns)
113
- col_set.remove(col_name)
114
- pairs_with_features = horizontal_explode(
115
- data_frame=pairs_with_features,
116
- column_to_explode=col_name,
117
- other_columns=[sf.col(column) for column in sorted(list(col_set))],
118
- prefix=f"{prefix}_{feature_prefix}",
119
- )
120
-
121
- return pairs_with_features
122
-
123
-
124
- # pylint: disable=too-many-instance-attributes
125
- class TwoStagesScenario(HybridRecommender):
126
- """
127
- *train*:
128
-
129
- 1) take input ``log`` and split it into first_level_train and second_level_train
130
- default splitter splits each user's data 50/50
131
- 2) train ``first_stage_models`` on ``first_stage_train``
132
- 3) create negative examples to train second stage model using one of:
133
-
134
- - wrong recommendations from first stage
135
- - random examples
136
-
137
- use ``num_negatives`` to specify number of negatives per user
138
- 4) augments dataset with features:
139
-
140
- - get 1 level recommendations for positive examples
141
- from second_level_train and for generated negative examples
142
- - add user and item features
143
- - generate statistical and pair features
144
-
145
- 5) train ``TabularAutoML`` from LightAutoML
146
-
147
- *inference*:
148
-
149
- 1) take ``log``
150
- 2) generate candidates, their number can be specified with ``num_candidates``
151
- 3) add features as in train
152
- 4) get recommendations
153
-
154
- """
155
-
156
- can_predict_cold_users: bool = True
157
- can_predict_cold_items: bool = True
158
-
159
- # pylint: disable=too-many-arguments
160
- def __init__(
161
- self,
162
- train_splitter: Splitter = RatioSplitter(test_size=0.5),
163
- first_level_models: Union[
164
- List[BaseRecommender], BaseRecommender
165
- ] = ScalaALSWrap(rank=128),
166
- fallback_model: Optional[BaseRecommender] = PopRec(),
167
- use_first_level_models_feat: Union[List[bool], bool] = False,
168
- second_model_params: Optional[Union[Dict, str]] = None,
169
- second_model_config_path: Optional[str] = None,
170
- num_negatives: int = 100,
171
- negatives_type: str = "first_level",
172
- use_generated_features: bool = False,
173
- user_cat_features_list: Optional[List] = None,
174
- item_cat_features_list: Optional[List] = None,
175
- custom_features_processor: HistoryBasedFeaturesProcessor = None,
176
- seed: int = 123,
177
- ) -> None:
178
- """
179
- :param train_splitter: splitter to get ``first_level_train`` and ``second_level_train``.
180
- Default is random 50% split.
181
- :param first_level_models: model or a list of models
182
- :param fallback_model: model used to fill missing recommendations at first level models
183
- :param use_first_level_models_feat: flag or a list of flags to use
184
- features created by first level models
185
- :param second_model_params: TabularAutoML parameters
186
- :param second_model_config_path: path to config file for TabularAutoML
187
- :param num_negatives: number of negative examples used during train
188
- :param negatives_type: negative examples creation strategy,``random``
189
- or most relevant examples from ``first-level``
190
- :param use_generated_features: flag to use generated features to train second level
191
- :param user_cat_features_list: list of user categorical features
192
- :param item_cat_features_list: list of item categorical features
193
- :param custom_features_processor: you can pass custom feature processor
194
- :param seed: random seed
195
-
196
- """
197
- self.train_splitter = train_splitter
198
- self.cached_list = []
199
-
200
- self.first_level_models = (
201
- first_level_models
202
- if isinstance(first_level_models, Iterable)
203
- else [first_level_models]
204
- )
205
-
206
- self.first_level_item_len = 0
207
- self.first_level_user_len = 0
208
-
209
- self.random_model = RandomRec(seed=seed)
210
- self.fallback_model = fallback_model
211
- self.first_level_user_features_transformer = (
212
- ToNumericFeatureTransformer()
213
- )
214
- self.first_level_item_features_transformer = (
215
- ToNumericFeatureTransformer()
216
- )
217
-
218
- if isinstance(use_first_level_models_feat, bool):
219
- self.use_first_level_models_feat = [
220
- use_first_level_models_feat
221
- ] * len(self.first_level_models)
222
- else:
223
- if len(self.first_level_models) != len(
224
- use_first_level_models_feat
225
- ):
226
- raise ValueError(
227
- f"For each model from first_level_models specify "
228
- f"flag to use first level features."
229
- f"Length of first_level_models is {len(first_level_models)}, "
230
- f"Length of use_first_level_models_feat is {len(use_first_level_models_feat)}"
231
- )
232
-
233
- self.use_first_level_models_feat = use_first_level_models_feat
234
-
235
- self.second_stage_model = LamaWrap(
236
- params=second_model_params, config_path=second_model_config_path
237
- )
238
-
239
- self.num_negatives = num_negatives
240
- if negatives_type not in ["random", "first_level"]:
241
- raise ValueError(
242
- f"Invalid negatives_type value: {negatives_type}. Use 'random' or 'first_level'"
243
- )
244
- self.negatives_type = negatives_type
245
-
246
- self.use_generated_features = use_generated_features
247
- self.features_processor = (
248
- custom_features_processor
249
- if custom_features_processor
250
- else HistoryBasedFeaturesProcessor(
251
- user_cat_features_list=user_cat_features_list,
252
- item_cat_features_list=item_cat_features_list,
253
- )
254
- )
255
- self.seed = seed
256
-
257
- # TO DO: add save/load for scenarios
258
- @property
259
- def _init_args(self):
260
- return {}
261
-
262
- # pylint: disable=too-many-locals
263
- def _add_features_for_second_level(
264
- self,
265
- log_to_add_features: SparkDataFrame,
266
- log_for_first_level_models: SparkDataFrame,
267
- user_features: SparkDataFrame,
268
- item_features: SparkDataFrame,
269
- ) -> SparkDataFrame:
270
- """
271
- Added features are:
272
- - relevance from first level models
273
- - user and item features from first level models
274
- - dataset features
275
- - FeatureProcessor features
276
-
277
- :param log_to_add_features: input DataFrame``[user_idx, item_idx, timestamp, relevance]``
278
- :param log_for_first_level_models: DataFrame``[user_idx, item_idx, timestamp, relevance]``
279
- :param user_features: user features``[user_idx]`` + feature columns
280
- :param item_features: item features``[item_idx]`` + feature columns
281
- :return: DataFrame
282
- """
283
- self.logger.info("Generating features")
284
- full_second_level_train = log_to_add_features
285
- first_level_item_features_cached = cache_if_exists(
286
- self.first_level_item_features_transformer.transform(item_features)
287
- )
288
- first_level_user_features_cached = cache_if_exists(
289
- self.first_level_user_features_transformer.transform(user_features)
290
- )
291
-
292
- pairs = log_to_add_features.select("user_idx", "item_idx")
293
- for idx, model in enumerate(self.first_level_models):
294
- current_pred = self._predict_pairs_with_first_level_model(
295
- model=model,
296
- log=log_for_first_level_models,
297
- pairs=pairs,
298
- user_features=first_level_user_features_cached,
299
- item_features=first_level_item_features_cached,
300
- ).withColumnRenamed("relevance", f"rel_{idx}_{model}")
301
- full_second_level_train = full_second_level_train.join(
302
- sf.broadcast(current_pred),
303
- on=["user_idx", "item_idx"],
304
- how="left",
305
- )
306
-
307
- if self.use_first_level_models_feat[idx]:
308
- features = get_first_level_model_features(
309
- model=model,
310
- pairs=full_second_level_train.select(
311
- "user_idx", "item_idx"
312
- ),
313
- user_features=first_level_user_features_cached,
314
- item_features=first_level_item_features_cached,
315
- prefix=f"m_{idx}",
316
- )
317
- full_second_level_train = join_with_col_renaming(
318
- left=full_second_level_train,
319
- right=features,
320
- on_col_name=["user_idx", "item_idx"],
321
- how="left",
322
- )
323
-
324
- unpersist_if_exists(first_level_user_features_cached)
325
- unpersist_if_exists(first_level_item_features_cached)
326
-
327
- full_second_level_train_cached = full_second_level_train.fillna(
328
- 0
329
- ).cache()
330
-
331
- self.logger.info("Adding features from the dataset")
332
- full_second_level_train = join_or_return(
333
- full_second_level_train_cached,
334
- user_features,
335
- on="user_idx",
336
- how="left",
337
- )
338
- full_second_level_train = join_or_return(
339
- full_second_level_train,
340
- item_features,
341
- on="item_idx",
342
- how="left",
343
- )
344
-
345
- if self.use_generated_features:
346
- if not self.features_processor.fitted:
347
- self.features_processor.fit(
348
- log=log_for_first_level_models,
349
- user_features=user_features,
350
- item_features=item_features,
351
- )
352
- self.logger.info("Adding generated features")
353
- full_second_level_train = self.features_processor.transform(
354
- log=full_second_level_train
355
- )
356
-
357
- self.logger.info(
358
- "Columns at second level: %s",
359
- " ".join(full_second_level_train.columns),
360
- )
361
- full_second_level_train_cached.unpersist()
362
- return full_second_level_train
363
-
364
- def _split_data(self, log: SparkDataFrame) -> Tuple[SparkDataFrame, SparkDataFrame]:
365
- """Write statistics"""
366
- first_level_train, second_level_train = self.train_splitter.split(log)
367
- State().logger.debug("Log info: %s", get_log_info(log))
368
- State().logger.debug(
369
- "first_level_train info: %s", get_log_info(first_level_train)
370
- )
371
- State().logger.debug(
372
- "second_level_train info: %s", get_log_info(second_level_train)
373
- )
374
- return first_level_train, second_level_train
375
-
376
- @staticmethod
377
- def _filter_or_return(dataframe, condition):
378
- if dataframe is None:
379
- return dataframe
380
- return dataframe.filter(condition)
381
-
382
- def _predict_with_first_level_model(
383
- self,
384
- model: BaseRecommender,
385
- log: SparkDataFrame,
386
- k: int,
387
- users: SparkDataFrame,
388
- items: SparkDataFrame,
389
- user_features: SparkDataFrame,
390
- item_features: SparkDataFrame,
391
- log_to_filter: SparkDataFrame,
392
- ):
393
- """
394
- Filter users and items using can_predict_cold_items and can_predict_cold_users, and predict
395
- """
396
- if not model.can_predict_cold_items:
397
- log, items, item_features = [
398
- self._filter_or_return(
399
- dataframe=df,
400
- condition=sf.col("item_idx") < self.first_level_item_len,
401
- )
402
- for df in [log, items, item_features]
403
- ]
404
- if not model.can_predict_cold_users:
405
- log, users, user_features = [
406
- self._filter_or_return(
407
- dataframe=df,
408
- condition=sf.col("user_idx") < self.first_level_user_len,
409
- )
410
- for df in [log, users, user_features]
411
- ]
412
-
413
- log_to_filter_cached = join_with_col_renaming(
414
- left=log_to_filter,
415
- right=users,
416
- on_col_name="user_idx",
417
- ).cache()
418
- max_positives_to_filter = 0
419
-
420
- if log_to_filter_cached.count() > 0:
421
- max_positives_to_filter = (
422
- log_to_filter_cached.groupBy("user_idx")
423
- .agg(sf.count("item_idx").alias("num_positives"))
424
- .select(sf.max("num_positives"))
425
- .collect()[0][0]
426
- )
427
-
428
- pred = model._predict(
429
- log,
430
- k=k + max_positives_to_filter,
431
- users=users,
432
- items=items,
433
- user_features=user_features,
434
- item_features=item_features,
435
- filter_seen_items=False,
436
- )
437
-
438
- pred = pred.join(
439
- log_to_filter_cached.select("user_idx", "item_idx"),
440
- on=["user_idx", "item_idx"],
441
- how="anti",
442
- ).drop("user", "item")
443
-
444
- log_to_filter_cached.unpersist()
445
-
446
- return get_top_k_recs(pred, k)
447
-
448
- def _predict_pairs_with_first_level_model(
449
- self,
450
- model: BaseRecommender,
451
- log: SparkDataFrame,
452
- pairs: SparkDataFrame,
453
- user_features: SparkDataFrame,
454
- item_features: SparkDataFrame,
455
- ):
456
- """
457
- Get relevance for selected user-item pairs.
458
- """
459
- if not model.can_predict_cold_items:
460
- log, pairs, item_features = [
461
- self._filter_or_return(
462
- dataframe=df,
463
- condition=sf.col("item_idx") < self.first_level_item_len,
464
- )
465
- for df in [log, pairs, item_features]
466
- ]
467
- if not model.can_predict_cold_users:
468
- log, pairs, user_features = [
469
- self._filter_or_return(
470
- dataframe=df,
471
- condition=sf.col("user_idx") < self.first_level_user_len,
472
- )
473
- for df in [log, pairs, user_features]
474
- ]
475
-
476
- return model._predict_pairs(
477
- pairs=pairs,
478
- log=log,
479
- user_features=user_features,
480
- item_features=item_features,
481
- )
482
-
483
- # pylint: disable=unused-argument
484
- def _get_first_level_candidates(
485
- self,
486
- model: BaseRecommender,
487
- log: SparkDataFrame,
488
- k: int,
489
- users: SparkDataFrame,
490
- items: SparkDataFrame,
491
- user_features: SparkDataFrame,
492
- item_features: SparkDataFrame,
493
- log_to_filter: SparkDataFrame,
494
- ) -> SparkDataFrame:
495
- """
496
- Combining the base model predictions with the fallback model
497
- predictions.
498
- """
499
- passed_arguments = locals()
500
- passed_arguments.pop("self")
501
- candidates = self._predict_with_first_level_model(**passed_arguments)
502
-
503
- if self.fallback_model is not None:
504
- passed_arguments.pop("model")
505
- fallback_candidates = self._predict_with_first_level_model(
506
- model=self.fallback_model, **passed_arguments
507
- )
508
-
509
- candidates = fallback(
510
- base=candidates,
511
- fill=fallback_candidates,
512
- k=self.num_negatives,
513
- )
514
- return candidates
515
-
516
- # pylint: disable=too-many-locals,too-many-statements
517
- def _fit(
518
- self,
519
- log: SparkDataFrame,
520
- user_features: Optional[SparkDataFrame] = None,
521
- item_features: Optional[SparkDataFrame] = None,
522
- ) -> None:
523
-
524
- self.cached_list = []
525
-
526
- self.logger.info("Data split")
527
- first_level_train, second_level_positive = self._split_data(log)
528
- # second_level_positive = second_level_positive
529
- # .join(first_level_train.select("user_idx"), on="user_idx", how="left")
530
-
531
- self.first_level_item_len = (
532
- first_level_train.select("item_idx").distinct().count()
533
- )
534
- self.first_level_user_len = (
535
- first_level_train.select("user_idx").distinct().count()
536
- )
537
-
538
- log.cache()
539
- first_level_train.cache()
540
- second_level_positive.cache()
541
- self.cached_list.extend(
542
- [log, first_level_train, second_level_positive]
543
- )
544
-
545
- if user_features is not None:
546
- user_features.cache()
547
- self.cached_list.append(user_features)
548
-
549
- if item_features is not None:
550
- item_features.cache()
551
- self.cached_list.append(item_features)
552
-
553
- self.first_level_item_features_transformer.fit(item_features)
554
- self.first_level_user_features_transformer.fit(user_features)
555
-
556
- first_level_item_features = cache_if_exists(
557
- self.first_level_item_features_transformer.transform(item_features)
558
- )
559
- first_level_user_features = cache_if_exists(
560
- self.first_level_user_features_transformer.transform(user_features)
561
- )
562
-
563
- for base_model in [
564
- *self.first_level_models,
565
- self.random_model,
566
- self.fallback_model,
567
- ]:
568
- base_model._fit_wrap(
569
- log=first_level_train,
570
- user_features=first_level_user_features.filter(
571
- sf.col("user_idx") < self.first_level_user_len
572
- ),
573
- item_features=first_level_item_features.filter(
574
- sf.col("item_idx") < self.first_level_item_len
575
- ),
576
- )
577
-
578
- self.logger.info("Generate negative examples")
579
- negatives_source = (
580
- self.first_level_models[0]
581
- if self.negatives_type == "first_level"
582
- else self.random_model
583
- )
584
-
585
- first_level_candidates = self._get_first_level_candidates(
586
- model=negatives_source,
587
- log=first_level_train,
588
- k=self.num_negatives,
589
- users=log.select("user_idx").distinct(),
590
- items=log.select("item_idx").distinct(),
591
- user_features=first_level_user_features,
592
- item_features=first_level_item_features,
593
- log_to_filter=first_level_train,
594
- ).select("user_idx", "item_idx")
595
-
596
- unpersist_if_exists(first_level_user_features)
597
- unpersist_if_exists(first_level_item_features)
598
-
599
- self.logger.info("Crate train dataset for second level")
600
-
601
- second_level_train = (
602
- first_level_candidates.join(
603
- second_level_positive.select(
604
- "user_idx", "item_idx"
605
- ).withColumn("target", sf.lit(1.0)),
606
- on=["user_idx", "item_idx"],
607
- how="left",
608
- ).fillna(0.0, subset="target")
609
- ).cache()
610
-
611
- self.cached_list.append(second_level_train)
612
-
613
- self.logger.info(
614
- "Distribution of classes in second-level train dataset:/n %s",
615
- (
616
- second_level_train.groupBy("target")
617
- .agg(sf.count(sf.col("target")).alias("count_for_class"))
618
- .take(2)
619
- ),
620
- )
621
-
622
- self.features_processor.fit(
623
- log=first_level_train,
624
- user_features=user_features,
625
- item_features=item_features,
626
- )
627
-
628
- self.logger.info("Adding features to second-level train dataset")
629
- second_level_train_to_convert = self._add_features_for_second_level(
630
- log_to_add_features=second_level_train,
631
- log_for_first_level_models=first_level_train,
632
- user_features=user_features,
633
- item_features=item_features,
634
- ).cache()
635
-
636
- self.cached_list.append(second_level_train_to_convert)
637
- self.second_stage_model.fit(second_level_train_to_convert)
638
- for dataframe in self.cached_list:
639
- unpersist_if_exists(dataframe)
640
-
641
- # pylint: disable=too-many-arguments
642
- def _predict(
643
- self,
644
- log: SparkDataFrame,
645
- k: int,
646
- users: SparkDataFrame,
647
- items: SparkDataFrame,
648
- user_features: Optional[SparkDataFrame] = None,
649
- item_features: Optional[SparkDataFrame] = None,
650
- filter_seen_items: bool = True,
651
- ) -> SparkDataFrame:
652
-
653
- State().logger.debug(msg="Generating candidates to rerank")
654
-
655
- first_level_user_features = cache_if_exists(
656
- self.first_level_user_features_transformer.transform(user_features)
657
- )
658
- first_level_item_features = cache_if_exists(
659
- self.first_level_item_features_transformer.transform(item_features)
660
- )
661
-
662
- candidates = self._get_first_level_candidates(
663
- model=self.first_level_models[0],
664
- log=log,
665
- k=self.num_negatives,
666
- users=users,
667
- items=items,
668
- user_features=first_level_user_features,
669
- item_features=first_level_item_features,
670
- log_to_filter=log,
671
- ).select("user_idx", "item_idx")
672
-
673
- candidates_cached = candidates.cache()
674
- unpersist_if_exists(first_level_user_features)
675
- unpersist_if_exists(first_level_item_features)
676
- self.logger.info("Adding features")
677
- candidates_features = self._add_features_for_second_level(
678
- log_to_add_features=candidates_cached,
679
- log_for_first_level_models=log,
680
- user_features=user_features,
681
- item_features=item_features,
682
- )
683
- candidates_features.cache()
684
- candidates_cached.unpersist()
685
- self.logger.info(
686
- "Generated %s candidates for %s users",
687
- candidates_features.count(),
688
- candidates_features.select("user_idx").distinct().count(),
689
- )
690
- return self.second_stage_model.predict(data=candidates_features, k=k)
691
-
692
- def fit_predict(
693
- self,
694
- log: DataFrameLike,
695
- k: int,
696
- users: Optional[Union[DataFrameLike, Iterable]] = None,
697
- items: Optional[Union[DataFrameLike, Iterable]] = None,
698
- user_features: Optional[DataFrameLike] = None,
699
- item_features: Optional[DataFrameLike] = None,
700
- filter_seen_items: bool = True,
701
- ) -> SparkDataFrame:
702
- """
703
- :param log: input DataFrame ``[user_id, item_id, timestamp, relevance]``
704
- :param k: length of a recommendation list, must be smaller than the number of ``items``
705
- :param users: users to get recommendations for
706
- :param items: items to get recommendations for
707
- :param user_features: user features``[user_id]`` + feature columns
708
- :param item_features: item features``[item_id]`` + feature columns
709
- :param filter_seen_items: flag to removed seen items from recommendations
710
- :return: DataFrame ``[user_id, item_id, relevance]``
711
- """
712
- self.fit(log, user_features, item_features)
713
- return self.predict(
714
- log,
715
- k,
716
- users,
717
- items,
718
- user_features,
719
- item_features,
720
- filter_seen_items,
721
- )
722
-
723
- @staticmethod
724
- def _optimize_one_model(
725
- model: BaseRecommender,
726
- train: DataFrameLike,
727
- test: DataFrameLike,
728
- user_features: Optional[DataFrameLike] = None,
729
- item_features: Optional[DataFrameLike] = None,
730
- param_borders: Optional[Dict[str, List[Any]]] = None,
731
- criterion: Metric = Precision,
732
- k: int = 10,
733
- budget: int = 10,
734
- new_study: bool = True,
735
- ):
736
- params = model.optimize(
737
- train,
738
- test,
739
- user_features,
740
- item_features,
741
- param_borders,
742
- criterion,
743
- k,
744
- budget,
745
- new_study,
746
- )
747
- return params
748
-
749
- # pylint: disable=too-many-arguments, too-many-locals
750
- def optimize(
751
- self,
752
- train: DataFrameLike,
753
- test: DataFrameLike,
754
- user_features: Optional[DataFrameLike] = None,
755
- item_features: Optional[DataFrameLike] = None,
756
- param_borders: Optional[List[Dict[str, List[Any]]]] = None,
757
- criterion: Metric = Precision,
758
- k: int = 10,
759
- budget: int = 10,
760
- new_study: bool = True,
761
- ) -> Tuple[List[Dict[str, Any]], Optional[Dict[str, Any]]]:
762
- """
763
- Optimize first level models with optuna.
764
-
765
- :param train: train DataFrame ``[user_id, item_id, timestamp, relevance]``
766
- :param test: test DataFrame ``[user_id, item_id, timestamp, relevance]``
767
- :param user_features: user features ``[user_id , timestamp]`` + feature columns
768
- :param item_features: item features``[item_id]`` + feature columns
769
- :param param_borders: list with param grids for first level models and a fallback model.
770
- Empty dict skips optimization for that model.
771
- Param grid is a dict ``{param: [low, high]}``.
772
- :param criterion: metric to optimize
773
- :param k: length of a recommendation list
774
- :param budget: number of points to train each model
775
- :param new_study: keep searching with previous study or start a new study
776
- :return: list of dicts of parameters
777
- """
778
- number_of_models = len(self.first_level_models)
779
- if self.fallback_model is not None:
780
- number_of_models += 1
781
- if number_of_models != len(param_borders):
782
- raise ValueError(
783
- "Provide search grid or None for every first level model"
784
- )
785
-
786
- first_level_user_features_tr = ToNumericFeatureTransformer()
787
- first_level_user_features = first_level_user_features_tr.fit_transform(
788
- user_features
789
- )
790
- first_level_item_features_tr = ToNumericFeatureTransformer()
791
- first_level_item_features = first_level_item_features_tr.fit_transform(
792
- item_features
793
- )
794
-
795
- first_level_user_features = cache_if_exists(first_level_user_features)
796
- first_level_item_features = cache_if_exists(first_level_item_features)
797
-
798
- params_found = []
799
- for i, model in enumerate(self.first_level_models):
800
- if param_borders[i] is None or (
801
- isinstance(param_borders[i], dict) and param_borders[i]
802
- ):
803
- self.logger.info(
804
- "Optimizing first level model number %s, %s",
805
- i,
806
- model.__str__(),
807
- )
808
- params_found.append(
809
- self._optimize_one_model(
810
- model=model,
811
- train=train,
812
- test=test,
813
- user_features=first_level_user_features,
814
- item_features=first_level_item_features,
815
- param_borders=param_borders[i],
816
- criterion=criterion,
817
- k=k,
818
- budget=budget,
819
- new_study=new_study,
820
- )
821
- )
822
- else:
823
- params_found.append(None)
824
-
825
- if self.fallback_model is None or (
826
- isinstance(param_borders[-1], dict) and not param_borders[-1]
827
- ):
828
- return params_found, None
829
-
830
- self.logger.info("Optimizing fallback-model")
831
- fallback_params = self._optimize_one_model(
832
- model=self.fallback_model,
833
- train=train,
834
- test=test,
835
- user_features=first_level_user_features,
836
- item_features=first_level_item_features,
837
- param_borders=param_borders[-1],
838
- criterion=criterion,
839
- new_study=new_study,
840
- )
841
- unpersist_if_exists(first_level_item_features)
842
- unpersist_if_exists(first_level_user_features)
843
- return params_found, fallback_params