replay-rec 0.18.0rc0__py3-none-any.whl → 0.18.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. replay/__init__.py +1 -1
  2. replay/data/dataset.py +27 -1
  3. replay/data/dataset_utils/dataset_label_encoder.py +6 -3
  4. replay/data/nn/schema.py +37 -16
  5. replay/data/nn/sequence_tokenizer.py +313 -165
  6. replay/data/nn/torch_sequential_dataset.py +17 -8
  7. replay/data/nn/utils.py +14 -7
  8. replay/data/schema.py +10 -6
  9. replay/metrics/offline_metrics.py +2 -2
  10. replay/models/__init__.py +1 -0
  11. replay/models/base_rec.py +18 -21
  12. replay/models/lin_ucb.py +407 -0
  13. replay/models/nn/sequential/bert4rec/dataset.py +17 -4
  14. replay/models/nn/sequential/bert4rec/lightning.py +121 -54
  15. replay/models/nn/sequential/bert4rec/model.py +21 -0
  16. replay/models/nn/sequential/callbacks/prediction_callbacks.py +5 -1
  17. replay/models/nn/sequential/compiled/__init__.py +5 -0
  18. replay/models/nn/sequential/compiled/base_compiled_model.py +261 -0
  19. replay/models/nn/sequential/compiled/bert4rec_compiled.py +152 -0
  20. replay/models/nn/sequential/compiled/sasrec_compiled.py +145 -0
  21. replay/models/nn/sequential/postprocessors/postprocessors.py +27 -1
  22. replay/models/nn/sequential/sasrec/dataset.py +17 -1
  23. replay/models/nn/sequential/sasrec/lightning.py +126 -50
  24. replay/models/nn/sequential/sasrec/model.py +3 -4
  25. replay/preprocessing/__init__.py +7 -1
  26. replay/preprocessing/discretizer.py +719 -0
  27. replay/preprocessing/label_encoder.py +384 -52
  28. replay/splitters/cold_user_random_splitter.py +1 -1
  29. replay/utils/__init__.py +1 -0
  30. replay/utils/common.py +7 -8
  31. replay/utils/session_handler.py +3 -4
  32. replay/utils/spark_utils.py +15 -1
  33. replay/utils/types.py +8 -0
  34. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/METADATA +75 -70
  35. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/RECORD +37 -84
  36. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/WHEEL +1 -1
  37. replay/experimental/__init__.py +0 -0
  38. replay/experimental/metrics/__init__.py +0 -62
  39. replay/experimental/metrics/base_metric.py +0 -602
  40. replay/experimental/metrics/coverage.py +0 -97
  41. replay/experimental/metrics/experiment.py +0 -175
  42. replay/experimental/metrics/hitrate.py +0 -26
  43. replay/experimental/metrics/map.py +0 -30
  44. replay/experimental/metrics/mrr.py +0 -18
  45. replay/experimental/metrics/ncis_precision.py +0 -31
  46. replay/experimental/metrics/ndcg.py +0 -49
  47. replay/experimental/metrics/precision.py +0 -22
  48. replay/experimental/metrics/recall.py +0 -25
  49. replay/experimental/metrics/rocauc.py +0 -49
  50. replay/experimental/metrics/surprisal.py +0 -90
  51. replay/experimental/metrics/unexpectedness.py +0 -76
  52. replay/experimental/models/__init__.py +0 -10
  53. replay/experimental/models/admm_slim.py +0 -205
  54. replay/experimental/models/base_neighbour_rec.py +0 -204
  55. replay/experimental/models/base_rec.py +0 -1271
  56. replay/experimental/models/base_torch_rec.py +0 -234
  57. replay/experimental/models/cql.py +0 -454
  58. replay/experimental/models/ddpg.py +0 -923
  59. replay/experimental/models/dt4rec/__init__.py +0 -0
  60. replay/experimental/models/dt4rec/dt4rec.py +0 -189
  61. replay/experimental/models/dt4rec/gpt1.py +0 -401
  62. replay/experimental/models/dt4rec/trainer.py +0 -127
  63. replay/experimental/models/dt4rec/utils.py +0 -265
  64. replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
  65. replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -792
  66. replay/experimental/models/implicit_wrap.py +0 -131
  67. replay/experimental/models/lightfm_wrap.py +0 -302
  68. replay/experimental/models/mult_vae.py +0 -332
  69. replay/experimental/models/neuromf.py +0 -406
  70. replay/experimental/models/scala_als.py +0 -296
  71. replay/experimental/nn/data/__init__.py +0 -1
  72. replay/experimental/nn/data/schema_builder.py +0 -55
  73. replay/experimental/preprocessing/__init__.py +0 -3
  74. replay/experimental/preprocessing/data_preparator.py +0 -839
  75. replay/experimental/preprocessing/padder.py +0 -229
  76. replay/experimental/preprocessing/sequence_generator.py +0 -208
  77. replay/experimental/scenarios/__init__.py +0 -1
  78. replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
  79. replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -74
  80. replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -248
  81. replay/experimental/scenarios/obp_wrapper/utils.py +0 -87
  82. replay/experimental/scenarios/two_stages/__init__.py +0 -0
  83. replay/experimental/scenarios/two_stages/reranker.py +0 -117
  84. replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -757
  85. replay/experimental/utils/__init__.py +0 -0
  86. replay/experimental/utils/logger.py +0 -24
  87. replay/experimental/utils/model_handler.py +0 -186
  88. replay/experimental/utils/session_handler.py +0 -44
  89. replay_rec-0.18.0rc0.dist-info/NOTICE +0 -41
  90. {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/LICENSE +0 -0
@@ -1,332 +0,0 @@
1
- """
2
- MultVAE implementation
3
- (Variational Autoencoders for Collaborative Filtering)
4
- """
5
-
6
- from typing import Optional, Tuple
7
-
8
- import numpy as np
9
- import torch
10
- import torch.nn.functional as sf
11
- from scipy.sparse import csr_matrix
12
- from sklearn.model_selection import GroupShuffleSplit
13
- from torch import nn
14
- from torch.optim import Adam
15
- from torch.optim.lr_scheduler import ReduceLROnPlateau
16
- from torch.utils.data import DataLoader, TensorDataset
17
-
18
- from replay.experimental.models.base_torch_rec import TorchRecommender
19
- from replay.utils import PandasDataFrame, SparkDataFrame
20
-
21
-
22
- class VAE(nn.Module):
23
- """Base variational autoencoder"""
24
-
25
- def __init__(
26
- self,
27
- item_count: int,
28
- latent_dim: int,
29
- hidden_dim: int = 600,
30
- dropout: float = 0.3,
31
- ):
32
- """
33
- :param item_count: number of items
34
- :param latent_dim: latent dimension size
35
- :param hidden_dim: hidden dimension size for encoder and decoder
36
- :param dropout: dropout coefficient
37
- """
38
- super().__init__()
39
-
40
- self.latent_dim = latent_dim
41
- self.encoder_dims = [item_count, hidden_dim, latent_dim * 2]
42
- self.decoder_dims = [latent_dim, hidden_dim, item_count]
43
-
44
- self.encoder = nn.ModuleList(
45
- [nn.Linear(d_in, d_out) for d_in, d_out in zip(self.encoder_dims[:-1], self.encoder_dims[1:])]
46
- )
47
- self.decoder = nn.ModuleList(
48
- [nn.Linear(d_in, d_out) for d_in, d_out in zip(self.decoder_dims[:-1], self.decoder_dims[1:])]
49
- )
50
- self.dropout = nn.Dropout(dropout)
51
- self.activation = torch.nn.ReLU()
52
-
53
- for layer in self.encoder:
54
- self.weight_init(layer)
55
-
56
- for layer in self.decoder:
57
- self.weight_init(layer)
58
-
59
- def encode(self, batch: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
60
- """Encode"""
61
- hidden = sf.normalize(batch, p=2, dim=1)
62
- hidden = self.dropout(hidden)
63
-
64
- for layer in self.encoder[:-1]:
65
- hidden = layer(hidden)
66
- hidden = self.activation(hidden)
67
-
68
- hidden = self.encoder[-1](hidden)
69
- mu_latent = hidden[:, : self.latent_dim]
70
- logvar_latent = hidden[:, self.latent_dim :]
71
- return mu_latent, logvar_latent
72
-
73
- def reparameterize(self, mu_latent: torch.Tensor, logvar_latent: torch.Tensor) -> torch.Tensor:
74
- """Reparametrization trick"""
75
-
76
- if self.training:
77
- std = torch.exp(0.5 * logvar_latent)
78
- eps = torch.randn_like(std)
79
- return eps * std + mu_latent
80
- return mu_latent
81
-
82
- def decode(self, z_latent: torch.Tensor) -> torch.Tensor:
83
- """Decode"""
84
- hidden = z_latent
85
- for layer in self.decoder[:-1]:
86
- hidden = layer(hidden)
87
- hidden = self.activation(hidden)
88
- return self.decoder[-1](hidden)
89
-
90
- def forward(self, batch: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
91
- """
92
- :param batch: user batch
93
- :return: output, expectation and logarithm of variation
94
- """
95
- mu_latent, logvar_latent = self.encode(batch)
96
- z_latent = self.reparameterize(mu_latent, logvar_latent)
97
- return self.decode(z_latent), mu_latent, logvar_latent
98
-
99
- @staticmethod
100
- def weight_init(layer: nn.Module):
101
- """
102
- Xavier initialization
103
-
104
- :param layer: layer of a model
105
- """
106
- if isinstance(layer, nn.Linear):
107
- nn.init.xavier_normal_(layer.weight.data)
108
- layer.bias.data.normal_(0.0, 0.001)
109
-
110
-
111
- class MultVAE(TorchRecommender):
112
- """`Variational Autoencoders for Collaborative Filtering
113
- <https://arxiv.org/pdf/1802.05814.pdf>`_"""
114
-
115
- num_workers: int = 0
116
- batch_size_users: int = 5000
117
- patience: int = 10
118
- n_saved: int = 2
119
- valid_split_size: float = 0.1
120
- seed: int = 42
121
- can_predict_cold_users = True
122
- train_user_batch: csr_matrix
123
- valid_user_batch: csr_matrix
124
- _search_space = {
125
- "learning_rate": {"type": "loguniform", "args": [0.0001, 0.5]},
126
- "epochs": {"type": "int", "args": [100, 100]},
127
- "latent_dim": {"type": "int", "args": [200, 200]},
128
- "hidden_dim": {"type": "int", "args": [600, 600]},
129
- "dropout": {"type": "uniform", "args": [0, 0.5]},
130
- "anneal": {"type": "uniform", "args": [0.2, 1]},
131
- "l2_reg": {"type": "loguniform", "args": [1e-9, 5]},
132
- "factor": {"type": "uniform", "args": [0.2, 0.2]},
133
- "patience": {"type": "int", "args": [3, 3]},
134
- }
135
-
136
- def __init__(
137
- self,
138
- learning_rate: float = 0.01,
139
- epochs: int = 100,
140
- latent_dim: int = 200,
141
- hidden_dim: int = 600,
142
- dropout: float = 0.3,
143
- anneal: float = 0.1,
144
- l2_reg: float = 0,
145
- factor: float = 0.2,
146
- patience: int = 3,
147
- ):
148
- """
149
- :param learning_rate: learning rate
150
- :param epochs: number of epochs to train model
151
- :param latent_dim: latent dimension size for user vectors
152
- :param hidden_dim: hidden dimension size for encoder and decoder
153
- :param dropout: dropout coefficient
154
- :param anneal: anneal coefficient [0,1]
155
- :param l2_reg: l2 regularization term
156
- :param factor: ReduceLROnPlateau reducing factor. new_lr = lr * factor
157
- :param patience: number of non-improved epochs before reducing lr
158
- """
159
- super().__init__()
160
- self.learning_rate = learning_rate
161
- self.epochs = epochs
162
- self.latent_dim = latent_dim
163
- self.hidden_dim = hidden_dim
164
- self.dropout = dropout
165
- self.anneal = anneal
166
- self.l2_reg = l2_reg
167
- self.factor = factor
168
- self.patience = patience
169
-
170
- @property
171
- def _init_args(self):
172
- return {
173
- "learning_rate": self.learning_rate,
174
- "epochs": self.epochs,
175
- "latent_dim": self.latent_dim,
176
- "hidden_dim": self.hidden_dim,
177
- "dropout": self.dropout,
178
- "anneal": self.anneal,
179
- "l2_reg": self.l2_reg,
180
- "factor": self.factor,
181
- "patience": self.patience,
182
- }
183
-
184
- def _get_data_loader(
185
- self, data: PandasDataFrame, shuffle: bool = True
186
- ) -> Tuple[csr_matrix, DataLoader, np.ndarray]:
187
- """get data loader and matrix with data"""
188
- users_count = data["user_idx"].value_counts().count()
189
- user_idx = data["user_idx"].astype("category").cat
190
- user_batch = csr_matrix(
191
- (
192
- np.ones(len(data["user_idx"])),
193
- ([user_idx.codes.values, data["item_idx"].values]),
194
- ),
195
- shape=(users_count, self._item_dim),
196
- )
197
- data_loader = DataLoader(
198
- TensorDataset(torch.arange(users_count).long()),
199
- batch_size=self.batch_size_users,
200
- shuffle=shuffle,
201
- num_workers=self.num_workers,
202
- )
203
-
204
- return user_batch, data_loader, user_idx.categories.values
205
-
206
- def _fit(
207
- self,
208
- log: SparkDataFrame,
209
- user_features: Optional[SparkDataFrame] = None, # noqa: ARG002
210
- item_features: Optional[SparkDataFrame] = None, # noqa: ARG002
211
- ) -> None:
212
- self.logger.debug("Creating batch")
213
- data = log.select("user_idx", "item_idx").toPandas()
214
- splitter = GroupShuffleSplit(n_splits=1, test_size=self.valid_split_size, random_state=self.seed)
215
- train_idx, valid_idx = next(splitter.split(data, groups=data["user_idx"]))
216
- train_data, valid_data = data.iloc[train_idx], data.iloc[valid_idx]
217
-
218
- self.train_user_batch, train_data_loader, _ = self._get_data_loader(train_data)
219
- self.valid_user_batch, valid_data_loader, _ = self._get_data_loader(valid_data, False)
220
-
221
- self.logger.debug("Training VAE")
222
- self.model = VAE(
223
- item_count=self._item_dim,
224
- latent_dim=self.latent_dim,
225
- hidden_dim=self.hidden_dim,
226
- dropout=self.dropout,
227
- ).to(self.device)
228
- optimizer = Adam(
229
- self.model.parameters(),
230
- lr=self.learning_rate,
231
- weight_decay=self.l2_reg / self.batch_size_users,
232
- )
233
- lr_scheduler = ReduceLROnPlateau(optimizer, factor=self.factor, patience=self.patience)
234
-
235
- self.train(
236
- train_data_loader,
237
- valid_data_loader,
238
- optimizer,
239
- lr_scheduler,
240
- self.epochs,
241
- "multvae",
242
- )
243
-
244
- def _loss(self, y_pred, y_true, mu_latent, logvar_latent):
245
- log_softmax_var = sf.log_softmax(y_pred, dim=1)
246
- bce = -(log_softmax_var * y_true).sum(dim=1).mean()
247
- kld = (
248
- -0.5
249
- * torch.sum(
250
- 1 + logvar_latent - mu_latent.pow(2) - logvar_latent.exp(),
251
- dim=1,
252
- ).mean()
253
- )
254
- return bce + self.anneal * kld
255
-
256
- def _batch_pass(self, batch, model):
257
- full_batch = self.train_user_batch if model.training else self.valid_user_batch
258
- user_batch = torch.FloatTensor(full_batch[batch[0]].toarray()).to(self.device)
259
- pred_user_batch, latent_mu, latent_logvar = self.model.forward(user_batch)
260
- return {
261
- "y_pred": pred_user_batch,
262
- "y_true": user_batch,
263
- "mu_latent": latent_mu,
264
- "logvar_latent": latent_logvar,
265
- }
266
-
267
- @staticmethod
268
- def _predict_pairs_inner(
269
- model: nn.Module,
270
- user_idx: int,
271
- items_np_history: np.ndarray,
272
- items_np_to_pred: np.ndarray,
273
- item_count: int,
274
- cnt: Optional[int] = None,
275
- ) -> SparkDataFrame:
276
- model.eval()
277
- with torch.no_grad():
278
- user_batch = torch.zeros((1, item_count))
279
- user_batch[0, items_np_history] = 1
280
- user_recs = sf.softmax(model(user_batch)[0][0].detach(), dim=0)
281
- if cnt is not None:
282
- best_item_idx = (torch.argsort(user_recs[items_np_to_pred], descending=True)[:cnt]).numpy()
283
- items_np_to_pred = items_np_to_pred[best_item_idx]
284
- return PandasDataFrame(
285
- {
286
- "user_idx": np.array(items_np_to_pred.shape[0] * [user_idx]),
287
- "item_idx": items_np_to_pred,
288
- "relevance": user_recs[items_np_to_pred],
289
- }
290
- )
291
-
292
- @staticmethod
293
- def _predict_by_user(
294
- pandas_df: PandasDataFrame,
295
- model: nn.Module,
296
- items_np: np.ndarray,
297
- k: int,
298
- item_count: int,
299
- ) -> PandasDataFrame:
300
- return MultVAE._predict_pairs_inner(
301
- model=model,
302
- user_idx=pandas_df["user_idx"][0],
303
- items_np_history=pandas_df["item_idx"].values,
304
- items_np_to_pred=items_np,
305
- item_count=item_count,
306
- cnt=min(len(pandas_df) + k, len(items_np)),
307
- )
308
-
309
- @staticmethod
310
- def _predict_by_user_pairs(
311
- pandas_df: PandasDataFrame,
312
- model: nn.Module,
313
- item_count: int,
314
- ) -> PandasDataFrame:
315
- return MultVAE._predict_pairs_inner(
316
- model=model,
317
- user_idx=pandas_df["user_idx"][0],
318
- items_np_history=np.array(pandas_df["item_idx_history"][0]),
319
- items_np_to_pred=np.array(pandas_df["item_idx_to_pred"][0]),
320
- item_count=item_count,
321
- cnt=None,
322
- )
323
-
324
- def _load_model(self, path: str):
325
- self.model = VAE(
326
- item_count=self._item_dim,
327
- latent_dim=self.latent_dim,
328
- hidden_dim=self.hidden_dim,
329
- dropout=self.dropout,
330
- ).to(self.device)
331
- self.model.load_state_dict(torch.load(path))
332
- self.model.eval()