replay-rec 0.18.0rc0__py3-none-any.whl → 0.18.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- replay/__init__.py +1 -1
- replay/data/dataset.py +27 -1
- replay/data/dataset_utils/dataset_label_encoder.py +6 -3
- replay/data/nn/schema.py +37 -16
- replay/data/nn/sequence_tokenizer.py +313 -165
- replay/data/nn/torch_sequential_dataset.py +17 -8
- replay/data/nn/utils.py +14 -7
- replay/data/schema.py +10 -6
- replay/metrics/offline_metrics.py +2 -2
- replay/models/__init__.py +1 -0
- replay/models/base_rec.py +18 -21
- replay/models/lin_ucb.py +407 -0
- replay/models/nn/sequential/bert4rec/dataset.py +17 -4
- replay/models/nn/sequential/bert4rec/lightning.py +121 -54
- replay/models/nn/sequential/bert4rec/model.py +21 -0
- replay/models/nn/sequential/callbacks/prediction_callbacks.py +5 -1
- replay/models/nn/sequential/compiled/__init__.py +5 -0
- replay/models/nn/sequential/compiled/base_compiled_model.py +261 -0
- replay/models/nn/sequential/compiled/bert4rec_compiled.py +152 -0
- replay/models/nn/sequential/compiled/sasrec_compiled.py +145 -0
- replay/models/nn/sequential/postprocessors/postprocessors.py +27 -1
- replay/models/nn/sequential/sasrec/dataset.py +17 -1
- replay/models/nn/sequential/sasrec/lightning.py +126 -50
- replay/models/nn/sequential/sasrec/model.py +3 -4
- replay/preprocessing/__init__.py +7 -1
- replay/preprocessing/discretizer.py +719 -0
- replay/preprocessing/label_encoder.py +384 -52
- replay/splitters/cold_user_random_splitter.py +1 -1
- replay/utils/__init__.py +1 -0
- replay/utils/common.py +7 -8
- replay/utils/session_handler.py +3 -4
- replay/utils/spark_utils.py +15 -1
- replay/utils/types.py +8 -0
- {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/METADATA +75 -70
- {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/RECORD +37 -84
- {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/WHEEL +1 -1
- replay/experimental/__init__.py +0 -0
- replay/experimental/metrics/__init__.py +0 -62
- replay/experimental/metrics/base_metric.py +0 -602
- replay/experimental/metrics/coverage.py +0 -97
- replay/experimental/metrics/experiment.py +0 -175
- replay/experimental/metrics/hitrate.py +0 -26
- replay/experimental/metrics/map.py +0 -30
- replay/experimental/metrics/mrr.py +0 -18
- replay/experimental/metrics/ncis_precision.py +0 -31
- replay/experimental/metrics/ndcg.py +0 -49
- replay/experimental/metrics/precision.py +0 -22
- replay/experimental/metrics/recall.py +0 -25
- replay/experimental/metrics/rocauc.py +0 -49
- replay/experimental/metrics/surprisal.py +0 -90
- replay/experimental/metrics/unexpectedness.py +0 -76
- replay/experimental/models/__init__.py +0 -10
- replay/experimental/models/admm_slim.py +0 -205
- replay/experimental/models/base_neighbour_rec.py +0 -204
- replay/experimental/models/base_rec.py +0 -1271
- replay/experimental/models/base_torch_rec.py +0 -234
- replay/experimental/models/cql.py +0 -454
- replay/experimental/models/ddpg.py +0 -923
- replay/experimental/models/dt4rec/__init__.py +0 -0
- replay/experimental/models/dt4rec/dt4rec.py +0 -189
- replay/experimental/models/dt4rec/gpt1.py +0 -401
- replay/experimental/models/dt4rec/trainer.py +0 -127
- replay/experimental/models/dt4rec/utils.py +0 -265
- replay/experimental/models/extensions/spark_custom_models/__init__.py +0 -0
- replay/experimental/models/extensions/spark_custom_models/als_extension.py +0 -792
- replay/experimental/models/implicit_wrap.py +0 -131
- replay/experimental/models/lightfm_wrap.py +0 -302
- replay/experimental/models/mult_vae.py +0 -332
- replay/experimental/models/neuromf.py +0 -406
- replay/experimental/models/scala_als.py +0 -296
- replay/experimental/nn/data/__init__.py +0 -1
- replay/experimental/nn/data/schema_builder.py +0 -55
- replay/experimental/preprocessing/__init__.py +0 -3
- replay/experimental/preprocessing/data_preparator.py +0 -839
- replay/experimental/preprocessing/padder.py +0 -229
- replay/experimental/preprocessing/sequence_generator.py +0 -208
- replay/experimental/scenarios/__init__.py +0 -1
- replay/experimental/scenarios/obp_wrapper/__init__.py +0 -8
- replay/experimental/scenarios/obp_wrapper/obp_optuna_objective.py +0 -74
- replay/experimental/scenarios/obp_wrapper/replay_offline.py +0 -248
- replay/experimental/scenarios/obp_wrapper/utils.py +0 -87
- replay/experimental/scenarios/two_stages/__init__.py +0 -0
- replay/experimental/scenarios/two_stages/reranker.py +0 -117
- replay/experimental/scenarios/two_stages/two_stages_scenario.py +0 -757
- replay/experimental/utils/__init__.py +0 -0
- replay/experimental/utils/logger.py +0 -24
- replay/experimental/utils/model_handler.py +0 -186
- replay/experimental/utils/session_handler.py +0 -44
- replay_rec-0.18.0rc0.dist-info/NOTICE +0 -41
- {replay_rec-0.18.0rc0.dist-info → replay_rec-0.18.1.dist-info}/LICENSE +0 -0
|
@@ -1,332 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
MultVAE implementation
|
|
3
|
-
(Variational Autoencoders for Collaborative Filtering)
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
from typing import Optional, Tuple
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import torch
|
|
10
|
-
import torch.nn.functional as sf
|
|
11
|
-
from scipy.sparse import csr_matrix
|
|
12
|
-
from sklearn.model_selection import GroupShuffleSplit
|
|
13
|
-
from torch import nn
|
|
14
|
-
from torch.optim import Adam
|
|
15
|
-
from torch.optim.lr_scheduler import ReduceLROnPlateau
|
|
16
|
-
from torch.utils.data import DataLoader, TensorDataset
|
|
17
|
-
|
|
18
|
-
from replay.experimental.models.base_torch_rec import TorchRecommender
|
|
19
|
-
from replay.utils import PandasDataFrame, SparkDataFrame
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class VAE(nn.Module):
|
|
23
|
-
"""Base variational autoencoder"""
|
|
24
|
-
|
|
25
|
-
def __init__(
|
|
26
|
-
self,
|
|
27
|
-
item_count: int,
|
|
28
|
-
latent_dim: int,
|
|
29
|
-
hidden_dim: int = 600,
|
|
30
|
-
dropout: float = 0.3,
|
|
31
|
-
):
|
|
32
|
-
"""
|
|
33
|
-
:param item_count: number of items
|
|
34
|
-
:param latent_dim: latent dimension size
|
|
35
|
-
:param hidden_dim: hidden dimension size for encoder and decoder
|
|
36
|
-
:param dropout: dropout coefficient
|
|
37
|
-
"""
|
|
38
|
-
super().__init__()
|
|
39
|
-
|
|
40
|
-
self.latent_dim = latent_dim
|
|
41
|
-
self.encoder_dims = [item_count, hidden_dim, latent_dim * 2]
|
|
42
|
-
self.decoder_dims = [latent_dim, hidden_dim, item_count]
|
|
43
|
-
|
|
44
|
-
self.encoder = nn.ModuleList(
|
|
45
|
-
[nn.Linear(d_in, d_out) for d_in, d_out in zip(self.encoder_dims[:-1], self.encoder_dims[1:])]
|
|
46
|
-
)
|
|
47
|
-
self.decoder = nn.ModuleList(
|
|
48
|
-
[nn.Linear(d_in, d_out) for d_in, d_out in zip(self.decoder_dims[:-1], self.decoder_dims[1:])]
|
|
49
|
-
)
|
|
50
|
-
self.dropout = nn.Dropout(dropout)
|
|
51
|
-
self.activation = torch.nn.ReLU()
|
|
52
|
-
|
|
53
|
-
for layer in self.encoder:
|
|
54
|
-
self.weight_init(layer)
|
|
55
|
-
|
|
56
|
-
for layer in self.decoder:
|
|
57
|
-
self.weight_init(layer)
|
|
58
|
-
|
|
59
|
-
def encode(self, batch: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
60
|
-
"""Encode"""
|
|
61
|
-
hidden = sf.normalize(batch, p=2, dim=1)
|
|
62
|
-
hidden = self.dropout(hidden)
|
|
63
|
-
|
|
64
|
-
for layer in self.encoder[:-1]:
|
|
65
|
-
hidden = layer(hidden)
|
|
66
|
-
hidden = self.activation(hidden)
|
|
67
|
-
|
|
68
|
-
hidden = self.encoder[-1](hidden)
|
|
69
|
-
mu_latent = hidden[:, : self.latent_dim]
|
|
70
|
-
logvar_latent = hidden[:, self.latent_dim :]
|
|
71
|
-
return mu_latent, logvar_latent
|
|
72
|
-
|
|
73
|
-
def reparameterize(self, mu_latent: torch.Tensor, logvar_latent: torch.Tensor) -> torch.Tensor:
|
|
74
|
-
"""Reparametrization trick"""
|
|
75
|
-
|
|
76
|
-
if self.training:
|
|
77
|
-
std = torch.exp(0.5 * logvar_latent)
|
|
78
|
-
eps = torch.randn_like(std)
|
|
79
|
-
return eps * std + mu_latent
|
|
80
|
-
return mu_latent
|
|
81
|
-
|
|
82
|
-
def decode(self, z_latent: torch.Tensor) -> torch.Tensor:
|
|
83
|
-
"""Decode"""
|
|
84
|
-
hidden = z_latent
|
|
85
|
-
for layer in self.decoder[:-1]:
|
|
86
|
-
hidden = layer(hidden)
|
|
87
|
-
hidden = self.activation(hidden)
|
|
88
|
-
return self.decoder[-1](hidden)
|
|
89
|
-
|
|
90
|
-
def forward(self, batch: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
91
|
-
"""
|
|
92
|
-
:param batch: user batch
|
|
93
|
-
:return: output, expectation and logarithm of variation
|
|
94
|
-
"""
|
|
95
|
-
mu_latent, logvar_latent = self.encode(batch)
|
|
96
|
-
z_latent = self.reparameterize(mu_latent, logvar_latent)
|
|
97
|
-
return self.decode(z_latent), mu_latent, logvar_latent
|
|
98
|
-
|
|
99
|
-
@staticmethod
|
|
100
|
-
def weight_init(layer: nn.Module):
|
|
101
|
-
"""
|
|
102
|
-
Xavier initialization
|
|
103
|
-
|
|
104
|
-
:param layer: layer of a model
|
|
105
|
-
"""
|
|
106
|
-
if isinstance(layer, nn.Linear):
|
|
107
|
-
nn.init.xavier_normal_(layer.weight.data)
|
|
108
|
-
layer.bias.data.normal_(0.0, 0.001)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
class MultVAE(TorchRecommender):
|
|
112
|
-
"""`Variational Autoencoders for Collaborative Filtering
|
|
113
|
-
<https://arxiv.org/pdf/1802.05814.pdf>`_"""
|
|
114
|
-
|
|
115
|
-
num_workers: int = 0
|
|
116
|
-
batch_size_users: int = 5000
|
|
117
|
-
patience: int = 10
|
|
118
|
-
n_saved: int = 2
|
|
119
|
-
valid_split_size: float = 0.1
|
|
120
|
-
seed: int = 42
|
|
121
|
-
can_predict_cold_users = True
|
|
122
|
-
train_user_batch: csr_matrix
|
|
123
|
-
valid_user_batch: csr_matrix
|
|
124
|
-
_search_space = {
|
|
125
|
-
"learning_rate": {"type": "loguniform", "args": [0.0001, 0.5]},
|
|
126
|
-
"epochs": {"type": "int", "args": [100, 100]},
|
|
127
|
-
"latent_dim": {"type": "int", "args": [200, 200]},
|
|
128
|
-
"hidden_dim": {"type": "int", "args": [600, 600]},
|
|
129
|
-
"dropout": {"type": "uniform", "args": [0, 0.5]},
|
|
130
|
-
"anneal": {"type": "uniform", "args": [0.2, 1]},
|
|
131
|
-
"l2_reg": {"type": "loguniform", "args": [1e-9, 5]},
|
|
132
|
-
"factor": {"type": "uniform", "args": [0.2, 0.2]},
|
|
133
|
-
"patience": {"type": "int", "args": [3, 3]},
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
def __init__(
|
|
137
|
-
self,
|
|
138
|
-
learning_rate: float = 0.01,
|
|
139
|
-
epochs: int = 100,
|
|
140
|
-
latent_dim: int = 200,
|
|
141
|
-
hidden_dim: int = 600,
|
|
142
|
-
dropout: float = 0.3,
|
|
143
|
-
anneal: float = 0.1,
|
|
144
|
-
l2_reg: float = 0,
|
|
145
|
-
factor: float = 0.2,
|
|
146
|
-
patience: int = 3,
|
|
147
|
-
):
|
|
148
|
-
"""
|
|
149
|
-
:param learning_rate: learning rate
|
|
150
|
-
:param epochs: number of epochs to train model
|
|
151
|
-
:param latent_dim: latent dimension size for user vectors
|
|
152
|
-
:param hidden_dim: hidden dimension size for encoder and decoder
|
|
153
|
-
:param dropout: dropout coefficient
|
|
154
|
-
:param anneal: anneal coefficient [0,1]
|
|
155
|
-
:param l2_reg: l2 regularization term
|
|
156
|
-
:param factor: ReduceLROnPlateau reducing factor. new_lr = lr * factor
|
|
157
|
-
:param patience: number of non-improved epochs before reducing lr
|
|
158
|
-
"""
|
|
159
|
-
super().__init__()
|
|
160
|
-
self.learning_rate = learning_rate
|
|
161
|
-
self.epochs = epochs
|
|
162
|
-
self.latent_dim = latent_dim
|
|
163
|
-
self.hidden_dim = hidden_dim
|
|
164
|
-
self.dropout = dropout
|
|
165
|
-
self.anneal = anneal
|
|
166
|
-
self.l2_reg = l2_reg
|
|
167
|
-
self.factor = factor
|
|
168
|
-
self.patience = patience
|
|
169
|
-
|
|
170
|
-
@property
|
|
171
|
-
def _init_args(self):
|
|
172
|
-
return {
|
|
173
|
-
"learning_rate": self.learning_rate,
|
|
174
|
-
"epochs": self.epochs,
|
|
175
|
-
"latent_dim": self.latent_dim,
|
|
176
|
-
"hidden_dim": self.hidden_dim,
|
|
177
|
-
"dropout": self.dropout,
|
|
178
|
-
"anneal": self.anneal,
|
|
179
|
-
"l2_reg": self.l2_reg,
|
|
180
|
-
"factor": self.factor,
|
|
181
|
-
"patience": self.patience,
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
def _get_data_loader(
|
|
185
|
-
self, data: PandasDataFrame, shuffle: bool = True
|
|
186
|
-
) -> Tuple[csr_matrix, DataLoader, np.ndarray]:
|
|
187
|
-
"""get data loader and matrix with data"""
|
|
188
|
-
users_count = data["user_idx"].value_counts().count()
|
|
189
|
-
user_idx = data["user_idx"].astype("category").cat
|
|
190
|
-
user_batch = csr_matrix(
|
|
191
|
-
(
|
|
192
|
-
np.ones(len(data["user_idx"])),
|
|
193
|
-
([user_idx.codes.values, data["item_idx"].values]),
|
|
194
|
-
),
|
|
195
|
-
shape=(users_count, self._item_dim),
|
|
196
|
-
)
|
|
197
|
-
data_loader = DataLoader(
|
|
198
|
-
TensorDataset(torch.arange(users_count).long()),
|
|
199
|
-
batch_size=self.batch_size_users,
|
|
200
|
-
shuffle=shuffle,
|
|
201
|
-
num_workers=self.num_workers,
|
|
202
|
-
)
|
|
203
|
-
|
|
204
|
-
return user_batch, data_loader, user_idx.categories.values
|
|
205
|
-
|
|
206
|
-
def _fit(
|
|
207
|
-
self,
|
|
208
|
-
log: SparkDataFrame,
|
|
209
|
-
user_features: Optional[SparkDataFrame] = None, # noqa: ARG002
|
|
210
|
-
item_features: Optional[SparkDataFrame] = None, # noqa: ARG002
|
|
211
|
-
) -> None:
|
|
212
|
-
self.logger.debug("Creating batch")
|
|
213
|
-
data = log.select("user_idx", "item_idx").toPandas()
|
|
214
|
-
splitter = GroupShuffleSplit(n_splits=1, test_size=self.valid_split_size, random_state=self.seed)
|
|
215
|
-
train_idx, valid_idx = next(splitter.split(data, groups=data["user_idx"]))
|
|
216
|
-
train_data, valid_data = data.iloc[train_idx], data.iloc[valid_idx]
|
|
217
|
-
|
|
218
|
-
self.train_user_batch, train_data_loader, _ = self._get_data_loader(train_data)
|
|
219
|
-
self.valid_user_batch, valid_data_loader, _ = self._get_data_loader(valid_data, False)
|
|
220
|
-
|
|
221
|
-
self.logger.debug("Training VAE")
|
|
222
|
-
self.model = VAE(
|
|
223
|
-
item_count=self._item_dim,
|
|
224
|
-
latent_dim=self.latent_dim,
|
|
225
|
-
hidden_dim=self.hidden_dim,
|
|
226
|
-
dropout=self.dropout,
|
|
227
|
-
).to(self.device)
|
|
228
|
-
optimizer = Adam(
|
|
229
|
-
self.model.parameters(),
|
|
230
|
-
lr=self.learning_rate,
|
|
231
|
-
weight_decay=self.l2_reg / self.batch_size_users,
|
|
232
|
-
)
|
|
233
|
-
lr_scheduler = ReduceLROnPlateau(optimizer, factor=self.factor, patience=self.patience)
|
|
234
|
-
|
|
235
|
-
self.train(
|
|
236
|
-
train_data_loader,
|
|
237
|
-
valid_data_loader,
|
|
238
|
-
optimizer,
|
|
239
|
-
lr_scheduler,
|
|
240
|
-
self.epochs,
|
|
241
|
-
"multvae",
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
def _loss(self, y_pred, y_true, mu_latent, logvar_latent):
|
|
245
|
-
log_softmax_var = sf.log_softmax(y_pred, dim=1)
|
|
246
|
-
bce = -(log_softmax_var * y_true).sum(dim=1).mean()
|
|
247
|
-
kld = (
|
|
248
|
-
-0.5
|
|
249
|
-
* torch.sum(
|
|
250
|
-
1 + logvar_latent - mu_latent.pow(2) - logvar_latent.exp(),
|
|
251
|
-
dim=1,
|
|
252
|
-
).mean()
|
|
253
|
-
)
|
|
254
|
-
return bce + self.anneal * kld
|
|
255
|
-
|
|
256
|
-
def _batch_pass(self, batch, model):
|
|
257
|
-
full_batch = self.train_user_batch if model.training else self.valid_user_batch
|
|
258
|
-
user_batch = torch.FloatTensor(full_batch[batch[0]].toarray()).to(self.device)
|
|
259
|
-
pred_user_batch, latent_mu, latent_logvar = self.model.forward(user_batch)
|
|
260
|
-
return {
|
|
261
|
-
"y_pred": pred_user_batch,
|
|
262
|
-
"y_true": user_batch,
|
|
263
|
-
"mu_latent": latent_mu,
|
|
264
|
-
"logvar_latent": latent_logvar,
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
@staticmethod
|
|
268
|
-
def _predict_pairs_inner(
|
|
269
|
-
model: nn.Module,
|
|
270
|
-
user_idx: int,
|
|
271
|
-
items_np_history: np.ndarray,
|
|
272
|
-
items_np_to_pred: np.ndarray,
|
|
273
|
-
item_count: int,
|
|
274
|
-
cnt: Optional[int] = None,
|
|
275
|
-
) -> SparkDataFrame:
|
|
276
|
-
model.eval()
|
|
277
|
-
with torch.no_grad():
|
|
278
|
-
user_batch = torch.zeros((1, item_count))
|
|
279
|
-
user_batch[0, items_np_history] = 1
|
|
280
|
-
user_recs = sf.softmax(model(user_batch)[0][0].detach(), dim=0)
|
|
281
|
-
if cnt is not None:
|
|
282
|
-
best_item_idx = (torch.argsort(user_recs[items_np_to_pred], descending=True)[:cnt]).numpy()
|
|
283
|
-
items_np_to_pred = items_np_to_pred[best_item_idx]
|
|
284
|
-
return PandasDataFrame(
|
|
285
|
-
{
|
|
286
|
-
"user_idx": np.array(items_np_to_pred.shape[0] * [user_idx]),
|
|
287
|
-
"item_idx": items_np_to_pred,
|
|
288
|
-
"relevance": user_recs[items_np_to_pred],
|
|
289
|
-
}
|
|
290
|
-
)
|
|
291
|
-
|
|
292
|
-
@staticmethod
|
|
293
|
-
def _predict_by_user(
|
|
294
|
-
pandas_df: PandasDataFrame,
|
|
295
|
-
model: nn.Module,
|
|
296
|
-
items_np: np.ndarray,
|
|
297
|
-
k: int,
|
|
298
|
-
item_count: int,
|
|
299
|
-
) -> PandasDataFrame:
|
|
300
|
-
return MultVAE._predict_pairs_inner(
|
|
301
|
-
model=model,
|
|
302
|
-
user_idx=pandas_df["user_idx"][0],
|
|
303
|
-
items_np_history=pandas_df["item_idx"].values,
|
|
304
|
-
items_np_to_pred=items_np,
|
|
305
|
-
item_count=item_count,
|
|
306
|
-
cnt=min(len(pandas_df) + k, len(items_np)),
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
@staticmethod
|
|
310
|
-
def _predict_by_user_pairs(
|
|
311
|
-
pandas_df: PandasDataFrame,
|
|
312
|
-
model: nn.Module,
|
|
313
|
-
item_count: int,
|
|
314
|
-
) -> PandasDataFrame:
|
|
315
|
-
return MultVAE._predict_pairs_inner(
|
|
316
|
-
model=model,
|
|
317
|
-
user_idx=pandas_df["user_idx"][0],
|
|
318
|
-
items_np_history=np.array(pandas_df["item_idx_history"][0]),
|
|
319
|
-
items_np_to_pred=np.array(pandas_df["item_idx_to_pred"][0]),
|
|
320
|
-
item_count=item_count,
|
|
321
|
-
cnt=None,
|
|
322
|
-
)
|
|
323
|
-
|
|
324
|
-
def _load_model(self, path: str):
|
|
325
|
-
self.model = VAE(
|
|
326
|
-
item_count=self._item_dim,
|
|
327
|
-
latent_dim=self.latent_dim,
|
|
328
|
-
hidden_dim=self.hidden_dim,
|
|
329
|
-
dropout=self.dropout,
|
|
330
|
-
).to(self.device)
|
|
331
|
-
self.model.load_state_dict(torch.load(path))
|
|
332
|
-
self.model.eval()
|