nextrec 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. nextrec/__init__.py +1 -1
  2. nextrec/__version__.py +1 -1
  3. nextrec/basic/activation.py +10 -5
  4. nextrec/basic/callback.py +1 -0
  5. nextrec/basic/features.py +30 -22
  6. nextrec/basic/layers.py +250 -112
  7. nextrec/basic/loggers.py +63 -44
  8. nextrec/basic/metrics.py +270 -120
  9. nextrec/basic/model.py +1084 -402
  10. nextrec/basic/session.py +10 -3
  11. nextrec/cli.py +492 -0
  12. nextrec/data/__init__.py +19 -25
  13. nextrec/data/batch_utils.py +11 -3
  14. nextrec/data/data_processing.py +51 -45
  15. nextrec/data/data_utils.py +26 -15
  16. nextrec/data/dataloader.py +273 -96
  17. nextrec/data/preprocessor.py +320 -199
  18. nextrec/loss/listwise.py +17 -9
  19. nextrec/loss/loss_utils.py +7 -8
  20. nextrec/loss/pairwise.py +2 -0
  21. nextrec/loss/pointwise.py +30 -12
  22. nextrec/models/generative/hstu.py +103 -38
  23. nextrec/models/match/dssm.py +82 -68
  24. nextrec/models/match/dssm_v2.py +72 -57
  25. nextrec/models/match/mind.py +175 -107
  26. nextrec/models/match/sdm.py +104 -87
  27. nextrec/models/match/youtube_dnn.py +73 -59
  28. nextrec/models/multi_task/esmm.py +69 -46
  29. nextrec/models/multi_task/mmoe.py +91 -53
  30. nextrec/models/multi_task/ple.py +117 -58
  31. nextrec/models/multi_task/poso.py +163 -55
  32. nextrec/models/multi_task/share_bottom.py +63 -36
  33. nextrec/models/ranking/afm.py +80 -45
  34. nextrec/models/ranking/autoint.py +74 -57
  35. nextrec/models/ranking/dcn.py +110 -48
  36. nextrec/models/ranking/dcn_v2.py +265 -45
  37. nextrec/models/ranking/deepfm.py +39 -24
  38. nextrec/models/ranking/dien.py +335 -146
  39. nextrec/models/ranking/din.py +158 -92
  40. nextrec/models/ranking/fibinet.py +134 -52
  41. nextrec/models/ranking/fm.py +68 -26
  42. nextrec/models/ranking/masknet.py +95 -33
  43. nextrec/models/ranking/pnn.py +128 -58
  44. nextrec/models/ranking/widedeep.py +40 -28
  45. nextrec/models/ranking/xdeepfm.py +67 -40
  46. nextrec/utils/__init__.py +59 -34
  47. nextrec/utils/config.py +496 -0
  48. nextrec/utils/device.py +30 -20
  49. nextrec/utils/distributed.py +36 -9
  50. nextrec/utils/embedding.py +1 -0
  51. nextrec/utils/feature.py +1 -0
  52. nextrec/utils/file.py +33 -11
  53. nextrec/utils/initializer.py +61 -16
  54. nextrec/utils/model.py +22 -0
  55. nextrec/utils/optimizer.py +25 -9
  56. nextrec/utils/synthetic_data.py +283 -165
  57. nextrec/utils/tensor.py +24 -13
  58. {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/METADATA +53 -24
  59. nextrec-0.4.3.dist-info/RECORD +69 -0
  60. nextrec-0.4.3.dist-info/entry_points.txt +2 -0
  61. nextrec-0.4.1.dist-info/RECORD +0 -66
  62. {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/WHEEL +0 -0
  63. {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,49 @@
1
1
  """
2
2
  Date: create on 09/11/2025
3
- Author:
4
- Yang Zhou,zyaztec@gmail.com
3
+ Author: Yang Zhou, zyaztec@gmail.com
4
+ Checkpoint: edit on 09/12/2025
5
5
  Reference:
6
- [1] Zhou G, Mou N, Fan Y, et al. Deep interest evolution network for click-through rate prediction[C]
7
- //Proceedings of the AAAI conference on artificial intelligence. 2019, 33(01): 5941-5948.
8
- (https://arxiv.org/abs/1809.03672)
6
+ [1] Zhou G, Mou N, Fan Y, et al. Deep interest evolution network for click-through
7
+ rate prediction[C] // Proceedings of the AAAI conference on artificial intelligence.
8
+ 2019, 33(01): 5941-5948. (https://arxiv.org/abs/1809.03672)
9
+
10
+ DIEN is a CTR prediction model that explicitly models how user interests evolve
11
+ over time. It introduces a two-stage pipeline:
12
+ (1) Interest Extraction: a GRU encodes raw behavior sequences into interest states
13
+ (2) Interest Evolution: an attention-aware GRU (AUGRU) updates interests by
14
+ focusing on behaviors most related to the target item
15
+ An auxiliary loss on next-click prediction guides the GRU to learn finer-grained
16
+ interest transitions and alleviates vanishing signals in long sequences.
17
+
18
+ Processing flow:
19
+ - Behavior embeddings -> DynamicGRU -> interest trajectory
20
+ - Target-aware attention scores highlight behaviors relevant to the candidate
21
+ - AUGRU modulates GRU updates with attention to emphasize impactful behaviors
22
+ - Final evolved interest, candidate embedding, and context features -> MLP -> CTR
23
+
24
+ Key advantages:
25
+ - Captures temporal evolution of user interests instead of a static summary
26
+ - Target-aware attention steers the evolution toward the candidate item
27
+ - AUGRU gates mitigate noise from irrelevant historical behaviors
28
+ - Auxiliary loss provides additional supervision for sequential dynamics
29
+
30
+ DIEN 是一个 CTR 预估模型,用于显式建模用户兴趣的时间演化。核心包含两阶段:
31
+ (1) 兴趣抽取:通过 GRU 将原始行为序列编码为兴趣状态轨迹
32
+ (2) 兴趣演化:利用目标感知的注意力门控 GRU(AUGRU),强调与候选目标相关的行为,
33
+ 引导兴趣随时间更新
34
+ 同时引入针对下一个行为点击的辅助损失,缓解长序列信号衰减并强化兴趣转移学习。
35
+
36
+ 流程概览:
37
+ - 行为 embedding 输入 DynamicGRU,得到兴趣轨迹
38
+ - 目标相关的注意力得分突出关键行为
39
+ - AUGRU 用注意力调制更新,抑制无关历史噪声
40
+ - 最终演化兴趣 + 候选 embedding + 其他上下文特征,经 MLP 输出 CTR
41
+
42
+ 主要优点:
43
+ - 建模兴趣随时间的演化,而非静态聚合
44
+ - 目标感知注意力将兴趣演化对齐到候选物品
45
+ - AUGRU 门控削弱无关行为的干扰
46
+ - 辅助损失为序列动态提供额外监督信号
9
47
  """
10
48
 
11
49
  import torch
@@ -13,37 +51,44 @@ import torch.nn as nn
13
51
  import torch.nn.functional as F
14
52
 
15
53
  from nextrec.basic.model import BaseModel
16
- from nextrec.basic.layers import EmbeddingLayer, MLP, AttentionPoolingLayer, PredictionLayer
54
+ from nextrec.basic.layers import (
55
+ EmbeddingLayer,
56
+ MLP,
57
+ AttentionPoolingLayer,
58
+ PredictionLayer,
59
+ )
17
60
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
18
61
 
62
+
19
63
  class AUGRU(nn.Module):
20
64
  """Attention-aware GRU update gate used in DIEN (Zhou et al., 2019)."""
65
+
21
66
  """
22
67
  Attention-based GRU for DIEN
23
68
  Uses attention scores to weight the update of hidden states
24
69
  """
25
-
70
+
26
71
  def __init__(self, input_size, hidden_size, bias=True):
27
72
  super().__init__()
28
73
  self.input_size = input_size
29
74
  self.hidden_size = hidden_size
30
-
75
+
31
76
  self.weight_ih = nn.Parameter(torch.randn(3 * hidden_size, input_size))
32
77
  self.weight_hh = nn.Parameter(torch.randn(3 * hidden_size, hidden_size))
33
78
  if bias:
34
79
  self.bias_ih = nn.Parameter(torch.randn(3 * hidden_size))
35
80
  self.bias_hh = nn.Parameter(torch.randn(3 * hidden_size))
36
81
  else:
37
- self.register_parameter('bias_ih', None)
38
- self.register_parameter('bias_hh', None)
39
-
82
+ self.register_parameter("bias_ih", None)
83
+ self.register_parameter("bias_hh", None)
84
+
40
85
  self.reset_parameters()
41
-
86
+
42
87
  def reset_parameters(self):
43
88
  std = 1.0 / (self.hidden_size) ** 0.5
44
89
  for weight in self.parameters():
45
90
  weight.data.uniform_(-std, std)
46
-
91
+
47
92
  def forward(self, x, att_scores):
48
93
  """
49
94
  Args:
@@ -56,37 +101,43 @@ class AUGRU(nn.Module):
56
101
  batch_size, seq_len, _ = x.shape
57
102
  h = torch.zeros(batch_size, self.hidden_size, device=x.device)
58
103
  outputs = []
104
+
59
105
  for t in range(seq_len):
60
- x_t = x[:, t, :] # [batch_size, input_size]
61
- att_t = att_scores[:, t, :] # [batch_size, 1]
62
-
106
+ x_t = x[:, t, :] # [B, input_size]
107
+ att_t = att_scores[:, t, :] # [B, 1]
108
+
63
109
  gi = F.linear(x_t, self.weight_ih, self.bias_ih)
64
110
  gh = F.linear(h, self.weight_hh, self.bias_hh)
65
111
  i_r, i_i, i_n = gi.chunk(3, 1)
66
112
  h_r, h_i, h_n = gh.chunk(3, 1)
67
-
68
- resetgate = torch.sigmoid(i_r + h_r)
69
- inputgate = torch.sigmoid(i_i + h_i)
70
- newgate = torch.tanh(i_n + resetgate * h_n)
71
- # Use attention score to control update
72
- h = (1 - att_t) * h + att_t * newgate
113
+
114
+ resetgate = torch.sigmoid(i_r + h_r) # r_t
115
+ updategate = torch.sigmoid(i_i + h_i) # z_t
116
+ newgate = torch.tanh(i_n + resetgate * h_n) # n_t
117
+
118
+ # att_t: [B,1],broadcast to [B,H]
119
+ z_att = updategate * att_t
120
+
121
+ # h_t = (1 - z'_t) * h_{t-1} + z'_t * n_t
122
+ h = (1.0 - z_att) * h + z_att * newgate
73
123
  outputs.append(h.unsqueeze(1))
74
- output = torch.cat(outputs, dim=1)
75
-
76
- return output, h
124
+
125
+ output = torch.cat(outputs, dim=1) # [B, L, H]
126
+ return output, h
77
127
 
78
128
 
79
129
  class DynamicGRU(nn.Module):
80
130
  """Dynamic GRU unit with auxiliary loss path from DIEN (Zhou et al., 2019)."""
131
+
81
132
  """
82
133
  GRU with dynamic routing for DIEN
83
134
  """
84
-
135
+
85
136
  def __init__(self, input_size, hidden_size, bias=True):
86
137
  super().__init__()
87
138
  self.input_size = input_size
88
139
  self.hidden_size = hidden_size
89
-
140
+
90
141
  # GRU parameters
91
142
  self.weight_ih = nn.Parameter(torch.randn(3 * hidden_size, input_size))
92
143
  self.weight_hh = nn.Parameter(torch.randn(3 * hidden_size, hidden_size))
@@ -94,49 +145,49 @@ class DynamicGRU(nn.Module):
94
145
  self.bias_ih = nn.Parameter(torch.randn(3 * hidden_size))
95
146
  self.bias_hh = nn.Parameter(torch.randn(3 * hidden_size))
96
147
  else:
97
- self.register_parameter('bias_ih', None)
98
- self.register_parameter('bias_hh', None)
99
-
148
+ self.register_parameter("bias_ih", None)
149
+ self.register_parameter("bias_hh", None)
150
+
100
151
  self.reset_parameters()
101
-
152
+
102
153
  def reset_parameters(self):
103
154
  std = 1.0 / (self.hidden_size) ** 0.5
104
155
  for weight in self.parameters():
105
156
  weight.data.uniform_(-std, std)
106
-
107
- def forward(self, x, att_scores=None):
157
+
158
+ def forward(self, x):
108
159
  """
109
160
  Args:
110
161
  x: [batch_size, seq_len, input_size]
111
- att_scores: [batch_size, seq_len] - attention scores for auxiliary loss
112
162
  Returns:
113
163
  output: [batch_size, seq_len, hidden_size]
114
164
  hidden: [batch_size, hidden_size] - final hidden state
115
165
  """
116
166
  batch_size, seq_len, _ = x.shape
117
-
167
+
118
168
  # Initialize hidden state
119
169
  h = torch.zeros(batch_size, self.hidden_size, device=x.device)
120
-
170
+
121
171
  outputs = []
122
172
  for t in range(seq_len):
123
173
  x_t = x[:, t, :] # [batch_size, input_size]
124
-
174
+
125
175
  # GRU computation
126
176
  gi = F.linear(x_t, self.weight_ih, self.bias_ih)
127
177
  gh = F.linear(h, self.weight_hh, self.bias_hh)
128
178
  i_r, i_i, i_n = gi.chunk(3, 1)
129
179
  h_r, h_i, h_n = gh.chunk(3, 1)
130
-
180
+
131
181
  resetgate = torch.sigmoid(i_r + h_r)
132
- inputgate = torch.sigmoid(i_i + h_i)
182
+ updategate = torch.sigmoid(i_i + h_i)
133
183
  newgate = torch.tanh(i_n + resetgate * h_n)
134
- h = newgate + inputgate * (h - newgate)
135
-
184
+
185
+ # h_t = (1 - z_t) * h_{t-1} + z_t * n_t
186
+ h = (1.0 - updategate) * h + updategate * newgate
187
+
136
188
  outputs.append(h.unsqueeze(1))
137
-
189
+
138
190
  output = torch.cat(outputs, dim=1) # [batch_size, seq_len, hidden_size]
139
-
140
191
  return output, h
141
192
 
142
193
 
@@ -148,29 +199,44 @@ class DIEN(BaseModel):
148
199
  @property
149
200
  def default_task(self):
150
201
  return "binary"
151
-
152
- def __init__(self,
153
- dense_features: list[DenseFeature],
154
- sparse_features: list[SparseFeature],
155
- sequence_features: list[SequenceFeature],
156
- mlp_params: dict,
157
- gru_hidden_size: int = 64,
158
- attention_hidden_units: list[int] = [80, 40],
159
- attention_activation: str = 'sigmoid',
160
- use_negsampling: bool = False,
161
- target: list[str] = [],
162
- task: str | list[str] | None = None,
163
- optimizer: str = "adam",
164
- optimizer_params: dict = {},
165
- loss: str | nn.Module | None = "bce",
166
- loss_params: dict | list[dict] | None = None,
167
- device: str = 'cpu',
168
- embedding_l1_reg=1e-6,
169
- dense_l1_reg=1e-5,
170
- embedding_l2_reg=1e-5,
171
- dense_l2_reg=1e-4,
172
- **kwargs):
173
-
202
+
203
+ def __init__(
204
+ self,
205
+ dense_features: list[DenseFeature] | None = None,
206
+ sparse_features: list[SparseFeature] | None = None,
207
+ sequence_features: list[SequenceFeature] | None = None,
208
+ behavior_feature_name: str | None = None,
209
+ candidate_feature_name: str | None = None,
210
+ neg_behavior_feature_name: str | None = None,
211
+ mlp_params: dict | None = None,
212
+ gru_hidden_size: int = 64,
213
+ attention_hidden_units: list[int] | None = None,
214
+ attention_activation: str = "sigmoid",
215
+ use_negsampling: bool = False,
216
+ aux_loss_weight: float = 1.0,
217
+ target: list[str] | str | None = None,
218
+ task: str | list[str] | None = None,
219
+ optimizer: str = "adam",
220
+ optimizer_params: dict | None = None,
221
+ loss: str | nn.Module | None = "bce",
222
+ loss_params: dict | list[dict] | None = None,
223
+ device: str = "cpu",
224
+ embedding_l1_reg=1e-6,
225
+ dense_l1_reg=1e-5,
226
+ embedding_l2_reg=1e-5,
227
+ dense_l2_reg=1e-4,
228
+ **kwargs,
229
+ ):
230
+
231
+ dense_features = dense_features or []
232
+ sparse_features = sparse_features or []
233
+ sequence_features = sequence_features or []
234
+ mlp_params = mlp_params or {}
235
+ attention_hidden_units = attention_hidden_units or [80, 40]
236
+ optimizer_params = optimizer_params or {}
237
+ if loss is None:
238
+ loss = "bce"
239
+
174
240
  super(DIEN, self).__init__(
175
241
  dense_features=dense_features,
176
242
  sparse_features=sparse_features,
@@ -182,138 +248,261 @@ class DIEN(BaseModel):
182
248
  dense_l1_reg=dense_l1_reg,
183
249
  embedding_l2_reg=embedding_l2_reg,
184
250
  dense_l2_reg=dense_l2_reg,
185
- **kwargs
251
+ **kwargs,
186
252
  )
187
253
 
188
254
  self.loss = loss
189
- if self.loss is None:
190
- self.loss = "bce"
191
-
192
255
  self.use_negsampling = use_negsampling
193
-
194
- # Features classification
256
+ self.aux_loss_weight = float(aux_loss_weight)
257
+ self.auxiliary_cache = None
258
+
195
259
  if len(sequence_features) == 0:
196
- raise ValueError("DIEN requires at least one sequence feature for user behavior history")
197
-
198
- self.behavior_feature = sequence_features[0] # User behavior sequence
199
- self.candidate_feature = sparse_features[-1] if sparse_features else None # Candidate item
200
-
201
- self.other_sparse_features = sparse_features[:-1] if self.candidate_feature else sparse_features
202
- self.dense_features_list = dense_features
260
+ raise ValueError(
261
+ "DIEN requires at least one sequence feature for user behavior history"
262
+ )
263
+
264
+ if behavior_feature_name is None:
265
+ raise ValueError(
266
+ "DIEN requires at least one sequence feature as behavior item feature"
267
+ )
268
+
269
+ if candidate_feature_name is None:
270
+ raise ValueError(
271
+ "DIEN requires at least one sparse_feature as candidate item feature"
272
+ )
273
+
274
+ self.behavior_feature = [
275
+ f for f in sequence_features if f.name == behavior_feature_name
276
+ ][0]
277
+ self.candidate_feature = [
278
+ f for f in sparse_features if f.name == candidate_feature_name
279
+ ][0]
280
+
281
+ self.other_sparse_features = (
282
+ sparse_features[:-1] if self.candidate_feature else sparse_features
283
+ )
284
+
285
+ self.neg_behavior_feature = None
203
286
 
204
287
  # Embedding layer
205
288
  self.embedding = EmbeddingLayer(features=self.all_features)
206
-
289
+
207
290
  behavior_emb_dim = self.behavior_feature.embedding_dim
291
+
292
+ # projection candidate feature to match GRU hidden size if needed
208
293
  self.candidate_proj = None
209
- if self.candidate_feature is not None and self.candidate_feature.embedding_dim != gru_hidden_size:
210
- self.candidate_proj = nn.Linear(self.candidate_feature.embedding_dim, gru_hidden_size)
211
-
212
- # Interest Extractor Layer (GRU)
294
+ if (
295
+ self.candidate_feature is not None
296
+ and self.candidate_feature.embedding_dim != gru_hidden_size
297
+ ):
298
+ self.candidate_proj = nn.Linear(
299
+ self.candidate_feature.embedding_dim, gru_hidden_size
300
+ )
301
+
302
+ # gru for interest extraction
213
303
  self.interest_extractor = DynamicGRU(
214
- input_size=behavior_emb_dim,
215
- hidden_size=gru_hidden_size
304
+ input_size=behavior_emb_dim, hidden_size=gru_hidden_size
216
305
  )
217
-
218
- # Attention layer for computing attention scores
306
+
219
307
  self.attention_layer = AttentionPoolingLayer(
220
308
  embedding_dim=gru_hidden_size,
221
309
  hidden_units=attention_hidden_units,
222
310
  activation=attention_activation,
223
- use_softmax=False # We'll use scores directly for AUGRU
311
+ use_softmax=False,
224
312
  )
225
-
313
+
226
314
  # Interest Evolution Layer (AUGRU)
227
- self.interest_evolution = AUGRU(input_size=gru_hidden_size, hidden_size=gru_hidden_size)
228
-
229
- # Calculate MLP input dimension
315
+ self.interest_evolution = AUGRU(
316
+ input_size=gru_hidden_size, hidden_size=gru_hidden_size
317
+ )
318
+
319
+ # build auxiliary loss net if provided neg sampling and neg_behavior_feature_name
320
+ # auxiliary loss uses the interest states to predict the next behavior in the sequence
321
+ # that's the second task of DIEN
322
+ if self.use_negsampling:
323
+ neg_candidates = [
324
+ f for f in sequence_features if f.name == neg_behavior_feature_name
325
+ ]
326
+ if len(neg_candidates) == 0:
327
+ raise ValueError(
328
+ f"use_negsampling=True requires a negative sequence feature named '{neg_behavior_feature_name}'"
329
+ )
330
+ self.neg_behavior_feature = neg_candidates[0]
331
+ self.auxiliary_net = nn.Sequential(
332
+ nn.Linear(gru_hidden_size + behavior_emb_dim, gru_hidden_size),
333
+ nn.PReLU(),
334
+ nn.Linear(gru_hidden_size, 1),
335
+ )
336
+ else:
337
+ self.auxiliary_net = None
338
+
230
339
  mlp_input_dim = 0
231
340
  if self.candidate_feature:
232
341
  mlp_input_dim += self.candidate_feature.embedding_dim
233
342
  mlp_input_dim += gru_hidden_size # final interest state
234
343
  mlp_input_dim += sum([f.embedding_dim for f in self.other_sparse_features])
235
- mlp_input_dim += sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
236
- # MLP for final prediction
344
+ mlp_input_dim += sum(
345
+ [getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
346
+ )
347
+
237
348
  self.mlp = MLP(input_dim=mlp_input_dim, **mlp_params)
238
349
  self.prediction_layer = PredictionLayer(task_type=self.task)
239
- # Register regularization weights
240
- self.register_regularization_weights(embedding_attr='embedding', include_modules=['interest_extractor', 'interest_evolution', 'attention_layer', 'mlp', 'candidate_proj'])
241
- self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
350
+
351
+ self.register_regularization_weights(
352
+ embedding_attr="embedding",
353
+ include_modules=[
354
+ "interest_extractor",
355
+ "interest_evolution",
356
+ "attention_layer",
357
+ "mlp",
358
+ "candidate_proj",
359
+ "auxiliary_net",
360
+ ],
361
+ )
362
+
363
+ self.compile(
364
+ optimizer=optimizer,
365
+ optimizer_params=optimizer_params,
366
+ loss=loss,
367
+ loss_params=loss_params,
368
+ )
242
369
 
243
370
  def forward(self, x):
244
- # Get candidate item embedding
371
+ self.auxiliary_cache = None
245
372
  if self.candidate_feature:
246
- candidate_emb = self.embedding.embed_dict[self.candidate_feature.embedding_name](x[self.candidate_feature.name].long()) # [B, emb_dim]
373
+ candidate_emb = self.embedding.embed_dict[
374
+ self.candidate_feature.embedding_name
375
+ ](
376
+ x[self.candidate_feature.name].long()
377
+ ) # [B, emb_dim]
247
378
  else:
248
379
  raise ValueError("DIEN requires a candidate item feature")
249
-
250
- # Get behavior sequence embedding
380
+
251
381
  behavior_seq = x[self.behavior_feature.name].long() # [B, seq_len]
252
- behavior_emb = self.embedding.embed_dict[self.behavior_feature.embedding_name](behavior_seq) # [B, seq_len, emb_dim]
253
-
254
- # Create mask for padding
382
+ behavior_emb = self.embedding.embed_dict[self.behavior_feature.embedding_name](
383
+ behavior_seq
384
+ ) # [B, seq_len, emb_dim]
385
+
255
386
  if self.behavior_feature.padding_idx is not None:
256
- mask = (behavior_seq != self.behavior_feature.padding_idx).unsqueeze(-1).float()
387
+ mask = (behavior_seq != self.behavior_feature.padding_idx).unsqueeze(-1)
257
388
  else:
258
- mask = (behavior_seq != 0).unsqueeze(-1).float()
259
-
260
- # Step 1: Interest Extractor (GRU)
261
- interest_states, _ = self.interest_extractor(behavior_emb) # [B, seq_len, hidden_size]
262
-
263
- # Step 2: Compute attention scores for each time step
389
+ mask = (behavior_seq != 0).unsqueeze(-1)
390
+ mask = mask.float() # [B, seq_len, 1]
391
+
392
+ interest_states, _ = self.interest_extractor(
393
+ behavior_emb
394
+ ) # [B, seq_len, hidden_size]
395
+
264
396
  batch_size, seq_len, hidden_size = interest_states.shape
265
-
266
- # Project candidate to hidden_size if necessary (defined in __init__)
397
+
267
398
  if self.candidate_proj is not None:
268
399
  candidate_for_attention = self.candidate_proj(candidate_emb)
269
400
  else:
270
- candidate_for_attention = candidate_emb
271
-
272
- # Compute attention scores for AUGRU
273
- attention_scores = []
401
+ candidate_for_attention = candidate_emb # [B, hidden_size]
402
+ att_scores_list = []
274
403
  for t in range(seq_len):
275
- score = self.attention_layer.attention_net(
276
- torch.cat([
404
+ # [B, 4H]
405
+ concat_feat = torch.cat(
406
+ [
277
407
  candidate_for_attention,
278
408
  interest_states[:, t, :],
279
409
  candidate_for_attention - interest_states[:, t, :],
280
- candidate_for_attention * interest_states[:, t, :]
281
- ], dim=-1)
282
- ) # [B, 1]
283
- attention_scores.append(score)
284
-
285
- attention_scores = torch.cat(attention_scores, dim=1).unsqueeze(-1) # [B, seq_len, 1]
286
- attention_scores = torch.sigmoid(attention_scores) # Normalize to [0, 1]
287
-
288
- # Apply mask to attention scores
289
- attention_scores = attention_scores * mask
290
-
291
- # Step 3: Interest Evolution (AUGRU)
410
+ candidate_for_attention * interest_states[:, t, :],
411
+ ],
412
+ dim=-1,
413
+ )
414
+ score_t = self.attention_layer.attention_net(concat_feat) # [B, 1]
415
+ att_scores_list.append(score_t)
416
+
417
+ # [B, seq_len, 1]
418
+ att_scores = torch.cat(att_scores_list, dim=1)
419
+
420
+ scores_flat = att_scores.squeeze(-1) # [B, seq_len]
421
+ mask_flat = mask.squeeze(-1) # [B, seq_len]
422
+
423
+ scores_flat = scores_flat.masked_fill(mask_flat == 0, -1e9)
424
+ att_weights = torch.softmax(scores_flat, dim=1) # [B, seq_len]
425
+ att_weights = att_weights.unsqueeze(-1) # [B, seq_len, 1]
426
+
427
+ att_weights = att_weights * mask
428
+
429
+ # 6. Interest Evolution(AUGRU)
292
430
  final_states, final_interest = self.interest_evolution(
293
- interest_states,
294
- attention_scores
431
+ interest_states, att_weights
295
432
  ) # final_interest: [B, hidden_size]
296
-
297
- # Get other features
433
+
434
+ if self.use_negsampling and self.training:
435
+ if self.neg_behavior_feature is None:
436
+ raise ValueError(
437
+ "Negative behavior feature is not configured while use_negsampling=True"
438
+ )
439
+ neg_seq = x[self.neg_behavior_feature.name].long()
440
+ neg_behavior_emb = self.embedding.embed_dict[
441
+ self.neg_behavior_feature.embedding_name
442
+ ](neg_seq)
443
+ self.auxiliary_cache = {
444
+ "interest_states": interest_states,
445
+ "behavior_emb": behavior_emb,
446
+ "neg_behavior_emb": neg_behavior_emb,
447
+ "mask": mask,
448
+ }
449
+
298
450
  other_embeddings = []
299
451
  other_embeddings.append(candidate_emb)
300
452
  other_embeddings.append(final_interest)
301
-
302
- # Other sparse features
453
+
303
454
  for feat in self.other_sparse_features:
304
- feat_emb = self.embedding.embed_dict[feat.embedding_name](x[feat.name].long())
455
+ feat_emb = self.embedding.embed_dict[feat.embedding_name](
456
+ x[feat.name].long()
457
+ )
305
458
  other_embeddings.append(feat_emb)
306
-
307
- # Dense features
308
- for feat in self.dense_features_list:
459
+
460
+ for feat in self.dense_features:
309
461
  val = x[feat.name].float()
310
462
  if val.dim() == 1:
311
463
  val = val.unsqueeze(1)
312
464
  other_embeddings.append(val)
313
-
314
- # Concatenate all features
465
+
315
466
  concat_input = torch.cat(other_embeddings, dim=-1) # [B, total_dim]
316
-
317
- # MLP prediction
467
+
318
468
  y = self.mlp(concat_input) # [B, 1]
319
469
  return self.prediction_layer(y)
470
+
471
+ def compute_auxiliary_loss(self):
472
+ if not (self.training and self.use_negsampling and self.auxiliary_net):
473
+ return torch.tensor(0.0, device=self.device)
474
+ if self.auxiliary_cache is None:
475
+ return torch.tensor(0.0, device=self.device)
476
+
477
+ interest_states = self.auxiliary_cache["interest_states"]
478
+ behavior_emb = self.auxiliary_cache["behavior_emb"]
479
+ neg_behavior_emb = self.auxiliary_cache["neg_behavior_emb"]
480
+ mask = self.auxiliary_cache["mask"]
481
+
482
+ interest_states = interest_states[:, :-1, :]
483
+ pos_seq = behavior_emb[:, 1:, :]
484
+ neg_seq = neg_behavior_emb[:, 1:, :]
485
+ aux_mask = mask[:, 1:, :].squeeze(-1)
486
+
487
+ if aux_mask.sum() == 0:
488
+ return torch.tensor(0.0, device=self.device)
489
+
490
+ pos_input = torch.cat([interest_states, pos_seq], dim=-1)
491
+ neg_input = torch.cat([interest_states, neg_seq], dim=-1)
492
+ pos_logits = self.auxiliary_net(pos_input).squeeze(-1)
493
+ neg_logits = self.auxiliary_net(neg_input).squeeze(-1)
494
+
495
+ pos_loss = F.binary_cross_entropy_with_logits(
496
+ pos_logits, torch.ones_like(pos_logits), reduction="none"
497
+ )
498
+ neg_loss = F.binary_cross_entropy_with_logits(
499
+ neg_logits, torch.zeros_like(neg_logits), reduction="none"
500
+ )
501
+ aux_loss = (pos_loss + neg_loss) * aux_mask
502
+ aux_loss = aux_loss.sum() / torch.clamp(aux_mask.sum(), min=1.0)
503
+ return aux_loss
504
+
505
+ def compute_loss(self, y_pred, y_true):
506
+ main_loss = super().compute_loss(y_pred, y_true)
507
+ aux_loss = self.compute_auxiliary_loss()
508
+ return main_loss + self.aux_loss_weight * aux_loss