nextrec 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +1 -1
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -5
- nextrec/basic/callback.py +1 -0
- nextrec/basic/features.py +30 -22
- nextrec/basic/layers.py +250 -112
- nextrec/basic/loggers.py +63 -44
- nextrec/basic/metrics.py +270 -120
- nextrec/basic/model.py +1084 -402
- nextrec/basic/session.py +10 -3
- nextrec/cli.py +492 -0
- nextrec/data/__init__.py +19 -25
- nextrec/data/batch_utils.py +11 -3
- nextrec/data/data_processing.py +51 -45
- nextrec/data/data_utils.py +26 -15
- nextrec/data/dataloader.py +273 -96
- nextrec/data/preprocessor.py +320 -199
- nextrec/loss/listwise.py +17 -9
- nextrec/loss/loss_utils.py +7 -8
- nextrec/loss/pairwise.py +2 -0
- nextrec/loss/pointwise.py +30 -12
- nextrec/models/generative/hstu.py +103 -38
- nextrec/models/match/dssm.py +82 -68
- nextrec/models/match/dssm_v2.py +72 -57
- nextrec/models/match/mind.py +175 -107
- nextrec/models/match/sdm.py +104 -87
- nextrec/models/match/youtube_dnn.py +73 -59
- nextrec/models/multi_task/esmm.py +69 -46
- nextrec/models/multi_task/mmoe.py +91 -53
- nextrec/models/multi_task/ple.py +117 -58
- nextrec/models/multi_task/poso.py +163 -55
- nextrec/models/multi_task/share_bottom.py +63 -36
- nextrec/models/ranking/afm.py +80 -45
- nextrec/models/ranking/autoint.py +74 -57
- nextrec/models/ranking/dcn.py +110 -48
- nextrec/models/ranking/dcn_v2.py +265 -45
- nextrec/models/ranking/deepfm.py +39 -24
- nextrec/models/ranking/dien.py +335 -146
- nextrec/models/ranking/din.py +158 -92
- nextrec/models/ranking/fibinet.py +134 -52
- nextrec/models/ranking/fm.py +68 -26
- nextrec/models/ranking/masknet.py +95 -33
- nextrec/models/ranking/pnn.py +128 -58
- nextrec/models/ranking/widedeep.py +40 -28
- nextrec/models/ranking/xdeepfm.py +67 -40
- nextrec/utils/__init__.py +59 -34
- nextrec/utils/config.py +496 -0
- nextrec/utils/device.py +30 -20
- nextrec/utils/distributed.py +36 -9
- nextrec/utils/embedding.py +1 -0
- nextrec/utils/feature.py +1 -0
- nextrec/utils/file.py +33 -11
- nextrec/utils/initializer.py +61 -16
- nextrec/utils/model.py +22 -0
- nextrec/utils/optimizer.py +25 -9
- nextrec/utils/synthetic_data.py +283 -165
- nextrec/utils/tensor.py +24 -13
- {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/METADATA +53 -24
- nextrec-0.4.3.dist-info/RECORD +69 -0
- nextrec-0.4.3.dist-info/entry_points.txt +2 -0
- nextrec-0.4.1.dist-info/RECORD +0 -66
- {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/WHEEL +0 -0
- {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/licenses/LICENSE +0 -0
nextrec/models/ranking/dien.py
CHANGED
|
@@ -1,11 +1,49 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Author:
|
|
4
|
-
|
|
3
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
4
|
+
Checkpoint: edit on 09/12/2025
|
|
5
5
|
Reference:
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
6
|
+
[1] Zhou G, Mou N, Fan Y, et al. Deep interest evolution network for click-through
|
|
7
|
+
rate prediction[C] // Proceedings of the AAAI conference on artificial intelligence.
|
|
8
|
+
2019, 33(01): 5941-5948. (https://arxiv.org/abs/1809.03672)
|
|
9
|
+
|
|
10
|
+
DIEN is a CTR prediction model that explicitly models how user interests evolve
|
|
11
|
+
over time. It introduces a two-stage pipeline:
|
|
12
|
+
(1) Interest Extraction: a GRU encodes raw behavior sequences into interest states
|
|
13
|
+
(2) Interest Evolution: an attention-aware GRU (AUGRU) updates interests by
|
|
14
|
+
focusing on behaviors most related to the target item
|
|
15
|
+
An auxiliary loss on next-click prediction guides the GRU to learn finer-grained
|
|
16
|
+
interest transitions and alleviates vanishing signals in long sequences.
|
|
17
|
+
|
|
18
|
+
Processing flow:
|
|
19
|
+
- Behavior embeddings -> DynamicGRU -> interest trajectory
|
|
20
|
+
- Target-aware attention scores highlight behaviors relevant to the candidate
|
|
21
|
+
- AUGRU modulates GRU updates with attention to emphasize impactful behaviors
|
|
22
|
+
- Final evolved interest, candidate embedding, and context features -> MLP -> CTR
|
|
23
|
+
|
|
24
|
+
Key advantages:
|
|
25
|
+
- Captures temporal evolution of user interests instead of a static summary
|
|
26
|
+
- Target-aware attention steers the evolution toward the candidate item
|
|
27
|
+
- AUGRU gates mitigate noise from irrelevant historical behaviors
|
|
28
|
+
- Auxiliary loss provides additional supervision for sequential dynamics
|
|
29
|
+
|
|
30
|
+
DIEN 是一个 CTR 预估模型,用于显式建模用户兴趣的时间演化。核心包含两阶段:
|
|
31
|
+
(1) 兴趣抽取:通过 GRU 将原始行为序列编码为兴趣状态轨迹
|
|
32
|
+
(2) 兴趣演化:利用目标感知的注意力门控 GRU(AUGRU),强调与候选目标相关的行为,
|
|
33
|
+
引导兴趣随时间更新
|
|
34
|
+
同时引入针对下一个行为点击的辅助损失,缓解长序列信号衰减并强化兴趣转移学习。
|
|
35
|
+
|
|
36
|
+
流程概览:
|
|
37
|
+
- 行为 embedding 输入 DynamicGRU,得到兴趣轨迹
|
|
38
|
+
- 目标相关的注意力得分突出关键行为
|
|
39
|
+
- AUGRU 用注意力调制更新,抑制无关历史噪声
|
|
40
|
+
- 最终演化兴趣 + 候选 embedding + 其他上下文特征,经 MLP 输出 CTR
|
|
41
|
+
|
|
42
|
+
主要优点:
|
|
43
|
+
- 建模兴趣随时间的演化,而非静态聚合
|
|
44
|
+
- 目标感知注意力将兴趣演化对齐到候选物品
|
|
45
|
+
- AUGRU 门控削弱无关行为的干扰
|
|
46
|
+
- 辅助损失为序列动态提供额外监督信号
|
|
9
47
|
"""
|
|
10
48
|
|
|
11
49
|
import torch
|
|
@@ -13,37 +51,44 @@ import torch.nn as nn
|
|
|
13
51
|
import torch.nn.functional as F
|
|
14
52
|
|
|
15
53
|
from nextrec.basic.model import BaseModel
|
|
16
|
-
from nextrec.basic.layers import
|
|
54
|
+
from nextrec.basic.layers import (
|
|
55
|
+
EmbeddingLayer,
|
|
56
|
+
MLP,
|
|
57
|
+
AttentionPoolingLayer,
|
|
58
|
+
PredictionLayer,
|
|
59
|
+
)
|
|
17
60
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
18
61
|
|
|
62
|
+
|
|
19
63
|
class AUGRU(nn.Module):
|
|
20
64
|
"""Attention-aware GRU update gate used in DIEN (Zhou et al., 2019)."""
|
|
65
|
+
|
|
21
66
|
"""
|
|
22
67
|
Attention-based GRU for DIEN
|
|
23
68
|
Uses attention scores to weight the update of hidden states
|
|
24
69
|
"""
|
|
25
|
-
|
|
70
|
+
|
|
26
71
|
def __init__(self, input_size, hidden_size, bias=True):
|
|
27
72
|
super().__init__()
|
|
28
73
|
self.input_size = input_size
|
|
29
74
|
self.hidden_size = hidden_size
|
|
30
|
-
|
|
75
|
+
|
|
31
76
|
self.weight_ih = nn.Parameter(torch.randn(3 * hidden_size, input_size))
|
|
32
77
|
self.weight_hh = nn.Parameter(torch.randn(3 * hidden_size, hidden_size))
|
|
33
78
|
if bias:
|
|
34
79
|
self.bias_ih = nn.Parameter(torch.randn(3 * hidden_size))
|
|
35
80
|
self.bias_hh = nn.Parameter(torch.randn(3 * hidden_size))
|
|
36
81
|
else:
|
|
37
|
-
self.register_parameter(
|
|
38
|
-
self.register_parameter(
|
|
39
|
-
|
|
82
|
+
self.register_parameter("bias_ih", None)
|
|
83
|
+
self.register_parameter("bias_hh", None)
|
|
84
|
+
|
|
40
85
|
self.reset_parameters()
|
|
41
|
-
|
|
86
|
+
|
|
42
87
|
def reset_parameters(self):
|
|
43
88
|
std = 1.0 / (self.hidden_size) ** 0.5
|
|
44
89
|
for weight in self.parameters():
|
|
45
90
|
weight.data.uniform_(-std, std)
|
|
46
|
-
|
|
91
|
+
|
|
47
92
|
def forward(self, x, att_scores):
|
|
48
93
|
"""
|
|
49
94
|
Args:
|
|
@@ -56,37 +101,43 @@ class AUGRU(nn.Module):
|
|
|
56
101
|
batch_size, seq_len, _ = x.shape
|
|
57
102
|
h = torch.zeros(batch_size, self.hidden_size, device=x.device)
|
|
58
103
|
outputs = []
|
|
104
|
+
|
|
59
105
|
for t in range(seq_len):
|
|
60
|
-
x_t = x[:, t, :] # [
|
|
61
|
-
att_t = att_scores[:, t, :] # [
|
|
62
|
-
|
|
106
|
+
x_t = x[:, t, :] # [B, input_size]
|
|
107
|
+
att_t = att_scores[:, t, :] # [B, 1]
|
|
108
|
+
|
|
63
109
|
gi = F.linear(x_t, self.weight_ih, self.bias_ih)
|
|
64
110
|
gh = F.linear(h, self.weight_hh, self.bias_hh)
|
|
65
111
|
i_r, i_i, i_n = gi.chunk(3, 1)
|
|
66
112
|
h_r, h_i, h_n = gh.chunk(3, 1)
|
|
67
|
-
|
|
68
|
-
resetgate = torch.sigmoid(i_r + h_r)
|
|
69
|
-
|
|
70
|
-
newgate = torch.tanh(i_n + resetgate * h_n)
|
|
71
|
-
|
|
72
|
-
|
|
113
|
+
|
|
114
|
+
resetgate = torch.sigmoid(i_r + h_r) # r_t
|
|
115
|
+
updategate = torch.sigmoid(i_i + h_i) # z_t
|
|
116
|
+
newgate = torch.tanh(i_n + resetgate * h_n) # n_t
|
|
117
|
+
|
|
118
|
+
# att_t: [B,1],broadcast to [B,H]
|
|
119
|
+
z_att = updategate * att_t
|
|
120
|
+
|
|
121
|
+
# h_t = (1 - z'_t) * h_{t-1} + z'_t * n_t
|
|
122
|
+
h = (1.0 - z_att) * h + z_att * newgate
|
|
73
123
|
outputs.append(h.unsqueeze(1))
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
return output, h
|
|
124
|
+
|
|
125
|
+
output = torch.cat(outputs, dim=1) # [B, L, H]
|
|
126
|
+
return output, h
|
|
77
127
|
|
|
78
128
|
|
|
79
129
|
class DynamicGRU(nn.Module):
|
|
80
130
|
"""Dynamic GRU unit with auxiliary loss path from DIEN (Zhou et al., 2019)."""
|
|
131
|
+
|
|
81
132
|
"""
|
|
82
133
|
GRU with dynamic routing for DIEN
|
|
83
134
|
"""
|
|
84
|
-
|
|
135
|
+
|
|
85
136
|
def __init__(self, input_size, hidden_size, bias=True):
|
|
86
137
|
super().__init__()
|
|
87
138
|
self.input_size = input_size
|
|
88
139
|
self.hidden_size = hidden_size
|
|
89
|
-
|
|
140
|
+
|
|
90
141
|
# GRU parameters
|
|
91
142
|
self.weight_ih = nn.Parameter(torch.randn(3 * hidden_size, input_size))
|
|
92
143
|
self.weight_hh = nn.Parameter(torch.randn(3 * hidden_size, hidden_size))
|
|
@@ -94,49 +145,49 @@ class DynamicGRU(nn.Module):
|
|
|
94
145
|
self.bias_ih = nn.Parameter(torch.randn(3 * hidden_size))
|
|
95
146
|
self.bias_hh = nn.Parameter(torch.randn(3 * hidden_size))
|
|
96
147
|
else:
|
|
97
|
-
self.register_parameter(
|
|
98
|
-
self.register_parameter(
|
|
99
|
-
|
|
148
|
+
self.register_parameter("bias_ih", None)
|
|
149
|
+
self.register_parameter("bias_hh", None)
|
|
150
|
+
|
|
100
151
|
self.reset_parameters()
|
|
101
|
-
|
|
152
|
+
|
|
102
153
|
def reset_parameters(self):
|
|
103
154
|
std = 1.0 / (self.hidden_size) ** 0.5
|
|
104
155
|
for weight in self.parameters():
|
|
105
156
|
weight.data.uniform_(-std, std)
|
|
106
|
-
|
|
107
|
-
def forward(self, x
|
|
157
|
+
|
|
158
|
+
def forward(self, x):
|
|
108
159
|
"""
|
|
109
160
|
Args:
|
|
110
161
|
x: [batch_size, seq_len, input_size]
|
|
111
|
-
att_scores: [batch_size, seq_len] - attention scores for auxiliary loss
|
|
112
162
|
Returns:
|
|
113
163
|
output: [batch_size, seq_len, hidden_size]
|
|
114
164
|
hidden: [batch_size, hidden_size] - final hidden state
|
|
115
165
|
"""
|
|
116
166
|
batch_size, seq_len, _ = x.shape
|
|
117
|
-
|
|
167
|
+
|
|
118
168
|
# Initialize hidden state
|
|
119
169
|
h = torch.zeros(batch_size, self.hidden_size, device=x.device)
|
|
120
|
-
|
|
170
|
+
|
|
121
171
|
outputs = []
|
|
122
172
|
for t in range(seq_len):
|
|
123
173
|
x_t = x[:, t, :] # [batch_size, input_size]
|
|
124
|
-
|
|
174
|
+
|
|
125
175
|
# GRU computation
|
|
126
176
|
gi = F.linear(x_t, self.weight_ih, self.bias_ih)
|
|
127
177
|
gh = F.linear(h, self.weight_hh, self.bias_hh)
|
|
128
178
|
i_r, i_i, i_n = gi.chunk(3, 1)
|
|
129
179
|
h_r, h_i, h_n = gh.chunk(3, 1)
|
|
130
|
-
|
|
180
|
+
|
|
131
181
|
resetgate = torch.sigmoid(i_r + h_r)
|
|
132
|
-
|
|
182
|
+
updategate = torch.sigmoid(i_i + h_i)
|
|
133
183
|
newgate = torch.tanh(i_n + resetgate * h_n)
|
|
134
|
-
|
|
135
|
-
|
|
184
|
+
|
|
185
|
+
# h_t = (1 - z_t) * h_{t-1} + z_t * n_t
|
|
186
|
+
h = (1.0 - updategate) * h + updategate * newgate
|
|
187
|
+
|
|
136
188
|
outputs.append(h.unsqueeze(1))
|
|
137
|
-
|
|
189
|
+
|
|
138
190
|
output = torch.cat(outputs, dim=1) # [batch_size, seq_len, hidden_size]
|
|
139
|
-
|
|
140
191
|
return output, h
|
|
141
192
|
|
|
142
193
|
|
|
@@ -148,29 +199,44 @@ class DIEN(BaseModel):
|
|
|
148
199
|
@property
|
|
149
200
|
def default_task(self):
|
|
150
201
|
return "binary"
|
|
151
|
-
|
|
152
|
-
def __init__(
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
202
|
+
|
|
203
|
+
def __init__(
|
|
204
|
+
self,
|
|
205
|
+
dense_features: list[DenseFeature] | None = None,
|
|
206
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
207
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
208
|
+
behavior_feature_name: str | None = None,
|
|
209
|
+
candidate_feature_name: str | None = None,
|
|
210
|
+
neg_behavior_feature_name: str | None = None,
|
|
211
|
+
mlp_params: dict | None = None,
|
|
212
|
+
gru_hidden_size: int = 64,
|
|
213
|
+
attention_hidden_units: list[int] | None = None,
|
|
214
|
+
attention_activation: str = "sigmoid",
|
|
215
|
+
use_negsampling: bool = False,
|
|
216
|
+
aux_loss_weight: float = 1.0,
|
|
217
|
+
target: list[str] | str | None = None,
|
|
218
|
+
task: str | list[str] | None = None,
|
|
219
|
+
optimizer: str = "adam",
|
|
220
|
+
optimizer_params: dict | None = None,
|
|
221
|
+
loss: str | nn.Module | None = "bce",
|
|
222
|
+
loss_params: dict | list[dict] | None = None,
|
|
223
|
+
device: str = "cpu",
|
|
224
|
+
embedding_l1_reg=1e-6,
|
|
225
|
+
dense_l1_reg=1e-5,
|
|
226
|
+
embedding_l2_reg=1e-5,
|
|
227
|
+
dense_l2_reg=1e-4,
|
|
228
|
+
**kwargs,
|
|
229
|
+
):
|
|
230
|
+
|
|
231
|
+
dense_features = dense_features or []
|
|
232
|
+
sparse_features = sparse_features or []
|
|
233
|
+
sequence_features = sequence_features or []
|
|
234
|
+
mlp_params = mlp_params or {}
|
|
235
|
+
attention_hidden_units = attention_hidden_units or [80, 40]
|
|
236
|
+
optimizer_params = optimizer_params or {}
|
|
237
|
+
if loss is None:
|
|
238
|
+
loss = "bce"
|
|
239
|
+
|
|
174
240
|
super(DIEN, self).__init__(
|
|
175
241
|
dense_features=dense_features,
|
|
176
242
|
sparse_features=sparse_features,
|
|
@@ -182,138 +248,261 @@ class DIEN(BaseModel):
|
|
|
182
248
|
dense_l1_reg=dense_l1_reg,
|
|
183
249
|
embedding_l2_reg=embedding_l2_reg,
|
|
184
250
|
dense_l2_reg=dense_l2_reg,
|
|
185
|
-
**kwargs
|
|
251
|
+
**kwargs,
|
|
186
252
|
)
|
|
187
253
|
|
|
188
254
|
self.loss = loss
|
|
189
|
-
if self.loss is None:
|
|
190
|
-
self.loss = "bce"
|
|
191
|
-
|
|
192
255
|
self.use_negsampling = use_negsampling
|
|
193
|
-
|
|
194
|
-
|
|
256
|
+
self.aux_loss_weight = float(aux_loss_weight)
|
|
257
|
+
self.auxiliary_cache = None
|
|
258
|
+
|
|
195
259
|
if len(sequence_features) == 0:
|
|
196
|
-
raise ValueError(
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
260
|
+
raise ValueError(
|
|
261
|
+
"DIEN requires at least one sequence feature for user behavior history"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
if behavior_feature_name is None:
|
|
265
|
+
raise ValueError(
|
|
266
|
+
"DIEN requires at least one sequence feature as behavior item feature"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
if candidate_feature_name is None:
|
|
270
|
+
raise ValueError(
|
|
271
|
+
"DIEN requires at least one sparse_feature as candidate item feature"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
self.behavior_feature = [
|
|
275
|
+
f for f in sequence_features if f.name == behavior_feature_name
|
|
276
|
+
][0]
|
|
277
|
+
self.candidate_feature = [
|
|
278
|
+
f for f in sparse_features if f.name == candidate_feature_name
|
|
279
|
+
][0]
|
|
280
|
+
|
|
281
|
+
self.other_sparse_features = (
|
|
282
|
+
sparse_features[:-1] if self.candidate_feature else sparse_features
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
self.neg_behavior_feature = None
|
|
203
286
|
|
|
204
287
|
# Embedding layer
|
|
205
288
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
206
|
-
|
|
289
|
+
|
|
207
290
|
behavior_emb_dim = self.behavior_feature.embedding_dim
|
|
291
|
+
|
|
292
|
+
# projection candidate feature to match GRU hidden size if needed
|
|
208
293
|
self.candidate_proj = None
|
|
209
|
-
if
|
|
210
|
-
self.
|
|
211
|
-
|
|
212
|
-
|
|
294
|
+
if (
|
|
295
|
+
self.candidate_feature is not None
|
|
296
|
+
and self.candidate_feature.embedding_dim != gru_hidden_size
|
|
297
|
+
):
|
|
298
|
+
self.candidate_proj = nn.Linear(
|
|
299
|
+
self.candidate_feature.embedding_dim, gru_hidden_size
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# gru for interest extraction
|
|
213
303
|
self.interest_extractor = DynamicGRU(
|
|
214
|
-
input_size=behavior_emb_dim,
|
|
215
|
-
hidden_size=gru_hidden_size
|
|
304
|
+
input_size=behavior_emb_dim, hidden_size=gru_hidden_size
|
|
216
305
|
)
|
|
217
|
-
|
|
218
|
-
# Attention layer for computing attention scores
|
|
306
|
+
|
|
219
307
|
self.attention_layer = AttentionPoolingLayer(
|
|
220
308
|
embedding_dim=gru_hidden_size,
|
|
221
309
|
hidden_units=attention_hidden_units,
|
|
222
310
|
activation=attention_activation,
|
|
223
|
-
use_softmax=False
|
|
311
|
+
use_softmax=False,
|
|
224
312
|
)
|
|
225
|
-
|
|
313
|
+
|
|
226
314
|
# Interest Evolution Layer (AUGRU)
|
|
227
|
-
self.interest_evolution = AUGRU(
|
|
228
|
-
|
|
229
|
-
|
|
315
|
+
self.interest_evolution = AUGRU(
|
|
316
|
+
input_size=gru_hidden_size, hidden_size=gru_hidden_size
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# build auxiliary loss net if provided neg sampling and neg_behavior_feature_name
|
|
320
|
+
# auxiliary loss uses the interest states to predict the next behavior in the sequence
|
|
321
|
+
# that's the second task of DIEN
|
|
322
|
+
if self.use_negsampling:
|
|
323
|
+
neg_candidates = [
|
|
324
|
+
f for f in sequence_features if f.name == neg_behavior_feature_name
|
|
325
|
+
]
|
|
326
|
+
if len(neg_candidates) == 0:
|
|
327
|
+
raise ValueError(
|
|
328
|
+
f"use_negsampling=True requires a negative sequence feature named '{neg_behavior_feature_name}'"
|
|
329
|
+
)
|
|
330
|
+
self.neg_behavior_feature = neg_candidates[0]
|
|
331
|
+
self.auxiliary_net = nn.Sequential(
|
|
332
|
+
nn.Linear(gru_hidden_size + behavior_emb_dim, gru_hidden_size),
|
|
333
|
+
nn.PReLU(),
|
|
334
|
+
nn.Linear(gru_hidden_size, 1),
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
self.auxiliary_net = None
|
|
338
|
+
|
|
230
339
|
mlp_input_dim = 0
|
|
231
340
|
if self.candidate_feature:
|
|
232
341
|
mlp_input_dim += self.candidate_feature.embedding_dim
|
|
233
342
|
mlp_input_dim += gru_hidden_size # final interest state
|
|
234
343
|
mlp_input_dim += sum([f.embedding_dim for f in self.other_sparse_features])
|
|
235
|
-
mlp_input_dim += sum(
|
|
236
|
-
|
|
344
|
+
mlp_input_dim += sum(
|
|
345
|
+
[getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
|
|
346
|
+
)
|
|
347
|
+
|
|
237
348
|
self.mlp = MLP(input_dim=mlp_input_dim, **mlp_params)
|
|
238
349
|
self.prediction_layer = PredictionLayer(task_type=self.task)
|
|
239
|
-
|
|
240
|
-
self.register_regularization_weights(
|
|
241
|
-
|
|
350
|
+
|
|
351
|
+
self.register_regularization_weights(
|
|
352
|
+
embedding_attr="embedding",
|
|
353
|
+
include_modules=[
|
|
354
|
+
"interest_extractor",
|
|
355
|
+
"interest_evolution",
|
|
356
|
+
"attention_layer",
|
|
357
|
+
"mlp",
|
|
358
|
+
"candidate_proj",
|
|
359
|
+
"auxiliary_net",
|
|
360
|
+
],
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
self.compile(
|
|
364
|
+
optimizer=optimizer,
|
|
365
|
+
optimizer_params=optimizer_params,
|
|
366
|
+
loss=loss,
|
|
367
|
+
loss_params=loss_params,
|
|
368
|
+
)
|
|
242
369
|
|
|
243
370
|
def forward(self, x):
|
|
244
|
-
|
|
371
|
+
self.auxiliary_cache = None
|
|
245
372
|
if self.candidate_feature:
|
|
246
|
-
candidate_emb = self.embedding.embed_dict[
|
|
373
|
+
candidate_emb = self.embedding.embed_dict[
|
|
374
|
+
self.candidate_feature.embedding_name
|
|
375
|
+
](
|
|
376
|
+
x[self.candidate_feature.name].long()
|
|
377
|
+
) # [B, emb_dim]
|
|
247
378
|
else:
|
|
248
379
|
raise ValueError("DIEN requires a candidate item feature")
|
|
249
|
-
|
|
250
|
-
# Get behavior sequence embedding
|
|
380
|
+
|
|
251
381
|
behavior_seq = x[self.behavior_feature.name].long() # [B, seq_len]
|
|
252
|
-
behavior_emb = self.embedding.embed_dict[self.behavior_feature.embedding_name](
|
|
253
|
-
|
|
254
|
-
#
|
|
382
|
+
behavior_emb = self.embedding.embed_dict[self.behavior_feature.embedding_name](
|
|
383
|
+
behavior_seq
|
|
384
|
+
) # [B, seq_len, emb_dim]
|
|
385
|
+
|
|
255
386
|
if self.behavior_feature.padding_idx is not None:
|
|
256
|
-
mask = (behavior_seq != self.behavior_feature.padding_idx).unsqueeze(-1)
|
|
387
|
+
mask = (behavior_seq != self.behavior_feature.padding_idx).unsqueeze(-1)
|
|
257
388
|
else:
|
|
258
|
-
mask = (behavior_seq != 0).unsqueeze(-1)
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
interest_states, _ = self.interest_extractor(
|
|
262
|
-
|
|
263
|
-
#
|
|
389
|
+
mask = (behavior_seq != 0).unsqueeze(-1)
|
|
390
|
+
mask = mask.float() # [B, seq_len, 1]
|
|
391
|
+
|
|
392
|
+
interest_states, _ = self.interest_extractor(
|
|
393
|
+
behavior_emb
|
|
394
|
+
) # [B, seq_len, hidden_size]
|
|
395
|
+
|
|
264
396
|
batch_size, seq_len, hidden_size = interest_states.shape
|
|
265
|
-
|
|
266
|
-
# Project candidate to hidden_size if necessary (defined in __init__)
|
|
397
|
+
|
|
267
398
|
if self.candidate_proj is not None:
|
|
268
399
|
candidate_for_attention = self.candidate_proj(candidate_emb)
|
|
269
400
|
else:
|
|
270
|
-
candidate_for_attention = candidate_emb
|
|
271
|
-
|
|
272
|
-
# Compute attention scores for AUGRU
|
|
273
|
-
attention_scores = []
|
|
401
|
+
candidate_for_attention = candidate_emb # [B, hidden_size]
|
|
402
|
+
att_scores_list = []
|
|
274
403
|
for t in range(seq_len):
|
|
275
|
-
|
|
276
|
-
|
|
404
|
+
# [B, 4H]
|
|
405
|
+
concat_feat = torch.cat(
|
|
406
|
+
[
|
|
277
407
|
candidate_for_attention,
|
|
278
408
|
interest_states[:, t, :],
|
|
279
409
|
candidate_for_attention - interest_states[:, t, :],
|
|
280
|
-
candidate_for_attention * interest_states[:, t, :]
|
|
281
|
-
],
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
410
|
+
candidate_for_attention * interest_states[:, t, :],
|
|
411
|
+
],
|
|
412
|
+
dim=-1,
|
|
413
|
+
)
|
|
414
|
+
score_t = self.attention_layer.attention_net(concat_feat) # [B, 1]
|
|
415
|
+
att_scores_list.append(score_t)
|
|
416
|
+
|
|
417
|
+
# [B, seq_len, 1]
|
|
418
|
+
att_scores = torch.cat(att_scores_list, dim=1)
|
|
419
|
+
|
|
420
|
+
scores_flat = att_scores.squeeze(-1) # [B, seq_len]
|
|
421
|
+
mask_flat = mask.squeeze(-1) # [B, seq_len]
|
|
422
|
+
|
|
423
|
+
scores_flat = scores_flat.masked_fill(mask_flat == 0, -1e9)
|
|
424
|
+
att_weights = torch.softmax(scores_flat, dim=1) # [B, seq_len]
|
|
425
|
+
att_weights = att_weights.unsqueeze(-1) # [B, seq_len, 1]
|
|
426
|
+
|
|
427
|
+
att_weights = att_weights * mask
|
|
428
|
+
|
|
429
|
+
# 6. Interest Evolution(AUGRU)
|
|
292
430
|
final_states, final_interest = self.interest_evolution(
|
|
293
|
-
interest_states,
|
|
294
|
-
attention_scores
|
|
431
|
+
interest_states, att_weights
|
|
295
432
|
) # final_interest: [B, hidden_size]
|
|
296
|
-
|
|
297
|
-
|
|
433
|
+
|
|
434
|
+
if self.use_negsampling and self.training:
|
|
435
|
+
if self.neg_behavior_feature is None:
|
|
436
|
+
raise ValueError(
|
|
437
|
+
"Negative behavior feature is not configured while use_negsampling=True"
|
|
438
|
+
)
|
|
439
|
+
neg_seq = x[self.neg_behavior_feature.name].long()
|
|
440
|
+
neg_behavior_emb = self.embedding.embed_dict[
|
|
441
|
+
self.neg_behavior_feature.embedding_name
|
|
442
|
+
](neg_seq)
|
|
443
|
+
self.auxiliary_cache = {
|
|
444
|
+
"interest_states": interest_states,
|
|
445
|
+
"behavior_emb": behavior_emb,
|
|
446
|
+
"neg_behavior_emb": neg_behavior_emb,
|
|
447
|
+
"mask": mask,
|
|
448
|
+
}
|
|
449
|
+
|
|
298
450
|
other_embeddings = []
|
|
299
451
|
other_embeddings.append(candidate_emb)
|
|
300
452
|
other_embeddings.append(final_interest)
|
|
301
|
-
|
|
302
|
-
# Other sparse features
|
|
453
|
+
|
|
303
454
|
for feat in self.other_sparse_features:
|
|
304
|
-
feat_emb = self.embedding.embed_dict[feat.embedding_name](
|
|
455
|
+
feat_emb = self.embedding.embed_dict[feat.embedding_name](
|
|
456
|
+
x[feat.name].long()
|
|
457
|
+
)
|
|
305
458
|
other_embeddings.append(feat_emb)
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
for feat in self.dense_features_list:
|
|
459
|
+
|
|
460
|
+
for feat in self.dense_features:
|
|
309
461
|
val = x[feat.name].float()
|
|
310
462
|
if val.dim() == 1:
|
|
311
463
|
val = val.unsqueeze(1)
|
|
312
464
|
other_embeddings.append(val)
|
|
313
|
-
|
|
314
|
-
# Concatenate all features
|
|
465
|
+
|
|
315
466
|
concat_input = torch.cat(other_embeddings, dim=-1) # [B, total_dim]
|
|
316
|
-
|
|
317
|
-
# MLP prediction
|
|
467
|
+
|
|
318
468
|
y = self.mlp(concat_input) # [B, 1]
|
|
319
469
|
return self.prediction_layer(y)
|
|
470
|
+
|
|
471
|
+
def compute_auxiliary_loss(self):
|
|
472
|
+
if not (self.training and self.use_negsampling and self.auxiliary_net):
|
|
473
|
+
return torch.tensor(0.0, device=self.device)
|
|
474
|
+
if self.auxiliary_cache is None:
|
|
475
|
+
return torch.tensor(0.0, device=self.device)
|
|
476
|
+
|
|
477
|
+
interest_states = self.auxiliary_cache["interest_states"]
|
|
478
|
+
behavior_emb = self.auxiliary_cache["behavior_emb"]
|
|
479
|
+
neg_behavior_emb = self.auxiliary_cache["neg_behavior_emb"]
|
|
480
|
+
mask = self.auxiliary_cache["mask"]
|
|
481
|
+
|
|
482
|
+
interest_states = interest_states[:, :-1, :]
|
|
483
|
+
pos_seq = behavior_emb[:, 1:, :]
|
|
484
|
+
neg_seq = neg_behavior_emb[:, 1:, :]
|
|
485
|
+
aux_mask = mask[:, 1:, :].squeeze(-1)
|
|
486
|
+
|
|
487
|
+
if aux_mask.sum() == 0:
|
|
488
|
+
return torch.tensor(0.0, device=self.device)
|
|
489
|
+
|
|
490
|
+
pos_input = torch.cat([interest_states, pos_seq], dim=-1)
|
|
491
|
+
neg_input = torch.cat([interest_states, neg_seq], dim=-1)
|
|
492
|
+
pos_logits = self.auxiliary_net(pos_input).squeeze(-1)
|
|
493
|
+
neg_logits = self.auxiliary_net(neg_input).squeeze(-1)
|
|
494
|
+
|
|
495
|
+
pos_loss = F.binary_cross_entropy_with_logits(
|
|
496
|
+
pos_logits, torch.ones_like(pos_logits), reduction="none"
|
|
497
|
+
)
|
|
498
|
+
neg_loss = F.binary_cross_entropy_with_logits(
|
|
499
|
+
neg_logits, torch.zeros_like(neg_logits), reduction="none"
|
|
500
|
+
)
|
|
501
|
+
aux_loss = (pos_loss + neg_loss) * aux_mask
|
|
502
|
+
aux_loss = aux_loss.sum() / torch.clamp(aux_mask.sum(), min=1.0)
|
|
503
|
+
return aux_loss
|
|
504
|
+
|
|
505
|
+
def compute_loss(self, y_pred, y_true):
|
|
506
|
+
main_loss = super().compute_loss(y_pred, y_true)
|
|
507
|
+
aux_loss = self.compute_auxiliary_loss()
|
|
508
|
+
return main_loss + self.aux_loss_weight * aux_loss
|