nextrec 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +1 -1
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -5
- nextrec/basic/callback.py +1 -0
- nextrec/basic/features.py +30 -22
- nextrec/basic/layers.py +244 -113
- nextrec/basic/loggers.py +62 -43
- nextrec/basic/metrics.py +268 -119
- nextrec/basic/model.py +1373 -443
- nextrec/basic/session.py +10 -3
- nextrec/cli.py +498 -0
- nextrec/data/__init__.py +19 -25
- nextrec/data/batch_utils.py +11 -3
- nextrec/data/data_processing.py +42 -24
- nextrec/data/data_utils.py +26 -15
- nextrec/data/dataloader.py +303 -96
- nextrec/data/preprocessor.py +320 -199
- nextrec/loss/listwise.py +17 -9
- nextrec/loss/loss_utils.py +7 -8
- nextrec/loss/pairwise.py +2 -0
- nextrec/loss/pointwise.py +30 -12
- nextrec/models/generative/hstu.py +106 -40
- nextrec/models/match/dssm.py +82 -69
- nextrec/models/match/dssm_v2.py +72 -58
- nextrec/models/match/mind.py +175 -108
- nextrec/models/match/sdm.py +104 -88
- nextrec/models/match/youtube_dnn.py +73 -60
- nextrec/models/multi_task/esmm.py +53 -39
- nextrec/models/multi_task/mmoe.py +70 -47
- nextrec/models/multi_task/ple.py +107 -50
- nextrec/models/multi_task/poso.py +121 -41
- nextrec/models/multi_task/share_bottom.py +54 -38
- nextrec/models/ranking/afm.py +172 -45
- nextrec/models/ranking/autoint.py +84 -61
- nextrec/models/ranking/dcn.py +59 -42
- nextrec/models/ranking/dcn_v2.py +64 -23
- nextrec/models/ranking/deepfm.py +36 -26
- nextrec/models/ranking/dien.py +158 -102
- nextrec/models/ranking/din.py +88 -60
- nextrec/models/ranking/fibinet.py +55 -35
- nextrec/models/ranking/fm.py +32 -26
- nextrec/models/ranking/masknet.py +95 -34
- nextrec/models/ranking/pnn.py +34 -31
- nextrec/models/ranking/widedeep.py +37 -29
- nextrec/models/ranking/xdeepfm.py +63 -41
- nextrec/utils/__init__.py +61 -32
- nextrec/utils/config.py +490 -0
- nextrec/utils/device.py +52 -12
- nextrec/utils/distributed.py +141 -0
- nextrec/utils/embedding.py +1 -0
- nextrec/utils/feature.py +1 -0
- nextrec/utils/file.py +32 -11
- nextrec/utils/initializer.py +61 -16
- nextrec/utils/optimizer.py +25 -9
- nextrec/utils/synthetic_data.py +531 -0
- nextrec/utils/tensor.py +24 -13
- {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/METADATA +15 -5
- nextrec-0.4.2.dist-info/RECORD +69 -0
- nextrec-0.4.2.dist-info/entry_points.txt +2 -0
- nextrec-0.3.6.dist-info/RECORD +0 -64
- {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
- {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
nextrec/models/ranking/afm.py
CHANGED
|
@@ -1,17 +1,46 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Checkpoint: edit on 06/12/2025
|
|
4
|
+
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
[1] Xiao J, Ye H, He X, et al. Attentional factorization machines: Learning the weight of
|
|
7
|
+
feature interactions via attention networks[C]//IJCAI. 2017: 3119-3125.
|
|
8
|
+
|
|
9
|
+
Attentional Factorization Machine (AFM) builds on FM by learning an importance
|
|
10
|
+
weight for every second-order interaction instead of treating all pairs equally.
|
|
11
|
+
It retains FM’s linear (first-order) component for sparsity-friendly modeling,
|
|
12
|
+
while using an attention network to reweight the element-wise product of field
|
|
13
|
+
embeddings before aggregation.
|
|
14
|
+
|
|
15
|
+
In each forward pass:
|
|
16
|
+
(1) Embed each field and compute pairwise element-wise products v_i ⊙ v_j
|
|
17
|
+
(2) Pass interactions through an attention MLP (ReLU + projection) to score them
|
|
18
|
+
(3) Softmax-normalize scores to obtain interaction weights
|
|
19
|
+
(4) Weighted sum of interactions -> linear projection -> add FM first-order term
|
|
20
|
+
|
|
21
|
+
Key Advantages:
|
|
22
|
+
- Learns which feature pairs contribute most via attention weights
|
|
23
|
+
- Keeps FM efficiency and interpretability by preserving first-order terms
|
|
24
|
+
- Softmax-normalized reweighting reduces noise from uninformative interactions
|
|
25
|
+
|
|
26
|
+
AFM 在 FM 的二阶交互上引入注意力,为每个特征对学习重要性权重;同时保留 FM 的一阶项,
|
|
27
|
+
保持对稀疏特征的友好与可解释性。具体流程:
|
|
28
|
+
(1) 对各字段做 embedding,并计算所有特征对的元素积 v_i ⊙ v_j
|
|
29
|
+
(2) 经由注意力 MLP(ReLU + 线性映射)得到交互得分
|
|
30
|
+
(3) 通过 softmax 归一化交互得分,得到权重
|
|
31
|
+
(4) 将加权交互求和、线性映射,再与一阶项相加得到最终预测
|
|
32
|
+
|
|
33
|
+
主要优点:
|
|
34
|
+
- 注意力显式告诉哪些特征对更重要
|
|
35
|
+
- 保留 FM 的效率和可解释性
|
|
36
|
+
- softmax 归一化减弱噪声交互的影响
|
|
8
37
|
"""
|
|
9
38
|
|
|
10
39
|
import torch
|
|
11
40
|
import torch.nn as nn
|
|
12
41
|
|
|
13
42
|
from nextrec.basic.model import BaseModel
|
|
14
|
-
from nextrec.basic.layers import EmbeddingLayer,
|
|
43
|
+
from nextrec.basic.layers import EmbeddingLayer, PredictionLayer, InputMask
|
|
15
44
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
16
45
|
|
|
17
46
|
|
|
@@ -21,69 +50,113 @@ class AFM(BaseModel):
|
|
|
21
50
|
return "AFM"
|
|
22
51
|
|
|
23
52
|
@property
|
|
24
|
-
def
|
|
53
|
+
def default_task(self):
|
|
25
54
|
return "binary"
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
dense_features: list[DenseFeature] | list = [],
|
|
59
|
+
sparse_features: list[SparseFeature] | list = [],
|
|
60
|
+
sequence_features: list[SequenceFeature] | list = [],
|
|
61
|
+
attention_dim: int = 32,
|
|
62
|
+
attention_dropout: float = 0.0,
|
|
63
|
+
target: list[str] | list = [],
|
|
64
|
+
task: str | list[str] | None = None,
|
|
65
|
+
optimizer: str = "adam",
|
|
66
|
+
optimizer_params: dict = {},
|
|
67
|
+
loss: str | nn.Module | None = "bce",
|
|
68
|
+
loss_params: dict | list[dict] | None = None,
|
|
69
|
+
device: str = "cpu",
|
|
70
|
+
embedding_l1_reg=1e-6,
|
|
71
|
+
dense_l1_reg=1e-5,
|
|
72
|
+
embedding_l2_reg=1e-5,
|
|
73
|
+
dense_l2_reg=1e-4,
|
|
74
|
+
**kwargs,
|
|
75
|
+
):
|
|
76
|
+
|
|
44
77
|
super(AFM, self).__init__(
|
|
45
78
|
dense_features=dense_features,
|
|
46
79
|
sparse_features=sparse_features,
|
|
47
80
|
sequence_features=sequence_features,
|
|
48
81
|
target=target,
|
|
49
|
-
task=self.
|
|
82
|
+
task=task or self.default_task,
|
|
50
83
|
device=device,
|
|
51
84
|
embedding_l1_reg=embedding_l1_reg,
|
|
52
85
|
dense_l1_reg=dense_l1_reg,
|
|
53
86
|
embedding_l2_reg=embedding_l2_reg,
|
|
54
87
|
dense_l2_reg=dense_l2_reg,
|
|
55
|
-
|
|
56
|
-
**kwargs
|
|
88
|
+
**kwargs,
|
|
57
89
|
)
|
|
58
90
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
91
|
+
if target is None:
|
|
92
|
+
target = []
|
|
93
|
+
if optimizer_params is None:
|
|
94
|
+
optimizer_params = {}
|
|
95
|
+
if loss is None:
|
|
96
|
+
loss = "bce"
|
|
97
|
+
|
|
63
98
|
self.fm_features = sparse_features + sequence_features
|
|
64
99
|
if len(self.fm_features) < 2:
|
|
65
|
-
raise ValueError(
|
|
100
|
+
raise ValueError(
|
|
101
|
+
"AFM requires at least two sparse/sequence features to build pairwise interactions."
|
|
102
|
+
)
|
|
66
103
|
|
|
67
|
-
#
|
|
104
|
+
# make sure all embedding dimension are the same for FM features
|
|
68
105
|
self.embedding_dim = self.fm_features[0].embedding_dim
|
|
69
106
|
if any(f.embedding_dim != self.embedding_dim for f in self.fm_features):
|
|
70
|
-
raise ValueError(
|
|
71
|
-
|
|
72
|
-
|
|
107
|
+
raise ValueError(
|
|
108
|
+
"All FM features must share the same embedding_dim for AFM."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
self.embedding = EmbeddingLayer(
|
|
112
|
+
features=self.fm_features
|
|
113
|
+
) # [Batch, Field, Dim ]
|
|
114
|
+
|
|
115
|
+
# First-order terms: dense linear + one hot embeddings
|
|
116
|
+
self.dense_features = list(dense_features)
|
|
117
|
+
dense_input_dim = sum([f.input_dim for f in self.dense_features])
|
|
118
|
+
self.linear_dense = (
|
|
119
|
+
nn.Linear(dense_input_dim, 1, bias=True) if dense_input_dim > 0 else None
|
|
120
|
+
)
|
|
73
121
|
|
|
74
|
-
|
|
75
|
-
|
|
122
|
+
# First-order term: sparse/sequence features one-hot
|
|
123
|
+
# **INFO**: source paper does not contain sequence features in experiments,
|
|
124
|
+
# but we implement it here for completeness. if you want follow the paper strictly,
|
|
125
|
+
# remove sequence features from fm_features.
|
|
126
|
+
self.first_order_embeddings = nn.ModuleDict()
|
|
127
|
+
for feature in self.fm_features:
|
|
128
|
+
if (
|
|
129
|
+
feature.embedding_name in self.first_order_embeddings
|
|
130
|
+
): # shared embedding
|
|
131
|
+
continue
|
|
132
|
+
emb = nn.Embedding(
|
|
133
|
+
num_embeddings=feature.vocab_size,
|
|
134
|
+
embedding_dim=1,
|
|
135
|
+
padding_idx=feature.padding_idx,
|
|
136
|
+
) # equal to one-hot encoding weight
|
|
137
|
+
# nn.init.zeros_(emb.weight)
|
|
138
|
+
self.first_order_embeddings[feature.embedding_name] = emb
|
|
76
139
|
|
|
77
140
|
self.attention_linear = nn.Linear(self.embedding_dim, attention_dim)
|
|
78
141
|
self.attention_p = nn.Linear(attention_dim, 1, bias=False)
|
|
79
142
|
self.attention_dropout = nn.Dropout(attention_dropout)
|
|
80
143
|
self.output_projection = nn.Linear(self.embedding_dim, 1, bias=False)
|
|
81
|
-
self.prediction_layer = PredictionLayer(task_type=self.
|
|
144
|
+
self.prediction_layer = PredictionLayer(task_type=self.default_task)
|
|
145
|
+
self.input_mask = InputMask()
|
|
82
146
|
|
|
83
147
|
# Register regularization weights
|
|
84
148
|
self.register_regularization_weights(
|
|
85
|
-
embedding_attr=
|
|
86
|
-
include_modules=[
|
|
149
|
+
embedding_attr="embedding",
|
|
150
|
+
include_modules=[
|
|
151
|
+
"linear_dense",
|
|
152
|
+
"attention_linear",
|
|
153
|
+
"attention_p",
|
|
154
|
+
"output_projection",
|
|
155
|
+
],
|
|
156
|
+
)
|
|
157
|
+
# add first-order embeddings to embedding regularization list
|
|
158
|
+
self.embedding_params.extend(
|
|
159
|
+
emb.weight for emb in self.first_order_embeddings.values()
|
|
87
160
|
)
|
|
88
161
|
|
|
89
162
|
self.compile(
|
|
@@ -94,11 +167,65 @@ class AFM(BaseModel):
|
|
|
94
167
|
)
|
|
95
168
|
|
|
96
169
|
def forward(self, x):
|
|
97
|
-
field_emb = self.embedding(
|
|
98
|
-
|
|
99
|
-
|
|
170
|
+
field_emb = self.embedding(
|
|
171
|
+
x=x, features=self.fm_features, squeeze_dim=False
|
|
172
|
+
) # [B, F, D]
|
|
173
|
+
batch_size = field_emb.size(0)
|
|
174
|
+
y_linear = torch.zeros(batch_size, 1, device=field_emb.device)
|
|
175
|
+
|
|
176
|
+
# First-order dense part
|
|
177
|
+
if self.linear_dense is not None:
|
|
178
|
+
dense_inputs = [
|
|
179
|
+
x[f.name].float().view(batch_size, -1) for f in self.dense_features
|
|
180
|
+
]
|
|
181
|
+
dense_stack = torch.cat(dense_inputs, dim=1) if dense_inputs else None
|
|
182
|
+
if dense_stack is not None:
|
|
183
|
+
y_linear = y_linear + self.linear_dense(dense_stack)
|
|
184
|
+
|
|
185
|
+
# First-order sparse/sequence part
|
|
186
|
+
first_order_terms = []
|
|
187
|
+
for feature in self.fm_features:
|
|
188
|
+
emb = self.first_order_embeddings[feature.embedding_name]
|
|
189
|
+
if isinstance(feature, SparseFeature):
|
|
190
|
+
term = emb(x[feature.name].long()) # [B, 1]
|
|
191
|
+
else: # SequenceFeature
|
|
192
|
+
seq_input = x[feature.name].long() # [B, 1]
|
|
193
|
+
if feature.max_len is not None and seq_input.size(1) > feature.max_len:
|
|
194
|
+
seq_input = seq_input[:, -feature.max_len :]
|
|
195
|
+
mask = self.input_mask(x, feature, seq_input).squeeze(1) # [B, 1]
|
|
196
|
+
seq_weight = emb(seq_input).squeeze(-1) # [B, L]
|
|
197
|
+
term = (seq_weight * mask).sum(dim=1, keepdim=True) # [B, 1]
|
|
198
|
+
first_order_terms.append(term)
|
|
199
|
+
if first_order_terms:
|
|
200
|
+
y_linear = y_linear + torch.sum(
|
|
201
|
+
torch.cat(first_order_terms, dim=1), dim=1, keepdim=True
|
|
202
|
+
)
|
|
100
203
|
|
|
101
204
|
interactions = []
|
|
205
|
+
feature_values = []
|
|
206
|
+
for feature in self.fm_features:
|
|
207
|
+
value = x.get(f"{feature.name}_value")
|
|
208
|
+
if value is not None:
|
|
209
|
+
value = value.float()
|
|
210
|
+
if value.dim() == 1:
|
|
211
|
+
value = value.unsqueeze(-1)
|
|
212
|
+
else:
|
|
213
|
+
if isinstance(feature, SequenceFeature):
|
|
214
|
+
seq_input = x[feature.name].long()
|
|
215
|
+
if (
|
|
216
|
+
feature.max_len is not None
|
|
217
|
+
and seq_input.size(1) > feature.max_len
|
|
218
|
+
):
|
|
219
|
+
seq_input = seq_input[:, -feature.max_len :]
|
|
220
|
+
value = self.input_mask(x, feature, seq_input).sum(dim=2) # [B, 1]
|
|
221
|
+
else:
|
|
222
|
+
value = torch.ones(batch_size, 1, device=field_emb.device)
|
|
223
|
+
feature_values.append(value)
|
|
224
|
+
feature_values_tensor = torch.cat(feature_values, dim=1).unsqueeze(
|
|
225
|
+
-1
|
|
226
|
+
) # [B, F, 1]
|
|
227
|
+
field_emb = field_emb * feature_values_tensor
|
|
228
|
+
|
|
102
229
|
num_fields = field_emb.shape[1]
|
|
103
230
|
for i in range(num_fields - 1):
|
|
104
231
|
vi = field_emb[:, i, :]
|
|
@@ -107,7 +234,7 @@ class AFM(BaseModel):
|
|
|
107
234
|
interactions.append(vi * vj)
|
|
108
235
|
|
|
109
236
|
pair_tensor = torch.stack(interactions, dim=1) # [B, num_pairs, D]
|
|
110
|
-
attention_scores = torch.
|
|
237
|
+
attention_scores = torch.relu(self.attention_linear(pair_tensor))
|
|
111
238
|
attention_scores = self.attention_p(attention_scores) # [B, num_pairs, 1]
|
|
112
239
|
attention_weights = torch.softmax(attention_scores, dim=1)
|
|
113
240
|
|
|
@@ -3,8 +3,8 @@ Date: create on 09/11/2025
|
|
|
3
3
|
Checkpoint: edit on 24/11/2025
|
|
4
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
|
|
7
|
-
self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
|
|
6
|
+
[1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
|
|
7
|
+
self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
|
|
8
8
|
on information and knowledge management. 2019: 1161-1170.
|
|
9
9
|
(https://arxiv.org/abs/1810.11921)
|
|
10
10
|
|
|
@@ -68,91 +68,106 @@ class AutoInt(BaseModel):
|
|
|
68
68
|
return "AutoInt"
|
|
69
69
|
|
|
70
70
|
@property
|
|
71
|
-
def
|
|
71
|
+
def default_task(self):
|
|
72
72
|
return "binary"
|
|
73
|
-
|
|
74
|
-
def __init__(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
dense_features: list[DenseFeature],
|
|
77
|
+
sparse_features: list[SparseFeature],
|
|
78
|
+
sequence_features: list[SequenceFeature],
|
|
79
|
+
att_layer_num: int = 3,
|
|
80
|
+
att_embedding_dim: int = 8,
|
|
81
|
+
att_head_num: int = 2,
|
|
82
|
+
att_dropout: float = 0.0,
|
|
83
|
+
att_use_residual: bool = True,
|
|
84
|
+
target: list[str] | None = None,
|
|
85
|
+
task: str | list[str] | None = None,
|
|
86
|
+
optimizer: str = "adam",
|
|
87
|
+
optimizer_params: dict | None = None,
|
|
88
|
+
loss: str | nn.Module | None = "bce",
|
|
89
|
+
loss_params: dict | list[dict] | None = None,
|
|
90
|
+
device: str = "cpu",
|
|
91
|
+
embedding_l1_reg=1e-6,
|
|
92
|
+
dense_l1_reg=1e-5,
|
|
93
|
+
embedding_l2_reg=1e-5,
|
|
94
|
+
dense_l2_reg=1e-4,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
|
|
95
98
|
super(AutoInt, self).__init__(
|
|
96
99
|
dense_features=dense_features,
|
|
97
100
|
sparse_features=sparse_features,
|
|
98
101
|
sequence_features=sequence_features,
|
|
99
102
|
target=target,
|
|
100
|
-
task=self.
|
|
103
|
+
task=task or self.default_task,
|
|
101
104
|
device=device,
|
|
102
105
|
embedding_l1_reg=embedding_l1_reg,
|
|
103
106
|
dense_l1_reg=dense_l1_reg,
|
|
104
107
|
embedding_l2_reg=embedding_l2_reg,
|
|
105
108
|
dense_l2_reg=dense_l2_reg,
|
|
106
|
-
|
|
107
|
-
**kwargs
|
|
109
|
+
**kwargs,
|
|
108
110
|
)
|
|
109
111
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
if target is None:
|
|
113
|
+
target = []
|
|
114
|
+
if optimizer_params is None:
|
|
115
|
+
optimizer_params = {}
|
|
116
|
+
if loss is None:
|
|
117
|
+
loss = "bce"
|
|
118
|
+
|
|
114
119
|
self.att_layer_num = att_layer_num
|
|
115
120
|
self.att_embedding_dim = att_embedding_dim
|
|
116
|
-
|
|
121
|
+
|
|
117
122
|
# Use sparse and sequence features for interaction
|
|
118
|
-
|
|
119
|
-
|
|
123
|
+
# **INFO**: this is different from the original paper, we also include dense features
|
|
124
|
+
# if you want to follow the paper strictly, set dense_features=[]
|
|
125
|
+
# or modify the code accordingly
|
|
126
|
+
self.interaction_features = dense_features + sparse_features + sequence_features
|
|
127
|
+
|
|
120
128
|
# All features for embedding
|
|
121
129
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
122
130
|
|
|
123
131
|
# Embedding layer
|
|
124
132
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
125
|
-
|
|
133
|
+
|
|
126
134
|
# Project embeddings to attention embedding dimension
|
|
127
135
|
num_fields = len(self.interaction_features)
|
|
128
|
-
|
|
136
|
+
|
|
129
137
|
# If embeddings have different dimensions, project them to att_embedding_dim
|
|
130
|
-
self.need_projection = not all(
|
|
138
|
+
self.need_projection = not all(
|
|
139
|
+
f.embedding_dim == att_embedding_dim for f in self.interaction_features
|
|
140
|
+
)
|
|
131
141
|
self.projection_layers = None
|
|
132
142
|
if self.need_projection:
|
|
133
|
-
self.projection_layers = nn.ModuleList(
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
143
|
+
self.projection_layers = nn.ModuleList(
|
|
144
|
+
[
|
|
145
|
+
nn.Linear(f.embedding_dim, att_embedding_dim, bias=False)
|
|
146
|
+
for f in self.interaction_features
|
|
147
|
+
]
|
|
148
|
+
)
|
|
149
|
+
|
|
138
150
|
# Multi-head self-attention layers
|
|
139
|
-
self.attention_layers = nn.ModuleList(
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
151
|
+
self.attention_layers = nn.ModuleList(
|
|
152
|
+
[
|
|
153
|
+
MultiHeadSelfAttention(
|
|
154
|
+
embedding_dim=att_embedding_dim,
|
|
155
|
+
num_heads=att_head_num,
|
|
156
|
+
dropout=att_dropout,
|
|
157
|
+
use_residual=att_use_residual,
|
|
158
|
+
)
|
|
159
|
+
for _ in range(att_layer_num)
|
|
160
|
+
]
|
|
161
|
+
)
|
|
162
|
+
|
|
148
163
|
# Final prediction layer
|
|
149
164
|
self.fc = nn.Linear(num_fields * att_embedding_dim, 1)
|
|
150
|
-
self.prediction_layer = PredictionLayer(task_type=self.
|
|
165
|
+
self.prediction_layer = PredictionLayer(task_type=self.default_task)
|
|
151
166
|
|
|
152
167
|
# Register regularization weights
|
|
153
168
|
self.register_regularization_weights(
|
|
154
|
-
embedding_attr=
|
|
155
|
-
include_modules=[
|
|
169
|
+
embedding_attr="embedding",
|
|
170
|
+
include_modules=["projection_layers", "attention_layers", "fc"],
|
|
156
171
|
)
|
|
157
172
|
|
|
158
173
|
self.compile(
|
|
@@ -166,21 +181,29 @@ class AutoInt(BaseModel):
|
|
|
166
181
|
# Get embeddings field-by-field so mixed dimensions can be projected safely
|
|
167
182
|
field_embeddings = []
|
|
168
183
|
if len(self.interaction_features) == 0:
|
|
169
|
-
raise ValueError(
|
|
184
|
+
raise ValueError(
|
|
185
|
+
"AutoInt requires at least one sparse or sequence feature for interactions."
|
|
186
|
+
)
|
|
170
187
|
for idx, feature in enumerate(self.interaction_features):
|
|
171
188
|
feature_emb = self.embedding(x=x, features=[feature], squeeze_dim=False)
|
|
172
189
|
feature_emb = feature_emb.squeeze(1) # [B, embedding_dim]
|
|
173
190
|
if self.need_projection and self.projection_layers is not None:
|
|
174
191
|
feature_emb = self.projection_layers[idx](feature_emb)
|
|
175
|
-
field_embeddings.append(
|
|
192
|
+
field_embeddings.append(
|
|
193
|
+
feature_emb.unsqueeze(1)
|
|
194
|
+
) # [B, 1, att_embedding_dim or original_dim]
|
|
176
195
|
embeddings = torch.cat(field_embeddings, dim=1)
|
|
177
|
-
|
|
196
|
+
|
|
178
197
|
# Apply multi-head self-attention layers
|
|
179
198
|
attention_output = embeddings
|
|
180
199
|
for att_layer in self.attention_layers:
|
|
181
|
-
attention_output = att_layer(
|
|
182
|
-
|
|
200
|
+
attention_output = att_layer(
|
|
201
|
+
attention_output
|
|
202
|
+
) # [B, num_fields, att_embedding_dim]
|
|
203
|
+
|
|
183
204
|
# Flatten and predict
|
|
184
|
-
attention_output_flat = attention_output.flatten(
|
|
205
|
+
attention_output_flat = attention_output.flatten(
|
|
206
|
+
start_dim=1
|
|
207
|
+
) # [B, num_fields * att_embedding_dim]
|
|
185
208
|
y = self.fc(attention_output_flat) # [B, 1]
|
|
186
209
|
return self.prediction_layer(y)
|
nextrec/models/ranking/dcn.py
CHANGED
|
@@ -15,24 +15,26 @@ from nextrec.basic.model import BaseModel
|
|
|
15
15
|
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
16
16
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
17
17
|
|
|
18
|
+
|
|
18
19
|
class CrossNetwork(nn.Module):
|
|
19
20
|
"""Stacked Cross Layers from DCN (Wang et al., 2017)."""
|
|
20
21
|
|
|
21
22
|
def __init__(self, input_dim, num_layers):
|
|
22
23
|
super().__init__()
|
|
23
24
|
self.num_layers = num_layers
|
|
24
|
-
self.w = torch.nn.ModuleList(
|
|
25
|
-
|
|
25
|
+
self.w = torch.nn.ModuleList(
|
|
26
|
+
[torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)]
|
|
27
|
+
)
|
|
28
|
+
self.b = torch.nn.ParameterList(
|
|
29
|
+
[torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)]
|
|
30
|
+
)
|
|
26
31
|
|
|
27
32
|
def forward(self, x):
|
|
28
|
-
"""
|
|
29
|
-
:param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
|
|
30
|
-
"""
|
|
31
33
|
x0 = x
|
|
32
34
|
for i in range(self.num_layers):
|
|
33
35
|
xw = self.w[i](x)
|
|
34
36
|
x = x0 * xw + self.b[i] + x
|
|
35
|
-
return x
|
|
37
|
+
return x # [batch_size, input_dim]
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
class DCN(BaseModel):
|
|
@@ -41,46 +43,48 @@ class DCN(BaseModel):
|
|
|
41
43
|
return "DCN"
|
|
42
44
|
|
|
43
45
|
@property
|
|
44
|
-
def
|
|
46
|
+
def default_task(self):
|
|
45
47
|
return "binary"
|
|
46
|
-
|
|
47
|
-
def __init__(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
dense_features: list[DenseFeature],
|
|
52
|
+
sparse_features: list[SparseFeature],
|
|
53
|
+
sequence_features: list[SequenceFeature],
|
|
54
|
+
cross_num: int = 3,
|
|
55
|
+
mlp_params: dict | None = None,
|
|
56
|
+
target: list[str] = [],
|
|
57
|
+
task: str | list[str] | None = None,
|
|
58
|
+
optimizer: str = "adam",
|
|
59
|
+
optimizer_params: dict = {},
|
|
60
|
+
loss: str | nn.Module | None = "bce",
|
|
61
|
+
loss_params: dict | list[dict] | None = None,
|
|
62
|
+
device: str = "cpu",
|
|
63
|
+
embedding_l1_reg=1e-6,
|
|
64
|
+
dense_l1_reg=1e-5,
|
|
65
|
+
embedding_l2_reg=1e-5,
|
|
66
|
+
dense_l2_reg=1e-4,
|
|
67
|
+
**kwargs,
|
|
68
|
+
):
|
|
69
|
+
|
|
65
70
|
super(DCN, self).__init__(
|
|
66
71
|
dense_features=dense_features,
|
|
67
72
|
sparse_features=sparse_features,
|
|
68
73
|
sequence_features=sequence_features,
|
|
69
74
|
target=target,
|
|
70
|
-
task=self.
|
|
75
|
+
task=task or self.default_task,
|
|
71
76
|
device=device,
|
|
72
77
|
embedding_l1_reg=embedding_l1_reg,
|
|
73
78
|
dense_l1_reg=dense_l1_reg,
|
|
74
79
|
embedding_l2_reg=embedding_l2_reg,
|
|
75
80
|
dense_l2_reg=dense_l2_reg,
|
|
76
|
-
|
|
77
|
-
**kwargs
|
|
81
|
+
**kwargs,
|
|
78
82
|
)
|
|
79
83
|
|
|
80
84
|
self.loss = loss
|
|
81
85
|
if self.loss is None:
|
|
82
86
|
self.loss = "bce"
|
|
83
|
-
|
|
87
|
+
|
|
84
88
|
# All features
|
|
85
89
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
86
90
|
|
|
@@ -88,30 +92,41 @@ class DCN(BaseModel):
|
|
|
88
92
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
89
93
|
|
|
90
94
|
# Calculate input dimension
|
|
91
|
-
emb_dim_total = sum(
|
|
92
|
-
|
|
95
|
+
emb_dim_total = sum(
|
|
96
|
+
[
|
|
97
|
+
f.embedding_dim
|
|
98
|
+
for f in self.all_features
|
|
99
|
+
if not isinstance(f, DenseFeature)
|
|
100
|
+
]
|
|
101
|
+
)
|
|
102
|
+
dense_input_dim = sum(
|
|
103
|
+
[getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
|
|
104
|
+
)
|
|
93
105
|
input_dim = emb_dim_total + dense_input_dim
|
|
94
|
-
|
|
106
|
+
|
|
95
107
|
# Cross Network
|
|
96
108
|
self.cross_network = CrossNetwork(input_dim=input_dim, num_layers=cross_num)
|
|
97
|
-
|
|
109
|
+
|
|
98
110
|
# Deep Network (optional)
|
|
99
111
|
if mlp_params is not None:
|
|
100
112
|
self.use_dnn = True
|
|
101
113
|
self.mlp = MLP(input_dim=input_dim, **mlp_params)
|
|
114
|
+
deep_dim = self.mlp.output_dim
|
|
102
115
|
# Final layer combines cross and deep
|
|
103
|
-
self.final_layer = nn.Linear(
|
|
116
|
+
self.final_layer = nn.Linear(
|
|
117
|
+
input_dim + deep_dim, 1
|
|
118
|
+
) # + deep_dim for MLP output
|
|
104
119
|
else:
|
|
105
120
|
self.use_dnn = False
|
|
106
121
|
# Final layer only uses cross network output
|
|
107
122
|
self.final_layer = nn.Linear(input_dim, 1)
|
|
108
123
|
|
|
109
|
-
self.prediction_layer = PredictionLayer(task_type=self.
|
|
124
|
+
self.prediction_layer = PredictionLayer(task_type=self.task)
|
|
110
125
|
|
|
111
126
|
# Register regularization weights
|
|
112
127
|
self.register_regularization_weights(
|
|
113
|
-
embedding_attr=
|
|
114
|
-
include_modules=[
|
|
128
|
+
embedding_attr="embedding",
|
|
129
|
+
include_modules=["cross_network", "mlp", "final_layer"],
|
|
115
130
|
)
|
|
116
131
|
|
|
117
132
|
self.compile(
|
|
@@ -124,18 +139,20 @@ class DCN(BaseModel):
|
|
|
124
139
|
def forward(self, x):
|
|
125
140
|
# Get all embeddings and flatten
|
|
126
141
|
input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
|
|
127
|
-
|
|
142
|
+
|
|
128
143
|
# Cross Network
|
|
129
144
|
cross_output = self.cross_network(input_flat) # [B, input_dim]
|
|
130
|
-
|
|
145
|
+
|
|
131
146
|
if self.use_dnn:
|
|
132
147
|
# Deep Network
|
|
133
148
|
deep_output = self.mlp(input_flat) # [B, 1]
|
|
134
149
|
# Concatenate cross and deep
|
|
135
|
-
combined = torch.cat(
|
|
150
|
+
combined = torch.cat(
|
|
151
|
+
[cross_output, deep_output], dim=-1
|
|
152
|
+
) # [B, input_dim + 1]
|
|
136
153
|
else:
|
|
137
154
|
combined = cross_output
|
|
138
|
-
|
|
155
|
+
|
|
139
156
|
# Final prediction
|
|
140
157
|
y = self.final_layer(combined)
|
|
141
158
|
return self.prediction_layer(y)
|