nextrec 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +1 -1
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -5
- nextrec/basic/callback.py +1 -0
- nextrec/basic/features.py +30 -22
- nextrec/basic/layers.py +250 -112
- nextrec/basic/loggers.py +63 -44
- nextrec/basic/metrics.py +270 -120
- nextrec/basic/model.py +1084 -402
- nextrec/basic/session.py +10 -3
- nextrec/cli.py +492 -0
- nextrec/data/__init__.py +19 -25
- nextrec/data/batch_utils.py +11 -3
- nextrec/data/data_processing.py +51 -45
- nextrec/data/data_utils.py +26 -15
- nextrec/data/dataloader.py +273 -96
- nextrec/data/preprocessor.py +320 -199
- nextrec/loss/listwise.py +17 -9
- nextrec/loss/loss_utils.py +7 -8
- nextrec/loss/pairwise.py +2 -0
- nextrec/loss/pointwise.py +30 -12
- nextrec/models/generative/hstu.py +103 -38
- nextrec/models/match/dssm.py +82 -68
- nextrec/models/match/dssm_v2.py +72 -57
- nextrec/models/match/mind.py +175 -107
- nextrec/models/match/sdm.py +104 -87
- nextrec/models/match/youtube_dnn.py +73 -59
- nextrec/models/multi_task/esmm.py +69 -46
- nextrec/models/multi_task/mmoe.py +91 -53
- nextrec/models/multi_task/ple.py +117 -58
- nextrec/models/multi_task/poso.py +163 -55
- nextrec/models/multi_task/share_bottom.py +63 -36
- nextrec/models/ranking/afm.py +80 -45
- nextrec/models/ranking/autoint.py +74 -57
- nextrec/models/ranking/dcn.py +110 -48
- nextrec/models/ranking/dcn_v2.py +265 -45
- nextrec/models/ranking/deepfm.py +39 -24
- nextrec/models/ranking/dien.py +335 -146
- nextrec/models/ranking/din.py +158 -92
- nextrec/models/ranking/fibinet.py +134 -52
- nextrec/models/ranking/fm.py +68 -26
- nextrec/models/ranking/masknet.py +95 -33
- nextrec/models/ranking/pnn.py +128 -58
- nextrec/models/ranking/widedeep.py +40 -28
- nextrec/models/ranking/xdeepfm.py +67 -40
- nextrec/utils/__init__.py +59 -34
- nextrec/utils/config.py +496 -0
- nextrec/utils/device.py +30 -20
- nextrec/utils/distributed.py +36 -9
- nextrec/utils/embedding.py +1 -0
- nextrec/utils/feature.py +1 -0
- nextrec/utils/file.py +33 -11
- nextrec/utils/initializer.py +61 -16
- nextrec/utils/model.py +22 -0
- nextrec/utils/optimizer.py +25 -9
- nextrec/utils/synthetic_data.py +283 -165
- nextrec/utils/tensor.py +24 -13
- {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/METADATA +53 -24
- nextrec-0.4.3.dist-info/RECORD +69 -0
- nextrec-0.4.3.dist-info/entry_points.txt +2 -0
- nextrec-0.4.1.dist-info/RECORD +0 -66
- {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/WHEEL +0 -0
- {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/licenses/LICENSE +0 -0
nextrec/models/ranking/afm.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
4
|
-
Author: Yang Zhou,zyaztec@gmail.com
|
|
3
|
+
Checkpoint: edit on 09/12/2025
|
|
4
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
6
|
[1] Xiao J, Ye H, He X, et al. Attentional factorization machines: Learning the weight of
|
|
7
7
|
feature interactions via attention networks[C]//IJCAI. 2017: 3119-3125.
|
|
@@ -40,7 +40,7 @@ import torch
|
|
|
40
40
|
import torch.nn as nn
|
|
41
41
|
|
|
42
42
|
from nextrec.basic.model import BaseModel
|
|
43
|
-
from nextrec.basic.layers import EmbeddingLayer,
|
|
43
|
+
from nextrec.basic.layers import EmbeddingLayer, PredictionLayer, InputMask
|
|
44
44
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
45
45
|
|
|
46
46
|
|
|
@@ -52,25 +52,35 @@ class AFM(BaseModel):
|
|
|
52
52
|
@property
|
|
53
53
|
def default_task(self):
|
|
54
54
|
return "binary"
|
|
55
|
-
|
|
56
|
-
def __init__(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
dense_features: list[DenseFeature] | None = None,
|
|
59
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
60
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
61
|
+
attention_dim: int = 32,
|
|
62
|
+
attention_dropout: float = 0.0,
|
|
63
|
+
target: list[str] | str | None = None,
|
|
64
|
+
task: str | list[str] | None = None,
|
|
65
|
+
optimizer: str = "adam",
|
|
66
|
+
optimizer_params: dict | None = None,
|
|
67
|
+
loss: str | nn.Module | None = "bce",
|
|
68
|
+
loss_params: dict | list[dict] | None = None,
|
|
69
|
+
device: str = "cpu",
|
|
70
|
+
embedding_l1_reg=1e-6,
|
|
71
|
+
dense_l1_reg=1e-5,
|
|
72
|
+
embedding_l2_reg=1e-5,
|
|
73
|
+
dense_l2_reg=1e-4,
|
|
74
|
+
**kwargs,
|
|
75
|
+
):
|
|
76
|
+
|
|
77
|
+
dense_features = dense_features or []
|
|
78
|
+
sparse_features = sparse_features or []
|
|
79
|
+
sequence_features = sequence_features or []
|
|
80
|
+
optimizer_params = optimizer_params or {}
|
|
81
|
+
if loss is None:
|
|
82
|
+
loss = "bce"
|
|
83
|
+
|
|
74
84
|
super(AFM, self).__init__(
|
|
75
85
|
dense_features=dense_features,
|
|
76
86
|
sparse_features=sparse_features,
|
|
@@ -82,31 +92,32 @@ class AFM(BaseModel):
|
|
|
82
92
|
dense_l1_reg=dense_l1_reg,
|
|
83
93
|
embedding_l2_reg=embedding_l2_reg,
|
|
84
94
|
dense_l2_reg=dense_l2_reg,
|
|
85
|
-
**kwargs
|
|
95
|
+
**kwargs,
|
|
86
96
|
)
|
|
87
97
|
|
|
88
|
-
if target is None:
|
|
89
|
-
target = []
|
|
90
|
-
if optimizer_params is None:
|
|
91
|
-
optimizer_params = {}
|
|
92
|
-
if loss is None:
|
|
93
|
-
loss = "bce"
|
|
94
|
-
|
|
95
98
|
self.fm_features = sparse_features + sequence_features
|
|
96
99
|
if len(self.fm_features) < 2:
|
|
97
|
-
raise ValueError(
|
|
100
|
+
raise ValueError(
|
|
101
|
+
"AFM requires at least two sparse/sequence features to build pairwise interactions."
|
|
102
|
+
)
|
|
98
103
|
|
|
99
104
|
# make sure all embedding dimension are the same for FM features
|
|
100
105
|
self.embedding_dim = self.fm_features[0].embedding_dim
|
|
101
106
|
if any(f.embedding_dim != self.embedding_dim for f in self.fm_features):
|
|
102
|
-
raise ValueError(
|
|
107
|
+
raise ValueError(
|
|
108
|
+
"All FM features must share the same embedding_dim for AFM."
|
|
109
|
+
)
|
|
103
110
|
|
|
104
|
-
self.embedding = EmbeddingLayer(
|
|
111
|
+
self.embedding = EmbeddingLayer(
|
|
112
|
+
features=self.fm_features
|
|
113
|
+
) # [Batch, Field, Dim ]
|
|
105
114
|
|
|
106
115
|
# First-order terms: dense linear + one hot embeddings
|
|
107
116
|
self.dense_features = list(dense_features)
|
|
108
117
|
dense_input_dim = sum([f.input_dim for f in self.dense_features])
|
|
109
|
-
self.linear_dense =
|
|
118
|
+
self.linear_dense = (
|
|
119
|
+
nn.Linear(dense_input_dim, 1, bias=True) if dense_input_dim > 0 else None
|
|
120
|
+
)
|
|
110
121
|
|
|
111
122
|
# First-order term: sparse/sequence features one-hot
|
|
112
123
|
# **INFO**: source paper does not contain sequence features in experiments,
|
|
@@ -114,9 +125,15 @@ class AFM(BaseModel):
|
|
|
114
125
|
# remove sequence features from fm_features.
|
|
115
126
|
self.first_order_embeddings = nn.ModuleDict()
|
|
116
127
|
for feature in self.fm_features:
|
|
117
|
-
if
|
|
128
|
+
if (
|
|
129
|
+
feature.embedding_name in self.first_order_embeddings
|
|
130
|
+
): # shared embedding
|
|
118
131
|
continue
|
|
119
|
-
emb = nn.Embedding(
|
|
132
|
+
emb = nn.Embedding(
|
|
133
|
+
num_embeddings=feature.vocab_size,
|
|
134
|
+
embedding_dim=1,
|
|
135
|
+
padding_idx=feature.padding_idx,
|
|
136
|
+
) # equal to one-hot encoding weight
|
|
120
137
|
# nn.init.zeros_(emb.weight)
|
|
121
138
|
self.first_order_embeddings[feature.embedding_name] = emb
|
|
122
139
|
|
|
@@ -129,11 +146,18 @@ class AFM(BaseModel):
|
|
|
129
146
|
|
|
130
147
|
# Register regularization weights
|
|
131
148
|
self.register_regularization_weights(
|
|
132
|
-
embedding_attr=
|
|
133
|
-
include_modules=[
|
|
149
|
+
embedding_attr="embedding",
|
|
150
|
+
include_modules=[
|
|
151
|
+
"linear_dense",
|
|
152
|
+
"attention_linear",
|
|
153
|
+
"attention_p",
|
|
154
|
+
"output_projection",
|
|
155
|
+
],
|
|
134
156
|
)
|
|
135
157
|
# add first-order embeddings to embedding regularization list
|
|
136
|
-
self.embedding_params.extend(
|
|
158
|
+
self.embedding_params.extend(
|
|
159
|
+
emb.weight for emb in self.first_order_embeddings.values()
|
|
160
|
+
)
|
|
137
161
|
|
|
138
162
|
self.compile(
|
|
139
163
|
optimizer=optimizer,
|
|
@@ -143,13 +167,17 @@ class AFM(BaseModel):
|
|
|
143
167
|
)
|
|
144
168
|
|
|
145
169
|
def forward(self, x):
|
|
146
|
-
field_emb = self.embedding(
|
|
170
|
+
field_emb = self.embedding(
|
|
171
|
+
x=x, features=self.fm_features, squeeze_dim=False
|
|
172
|
+
) # [B, F, D]
|
|
147
173
|
batch_size = field_emb.size(0)
|
|
148
174
|
y_linear = torch.zeros(batch_size, 1, device=field_emb.device)
|
|
149
175
|
|
|
150
176
|
# First-order dense part
|
|
151
177
|
if self.linear_dense is not None:
|
|
152
|
-
dense_inputs = [
|
|
178
|
+
dense_inputs = [
|
|
179
|
+
x[f.name].float().view(batch_size, -1) for f in self.dense_features
|
|
180
|
+
]
|
|
153
181
|
dense_stack = torch.cat(dense_inputs, dim=1) if dense_inputs else None
|
|
154
182
|
if dense_stack is not None:
|
|
155
183
|
y_linear = y_linear + self.linear_dense(dense_stack)
|
|
@@ -161,7 +189,7 @@ class AFM(BaseModel):
|
|
|
161
189
|
if isinstance(feature, SparseFeature):
|
|
162
190
|
term = emb(x[feature.name].long()) # [B, 1]
|
|
163
191
|
else: # SequenceFeature
|
|
164
|
-
seq_input = x[feature.name].long()
|
|
192
|
+
seq_input = x[feature.name].long() # [B, 1]
|
|
165
193
|
if feature.max_len is not None and seq_input.size(1) > feature.max_len:
|
|
166
194
|
seq_input = seq_input[:, -feature.max_len :]
|
|
167
195
|
mask = self.input_mask(x, feature, seq_input).squeeze(1) # [B, 1]
|
|
@@ -169,7 +197,9 @@ class AFM(BaseModel):
|
|
|
169
197
|
term = (seq_weight * mask).sum(dim=1, keepdim=True) # [B, 1]
|
|
170
198
|
first_order_terms.append(term)
|
|
171
199
|
if first_order_terms:
|
|
172
|
-
y_linear = y_linear + torch.sum(
|
|
200
|
+
y_linear = y_linear + torch.sum(
|
|
201
|
+
torch.cat(first_order_terms, dim=1), dim=1, keepdim=True
|
|
202
|
+
)
|
|
173
203
|
|
|
174
204
|
interactions = []
|
|
175
205
|
feature_values = []
|
|
@@ -182,13 +212,18 @@ class AFM(BaseModel):
|
|
|
182
212
|
else:
|
|
183
213
|
if isinstance(feature, SequenceFeature):
|
|
184
214
|
seq_input = x[feature.name].long()
|
|
185
|
-
if
|
|
215
|
+
if (
|
|
216
|
+
feature.max_len is not None
|
|
217
|
+
and seq_input.size(1) > feature.max_len
|
|
218
|
+
):
|
|
186
219
|
seq_input = seq_input[:, -feature.max_len :]
|
|
187
220
|
value = self.input_mask(x, feature, seq_input).sum(dim=2) # [B, 1]
|
|
188
221
|
else:
|
|
189
222
|
value = torch.ones(batch_size, 1, device=field_emb.device)
|
|
190
223
|
feature_values.append(value)
|
|
191
|
-
feature_values_tensor = torch.cat(feature_values, dim=1).unsqueeze(
|
|
224
|
+
feature_values_tensor = torch.cat(feature_values, dim=1).unsqueeze(
|
|
225
|
+
-1
|
|
226
|
+
) # [B, F, 1]
|
|
192
227
|
field_emb = field_emb * feature_values_tensor
|
|
193
228
|
|
|
194
229
|
num_fields = field_emb.shape[1]
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
Checkpoint: edit on
|
|
4
|
-
Author: Yang Zhou,zyaztec@gmail.com
|
|
3
|
+
Checkpoint: edit on 09/12/2025
|
|
4
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
|
|
7
|
-
self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
|
|
6
|
+
[1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
|
|
7
|
+
self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
|
|
8
8
|
on information and knowledge management. 2019: 1161-1170.
|
|
9
9
|
(https://arxiv.org/abs/1810.11921)
|
|
10
10
|
|
|
@@ -70,29 +70,31 @@ class AutoInt(BaseModel):
|
|
|
70
70
|
@property
|
|
71
71
|
def default_task(self):
|
|
72
72
|
return "binary"
|
|
73
|
-
|
|
74
|
-
def __init__(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
dense_features: list[DenseFeature],
|
|
77
|
+
sparse_features: list[SparseFeature],
|
|
78
|
+
sequence_features: list[SequenceFeature],
|
|
79
|
+
att_layer_num: int = 3,
|
|
80
|
+
att_embedding_dim: int = 8,
|
|
81
|
+
att_head_num: int = 2,
|
|
82
|
+
att_dropout: float = 0.0,
|
|
83
|
+
att_use_residual: bool = True,
|
|
84
|
+
target: list[str] | None = None,
|
|
85
|
+
task: str | list[str] | None = None,
|
|
86
|
+
optimizer: str = "adam",
|
|
87
|
+
optimizer_params: dict | None = None,
|
|
88
|
+
loss: str | nn.Module | None = "bce",
|
|
89
|
+
loss_params: dict | list[dict] | None = None,
|
|
90
|
+
device: str = "cpu",
|
|
91
|
+
embedding_l1_reg=1e-6,
|
|
92
|
+
dense_l1_reg=1e-5,
|
|
93
|
+
embedding_l2_reg=1e-5,
|
|
94
|
+
dense_l2_reg=1e-4,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
|
|
96
98
|
super(AutoInt, self).__init__(
|
|
97
99
|
dense_features=dense_features,
|
|
98
100
|
sparse_features=sparse_features,
|
|
@@ -104,7 +106,7 @@ class AutoInt(BaseModel):
|
|
|
104
106
|
dense_l1_reg=dense_l1_reg,
|
|
105
107
|
embedding_l2_reg=embedding_l2_reg,
|
|
106
108
|
dense_l2_reg=dense_l2_reg,
|
|
107
|
-
**kwargs
|
|
109
|
+
**kwargs,
|
|
108
110
|
)
|
|
109
111
|
|
|
110
112
|
if target is None:
|
|
@@ -113,52 +115,59 @@ class AutoInt(BaseModel):
|
|
|
113
115
|
optimizer_params = {}
|
|
114
116
|
if loss is None:
|
|
115
117
|
loss = "bce"
|
|
116
|
-
|
|
118
|
+
|
|
117
119
|
self.att_layer_num = att_layer_num
|
|
118
120
|
self.att_embedding_dim = att_embedding_dim
|
|
119
|
-
|
|
121
|
+
|
|
120
122
|
# Use sparse and sequence features for interaction
|
|
121
123
|
# **INFO**: this is different from the original paper, we also include dense features
|
|
122
124
|
# if you want to follow the paper strictly, set dense_features=[]
|
|
123
125
|
# or modify the code accordingly
|
|
124
|
-
self.interaction_features = dense_features + sparse_features + sequence_features
|
|
125
|
-
|
|
126
|
+
self.interaction_features = dense_features + sparse_features + sequence_features
|
|
127
|
+
|
|
126
128
|
# All features for embedding
|
|
127
129
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
128
130
|
|
|
129
131
|
# Embedding layer
|
|
130
132
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
131
|
-
|
|
133
|
+
|
|
132
134
|
# Project embeddings to attention embedding dimension
|
|
133
135
|
num_fields = len(self.interaction_features)
|
|
134
|
-
|
|
136
|
+
|
|
135
137
|
# If embeddings have different dimensions, project them to att_embedding_dim
|
|
136
|
-
self.need_projection = not all(
|
|
138
|
+
self.need_projection = not all(
|
|
139
|
+
f.embedding_dim == att_embedding_dim for f in self.interaction_features
|
|
140
|
+
)
|
|
137
141
|
self.projection_layers = None
|
|
138
142
|
if self.need_projection:
|
|
139
|
-
self.projection_layers = nn.ModuleList(
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
143
|
+
self.projection_layers = nn.ModuleList(
|
|
144
|
+
[
|
|
145
|
+
nn.Linear(f.embedding_dim, att_embedding_dim, bias=False)
|
|
146
|
+
for f in self.interaction_features
|
|
147
|
+
]
|
|
148
|
+
)
|
|
149
|
+
|
|
144
150
|
# Multi-head self-attention layers
|
|
145
|
-
self.attention_layers = nn.ModuleList(
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
151
|
+
self.attention_layers = nn.ModuleList(
|
|
152
|
+
[
|
|
153
|
+
MultiHeadSelfAttention(
|
|
154
|
+
embedding_dim=att_embedding_dim,
|
|
155
|
+
num_heads=att_head_num,
|
|
156
|
+
dropout=att_dropout,
|
|
157
|
+
use_residual=att_use_residual,
|
|
158
|
+
)
|
|
159
|
+
for _ in range(att_layer_num)
|
|
160
|
+
]
|
|
161
|
+
)
|
|
162
|
+
|
|
154
163
|
# Final prediction layer
|
|
155
164
|
self.fc = nn.Linear(num_fields * att_embedding_dim, 1)
|
|
156
165
|
self.prediction_layer = PredictionLayer(task_type=self.default_task)
|
|
157
166
|
|
|
158
167
|
# Register regularization weights
|
|
159
168
|
self.register_regularization_weights(
|
|
160
|
-
embedding_attr=
|
|
161
|
-
include_modules=[
|
|
169
|
+
embedding_attr="embedding",
|
|
170
|
+
include_modules=["projection_layers", "attention_layers", "fc"],
|
|
162
171
|
)
|
|
163
172
|
|
|
164
173
|
self.compile(
|
|
@@ -172,21 +181,29 @@ class AutoInt(BaseModel):
|
|
|
172
181
|
# Get embeddings field-by-field so mixed dimensions can be projected safely
|
|
173
182
|
field_embeddings = []
|
|
174
183
|
if len(self.interaction_features) == 0:
|
|
175
|
-
raise ValueError(
|
|
184
|
+
raise ValueError(
|
|
185
|
+
"AutoInt requires at least one sparse or sequence feature for interactions."
|
|
186
|
+
)
|
|
176
187
|
for idx, feature in enumerate(self.interaction_features):
|
|
177
188
|
feature_emb = self.embedding(x=x, features=[feature], squeeze_dim=False)
|
|
178
189
|
feature_emb = feature_emb.squeeze(1) # [B, embedding_dim]
|
|
179
190
|
if self.need_projection and self.projection_layers is not None:
|
|
180
191
|
feature_emb = self.projection_layers[idx](feature_emb)
|
|
181
|
-
field_embeddings.append(
|
|
192
|
+
field_embeddings.append(
|
|
193
|
+
feature_emb.unsqueeze(1)
|
|
194
|
+
) # [B, 1, att_embedding_dim or original_dim]
|
|
182
195
|
embeddings = torch.cat(field_embeddings, dim=1)
|
|
183
|
-
|
|
196
|
+
|
|
184
197
|
# Apply multi-head self-attention layers
|
|
185
198
|
attention_output = embeddings
|
|
186
199
|
for att_layer in self.attention_layers:
|
|
187
|
-
attention_output = att_layer(
|
|
188
|
-
|
|
200
|
+
attention_output = att_layer(
|
|
201
|
+
attention_output
|
|
202
|
+
) # [B, num_fields, att_embedding_dim]
|
|
203
|
+
|
|
189
204
|
# Flatten and predict
|
|
190
|
-
attention_output_flat = attention_output.flatten(
|
|
205
|
+
attention_output_flat = attention_output.flatten(
|
|
206
|
+
start_dim=1
|
|
207
|
+
) # [B, num_fields * att_embedding_dim]
|
|
191
208
|
y = self.fc(attention_output_flat) # [B, 1]
|
|
192
209
|
return self.prediction_layer(y)
|
nextrec/models/ranking/dcn.py
CHANGED
|
@@ -1,11 +1,53 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Checkpoint: edit on 09/12/2025
|
|
4
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
6
|
+
[1] Wang R, Fu B, Fu G, et al. Deep & cross network for ad click predictions[C]
|
|
7
|
+
//Proceedings of the ADKDD'17. 2017: 1-7.
|
|
8
|
+
(https://arxiv.org/abs/1708.05123)
|
|
9
|
+
|
|
10
|
+
Deep & Cross Network (DCN) mixes explicit polynomial feature crosses with a deep
|
|
11
|
+
MLP branch to capture both low-order and high-order interactions for CTR-style
|
|
12
|
+
tasks. Cross Layers repeatedly apply x_{l+1} = x0 * (w_l^T x_l) + b_l + x_l,
|
|
13
|
+
which expands feature crosses with linear parameter growth, while the deep branch
|
|
14
|
+
learns nonlinear patterns on the same shared embeddings. The final prediction
|
|
15
|
+
concatenates (or solely uses) cross outputs before a linear head, offering a
|
|
16
|
+
balanced trade-off between interpretability and expressiveness.
|
|
17
|
+
|
|
18
|
+
Workflow:
|
|
19
|
+
(1) Embed sparse/sequence features and concatenate with dense inputs
|
|
20
|
+
(2) Cross Network builds explicit polynomial interactions via residual crosses
|
|
21
|
+
(3) Optional MLP models implicit high-order nonlinear relationships
|
|
22
|
+
(4) Cross output (and deep output if enabled) are fused for the final logit
|
|
23
|
+
(5) Prediction layer maps logits to binary CTR scores
|
|
24
|
+
|
|
25
|
+
Key Advantages:
|
|
26
|
+
- Explicit, low-cost cross features with O(L * d) parameters
|
|
27
|
+
- Residual cross formulation stabilizes optimization
|
|
28
|
+
- Optional deep tower increases capacity without losing interpretability
|
|
29
|
+
- Shared embeddings reduce redundant parameters and preprocessing
|
|
30
|
+
- Strong, simple baseline for ad/recommendation ranking tasks
|
|
31
|
+
|
|
32
|
+
DCN(Deep & Cross Network)通过 Cross 层显式生成多项式特征交互,同时可选 Deep
|
|
33
|
+
分支学习高阶非线性关系,两者共享 embedding。Cross 层按
|
|
34
|
+
x_{l+1} = x0 * (w_l^T x_l) + b_l + x_l 递推,参数线性增长且具解释性;
|
|
35
|
+
Deep 分支提升表达能力;最终将 Cross(及 Deep)结果送入线性层与预测层,形成兼具
|
|
36
|
+
效率与效果的 CTR/CVR 预估模型。
|
|
37
|
+
|
|
38
|
+
流程:
|
|
39
|
+
(1) 对稀疏/序列特征做 embedding,并与稠密特征拼接
|
|
40
|
+
(2) Cross 层以残差形式显式构造多阶交叉特征
|
|
41
|
+
(3) 可选 MLP 学习隐式高阶非线性交互
|
|
42
|
+
(4) 将 Cross(及 Deep)输出融合后接线性头得到 logit
|
|
43
|
+
(5) 预测层输出二分类 CTR 分数
|
|
44
|
+
|
|
45
|
+
主要优点:
|
|
46
|
+
- 显式交叉特征、参数线性增长、易解释
|
|
47
|
+
- 残差式 Cross 提升训练稳定性
|
|
48
|
+
- Deep 分支可灵活增强模型容量
|
|
49
|
+
- 共享 embedding,减少冗余参数与预处理
|
|
50
|
+
- CTR/CVR 排序任务的简洁强基线
|
|
9
51
|
"""
|
|
10
52
|
|
|
11
53
|
import torch
|
|
@@ -15,21 +57,27 @@ from nextrec.basic.model import BaseModel
|
|
|
15
57
|
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
16
58
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
17
59
|
|
|
60
|
+
|
|
18
61
|
class CrossNetwork(nn.Module):
|
|
19
62
|
"""Stacked Cross Layers from DCN (Wang et al., 2017)."""
|
|
20
63
|
|
|
21
64
|
def __init__(self, input_dim, num_layers):
|
|
22
65
|
super().__init__()
|
|
23
66
|
self.num_layers = num_layers
|
|
24
|
-
self.w = torch.nn.ModuleList(
|
|
25
|
-
|
|
67
|
+
self.w = torch.nn.ModuleList(
|
|
68
|
+
[torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)]
|
|
69
|
+
)
|
|
70
|
+
self.b = torch.nn.ParameterList(
|
|
71
|
+
[torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)]
|
|
72
|
+
)
|
|
26
73
|
|
|
27
74
|
def forward(self, x):
|
|
28
75
|
x0 = x
|
|
29
76
|
for i in range(self.num_layers):
|
|
30
77
|
xw = self.w[i](x)
|
|
31
78
|
x = x0 * xw + self.b[i] + x
|
|
32
|
-
return x
|
|
79
|
+
return x # [batch_size, input_dim]
|
|
80
|
+
|
|
33
81
|
|
|
34
82
|
class DCN(BaseModel):
|
|
35
83
|
@property
|
|
@@ -40,25 +88,34 @@ class DCN(BaseModel):
|
|
|
40
88
|
def default_task(self):
|
|
41
89
|
return "binary"
|
|
42
90
|
|
|
43
|
-
def __init__(
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
dense_features: list[DenseFeature] | None = None,
|
|
94
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
95
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
96
|
+
cross_num: int = 3,
|
|
97
|
+
mlp_params: dict | None = None,
|
|
98
|
+
target: list[str] | str | None = None,
|
|
99
|
+
task: str | list[str] | None = None,
|
|
100
|
+
optimizer: str = "adam",
|
|
101
|
+
optimizer_params: dict | None = None,
|
|
102
|
+
loss: str | nn.Module | None = "bce",
|
|
103
|
+
loss_params: dict | list[dict] | None = None,
|
|
104
|
+
device: str = "cpu",
|
|
105
|
+
embedding_l1_reg=1e-6,
|
|
106
|
+
dense_l1_reg=1e-5,
|
|
107
|
+
embedding_l2_reg=1e-5,
|
|
108
|
+
dense_l2_reg=1e-4,
|
|
109
|
+
**kwargs,
|
|
110
|
+
):
|
|
111
|
+
|
|
112
|
+
dense_features = dense_features or []
|
|
113
|
+
sparse_features = sparse_features or []
|
|
114
|
+
sequence_features = sequence_features or []
|
|
115
|
+
optimizer_params = optimizer_params or {}
|
|
116
|
+
if loss is None:
|
|
117
|
+
loss = "bce"
|
|
118
|
+
|
|
62
119
|
super(DCN, self).__init__(
|
|
63
120
|
dense_features=dense_features,
|
|
64
121
|
sparse_features=sparse_features,
|
|
@@ -70,34 +127,37 @@ class DCN(BaseModel):
|
|
|
70
127
|
dense_l1_reg=dense_l1_reg,
|
|
71
128
|
embedding_l2_reg=embedding_l2_reg,
|
|
72
129
|
dense_l2_reg=dense_l2_reg,
|
|
73
|
-
**kwargs
|
|
130
|
+
**kwargs,
|
|
74
131
|
)
|
|
75
132
|
|
|
76
|
-
self.loss = loss
|
|
77
|
-
if self.loss is None:
|
|
78
|
-
self.loss = "bce"
|
|
79
|
-
|
|
80
|
-
# All features
|
|
81
|
-
self.all_features = dense_features + sparse_features + sequence_features
|
|
82
|
-
|
|
83
133
|
# Embedding layer
|
|
84
134
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
85
135
|
|
|
86
136
|
# Calculate input dimension
|
|
87
|
-
emb_dim_total = sum(
|
|
88
|
-
|
|
137
|
+
emb_dim_total = sum(
|
|
138
|
+
[
|
|
139
|
+
f.embedding_dim
|
|
140
|
+
for f in self.all_features
|
|
141
|
+
if not isinstance(f, DenseFeature)
|
|
142
|
+
]
|
|
143
|
+
)
|
|
144
|
+
dense_input_dim = sum(
|
|
145
|
+
[getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
|
|
146
|
+
)
|
|
89
147
|
input_dim = emb_dim_total + dense_input_dim
|
|
90
|
-
|
|
91
|
-
# Cross Network
|
|
148
|
+
|
|
149
|
+
# Cross Network for explicit feature crosses
|
|
92
150
|
self.cross_network = CrossNetwork(input_dim=input_dim, num_layers=cross_num)
|
|
93
|
-
|
|
94
|
-
# Deep Network
|
|
151
|
+
|
|
152
|
+
# Deep Network for implicit high-order interactions
|
|
95
153
|
if mlp_params is not None:
|
|
96
154
|
self.use_dnn = True
|
|
97
155
|
self.mlp = MLP(input_dim=input_dim, **mlp_params)
|
|
98
156
|
deep_dim = self.mlp.output_dim
|
|
99
157
|
# Final layer combines cross and deep
|
|
100
|
-
self.final_layer = nn.Linear(
|
|
158
|
+
self.final_layer = nn.Linear(
|
|
159
|
+
input_dim + deep_dim, 1
|
|
160
|
+
) # + deep_dim for MLP output
|
|
101
161
|
else:
|
|
102
162
|
self.use_dnn = False
|
|
103
163
|
# Final layer only uses cross network output
|
|
@@ -107,8 +167,8 @@ class DCN(BaseModel):
|
|
|
107
167
|
|
|
108
168
|
# Register regularization weights
|
|
109
169
|
self.register_regularization_weights(
|
|
110
|
-
embedding_attr=
|
|
111
|
-
include_modules=[
|
|
170
|
+
embedding_attr="embedding",
|
|
171
|
+
include_modules=["cross_network", "mlp", "final_layer"],
|
|
112
172
|
)
|
|
113
173
|
|
|
114
174
|
self.compile(
|
|
@@ -121,18 +181,20 @@ class DCN(BaseModel):
|
|
|
121
181
|
def forward(self, x):
|
|
122
182
|
# Get all embeddings and flatten
|
|
123
183
|
input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
|
|
124
|
-
|
|
184
|
+
|
|
125
185
|
# Cross Network
|
|
126
186
|
cross_output = self.cross_network(input_flat) # [B, input_dim]
|
|
127
|
-
|
|
187
|
+
|
|
128
188
|
if self.use_dnn:
|
|
129
189
|
# Deep Network
|
|
130
190
|
deep_output = self.mlp(input_flat) # [B, 1]
|
|
131
191
|
# Concatenate cross and deep
|
|
132
|
-
combined = torch.cat(
|
|
192
|
+
combined = torch.cat(
|
|
193
|
+
[cross_output, deep_output], dim=-1
|
|
194
|
+
) # [B, input_dim + 1]
|
|
133
195
|
else:
|
|
134
196
|
combined = cross_output
|
|
135
|
-
|
|
197
|
+
|
|
136
198
|
# Final prediction
|
|
137
199
|
y = self.final_layer(combined)
|
|
138
200
|
return self.prediction_layer(y)
|