nextrec 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/layers.py +32 -8
- nextrec/basic/loggers.py +1 -1
- nextrec/basic/metrics.py +2 -1
- nextrec/basic/model.py +3 -3
- nextrec/cli.py +41 -47
- nextrec/data/dataloader.py +1 -1
- nextrec/models/multi_task/esmm.py +23 -16
- nextrec/models/multi_task/mmoe.py +36 -17
- nextrec/models/multi_task/ple.py +18 -12
- nextrec/models/multi_task/poso.py +68 -37
- nextrec/models/multi_task/share_bottom.py +16 -2
- nextrec/models/ranking/afm.py +14 -14
- nextrec/models/ranking/autoint.py +2 -2
- nextrec/models/ranking/dcn.py +61 -19
- nextrec/models/ranking/dcn_v2.py +224 -45
- nextrec/models/ranking/deepfm.py +14 -9
- nextrec/models/ranking/dien.py +215 -82
- nextrec/models/ranking/din.py +95 -57
- nextrec/models/ranking/fibinet.py +92 -30
- nextrec/models/ranking/fm.py +44 -8
- nextrec/models/ranking/masknet.py +7 -7
- nextrec/models/ranking/pnn.py +105 -38
- nextrec/models/ranking/widedeep.py +8 -4
- nextrec/models/ranking/xdeepfm.py +57 -10
- nextrec/utils/config.py +15 -3
- nextrec/utils/file.py +2 -1
- nextrec/utils/initializer.py +12 -16
- nextrec/utils/model.py +22 -0
- {nextrec-0.4.2.dist-info → nextrec-0.4.4.dist-info}/METADATA +57 -22
- {nextrec-0.4.2.dist-info → nextrec-0.4.4.dist-info}/RECORD +34 -34
- {nextrec-0.4.2.dist-info → nextrec-0.4.4.dist-info}/WHEEL +0 -0
- {nextrec-0.4.2.dist-info → nextrec-0.4.4.dist-info}/entry_points.txt +0 -0
- {nextrec-0.4.2.dist-info → nextrec-0.4.4.dist-info}/licenses/LICENSE +0 -0
nextrec/models/ranking/dcn_v2.py
CHANGED
|
@@ -1,15 +1,60 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
+
Checkpoint: edit on 09/12/2025
|
|
4
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
|
+
Reference:
|
|
6
|
+
[1] R. Wang et al. DCN V2: Improved Deep & Cross Network and Practical Lessons for
|
|
7
|
+
Web-scale Learning to Rank Systems. KDD 2021.
|
|
8
|
+
(https://arxiv.org/abs/2008.13535)
|
|
9
|
+
|
|
10
|
+
DCN v2 enhances the original Deep & Cross Network by replacing the scalar cross
|
|
11
|
+
weights with vector-wise (matrix) parameters and a Mixture-of-Low-Rank-Experts
|
|
12
|
+
variant. The matrix cross (CrossNetV2) improves expressiveness with manageable
|
|
13
|
+
parameter growth, while CrossNetMix decomposes the matrix into low-rank factors and
|
|
14
|
+
gates across experts for stronger modeling at a similar cost. As in DCN, the cross
|
|
15
|
+
tower explicitly builds polynomial feature interactions and can be paired with a
|
|
16
|
+
deep MLP tower; their outputs are concatenated before a final linear head.
|
|
17
|
+
|
|
18
|
+
Workflow:
|
|
19
|
+
(1) Embed sparse/sequence fields and concatenate with dense inputs
|
|
20
|
+
(2) Cross tower: choose matrix CrossNetV2 or low-rank CrossNetMix for explicit crosses
|
|
21
|
+
(3) Optional deep tower: MLP over the same flattened embeddings
|
|
22
|
+
(4) Fuse cross and deep outputs, then predict via a linear + prediction layer
|
|
23
|
+
|
|
24
|
+
Key Advantages:
|
|
25
|
+
- Vector-wise/matrix cross weights capture richer interactions than DCN v1
|
|
26
|
+
- Low-rank MoE cross (CrossNetMix) boosts capacity without quadratic parameters
|
|
27
|
+
- Compatible with a deep tower for additional nonlinear modeling
|
|
28
|
+
|
|
29
|
+
DCN v2 在原始 DCN 基础上,将标量交叉权重升级为向量/矩阵参数,并引入低秩专家混合
|
|
30
|
+
的 CrossNetMix。矩阵交叉(CrossNetV2)在参数可控的前提下提升表达力,CrossNetMix
|
|
31
|
+
通过低秩分解和 gating 进一步增强建模能力且保持参数效率。和 DCN 一样,交叉塔显式
|
|
32
|
+
构造多项式交互,可并行或串联一个 MLP 深塔,最终拼接/输出到线性头做预测。
|
|
33
|
+
|
|
34
|
+
流程:
|
|
35
|
+
(1) 对稀疏/序列特征做 embedding,并与稠密特征拼接
|
|
36
|
+
(2) 交叉塔:可选矩阵 CrossNetV2 或低秩混合 CrossNetMix 显式构造交互
|
|
37
|
+
(3) 可选深塔:MLP 处理同一展平后的输入或交叉输出
|
|
38
|
+
(4) 融合交叉与深塔输出,经线性层和预测层得到最终得分
|
|
39
|
+
|
|
40
|
+
主要优点:
|
|
41
|
+
- 矩阵交叉相较 DCN v1 捕获更丰富的交互
|
|
42
|
+
- 低秩专家混合在相近参数量下带来更强建模能力
|
|
43
|
+
- 兼容并行/串行深塔,灵活扩展非线性表示
|
|
3
44
|
"""
|
|
4
45
|
|
|
5
46
|
import torch
|
|
6
47
|
import torch.nn as nn
|
|
7
48
|
|
|
49
|
+
from nextrec.basic.model import BaseModel
|
|
50
|
+
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
51
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
52
|
+
|
|
8
53
|
|
|
9
54
|
class CrossNetV2(nn.Module):
|
|
10
55
|
"""Vector-wise cross network proposed in DCN V2 (Wang et al., 2021)."""
|
|
11
56
|
|
|
12
|
-
def __init__(self, input_dim, num_layers):
|
|
57
|
+
def __init__(self, input_dim: int, num_layers: int):
|
|
13
58
|
super().__init__()
|
|
14
59
|
self.num_layers = num_layers
|
|
15
60
|
self.w = torch.nn.ModuleList(
|
|
@@ -22,7 +67,7 @@ class CrossNetV2(nn.Module):
|
|
|
22
67
|
[torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)]
|
|
23
68
|
)
|
|
24
69
|
|
|
25
|
-
def forward(self, x):
|
|
70
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
26
71
|
x0 = x
|
|
27
72
|
for i in range(self.num_layers):
|
|
28
73
|
x = x0 * self.w[i](x) + self.b[i] + x
|
|
@@ -32,94 +77,228 @@ class CrossNetV2(nn.Module):
|
|
|
32
77
|
class CrossNetMix(nn.Module):
|
|
33
78
|
"""Mixture of low-rank cross experts from DCN V2 (Wang et al., 2021)."""
|
|
34
79
|
|
|
35
|
-
def __init__(
|
|
36
|
-
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
input_dim: int,
|
|
83
|
+
num_layers: int = 2,
|
|
84
|
+
low_rank: int = 32,
|
|
85
|
+
num_experts: int = 4,
|
|
86
|
+
):
|
|
87
|
+
super().__init__()
|
|
37
88
|
self.num_layers = num_layers
|
|
38
89
|
self.num_experts = num_experts
|
|
39
90
|
|
|
40
|
-
|
|
41
|
-
self.u_list = torch.nn.ParameterList(
|
|
91
|
+
self.u_list = nn.ParameterList(
|
|
42
92
|
[
|
|
43
93
|
nn.Parameter(
|
|
44
94
|
nn.init.xavier_normal_(
|
|
45
95
|
torch.empty(num_experts, input_dim, low_rank)
|
|
46
96
|
)
|
|
47
97
|
)
|
|
48
|
-
for
|
|
98
|
+
for _ in range(num_layers)
|
|
49
99
|
]
|
|
50
100
|
)
|
|
51
|
-
|
|
52
|
-
self.v_list = torch.nn.ParameterList(
|
|
101
|
+
self.v_list = nn.ParameterList(
|
|
53
102
|
[
|
|
54
103
|
nn.Parameter(
|
|
55
104
|
nn.init.xavier_normal_(
|
|
56
105
|
torch.empty(num_experts, input_dim, low_rank)
|
|
57
106
|
)
|
|
58
107
|
)
|
|
59
|
-
for
|
|
108
|
+
for _ in range(num_layers)
|
|
60
109
|
]
|
|
61
110
|
)
|
|
62
|
-
|
|
63
|
-
self.c_list = torch.nn.ParameterList(
|
|
111
|
+
self.c_list = nn.ParameterList(
|
|
64
112
|
[
|
|
65
113
|
nn.Parameter(
|
|
66
114
|
nn.init.xavier_normal_(torch.empty(num_experts, low_rank, low_rank))
|
|
67
115
|
)
|
|
68
|
-
for
|
|
116
|
+
for _ in range(num_layers)
|
|
69
117
|
]
|
|
70
118
|
)
|
|
119
|
+
|
|
71
120
|
self.gating = nn.ModuleList(
|
|
72
|
-
[nn.Linear(input_dim, 1, bias=False) for
|
|
121
|
+
[nn.Linear(input_dim, 1, bias=False) for _ in range(num_experts)]
|
|
73
122
|
)
|
|
74
123
|
|
|
75
|
-
self.bias =
|
|
76
|
-
[
|
|
77
|
-
nn.Parameter(nn.init.zeros_(torch.empty(input_dim, 1)))
|
|
78
|
-
for i in range(self.num_layers)
|
|
79
|
-
]
|
|
124
|
+
self.bias = nn.ParameterList(
|
|
125
|
+
[nn.Parameter(torch.zeros(input_dim, 1)) for _ in range(num_layers)]
|
|
80
126
|
)
|
|
81
127
|
|
|
82
|
-
def forward(self, x):
|
|
128
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
129
|
+
# x: (bs, in_features)
|
|
83
130
|
x_0 = x.unsqueeze(2) # (bs, in_features, 1)
|
|
84
131
|
x_l = x_0
|
|
132
|
+
|
|
85
133
|
for i in range(self.num_layers):
|
|
86
134
|
output_of_experts = []
|
|
87
135
|
gating_score_experts = []
|
|
136
|
+
|
|
137
|
+
gating_input = x_l.squeeze(2) # (bs, in_features)
|
|
138
|
+
|
|
88
139
|
for expert_id in range(self.num_experts):
|
|
89
|
-
#
|
|
90
|
-
|
|
91
|
-
|
|
140
|
+
# Gating
|
|
141
|
+
gating_score_experts.append(
|
|
142
|
+
self.gating[expert_id](gating_input)
|
|
143
|
+
) # (bs, 1)
|
|
92
144
|
|
|
93
|
-
#
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
) # (bs, low_rank, 1)
|
|
145
|
+
# Low-rank cross: U C V^T x_l
|
|
146
|
+
V = self.v_list[i][expert_id] # (in_features, low_rank)
|
|
147
|
+
C = self.c_list[i][expert_id] # (low_rank, low_rank)
|
|
148
|
+
U = self.u_list[i][expert_id] # (in_features, low_rank)
|
|
98
149
|
|
|
99
|
-
#
|
|
100
|
-
v_x =
|
|
101
|
-
v_x =
|
|
150
|
+
# (bs, 1, low_rank)
|
|
151
|
+
v_x = x_l.transpose(1, 2).matmul(V) # x_l^T V
|
|
152
|
+
v_x = v_x.matmul(C) # · C
|
|
102
153
|
v_x = torch.tanh(v_x)
|
|
103
154
|
|
|
104
|
-
#
|
|
105
|
-
uv_x =
|
|
106
|
-
self.u_list[i][expert_id], v_x
|
|
107
|
-
) # (bs, in_features, 1)
|
|
155
|
+
# (bs, in_features, 1)
|
|
156
|
+
uv_x = U.matmul(v_x.transpose(1, 2))
|
|
108
157
|
|
|
109
|
-
|
|
110
|
-
dot_ = x_0 *
|
|
158
|
+
# x_0 ⊙ (uv_x + b)
|
|
159
|
+
dot_ = x_0 * (uv_x + self.bias[i]) # (bs, in_features, 1)
|
|
111
160
|
|
|
112
|
-
output_of_experts.append(dot_.squeeze(2))
|
|
161
|
+
output_of_experts.append(dot_.squeeze(2)) # (bs, in_features)
|
|
113
162
|
|
|
114
|
-
# (3)
|
|
163
|
+
# (3) Mixture of experts
|
|
115
164
|
output_of_experts = torch.stack(
|
|
116
|
-
output_of_experts, 2
|
|
165
|
+
output_of_experts, dim=2
|
|
117
166
|
) # (bs, in_features, num_experts)
|
|
118
167
|
gating_score_experts = torch.stack(
|
|
119
|
-
gating_score_experts, 1
|
|
168
|
+
gating_score_experts, dim=1
|
|
120
169
|
) # (bs, num_experts, 1)
|
|
121
|
-
|
|
122
|
-
|
|
170
|
+
gating_score_experts = gating_score_experts.softmax(dim=1)
|
|
171
|
+
|
|
172
|
+
moe_out = torch.matmul(
|
|
173
|
+
output_of_experts, gating_score_experts
|
|
174
|
+
) # (bs, in_features, 1)
|
|
175
|
+
x_l = moe_out + x_l # residual
|
|
176
|
+
|
|
177
|
+
return x_l.squeeze(-1) # (bs, in_features)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class DCNv2(BaseModel):
|
|
181
|
+
@property
|
|
182
|
+
def model_name(self) -> str:
|
|
183
|
+
return "DCNv2"
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def default_task(self):
|
|
187
|
+
return "binary"
|
|
188
|
+
|
|
189
|
+
def __init__(
|
|
190
|
+
self,
|
|
191
|
+
dense_features: list[DenseFeature] | None = None,
|
|
192
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
193
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
194
|
+
cross_num: int = 3,
|
|
195
|
+
cross_type: str = "matrix",
|
|
196
|
+
architecture: str = "parallel",
|
|
197
|
+
low_rank: int = 32,
|
|
198
|
+
num_experts: int = 4,
|
|
199
|
+
mlp_params: dict | None = None,
|
|
200
|
+
target: list[str] | str | None = None,
|
|
201
|
+
task: str | list[str] | None = None,
|
|
202
|
+
optimizer: str = "adam",
|
|
203
|
+
optimizer_params: dict | None = None,
|
|
204
|
+
loss: str | nn.Module | None = "bce",
|
|
205
|
+
loss_params: dict | list[dict] | None = None,
|
|
206
|
+
device: str = "cpu",
|
|
207
|
+
embedding_l1_reg=1e-6,
|
|
208
|
+
dense_l1_reg=1e-5,
|
|
209
|
+
embedding_l2_reg=1e-5,
|
|
210
|
+
dense_l2_reg=1e-4,
|
|
211
|
+
**kwargs,
|
|
212
|
+
):
|
|
213
|
+
dense_features = dense_features or []
|
|
214
|
+
sparse_features = sparse_features or []
|
|
215
|
+
sequence_features = sequence_features or []
|
|
216
|
+
optimizer_params = optimizer_params or {}
|
|
217
|
+
if loss is None:
|
|
218
|
+
loss = "bce"
|
|
219
|
+
|
|
220
|
+
super(DCNv2, self).__init__(
|
|
221
|
+
dense_features=dense_features,
|
|
222
|
+
sparse_features=sparse_features,
|
|
223
|
+
sequence_features=sequence_features,
|
|
224
|
+
target=target,
|
|
225
|
+
task=task or self.default_task,
|
|
226
|
+
device=device,
|
|
227
|
+
embedding_l1_reg=embedding_l1_reg,
|
|
228
|
+
dense_l1_reg=dense_l1_reg,
|
|
229
|
+
embedding_l2_reg=embedding_l2_reg,
|
|
230
|
+
dense_l2_reg=dense_l2_reg,
|
|
231
|
+
**kwargs,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
self.all_features = dense_features + sparse_features + sequence_features
|
|
235
|
+
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
236
|
+
input_dim = self.embedding.input_dim
|
|
237
|
+
|
|
238
|
+
architecture = architecture.lower()
|
|
239
|
+
if architecture not in {"parallel", "stacked"}:
|
|
240
|
+
raise ValueError("architecture must be 'parallel' or 'stacked'.")
|
|
241
|
+
self.architecture = architecture
|
|
242
|
+
|
|
243
|
+
cross_type = cross_type.lower()
|
|
244
|
+
if cross_type == "matrix":
|
|
245
|
+
self.cross_network = CrossNetV2(input_dim=input_dim, num_layers=cross_num)
|
|
246
|
+
elif cross_type in {"mix", "low_rank"}:
|
|
247
|
+
self.cross_network = CrossNetMix(
|
|
248
|
+
input_dim=input_dim,
|
|
249
|
+
num_layers=cross_num,
|
|
250
|
+
low_rank=low_rank,
|
|
251
|
+
num_experts=num_experts,
|
|
252
|
+
)
|
|
253
|
+
else:
|
|
254
|
+
raise ValueError("Unsupported cross_type for DCNv2. Use 'matrix' or 'mix'.")
|
|
255
|
+
|
|
256
|
+
if mlp_params is not None:
|
|
257
|
+
self.use_dnn = True
|
|
258
|
+
dnn_params = dict(mlp_params)
|
|
259
|
+
dnn_params.setdefault("output_layer", False)
|
|
260
|
+
self.mlp = MLP(input_dim=input_dim, **dnn_params)
|
|
261
|
+
deep_dim = self.mlp.output_dim
|
|
262
|
+
final_input_dim = (
|
|
263
|
+
input_dim + deep_dim if architecture == "parallel" else deep_dim
|
|
264
|
+
)
|
|
265
|
+
else:
|
|
266
|
+
if architecture == "stacked":
|
|
267
|
+
raise ValueError(
|
|
268
|
+
"Stacked architecture requires mlp_params (deep tower)."
|
|
269
|
+
)
|
|
270
|
+
self.use_dnn = False
|
|
271
|
+
self.mlp = None
|
|
272
|
+
final_input_dim = input_dim
|
|
273
|
+
|
|
274
|
+
self.final_layer = nn.Linear(final_input_dim, 1)
|
|
275
|
+
self.prediction_layer = PredictionLayer(task_type=self.default_task)
|
|
276
|
+
|
|
277
|
+
self.register_regularization_weights(
|
|
278
|
+
embedding_attr="embedding",
|
|
279
|
+
include_modules=["cross_network", "mlp", "final_layer"],
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
self.compile(
|
|
283
|
+
optimizer=optimizer,
|
|
284
|
+
optimizer_params=optimizer_params,
|
|
285
|
+
loss=loss,
|
|
286
|
+
loss_params=loss_params,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def forward(self, x) -> torch.Tensor:
|
|
290
|
+
input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
|
|
291
|
+
cross_out = self.cross_network(input_flat)
|
|
292
|
+
|
|
293
|
+
if self.use_dnn and self.mlp is not None:
|
|
294
|
+
if self.architecture == "parallel":
|
|
295
|
+
deep_out = self.mlp(input_flat)
|
|
296
|
+
combined = torch.cat([cross_out, deep_out], dim=-1)
|
|
297
|
+
else: # stacked
|
|
298
|
+
deep_out = self.mlp(cross_out)
|
|
299
|
+
combined = deep_out
|
|
300
|
+
else:
|
|
301
|
+
combined = cross_out
|
|
123
302
|
|
|
124
|
-
|
|
125
|
-
return
|
|
303
|
+
logit = self.final_layer(combined)
|
|
304
|
+
return self.prediction_layer(logit)
|
nextrec/models/ranking/deepfm.py
CHANGED
|
@@ -61,14 +61,14 @@ class DeepFM(BaseModel):
|
|
|
61
61
|
|
|
62
62
|
def __init__(
|
|
63
63
|
self,
|
|
64
|
-
dense_features: list[DenseFeature] |
|
|
65
|
-
sparse_features: list[SparseFeature] |
|
|
66
|
-
sequence_features: list[SequenceFeature] |
|
|
67
|
-
mlp_params: dict =
|
|
68
|
-
target: list[str] | str =
|
|
64
|
+
dense_features: list[DenseFeature] | None = None,
|
|
65
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
66
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
67
|
+
mlp_params: dict | None = None,
|
|
68
|
+
target: list[str] | str | None = None,
|
|
69
69
|
task: str | list[str] | None = None,
|
|
70
70
|
optimizer: str = "adam",
|
|
71
|
-
optimizer_params: dict =
|
|
71
|
+
optimizer_params: dict | None = None,
|
|
72
72
|
loss: str | nn.Module | None = "bce",
|
|
73
73
|
loss_params: dict | list[dict] | None = None,
|
|
74
74
|
device: str = "cpu",
|
|
@@ -79,6 +79,14 @@ class DeepFM(BaseModel):
|
|
|
79
79
|
**kwargs,
|
|
80
80
|
):
|
|
81
81
|
|
|
82
|
+
dense_features = dense_features or []
|
|
83
|
+
sparse_features = sparse_features or []
|
|
84
|
+
sequence_features = sequence_features or []
|
|
85
|
+
mlp_params = mlp_params or {}
|
|
86
|
+
optimizer_params = optimizer_params or {}
|
|
87
|
+
if loss is None:
|
|
88
|
+
loss = "bce"
|
|
89
|
+
|
|
82
90
|
super(DeepFM, self).__init__(
|
|
83
91
|
dense_features=dense_features,
|
|
84
92
|
sparse_features=sparse_features,
|
|
@@ -94,9 +102,6 @@ class DeepFM(BaseModel):
|
|
|
94
102
|
)
|
|
95
103
|
|
|
96
104
|
self.loss = loss
|
|
97
|
-
if self.loss is None:
|
|
98
|
-
self.loss = "bce"
|
|
99
|
-
|
|
100
105
|
self.fm_features = sparse_features + sequence_features
|
|
101
106
|
self.deep_features = dense_features + sparse_features + sequence_features
|
|
102
107
|
self.embedding = EmbeddingLayer(features=self.deep_features)
|