nextrec 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,15 +1,60 @@
1
1
  """
2
2
  Date: create on 09/11/2025
3
+ Checkpoint: edit on 09/12/2025
4
+ Author: Yang Zhou, zyaztec@gmail.com
5
+ Reference:
6
+ [1] R. Wang et al. DCN V2: Improved Deep & Cross Network and Practical Lessons for
7
+ Web-scale Learning to Rank Systems. KDD 2021.
8
+ (https://arxiv.org/abs/2008.13535)
9
+
10
+ DCN v2 enhances the original Deep & Cross Network by replacing the scalar cross
11
+ weights with vector-wise (matrix) parameters and a Mixture-of-Low-Rank-Experts
12
+ variant. The matrix cross (CrossNetV2) improves expressiveness with manageable
13
+ parameter growth, while CrossNetMix decomposes the matrix into low-rank factors and
14
+ gates across experts for stronger modeling at a similar cost. As in DCN, the cross
15
+ tower explicitly builds polynomial feature interactions and can be paired with a
16
+ deep MLP tower; their outputs are concatenated before a final linear head.
17
+
18
+ Workflow:
19
+ (1) Embed sparse/sequence fields and concatenate with dense inputs
20
+ (2) Cross tower: choose matrix CrossNetV2 or low-rank CrossNetMix for explicit crosses
21
+ (3) Optional deep tower: MLP over the same flattened embeddings
22
+ (4) Fuse cross and deep outputs, then predict via a linear + prediction layer
23
+
24
+ Key Advantages:
25
+ - Vector-wise/matrix cross weights capture richer interactions than DCN v1
26
+ - Low-rank MoE cross (CrossNetMix) boosts capacity without quadratic parameters
27
+ - Compatible with a deep tower for additional nonlinear modeling
28
+
29
+ DCN v2 在原始 DCN 基础上,将标量交叉权重升级为向量/矩阵参数,并引入低秩专家混合
30
+ 的 CrossNetMix。矩阵交叉(CrossNetV2)在参数可控的前提下提升表达力,CrossNetMix
31
+ 通过低秩分解和 gating 进一步增强建模能力且保持参数效率。和 DCN 一样,交叉塔显式
32
+ 构造多项式交互,可并行或串联一个 MLP 深塔,最终拼接/输出到线性头做预测。
33
+
34
+ 流程:
35
+ (1) 对稀疏/序列特征做 embedding,并与稠密特征拼接
36
+ (2) 交叉塔:可选矩阵 CrossNetV2 或低秩混合 CrossNetMix 显式构造交互
37
+ (3) 可选深塔:MLP 处理同一展平后的输入或交叉输出
38
+ (4) 融合交叉与深塔输出,经线性层和预测层得到最终得分
39
+
40
+ 主要优点:
41
+ - 矩阵交叉相较 DCN v1 捕获更丰富的交互
42
+ - 低秩专家混合在相近参数量下带来更强建模能力
43
+ - 兼容并行/串行深塔,灵活扩展非线性表示
3
44
  """
4
45
 
5
46
  import torch
6
47
  import torch.nn as nn
7
48
 
49
+ from nextrec.basic.model import BaseModel
50
+ from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
51
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
52
+
8
53
 
9
54
  class CrossNetV2(nn.Module):
10
55
  """Vector-wise cross network proposed in DCN V2 (Wang et al., 2021)."""
11
56
 
12
- def __init__(self, input_dim, num_layers):
57
+ def __init__(self, input_dim: int, num_layers: int):
13
58
  super().__init__()
14
59
  self.num_layers = num_layers
15
60
  self.w = torch.nn.ModuleList(
@@ -22,7 +67,7 @@ class CrossNetV2(nn.Module):
22
67
  [torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)]
23
68
  )
24
69
 
25
- def forward(self, x):
70
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
26
71
  x0 = x
27
72
  for i in range(self.num_layers):
28
73
  x = x0 * self.w[i](x) + self.b[i] + x
@@ -32,94 +77,228 @@ class CrossNetV2(nn.Module):
32
77
  class CrossNetMix(nn.Module):
33
78
  """Mixture of low-rank cross experts from DCN V2 (Wang et al., 2021)."""
34
79
 
35
- def __init__(self, input_dim, num_layers=2, low_rank=32, num_experts=4):
36
- super(CrossNetMix, self).__init__()
80
+ def __init__(
81
+ self,
82
+ input_dim: int,
83
+ num_layers: int = 2,
84
+ low_rank: int = 32,
85
+ num_experts: int = 4,
86
+ ):
87
+ super().__init__()
37
88
  self.num_layers = num_layers
38
89
  self.num_experts = num_experts
39
90
 
40
- # U: (input_dim, low_rank)
41
- self.u_list = torch.nn.ParameterList(
91
+ self.u_list = nn.ParameterList(
42
92
  [
43
93
  nn.Parameter(
44
94
  nn.init.xavier_normal_(
45
95
  torch.empty(num_experts, input_dim, low_rank)
46
96
  )
47
97
  )
48
- for i in range(self.num_layers)
98
+ for _ in range(num_layers)
49
99
  ]
50
100
  )
51
- # V: (input_dim, low_rank)
52
- self.v_list = torch.nn.ParameterList(
101
+ self.v_list = nn.ParameterList(
53
102
  [
54
103
  nn.Parameter(
55
104
  nn.init.xavier_normal_(
56
105
  torch.empty(num_experts, input_dim, low_rank)
57
106
  )
58
107
  )
59
- for i in range(self.num_layers)
108
+ for _ in range(num_layers)
60
109
  ]
61
110
  )
62
- # C: (low_rank, low_rank)
63
- self.c_list = torch.nn.ParameterList(
111
+ self.c_list = nn.ParameterList(
64
112
  [
65
113
  nn.Parameter(
66
114
  nn.init.xavier_normal_(torch.empty(num_experts, low_rank, low_rank))
67
115
  )
68
- for i in range(self.num_layers)
116
+ for _ in range(num_layers)
69
117
  ]
70
118
  )
119
+
71
120
  self.gating = nn.ModuleList(
72
- [nn.Linear(input_dim, 1, bias=False) for i in range(self.num_experts)]
121
+ [nn.Linear(input_dim, 1, bias=False) for _ in range(num_experts)]
73
122
  )
74
123
 
75
- self.bias = torch.nn.ParameterList(
76
- [
77
- nn.Parameter(nn.init.zeros_(torch.empty(input_dim, 1)))
78
- for i in range(self.num_layers)
79
- ]
124
+ self.bias = nn.ParameterList(
125
+ [nn.Parameter(torch.zeros(input_dim, 1)) for _ in range(num_layers)]
80
126
  )
81
127
 
82
- def forward(self, x):
128
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
129
+ # x: (bs, in_features)
83
130
  x_0 = x.unsqueeze(2) # (bs, in_features, 1)
84
131
  x_l = x_0
132
+
85
133
  for i in range(self.num_layers):
86
134
  output_of_experts = []
87
135
  gating_score_experts = []
136
+
137
+ gating_input = x_l.squeeze(2) # (bs, in_features)
138
+
88
139
  for expert_id in range(self.num_experts):
89
- # (1) G(x_l)
90
- # compute the gating score by x_l
91
- gating_score_experts.append(self.gating[expert_id](x_l.squeeze(2)))
140
+ # Gating
141
+ gating_score_experts.append(
142
+ self.gating[expert_id](gating_input)
143
+ ) # (bs, 1)
92
144
 
93
- # (2) E(x_l)
94
- # project the input x_l to $\mathbb{R}^{r}$
95
- v_x = torch.matmul(
96
- self.v_list[i][expert_id].t(), x_l
97
- ) # (bs, low_rank, 1)
145
+ # Low-rank cross: U C V^T x_l
146
+ V = self.v_list[i][expert_id] # (in_features, low_rank)
147
+ C = self.c_list[i][expert_id] # (low_rank, low_rank)
148
+ U = self.u_list[i][expert_id] # (in_features, low_rank)
98
149
 
99
- # nonlinear activation in low rank space
100
- v_x = torch.tanh(v_x)
101
- v_x = torch.matmul(self.c_list[i][expert_id], v_x)
150
+ # (bs, 1, low_rank)
151
+ v_x = x_l.transpose(1, 2).matmul(V) # x_l^T V
152
+ v_x = v_x.matmul(C) # · C
102
153
  v_x = torch.tanh(v_x)
103
154
 
104
- # project back to $\mathbb{R}^{d}$
105
- uv_x = torch.matmul(
106
- self.u_list[i][expert_id], v_x
107
- ) # (bs, in_features, 1)
155
+ # (bs, in_features, 1)
156
+ uv_x = U.matmul(v_x.transpose(1, 2))
108
157
 
109
- dot_ = uv_x + self.bias[i]
110
- dot_ = x_0 * dot_ # Hadamard-product
158
+ # x_0 ⊙ (uv_x + b)
159
+ dot_ = x_0 * (uv_x + self.bias[i]) # (bs, in_features, 1)
111
160
 
112
- output_of_experts.append(dot_.squeeze(2))
161
+ output_of_experts.append(dot_.squeeze(2)) # (bs, in_features)
113
162
 
114
- # (3) mixture of low-rank experts
163
+ # (3) Mixture of experts
115
164
  output_of_experts = torch.stack(
116
- output_of_experts, 2
165
+ output_of_experts, dim=2
117
166
  ) # (bs, in_features, num_experts)
118
167
  gating_score_experts = torch.stack(
119
- gating_score_experts, 1
168
+ gating_score_experts, dim=1
120
169
  ) # (bs, num_experts, 1)
121
- moe_out = torch.matmul(output_of_experts, gating_score_experts.softmax(1))
122
- x_l = moe_out + x_l # (bs, in_features, 1)
170
+ gating_score_experts = gating_score_experts.softmax(dim=1)
171
+
172
+ moe_out = torch.matmul(
173
+ output_of_experts, gating_score_experts
174
+ ) # (bs, in_features, 1)
175
+ x_l = moe_out + x_l # residual
176
+
177
+ return x_l.squeeze(-1) # (bs, in_features)
178
+
179
+
180
+ class DCNv2(BaseModel):
181
+ @property
182
+ def model_name(self) -> str:
183
+ return "DCNv2"
184
+
185
+ @property
186
+ def default_task(self):
187
+ return "binary"
188
+
189
+ def __init__(
190
+ self,
191
+ dense_features: list[DenseFeature] | None = None,
192
+ sparse_features: list[SparseFeature] | None = None,
193
+ sequence_features: list[SequenceFeature] | None = None,
194
+ cross_num: int = 3,
195
+ cross_type: str = "matrix",
196
+ architecture: str = "parallel",
197
+ low_rank: int = 32,
198
+ num_experts: int = 4,
199
+ mlp_params: dict | None = None,
200
+ target: list[str] | str | None = None,
201
+ task: str | list[str] | None = None,
202
+ optimizer: str = "adam",
203
+ optimizer_params: dict | None = None,
204
+ loss: str | nn.Module | None = "bce",
205
+ loss_params: dict | list[dict] | None = None,
206
+ device: str = "cpu",
207
+ embedding_l1_reg=1e-6,
208
+ dense_l1_reg=1e-5,
209
+ embedding_l2_reg=1e-5,
210
+ dense_l2_reg=1e-4,
211
+ **kwargs,
212
+ ):
213
+ dense_features = dense_features or []
214
+ sparse_features = sparse_features or []
215
+ sequence_features = sequence_features or []
216
+ optimizer_params = optimizer_params or {}
217
+ if loss is None:
218
+ loss = "bce"
219
+
220
+ super(DCNv2, self).__init__(
221
+ dense_features=dense_features,
222
+ sparse_features=sparse_features,
223
+ sequence_features=sequence_features,
224
+ target=target,
225
+ task=task or self.default_task,
226
+ device=device,
227
+ embedding_l1_reg=embedding_l1_reg,
228
+ dense_l1_reg=dense_l1_reg,
229
+ embedding_l2_reg=embedding_l2_reg,
230
+ dense_l2_reg=dense_l2_reg,
231
+ **kwargs,
232
+ )
233
+
234
+ self.all_features = dense_features + sparse_features + sequence_features
235
+ self.embedding = EmbeddingLayer(features=self.all_features)
236
+ input_dim = self.embedding.input_dim
237
+
238
+ architecture = architecture.lower()
239
+ if architecture not in {"parallel", "stacked"}:
240
+ raise ValueError("architecture must be 'parallel' or 'stacked'.")
241
+ self.architecture = architecture
242
+
243
+ cross_type = cross_type.lower()
244
+ if cross_type == "matrix":
245
+ self.cross_network = CrossNetV2(input_dim=input_dim, num_layers=cross_num)
246
+ elif cross_type in {"mix", "low_rank"}:
247
+ self.cross_network = CrossNetMix(
248
+ input_dim=input_dim,
249
+ num_layers=cross_num,
250
+ low_rank=low_rank,
251
+ num_experts=num_experts,
252
+ )
253
+ else:
254
+ raise ValueError("Unsupported cross_type for DCNv2. Use 'matrix' or 'mix'.")
255
+
256
+ if mlp_params is not None:
257
+ self.use_dnn = True
258
+ dnn_params = dict(mlp_params)
259
+ dnn_params.setdefault("output_layer", False)
260
+ self.mlp = MLP(input_dim=input_dim, **dnn_params)
261
+ deep_dim = self.mlp.output_dim
262
+ final_input_dim = (
263
+ input_dim + deep_dim if architecture == "parallel" else deep_dim
264
+ )
265
+ else:
266
+ if architecture == "stacked":
267
+ raise ValueError(
268
+ "Stacked architecture requires mlp_params (deep tower)."
269
+ )
270
+ self.use_dnn = False
271
+ self.mlp = None
272
+ final_input_dim = input_dim
273
+
274
+ self.final_layer = nn.Linear(final_input_dim, 1)
275
+ self.prediction_layer = PredictionLayer(task_type=self.default_task)
276
+
277
+ self.register_regularization_weights(
278
+ embedding_attr="embedding",
279
+ include_modules=["cross_network", "mlp", "final_layer"],
280
+ )
281
+
282
+ self.compile(
283
+ optimizer=optimizer,
284
+ optimizer_params=optimizer_params,
285
+ loss=loss,
286
+ loss_params=loss_params,
287
+ )
288
+
289
+ def forward(self, x) -> torch.Tensor:
290
+ input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
291
+ cross_out = self.cross_network(input_flat)
292
+
293
+ if self.use_dnn and self.mlp is not None:
294
+ if self.architecture == "parallel":
295
+ deep_out = self.mlp(input_flat)
296
+ combined = torch.cat([cross_out, deep_out], dim=-1)
297
+ else: # stacked
298
+ deep_out = self.mlp(cross_out)
299
+ combined = deep_out
300
+ else:
301
+ combined = cross_out
123
302
 
124
- x_l = x_l.squeeze() # (bs, in_features)
125
- return x_l
303
+ logit = self.final_layer(combined)
304
+ return self.prediction_layer(logit)
@@ -61,14 +61,14 @@ class DeepFM(BaseModel):
61
61
 
62
62
  def __init__(
63
63
  self,
64
- dense_features: list[DenseFeature] | list = [],
65
- sparse_features: list[SparseFeature] | list = [],
66
- sequence_features: list[SequenceFeature] | list = [],
67
- mlp_params: dict = {},
68
- target: list[str] | str = [],
64
+ dense_features: list[DenseFeature] | None = None,
65
+ sparse_features: list[SparseFeature] | None = None,
66
+ sequence_features: list[SequenceFeature] | None = None,
67
+ mlp_params: dict | None = None,
68
+ target: list[str] | str | None = None,
69
69
  task: str | list[str] | None = None,
70
70
  optimizer: str = "adam",
71
- optimizer_params: dict = {},
71
+ optimizer_params: dict | None = None,
72
72
  loss: str | nn.Module | None = "bce",
73
73
  loss_params: dict | list[dict] | None = None,
74
74
  device: str = "cpu",
@@ -79,6 +79,14 @@ class DeepFM(BaseModel):
79
79
  **kwargs,
80
80
  ):
81
81
 
82
+ dense_features = dense_features or []
83
+ sparse_features = sparse_features or []
84
+ sequence_features = sequence_features or []
85
+ mlp_params = mlp_params or {}
86
+ optimizer_params = optimizer_params or {}
87
+ if loss is None:
88
+ loss = "bce"
89
+
82
90
  super(DeepFM, self).__init__(
83
91
  dense_features=dense_features,
84
92
  sparse_features=sparse_features,
@@ -94,9 +102,6 @@ class DeepFM(BaseModel):
94
102
  )
95
103
 
96
104
  self.loss = loss
97
- if self.loss is None:
98
- self.loss = "bce"
99
-
100
105
  self.fm_features = sparse_features + sequence_features
101
106
  self.deep_features = dense_features + sparse_features + sequence_features
102
107
  self.embedding = EmbeddingLayer(features=self.deep_features)