nextrec 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +1 -1
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -5
- nextrec/basic/callback.py +1 -0
- nextrec/basic/features.py +30 -22
- nextrec/basic/layers.py +220 -106
- nextrec/basic/loggers.py +62 -43
- nextrec/basic/metrics.py +268 -119
- nextrec/basic/model.py +1082 -400
- nextrec/basic/session.py +10 -3
- nextrec/cli.py +498 -0
- nextrec/data/__init__.py +19 -25
- nextrec/data/batch_utils.py +11 -3
- nextrec/data/data_processing.py +51 -45
- nextrec/data/data_utils.py +26 -15
- nextrec/data/dataloader.py +272 -95
- nextrec/data/preprocessor.py +320 -199
- nextrec/loss/listwise.py +17 -9
- nextrec/loss/loss_utils.py +7 -8
- nextrec/loss/pairwise.py +2 -0
- nextrec/loss/pointwise.py +30 -12
- nextrec/models/generative/hstu.py +103 -38
- nextrec/models/match/dssm.py +82 -68
- nextrec/models/match/dssm_v2.py +72 -57
- nextrec/models/match/mind.py +175 -107
- nextrec/models/match/sdm.py +104 -87
- nextrec/models/match/youtube_dnn.py +73 -59
- nextrec/models/multi_task/esmm.py +53 -37
- nextrec/models/multi_task/mmoe.py +64 -45
- nextrec/models/multi_task/ple.py +101 -48
- nextrec/models/multi_task/poso.py +113 -36
- nextrec/models/multi_task/share_bottom.py +48 -35
- nextrec/models/ranking/afm.py +72 -37
- nextrec/models/ranking/autoint.py +72 -55
- nextrec/models/ranking/dcn.py +55 -35
- nextrec/models/ranking/dcn_v2.py +64 -23
- nextrec/models/ranking/deepfm.py +32 -22
- nextrec/models/ranking/dien.py +155 -99
- nextrec/models/ranking/din.py +85 -57
- nextrec/models/ranking/fibinet.py +52 -32
- nextrec/models/ranking/fm.py +29 -23
- nextrec/models/ranking/masknet.py +91 -29
- nextrec/models/ranking/pnn.py +31 -28
- nextrec/models/ranking/widedeep.py +34 -26
- nextrec/models/ranking/xdeepfm.py +60 -38
- nextrec/utils/__init__.py +59 -34
- nextrec/utils/config.py +490 -0
- nextrec/utils/device.py +30 -20
- nextrec/utils/distributed.py +36 -9
- nextrec/utils/embedding.py +1 -0
- nextrec/utils/feature.py +1 -0
- nextrec/utils/file.py +32 -11
- nextrec/utils/initializer.py +61 -16
- nextrec/utils/optimizer.py +25 -9
- nextrec/utils/synthetic_data.py +283 -165
- nextrec/utils/tensor.py +24 -13
- {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/METADATA +4 -4
- nextrec-0.4.2.dist-info/RECORD +69 -0
- nextrec-0.4.2.dist-info/entry_points.txt +2 -0
- nextrec-0.4.1.dist-info/RECORD +0 -66
- {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
- {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,8 +3,8 @@ Date: create on 09/11/2025
|
|
|
3
3
|
Checkpoint: edit on 24/11/2025
|
|
4
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
[1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
|
|
7
|
-
self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
|
|
6
|
+
[1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
|
|
7
|
+
self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
|
|
8
8
|
on information and knowledge management. 2019: 1161-1170.
|
|
9
9
|
(https://arxiv.org/abs/1810.11921)
|
|
10
10
|
|
|
@@ -70,29 +70,31 @@ class AutoInt(BaseModel):
|
|
|
70
70
|
@property
|
|
71
71
|
def default_task(self):
|
|
72
72
|
return "binary"
|
|
73
|
-
|
|
74
|
-
def __init__(
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
dense_features: list[DenseFeature],
|
|
77
|
+
sparse_features: list[SparseFeature],
|
|
78
|
+
sequence_features: list[SequenceFeature],
|
|
79
|
+
att_layer_num: int = 3,
|
|
80
|
+
att_embedding_dim: int = 8,
|
|
81
|
+
att_head_num: int = 2,
|
|
82
|
+
att_dropout: float = 0.0,
|
|
83
|
+
att_use_residual: bool = True,
|
|
84
|
+
target: list[str] | None = None,
|
|
85
|
+
task: str | list[str] | None = None,
|
|
86
|
+
optimizer: str = "adam",
|
|
87
|
+
optimizer_params: dict | None = None,
|
|
88
|
+
loss: str | nn.Module | None = "bce",
|
|
89
|
+
loss_params: dict | list[dict] | None = None,
|
|
90
|
+
device: str = "cpu",
|
|
91
|
+
embedding_l1_reg=1e-6,
|
|
92
|
+
dense_l1_reg=1e-5,
|
|
93
|
+
embedding_l2_reg=1e-5,
|
|
94
|
+
dense_l2_reg=1e-4,
|
|
95
|
+
**kwargs,
|
|
96
|
+
):
|
|
97
|
+
|
|
96
98
|
super(AutoInt, self).__init__(
|
|
97
99
|
dense_features=dense_features,
|
|
98
100
|
sparse_features=sparse_features,
|
|
@@ -104,7 +106,7 @@ class AutoInt(BaseModel):
|
|
|
104
106
|
dense_l1_reg=dense_l1_reg,
|
|
105
107
|
embedding_l2_reg=embedding_l2_reg,
|
|
106
108
|
dense_l2_reg=dense_l2_reg,
|
|
107
|
-
**kwargs
|
|
109
|
+
**kwargs,
|
|
108
110
|
)
|
|
109
111
|
|
|
110
112
|
if target is None:
|
|
@@ -113,52 +115,59 @@ class AutoInt(BaseModel):
|
|
|
113
115
|
optimizer_params = {}
|
|
114
116
|
if loss is None:
|
|
115
117
|
loss = "bce"
|
|
116
|
-
|
|
118
|
+
|
|
117
119
|
self.att_layer_num = att_layer_num
|
|
118
120
|
self.att_embedding_dim = att_embedding_dim
|
|
119
|
-
|
|
121
|
+
|
|
120
122
|
# Use sparse and sequence features for interaction
|
|
121
123
|
# **INFO**: this is different from the original paper, we also include dense features
|
|
122
124
|
# if you want to follow the paper strictly, set dense_features=[]
|
|
123
125
|
# or modify the code accordingly
|
|
124
|
-
self.interaction_features = dense_features + sparse_features + sequence_features
|
|
125
|
-
|
|
126
|
+
self.interaction_features = dense_features + sparse_features + sequence_features
|
|
127
|
+
|
|
126
128
|
# All features for embedding
|
|
127
129
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
128
130
|
|
|
129
131
|
# Embedding layer
|
|
130
132
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
131
|
-
|
|
133
|
+
|
|
132
134
|
# Project embeddings to attention embedding dimension
|
|
133
135
|
num_fields = len(self.interaction_features)
|
|
134
|
-
|
|
136
|
+
|
|
135
137
|
# If embeddings have different dimensions, project them to att_embedding_dim
|
|
136
|
-
self.need_projection = not all(
|
|
138
|
+
self.need_projection = not all(
|
|
139
|
+
f.embedding_dim == att_embedding_dim for f in self.interaction_features
|
|
140
|
+
)
|
|
137
141
|
self.projection_layers = None
|
|
138
142
|
if self.need_projection:
|
|
139
|
-
self.projection_layers = nn.ModuleList(
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
143
|
+
self.projection_layers = nn.ModuleList(
|
|
144
|
+
[
|
|
145
|
+
nn.Linear(f.embedding_dim, att_embedding_dim, bias=False)
|
|
146
|
+
for f in self.interaction_features
|
|
147
|
+
]
|
|
148
|
+
)
|
|
149
|
+
|
|
144
150
|
# Multi-head self-attention layers
|
|
145
|
-
self.attention_layers = nn.ModuleList(
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
151
|
+
self.attention_layers = nn.ModuleList(
|
|
152
|
+
[
|
|
153
|
+
MultiHeadSelfAttention(
|
|
154
|
+
embedding_dim=att_embedding_dim,
|
|
155
|
+
num_heads=att_head_num,
|
|
156
|
+
dropout=att_dropout,
|
|
157
|
+
use_residual=att_use_residual,
|
|
158
|
+
)
|
|
159
|
+
for _ in range(att_layer_num)
|
|
160
|
+
]
|
|
161
|
+
)
|
|
162
|
+
|
|
154
163
|
# Final prediction layer
|
|
155
164
|
self.fc = nn.Linear(num_fields * att_embedding_dim, 1)
|
|
156
165
|
self.prediction_layer = PredictionLayer(task_type=self.default_task)
|
|
157
166
|
|
|
158
167
|
# Register regularization weights
|
|
159
168
|
self.register_regularization_weights(
|
|
160
|
-
embedding_attr=
|
|
161
|
-
include_modules=[
|
|
169
|
+
embedding_attr="embedding",
|
|
170
|
+
include_modules=["projection_layers", "attention_layers", "fc"],
|
|
162
171
|
)
|
|
163
172
|
|
|
164
173
|
self.compile(
|
|
@@ -172,21 +181,29 @@ class AutoInt(BaseModel):
|
|
|
172
181
|
# Get embeddings field-by-field so mixed dimensions can be projected safely
|
|
173
182
|
field_embeddings = []
|
|
174
183
|
if len(self.interaction_features) == 0:
|
|
175
|
-
raise ValueError(
|
|
184
|
+
raise ValueError(
|
|
185
|
+
"AutoInt requires at least one sparse or sequence feature for interactions."
|
|
186
|
+
)
|
|
176
187
|
for idx, feature in enumerate(self.interaction_features):
|
|
177
188
|
feature_emb = self.embedding(x=x, features=[feature], squeeze_dim=False)
|
|
178
189
|
feature_emb = feature_emb.squeeze(1) # [B, embedding_dim]
|
|
179
190
|
if self.need_projection and self.projection_layers is not None:
|
|
180
191
|
feature_emb = self.projection_layers[idx](feature_emb)
|
|
181
|
-
field_embeddings.append(
|
|
192
|
+
field_embeddings.append(
|
|
193
|
+
feature_emb.unsqueeze(1)
|
|
194
|
+
) # [B, 1, att_embedding_dim or original_dim]
|
|
182
195
|
embeddings = torch.cat(field_embeddings, dim=1)
|
|
183
|
-
|
|
196
|
+
|
|
184
197
|
# Apply multi-head self-attention layers
|
|
185
198
|
attention_output = embeddings
|
|
186
199
|
for att_layer in self.attention_layers:
|
|
187
|
-
attention_output = att_layer(
|
|
188
|
-
|
|
200
|
+
attention_output = att_layer(
|
|
201
|
+
attention_output
|
|
202
|
+
) # [B, num_fields, att_embedding_dim]
|
|
203
|
+
|
|
189
204
|
# Flatten and predict
|
|
190
|
-
attention_output_flat = attention_output.flatten(
|
|
205
|
+
attention_output_flat = attention_output.flatten(
|
|
206
|
+
start_dim=1
|
|
207
|
+
) # [B, num_fields * att_embedding_dim]
|
|
191
208
|
y = self.fc(attention_output_flat) # [B, 1]
|
|
192
209
|
return self.prediction_layer(y)
|
nextrec/models/ranking/dcn.py
CHANGED
|
@@ -15,21 +15,27 @@ from nextrec.basic.model import BaseModel
|
|
|
15
15
|
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
16
16
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
17
17
|
|
|
18
|
+
|
|
18
19
|
class CrossNetwork(nn.Module):
|
|
19
20
|
"""Stacked Cross Layers from DCN (Wang et al., 2017)."""
|
|
20
21
|
|
|
21
22
|
def __init__(self, input_dim, num_layers):
|
|
22
23
|
super().__init__()
|
|
23
24
|
self.num_layers = num_layers
|
|
24
|
-
self.w = torch.nn.ModuleList(
|
|
25
|
-
|
|
25
|
+
self.w = torch.nn.ModuleList(
|
|
26
|
+
[torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)]
|
|
27
|
+
)
|
|
28
|
+
self.b = torch.nn.ParameterList(
|
|
29
|
+
[torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)]
|
|
30
|
+
)
|
|
26
31
|
|
|
27
32
|
def forward(self, x):
|
|
28
33
|
x0 = x
|
|
29
34
|
for i in range(self.num_layers):
|
|
30
35
|
xw = self.w[i](x)
|
|
31
36
|
x = x0 * xw + self.b[i] + x
|
|
32
|
-
return x
|
|
37
|
+
return x # [batch_size, input_dim]
|
|
38
|
+
|
|
33
39
|
|
|
34
40
|
class DCN(BaseModel):
|
|
35
41
|
@property
|
|
@@ -40,25 +46,27 @@ class DCN(BaseModel):
|
|
|
40
46
|
def default_task(self):
|
|
41
47
|
return "binary"
|
|
42
48
|
|
|
43
|
-
def __init__(
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
dense_features: list[DenseFeature],
|
|
52
|
+
sparse_features: list[SparseFeature],
|
|
53
|
+
sequence_features: list[SequenceFeature],
|
|
54
|
+
cross_num: int = 3,
|
|
55
|
+
mlp_params: dict | None = None,
|
|
56
|
+
target: list[str] = [],
|
|
57
|
+
task: str | list[str] | None = None,
|
|
58
|
+
optimizer: str = "adam",
|
|
59
|
+
optimizer_params: dict = {},
|
|
60
|
+
loss: str | nn.Module | None = "bce",
|
|
61
|
+
loss_params: dict | list[dict] | None = None,
|
|
62
|
+
device: str = "cpu",
|
|
63
|
+
embedding_l1_reg=1e-6,
|
|
64
|
+
dense_l1_reg=1e-5,
|
|
65
|
+
embedding_l2_reg=1e-5,
|
|
66
|
+
dense_l2_reg=1e-4,
|
|
67
|
+
**kwargs,
|
|
68
|
+
):
|
|
69
|
+
|
|
62
70
|
super(DCN, self).__init__(
|
|
63
71
|
dense_features=dense_features,
|
|
64
72
|
sparse_features=sparse_features,
|
|
@@ -70,13 +78,13 @@ class DCN(BaseModel):
|
|
|
70
78
|
dense_l1_reg=dense_l1_reg,
|
|
71
79
|
embedding_l2_reg=embedding_l2_reg,
|
|
72
80
|
dense_l2_reg=dense_l2_reg,
|
|
73
|
-
**kwargs
|
|
81
|
+
**kwargs,
|
|
74
82
|
)
|
|
75
83
|
|
|
76
84
|
self.loss = loss
|
|
77
85
|
if self.loss is None:
|
|
78
86
|
self.loss = "bce"
|
|
79
|
-
|
|
87
|
+
|
|
80
88
|
# All features
|
|
81
89
|
self.all_features = dense_features + sparse_features + sequence_features
|
|
82
90
|
|
|
@@ -84,20 +92,30 @@ class DCN(BaseModel):
|
|
|
84
92
|
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
85
93
|
|
|
86
94
|
# Calculate input dimension
|
|
87
|
-
emb_dim_total = sum(
|
|
88
|
-
|
|
95
|
+
emb_dim_total = sum(
|
|
96
|
+
[
|
|
97
|
+
f.embedding_dim
|
|
98
|
+
for f in self.all_features
|
|
99
|
+
if not isinstance(f, DenseFeature)
|
|
100
|
+
]
|
|
101
|
+
)
|
|
102
|
+
dense_input_dim = sum(
|
|
103
|
+
[getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
|
|
104
|
+
)
|
|
89
105
|
input_dim = emb_dim_total + dense_input_dim
|
|
90
|
-
|
|
106
|
+
|
|
91
107
|
# Cross Network
|
|
92
108
|
self.cross_network = CrossNetwork(input_dim=input_dim, num_layers=cross_num)
|
|
93
|
-
|
|
109
|
+
|
|
94
110
|
# Deep Network (optional)
|
|
95
111
|
if mlp_params is not None:
|
|
96
112
|
self.use_dnn = True
|
|
97
113
|
self.mlp = MLP(input_dim=input_dim, **mlp_params)
|
|
98
114
|
deep_dim = self.mlp.output_dim
|
|
99
115
|
# Final layer combines cross and deep
|
|
100
|
-
self.final_layer = nn.Linear(
|
|
116
|
+
self.final_layer = nn.Linear(
|
|
117
|
+
input_dim + deep_dim, 1
|
|
118
|
+
) # + deep_dim for MLP output
|
|
101
119
|
else:
|
|
102
120
|
self.use_dnn = False
|
|
103
121
|
# Final layer only uses cross network output
|
|
@@ -107,8 +125,8 @@ class DCN(BaseModel):
|
|
|
107
125
|
|
|
108
126
|
# Register regularization weights
|
|
109
127
|
self.register_regularization_weights(
|
|
110
|
-
embedding_attr=
|
|
111
|
-
include_modules=[
|
|
128
|
+
embedding_attr="embedding",
|
|
129
|
+
include_modules=["cross_network", "mlp", "final_layer"],
|
|
112
130
|
)
|
|
113
131
|
|
|
114
132
|
self.compile(
|
|
@@ -121,18 +139,20 @@ class DCN(BaseModel):
|
|
|
121
139
|
def forward(self, x):
|
|
122
140
|
# Get all embeddings and flatten
|
|
123
141
|
input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
|
|
124
|
-
|
|
142
|
+
|
|
125
143
|
# Cross Network
|
|
126
144
|
cross_output = self.cross_network(input_flat) # [B, input_dim]
|
|
127
|
-
|
|
145
|
+
|
|
128
146
|
if self.use_dnn:
|
|
129
147
|
# Deep Network
|
|
130
148
|
deep_output = self.mlp(input_flat) # [B, 1]
|
|
131
149
|
# Concatenate cross and deep
|
|
132
|
-
combined = torch.cat(
|
|
150
|
+
combined = torch.cat(
|
|
151
|
+
[cross_output, deep_output], dim=-1
|
|
152
|
+
) # [B, input_dim + 1]
|
|
133
153
|
else:
|
|
134
154
|
combined = cross_output
|
|
135
|
-
|
|
155
|
+
|
|
136
156
|
# Final prediction
|
|
137
157
|
y = self.final_layer(combined)
|
|
138
158
|
return self.prediction_layer(y)
|
nextrec/models/ranking/dcn_v2.py
CHANGED
|
@@ -5,25 +5,30 @@ Date: create on 09/11/2025
|
|
|
5
5
|
import torch
|
|
6
6
|
import torch.nn as nn
|
|
7
7
|
|
|
8
|
-
from nextrec.basic.model import BaseModel
|
|
9
|
-
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
10
|
-
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
11
8
|
|
|
12
9
|
class CrossNetV2(nn.Module):
|
|
13
10
|
"""Vector-wise cross network proposed in DCN V2 (Wang et al., 2021)."""
|
|
11
|
+
|
|
14
12
|
def __init__(self, input_dim, num_layers):
|
|
15
13
|
super().__init__()
|
|
16
14
|
self.num_layers = num_layers
|
|
17
|
-
self.w = torch.nn.ModuleList(
|
|
18
|
-
|
|
19
|
-
|
|
15
|
+
self.w = torch.nn.ModuleList(
|
|
16
|
+
[
|
|
17
|
+
torch.nn.Linear(input_dim, input_dim, bias=False)
|
|
18
|
+
for _ in range(num_layers)
|
|
19
|
+
]
|
|
20
|
+
)
|
|
21
|
+
self.b = torch.nn.ParameterList(
|
|
22
|
+
[torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)]
|
|
23
|
+
)
|
|
20
24
|
|
|
21
25
|
def forward(self, x):
|
|
22
26
|
x0 = x
|
|
23
27
|
for i in range(self.num_layers):
|
|
24
|
-
x =x0*self.w[i](x) + self.b[i] + x
|
|
28
|
+
x = x0 * self.w[i](x) + self.b[i] + x
|
|
25
29
|
return x
|
|
26
|
-
|
|
30
|
+
|
|
31
|
+
|
|
27
32
|
class CrossNetMix(nn.Module):
|
|
28
33
|
"""Mixture of low-rank cross experts from DCN V2 (Wang et al., 2021)."""
|
|
29
34
|
|
|
@@ -33,18 +38,46 @@ class CrossNetMix(nn.Module):
|
|
|
33
38
|
self.num_experts = num_experts
|
|
34
39
|
|
|
35
40
|
# U: (input_dim, low_rank)
|
|
36
|
-
self.u_list = torch.nn.ParameterList(
|
|
37
|
-
|
|
41
|
+
self.u_list = torch.nn.ParameterList(
|
|
42
|
+
[
|
|
43
|
+
nn.Parameter(
|
|
44
|
+
nn.init.xavier_normal_(
|
|
45
|
+
torch.empty(num_experts, input_dim, low_rank)
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
for i in range(self.num_layers)
|
|
49
|
+
]
|
|
50
|
+
)
|
|
38
51
|
# V: (input_dim, low_rank)
|
|
39
|
-
self.v_list = torch.nn.ParameterList(
|
|
40
|
-
|
|
52
|
+
self.v_list = torch.nn.ParameterList(
|
|
53
|
+
[
|
|
54
|
+
nn.Parameter(
|
|
55
|
+
nn.init.xavier_normal_(
|
|
56
|
+
torch.empty(num_experts, input_dim, low_rank)
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
for i in range(self.num_layers)
|
|
60
|
+
]
|
|
61
|
+
)
|
|
41
62
|
# C: (low_rank, low_rank)
|
|
42
|
-
self.c_list = torch.nn.ParameterList(
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
63
|
+
self.c_list = torch.nn.ParameterList(
|
|
64
|
+
[
|
|
65
|
+
nn.Parameter(
|
|
66
|
+
nn.init.xavier_normal_(torch.empty(num_experts, low_rank, low_rank))
|
|
67
|
+
)
|
|
68
|
+
for i in range(self.num_layers)
|
|
69
|
+
]
|
|
70
|
+
)
|
|
71
|
+
self.gating = nn.ModuleList(
|
|
72
|
+
[nn.Linear(input_dim, 1, bias=False) for i in range(self.num_experts)]
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
self.bias = torch.nn.ParameterList(
|
|
76
|
+
[
|
|
77
|
+
nn.Parameter(nn.init.zeros_(torch.empty(input_dim, 1)))
|
|
78
|
+
for i in range(self.num_layers)
|
|
79
|
+
]
|
|
80
|
+
)
|
|
48
81
|
|
|
49
82
|
def forward(self, x):
|
|
50
83
|
x_0 = x.unsqueeze(2) # (bs, in_features, 1)
|
|
@@ -59,7 +92,9 @@ class CrossNetMix(nn.Module):
|
|
|
59
92
|
|
|
60
93
|
# (2) E(x_l)
|
|
61
94
|
# project the input x_l to $\mathbb{R}^{r}$
|
|
62
|
-
v_x = torch.matmul(
|
|
95
|
+
v_x = torch.matmul(
|
|
96
|
+
self.v_list[i][expert_id].t(), x_l
|
|
97
|
+
) # (bs, low_rank, 1)
|
|
63
98
|
|
|
64
99
|
# nonlinear activation in low rank space
|
|
65
100
|
v_x = torch.tanh(v_x)
|
|
@@ -67,7 +102,9 @@ class CrossNetMix(nn.Module):
|
|
|
67
102
|
v_x = torch.tanh(v_x)
|
|
68
103
|
|
|
69
104
|
# project back to $\mathbb{R}^{d}$
|
|
70
|
-
uv_x = torch.matmul(
|
|
105
|
+
uv_x = torch.matmul(
|
|
106
|
+
self.u_list[i][expert_id], v_x
|
|
107
|
+
) # (bs, in_features, 1)
|
|
71
108
|
|
|
72
109
|
dot_ = uv_x + self.bias[i]
|
|
73
110
|
dot_ = x_0 * dot_ # Hadamard-product
|
|
@@ -75,10 +112,14 @@ class CrossNetMix(nn.Module):
|
|
|
75
112
|
output_of_experts.append(dot_.squeeze(2))
|
|
76
113
|
|
|
77
114
|
# (3) mixture of low-rank experts
|
|
78
|
-
output_of_experts = torch.stack(
|
|
79
|
-
|
|
115
|
+
output_of_experts = torch.stack(
|
|
116
|
+
output_of_experts, 2
|
|
117
|
+
) # (bs, in_features, num_experts)
|
|
118
|
+
gating_score_experts = torch.stack(
|
|
119
|
+
gating_score_experts, 1
|
|
120
|
+
) # (bs, num_experts, 1)
|
|
80
121
|
moe_out = torch.matmul(output_of_experts, gating_score_experts.softmax(1))
|
|
81
122
|
x_l = moe_out + x_l # (bs, in_features, 1)
|
|
82
123
|
|
|
83
124
|
x_l = x_l.squeeze() # (bs, in_features)
|
|
84
|
-
return x_l
|
|
125
|
+
return x_l
|
nextrec/models/ranking/deepfm.py
CHANGED
|
@@ -43,13 +43,13 @@ embedding,无需手工构造交叉特征即可端到端训练,常用于 CTR/
|
|
|
43
43
|
- CTR/CVR 任务的常用强基线
|
|
44
44
|
"""
|
|
45
45
|
|
|
46
|
-
import torch
|
|
47
46
|
import torch.nn as nn
|
|
48
47
|
|
|
49
48
|
from nextrec.basic.model import BaseModel
|
|
50
49
|
from nextrec.basic.layers import FM, LR, EmbeddingLayer, MLP, PredictionLayer
|
|
51
50
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
52
51
|
|
|
52
|
+
|
|
53
53
|
class DeepFM(BaseModel):
|
|
54
54
|
@property
|
|
55
55
|
def model_name(self):
|
|
@@ -59,23 +59,26 @@ class DeepFM(BaseModel):
|
|
|
59
59
|
def default_task(self):
|
|
60
60
|
return "binary"
|
|
61
61
|
|
|
62
|
-
def __init__(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
dense_features: list[DenseFeature] | list = [],
|
|
65
|
+
sparse_features: list[SparseFeature] | list = [],
|
|
66
|
+
sequence_features: list[SequenceFeature] | list = [],
|
|
67
|
+
mlp_params: dict = {},
|
|
68
|
+
target: list[str] | str = [],
|
|
69
|
+
task: str | list[str] | None = None,
|
|
70
|
+
optimizer: str = "adam",
|
|
71
|
+
optimizer_params: dict = {},
|
|
72
|
+
loss: str | nn.Module | None = "bce",
|
|
73
|
+
loss_params: dict | list[dict] | None = None,
|
|
74
|
+
device: str = "cpu",
|
|
75
|
+
embedding_l1_reg=1e-6,
|
|
76
|
+
dense_l1_reg=1e-5,
|
|
77
|
+
embedding_l2_reg=1e-5,
|
|
78
|
+
dense_l2_reg=1e-4,
|
|
79
|
+
**kwargs,
|
|
80
|
+
):
|
|
81
|
+
|
|
79
82
|
super(DeepFM, self).__init__(
|
|
80
83
|
dense_features=dense_features,
|
|
81
84
|
sparse_features=sparse_features,
|
|
@@ -87,13 +90,13 @@ class DeepFM(BaseModel):
|
|
|
87
90
|
dense_l1_reg=dense_l1_reg,
|
|
88
91
|
embedding_l2_reg=embedding_l2_reg,
|
|
89
92
|
dense_l2_reg=dense_l2_reg,
|
|
90
|
-
**kwargs
|
|
93
|
+
**kwargs,
|
|
91
94
|
)
|
|
92
95
|
|
|
93
96
|
self.loss = loss
|
|
94
97
|
if self.loss is None:
|
|
95
98
|
self.loss = "bce"
|
|
96
|
-
|
|
99
|
+
|
|
97
100
|
self.fm_features = sparse_features + sequence_features
|
|
98
101
|
self.deep_features = dense_features + sparse_features + sequence_features
|
|
99
102
|
self.embedding = EmbeddingLayer(features=self.deep_features)
|
|
@@ -107,8 +110,15 @@ class DeepFM(BaseModel):
|
|
|
107
110
|
self.prediction_layer = PredictionLayer(task_type=self.default_task)
|
|
108
111
|
|
|
109
112
|
# Register regularization weights
|
|
110
|
-
self.register_regularization_weights(
|
|
111
|
-
|
|
113
|
+
self.register_regularization_weights(
|
|
114
|
+
embedding_attr="embedding", include_modules=["linear", "mlp"]
|
|
115
|
+
)
|
|
116
|
+
self.compile(
|
|
117
|
+
optimizer=optimizer,
|
|
118
|
+
optimizer_params=optimizer_params,
|
|
119
|
+
loss=loss,
|
|
120
|
+
loss_params=loss_params,
|
|
121
|
+
)
|
|
112
122
|
|
|
113
123
|
def forward(self, x):
|
|
114
124
|
input_deep = self.embedding(x=x, features=self.deep_features, squeeze_dim=True)
|