nextrec 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. nextrec/__init__.py +1 -1
  2. nextrec/__version__.py +1 -1
  3. nextrec/basic/activation.py +10 -5
  4. nextrec/basic/callback.py +1 -0
  5. nextrec/basic/features.py +30 -22
  6. nextrec/basic/layers.py +244 -113
  7. nextrec/basic/loggers.py +62 -43
  8. nextrec/basic/metrics.py +268 -119
  9. nextrec/basic/model.py +1373 -443
  10. nextrec/basic/session.py +10 -3
  11. nextrec/cli.py +498 -0
  12. nextrec/data/__init__.py +19 -25
  13. nextrec/data/batch_utils.py +11 -3
  14. nextrec/data/data_processing.py +42 -24
  15. nextrec/data/data_utils.py +26 -15
  16. nextrec/data/dataloader.py +303 -96
  17. nextrec/data/preprocessor.py +320 -199
  18. nextrec/loss/listwise.py +17 -9
  19. nextrec/loss/loss_utils.py +7 -8
  20. nextrec/loss/pairwise.py +2 -0
  21. nextrec/loss/pointwise.py +30 -12
  22. nextrec/models/generative/hstu.py +106 -40
  23. nextrec/models/match/dssm.py +82 -69
  24. nextrec/models/match/dssm_v2.py +72 -58
  25. nextrec/models/match/mind.py +175 -108
  26. nextrec/models/match/sdm.py +104 -88
  27. nextrec/models/match/youtube_dnn.py +73 -60
  28. nextrec/models/multi_task/esmm.py +53 -39
  29. nextrec/models/multi_task/mmoe.py +70 -47
  30. nextrec/models/multi_task/ple.py +107 -50
  31. nextrec/models/multi_task/poso.py +121 -41
  32. nextrec/models/multi_task/share_bottom.py +54 -38
  33. nextrec/models/ranking/afm.py +172 -45
  34. nextrec/models/ranking/autoint.py +84 -61
  35. nextrec/models/ranking/dcn.py +59 -42
  36. nextrec/models/ranking/dcn_v2.py +64 -23
  37. nextrec/models/ranking/deepfm.py +36 -26
  38. nextrec/models/ranking/dien.py +158 -102
  39. nextrec/models/ranking/din.py +88 -60
  40. nextrec/models/ranking/fibinet.py +55 -35
  41. nextrec/models/ranking/fm.py +32 -26
  42. nextrec/models/ranking/masknet.py +95 -34
  43. nextrec/models/ranking/pnn.py +34 -31
  44. nextrec/models/ranking/widedeep.py +37 -29
  45. nextrec/models/ranking/xdeepfm.py +63 -41
  46. nextrec/utils/__init__.py +61 -32
  47. nextrec/utils/config.py +490 -0
  48. nextrec/utils/device.py +52 -12
  49. nextrec/utils/distributed.py +141 -0
  50. nextrec/utils/embedding.py +1 -0
  51. nextrec/utils/feature.py +1 -0
  52. nextrec/utils/file.py +32 -11
  53. nextrec/utils/initializer.py +61 -16
  54. nextrec/utils/optimizer.py +25 -9
  55. nextrec/utils/synthetic_data.py +531 -0
  56. nextrec/utils/tensor.py +24 -13
  57. {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/METADATA +15 -5
  58. nextrec-0.4.2.dist-info/RECORD +69 -0
  59. nextrec-0.4.2.dist-info/entry_points.txt +2 -0
  60. nextrec-0.3.6.dist-info/RECORD +0 -64
  61. {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
  62. {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,17 +1,46 @@
1
1
  """
2
2
  Date: create on 09/11/2025
3
- Author:
4
- Yang Zhou,zyaztec@gmail.com
3
+ Checkpoint: edit on 06/12/2025
4
+ Author: Yang Zhou,zyaztec@gmail.com
5
5
  Reference:
6
- [1] Xiao J, Ye H, He X, et al. Attentional factorization machines: Learning the weight of
7
- feature interactions via attention networks[C]//IJCAI. 2017: 3119-3125.
6
+ [1] Xiao J, Ye H, He X, et al. Attentional factorization machines: Learning the weight of
7
+ feature interactions via attention networks[C]//IJCAI. 2017: 3119-3125.
8
+
9
+ Attentional Factorization Machine (AFM) builds on FM by learning an importance
10
+ weight for every second-order interaction instead of treating all pairs equally.
11
+ It retains FM’s linear (first-order) component for sparsity-friendly modeling,
12
+ while using an attention network to reweight the element-wise product of field
13
+ embeddings before aggregation.
14
+
15
+ In each forward pass:
16
+ (1) Embed each field and compute pairwise element-wise products v_i ⊙ v_j
17
+ (2) Pass interactions through an attention MLP (ReLU + projection) to score them
18
+ (3) Softmax-normalize scores to obtain interaction weights
19
+ (4) Weighted sum of interactions -> linear projection -> add FM first-order term
20
+
21
+ Key Advantages:
22
+ - Learns which feature pairs contribute most via attention weights
23
+ - Keeps FM efficiency and interpretability by preserving first-order terms
24
+ - Softmax-normalized reweighting reduces noise from uninformative interactions
25
+
26
+ AFM 在 FM 的二阶交互上引入注意力,为每个特征对学习重要性权重;同时保留 FM 的一阶项,
27
+ 保持对稀疏特征的友好与可解释性。具体流程:
28
+ (1) 对各字段做 embedding,并计算所有特征对的元素积 v_i ⊙ v_j
29
+ (2) 经由注意力 MLP(ReLU + 线性映射)得到交互得分
30
+ (3) 通过 softmax 归一化交互得分,得到权重
31
+ (4) 将加权交互求和、线性映射,再与一阶项相加得到最终预测
32
+
33
+ 主要优点:
34
+ - 注意力显式告诉哪些特征对更重要
35
+ - 保留 FM 的效率和可解释性
36
+ - softmax 归一化减弱噪声交互的影响
8
37
  """
9
38
 
10
39
  import torch
11
40
  import torch.nn as nn
12
41
 
13
42
  from nextrec.basic.model import BaseModel
14
- from nextrec.basic.layers import EmbeddingLayer, LR, PredictionLayer
43
+ from nextrec.basic.layers import EmbeddingLayer, PredictionLayer, InputMask
15
44
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
16
45
 
17
46
 
@@ -21,69 +50,113 @@ class AFM(BaseModel):
21
50
  return "AFM"
22
51
 
23
52
  @property
24
- def task_type(self):
53
+ def default_task(self):
25
54
  return "binary"
26
-
27
- def __init__(self,
28
- dense_features: list[DenseFeature] | list = [],
29
- sparse_features: list[SparseFeature] | list = [],
30
- sequence_features: list[SequenceFeature] | list = [],
31
- attention_dim: int = 32,
32
- attention_dropout: float = 0.0,
33
- target: list[str] | list = [],
34
- optimizer: str = "adam",
35
- optimizer_params: dict = {},
36
- loss: str | nn.Module | None = "bce",
37
- loss_params: dict | list[dict] | None = None,
38
- device: str = 'cpu',
39
- embedding_l1_reg=1e-6,
40
- dense_l1_reg=1e-5,
41
- embedding_l2_reg=1e-5,
42
- dense_l2_reg=1e-4, **kwargs):
43
-
55
+
56
+ def __init__(
57
+ self,
58
+ dense_features: list[DenseFeature] | list = [],
59
+ sparse_features: list[SparseFeature] | list = [],
60
+ sequence_features: list[SequenceFeature] | list = [],
61
+ attention_dim: int = 32,
62
+ attention_dropout: float = 0.0,
63
+ target: list[str] | list = [],
64
+ task: str | list[str] | None = None,
65
+ optimizer: str = "adam",
66
+ optimizer_params: dict = {},
67
+ loss: str | nn.Module | None = "bce",
68
+ loss_params: dict | list[dict] | None = None,
69
+ device: str = "cpu",
70
+ embedding_l1_reg=1e-6,
71
+ dense_l1_reg=1e-5,
72
+ embedding_l2_reg=1e-5,
73
+ dense_l2_reg=1e-4,
74
+ **kwargs,
75
+ ):
76
+
44
77
  super(AFM, self).__init__(
45
78
  dense_features=dense_features,
46
79
  sparse_features=sparse_features,
47
80
  sequence_features=sequence_features,
48
81
  target=target,
49
- task=self.task_type,
82
+ task=task or self.default_task,
50
83
  device=device,
51
84
  embedding_l1_reg=embedding_l1_reg,
52
85
  dense_l1_reg=dense_l1_reg,
53
86
  embedding_l2_reg=embedding_l2_reg,
54
87
  dense_l2_reg=dense_l2_reg,
55
- early_stop_patience=20,
56
- **kwargs
88
+ **kwargs,
57
89
  )
58
90
 
59
- self.loss = loss
60
- if self.loss is None:
61
- self.loss = "bce"
62
-
91
+ if target is None:
92
+ target = []
93
+ if optimizer_params is None:
94
+ optimizer_params = {}
95
+ if loss is None:
96
+ loss = "bce"
97
+
63
98
  self.fm_features = sparse_features + sequence_features
64
99
  if len(self.fm_features) < 2:
65
- raise ValueError("AFM requires at least two sparse/sequence features to build pairwise interactions.")
100
+ raise ValueError(
101
+ "AFM requires at least two sparse/sequence features to build pairwise interactions."
102
+ )
66
103
 
67
- # Assume uniform embedding dimension across FM fields
104
+ # make sure all embedding dimension are the same for FM features
68
105
  self.embedding_dim = self.fm_features[0].embedding_dim
69
106
  if any(f.embedding_dim != self.embedding_dim for f in self.fm_features):
70
- raise ValueError("All FM features must share the same embedding_dim for AFM.")
71
-
72
- self.embedding = EmbeddingLayer(features=self.fm_features)
107
+ raise ValueError(
108
+ "All FM features must share the same embedding_dim for AFM."
109
+ )
110
+
111
+ self.embedding = EmbeddingLayer(
112
+ features=self.fm_features
113
+ ) # [Batch, Field, Dim ]
114
+
115
+ # First-order terms: dense linear + one hot embeddings
116
+ self.dense_features = list(dense_features)
117
+ dense_input_dim = sum([f.input_dim for f in self.dense_features])
118
+ self.linear_dense = (
119
+ nn.Linear(dense_input_dim, 1, bias=True) if dense_input_dim > 0 else None
120
+ )
73
121
 
74
- fm_input_dim = sum([f.embedding_dim for f in self.fm_features])
75
- self.linear = LR(fm_input_dim)
122
+ # First-order term: sparse/sequence features one-hot
123
+ # **INFO**: source paper does not contain sequence features in experiments,
124
+ # but we implement it here for completeness. if you want follow the paper strictly,
125
+ # remove sequence features from fm_features.
126
+ self.first_order_embeddings = nn.ModuleDict()
127
+ for feature in self.fm_features:
128
+ if (
129
+ feature.embedding_name in self.first_order_embeddings
130
+ ): # shared embedding
131
+ continue
132
+ emb = nn.Embedding(
133
+ num_embeddings=feature.vocab_size,
134
+ embedding_dim=1,
135
+ padding_idx=feature.padding_idx,
136
+ ) # equal to one-hot encoding weight
137
+ # nn.init.zeros_(emb.weight)
138
+ self.first_order_embeddings[feature.embedding_name] = emb
76
139
 
77
140
  self.attention_linear = nn.Linear(self.embedding_dim, attention_dim)
78
141
  self.attention_p = nn.Linear(attention_dim, 1, bias=False)
79
142
  self.attention_dropout = nn.Dropout(attention_dropout)
80
143
  self.output_projection = nn.Linear(self.embedding_dim, 1, bias=False)
81
- self.prediction_layer = PredictionLayer(task_type=self.task_type)
144
+ self.prediction_layer = PredictionLayer(task_type=self.default_task)
145
+ self.input_mask = InputMask()
82
146
 
83
147
  # Register regularization weights
84
148
  self.register_regularization_weights(
85
- embedding_attr='embedding',
86
- include_modules=['linear', 'attention_linear', 'attention_p', 'output_projection']
149
+ embedding_attr="embedding",
150
+ include_modules=[
151
+ "linear_dense",
152
+ "attention_linear",
153
+ "attention_p",
154
+ "output_projection",
155
+ ],
156
+ )
157
+ # add first-order embeddings to embedding regularization list
158
+ self.embedding_params.extend(
159
+ emb.weight for emb in self.first_order_embeddings.values()
87
160
  )
88
161
 
89
162
  self.compile(
@@ -94,11 +167,65 @@ class AFM(BaseModel):
94
167
  )
95
168
 
96
169
  def forward(self, x):
97
- field_emb = self.embedding(x=x, features=self.fm_features, squeeze_dim=False) # [B, F, D]
98
- input_linear = field_emb.flatten(start_dim=1)
99
- y_linear = self.linear(input_linear)
170
+ field_emb = self.embedding(
171
+ x=x, features=self.fm_features, squeeze_dim=False
172
+ ) # [B, F, D]
173
+ batch_size = field_emb.size(0)
174
+ y_linear = torch.zeros(batch_size, 1, device=field_emb.device)
175
+
176
+ # First-order dense part
177
+ if self.linear_dense is not None:
178
+ dense_inputs = [
179
+ x[f.name].float().view(batch_size, -1) for f in self.dense_features
180
+ ]
181
+ dense_stack = torch.cat(dense_inputs, dim=1) if dense_inputs else None
182
+ if dense_stack is not None:
183
+ y_linear = y_linear + self.linear_dense(dense_stack)
184
+
185
+ # First-order sparse/sequence part
186
+ first_order_terms = []
187
+ for feature in self.fm_features:
188
+ emb = self.first_order_embeddings[feature.embedding_name]
189
+ if isinstance(feature, SparseFeature):
190
+ term = emb(x[feature.name].long()) # [B, 1]
191
+ else: # SequenceFeature
192
+ seq_input = x[feature.name].long() # [B, 1]
193
+ if feature.max_len is not None and seq_input.size(1) > feature.max_len:
194
+ seq_input = seq_input[:, -feature.max_len :]
195
+ mask = self.input_mask(x, feature, seq_input).squeeze(1) # [B, 1]
196
+ seq_weight = emb(seq_input).squeeze(-1) # [B, L]
197
+ term = (seq_weight * mask).sum(dim=1, keepdim=True) # [B, 1]
198
+ first_order_terms.append(term)
199
+ if first_order_terms:
200
+ y_linear = y_linear + torch.sum(
201
+ torch.cat(first_order_terms, dim=1), dim=1, keepdim=True
202
+ )
100
203
 
101
204
  interactions = []
205
+ feature_values = []
206
+ for feature in self.fm_features:
207
+ value = x.get(f"{feature.name}_value")
208
+ if value is not None:
209
+ value = value.float()
210
+ if value.dim() == 1:
211
+ value = value.unsqueeze(-1)
212
+ else:
213
+ if isinstance(feature, SequenceFeature):
214
+ seq_input = x[feature.name].long()
215
+ if (
216
+ feature.max_len is not None
217
+ and seq_input.size(1) > feature.max_len
218
+ ):
219
+ seq_input = seq_input[:, -feature.max_len :]
220
+ value = self.input_mask(x, feature, seq_input).sum(dim=2) # [B, 1]
221
+ else:
222
+ value = torch.ones(batch_size, 1, device=field_emb.device)
223
+ feature_values.append(value)
224
+ feature_values_tensor = torch.cat(feature_values, dim=1).unsqueeze(
225
+ -1
226
+ ) # [B, F, 1]
227
+ field_emb = field_emb * feature_values_tensor
228
+
102
229
  num_fields = field_emb.shape[1]
103
230
  for i in range(num_fields - 1):
104
231
  vi = field_emb[:, i, :]
@@ -107,7 +234,7 @@ class AFM(BaseModel):
107
234
  interactions.append(vi * vj)
108
235
 
109
236
  pair_tensor = torch.stack(interactions, dim=1) # [B, num_pairs, D]
110
- attention_scores = torch.tanh(self.attention_linear(pair_tensor))
237
+ attention_scores = torch.relu(self.attention_linear(pair_tensor))
111
238
  attention_scores = self.attention_p(attention_scores) # [B, num_pairs, 1]
112
239
  attention_weights = torch.softmax(attention_scores, dim=1)
113
240
 
@@ -3,8 +3,8 @@ Date: create on 09/11/2025
3
3
  Checkpoint: edit on 24/11/2025
4
4
  Author: Yang Zhou,zyaztec@gmail.com
5
5
  Reference:
6
- [1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
7
- self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
6
+ [1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
7
+ self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
8
8
  on information and knowledge management. 2019: 1161-1170.
9
9
  (https://arxiv.org/abs/1810.11921)
10
10
 
@@ -68,91 +68,106 @@ class AutoInt(BaseModel):
68
68
  return "AutoInt"
69
69
 
70
70
  @property
71
- def task_type(self):
71
+ def default_task(self):
72
72
  return "binary"
73
-
74
- def __init__(self,
75
- dense_features: list[DenseFeature],
76
- sparse_features: list[SparseFeature],
77
- sequence_features: list[SequenceFeature],
78
- att_layer_num: int = 3,
79
- att_embedding_dim: int = 8,
80
- att_head_num: int = 2,
81
- att_dropout: float = 0.0,
82
- att_use_residual: bool = True,
83
- target: list[str] = [],
84
- optimizer: str = "adam",
85
- optimizer_params: dict = {},
86
- loss: str | nn.Module | None = "bce",
87
- loss_params: dict | list[dict] | None = None,
88
- device: str = 'cpu',
89
- embedding_l1_reg=1e-6,
90
- dense_l1_reg=1e-5,
91
- embedding_l2_reg=1e-5,
92
- dense_l2_reg=1e-4,
93
- **kwargs):
94
-
73
+
74
+ def __init__(
75
+ self,
76
+ dense_features: list[DenseFeature],
77
+ sparse_features: list[SparseFeature],
78
+ sequence_features: list[SequenceFeature],
79
+ att_layer_num: int = 3,
80
+ att_embedding_dim: int = 8,
81
+ att_head_num: int = 2,
82
+ att_dropout: float = 0.0,
83
+ att_use_residual: bool = True,
84
+ target: list[str] | None = None,
85
+ task: str | list[str] | None = None,
86
+ optimizer: str = "adam",
87
+ optimizer_params: dict | None = None,
88
+ loss: str | nn.Module | None = "bce",
89
+ loss_params: dict | list[dict] | None = None,
90
+ device: str = "cpu",
91
+ embedding_l1_reg=1e-6,
92
+ dense_l1_reg=1e-5,
93
+ embedding_l2_reg=1e-5,
94
+ dense_l2_reg=1e-4,
95
+ **kwargs,
96
+ ):
97
+
95
98
  super(AutoInt, self).__init__(
96
99
  dense_features=dense_features,
97
100
  sparse_features=sparse_features,
98
101
  sequence_features=sequence_features,
99
102
  target=target,
100
- task=self.task_type,
103
+ task=task or self.default_task,
101
104
  device=device,
102
105
  embedding_l1_reg=embedding_l1_reg,
103
106
  dense_l1_reg=dense_l1_reg,
104
107
  embedding_l2_reg=embedding_l2_reg,
105
108
  dense_l2_reg=dense_l2_reg,
106
- early_stop_patience=20,
107
- **kwargs
109
+ **kwargs,
108
110
  )
109
111
 
110
- self.loss = loss
111
- if self.loss is None:
112
- self.loss = "bce"
113
-
112
+ if target is None:
113
+ target = []
114
+ if optimizer_params is None:
115
+ optimizer_params = {}
116
+ if loss is None:
117
+ loss = "bce"
118
+
114
119
  self.att_layer_num = att_layer_num
115
120
  self.att_embedding_dim = att_embedding_dim
116
-
121
+
117
122
  # Use sparse and sequence features for interaction
118
- self.interaction_features = dense_features + sparse_features + sequence_features
119
-
123
+ # **INFO**: this is different from the original paper, we also include dense features
124
+ # if you want to follow the paper strictly, set dense_features=[]
125
+ # or modify the code accordingly
126
+ self.interaction_features = dense_features + sparse_features + sequence_features
127
+
120
128
  # All features for embedding
121
129
  self.all_features = dense_features + sparse_features + sequence_features
122
130
 
123
131
  # Embedding layer
124
132
  self.embedding = EmbeddingLayer(features=self.all_features)
125
-
133
+
126
134
  # Project embeddings to attention embedding dimension
127
135
  num_fields = len(self.interaction_features)
128
-
136
+
129
137
  # If embeddings have different dimensions, project them to att_embedding_dim
130
- self.need_projection = not all(f.embedding_dim == att_embedding_dim for f in self.interaction_features)
138
+ self.need_projection = not all(
139
+ f.embedding_dim == att_embedding_dim for f in self.interaction_features
140
+ )
131
141
  self.projection_layers = None
132
142
  if self.need_projection:
133
- self.projection_layers = nn.ModuleList([
134
- nn.Linear(f.embedding_dim, att_embedding_dim, bias=False)
135
- for f in self.interaction_features
136
- ])
137
-
143
+ self.projection_layers = nn.ModuleList(
144
+ [
145
+ nn.Linear(f.embedding_dim, att_embedding_dim, bias=False)
146
+ for f in self.interaction_features
147
+ ]
148
+ )
149
+
138
150
  # Multi-head self-attention layers
139
- self.attention_layers = nn.ModuleList([
140
- MultiHeadSelfAttention(
141
- embedding_dim=att_embedding_dim,
142
- num_heads=att_head_num,
143
- dropout=att_dropout,
144
- use_residual=att_use_residual
145
- ) for _ in range(att_layer_num)
146
- ])
147
-
151
+ self.attention_layers = nn.ModuleList(
152
+ [
153
+ MultiHeadSelfAttention(
154
+ embedding_dim=att_embedding_dim,
155
+ num_heads=att_head_num,
156
+ dropout=att_dropout,
157
+ use_residual=att_use_residual,
158
+ )
159
+ for _ in range(att_layer_num)
160
+ ]
161
+ )
162
+
148
163
  # Final prediction layer
149
164
  self.fc = nn.Linear(num_fields * att_embedding_dim, 1)
150
- self.prediction_layer = PredictionLayer(task_type=self.task_type)
165
+ self.prediction_layer = PredictionLayer(task_type=self.default_task)
151
166
 
152
167
  # Register regularization weights
153
168
  self.register_regularization_weights(
154
- embedding_attr='embedding',
155
- include_modules=['projection_layers', 'attention_layers', 'fc']
169
+ embedding_attr="embedding",
170
+ include_modules=["projection_layers", "attention_layers", "fc"],
156
171
  )
157
172
 
158
173
  self.compile(
@@ -166,21 +181,29 @@ class AutoInt(BaseModel):
166
181
  # Get embeddings field-by-field so mixed dimensions can be projected safely
167
182
  field_embeddings = []
168
183
  if len(self.interaction_features) == 0:
169
- raise ValueError("AutoInt requires at least one sparse or sequence feature for interactions.")
184
+ raise ValueError(
185
+ "AutoInt requires at least one sparse or sequence feature for interactions."
186
+ )
170
187
  for idx, feature in enumerate(self.interaction_features):
171
188
  feature_emb = self.embedding(x=x, features=[feature], squeeze_dim=False)
172
189
  feature_emb = feature_emb.squeeze(1) # [B, embedding_dim]
173
190
  if self.need_projection and self.projection_layers is not None:
174
191
  feature_emb = self.projection_layers[idx](feature_emb)
175
- field_embeddings.append(feature_emb.unsqueeze(1)) # [B, 1, att_embedding_dim or original_dim]
192
+ field_embeddings.append(
193
+ feature_emb.unsqueeze(1)
194
+ ) # [B, 1, att_embedding_dim or original_dim]
176
195
  embeddings = torch.cat(field_embeddings, dim=1)
177
-
196
+
178
197
  # Apply multi-head self-attention layers
179
198
  attention_output = embeddings
180
199
  for att_layer in self.attention_layers:
181
- attention_output = att_layer(attention_output) # [B, num_fields, att_embedding_dim]
182
-
200
+ attention_output = att_layer(
201
+ attention_output
202
+ ) # [B, num_fields, att_embedding_dim]
203
+
183
204
  # Flatten and predict
184
- attention_output_flat = attention_output.flatten(start_dim=1) # [B, num_fields * att_embedding_dim]
205
+ attention_output_flat = attention_output.flatten(
206
+ start_dim=1
207
+ ) # [B, num_fields * att_embedding_dim]
185
208
  y = self.fc(attention_output_flat) # [B, 1]
186
209
  return self.prediction_layer(y)
@@ -15,24 +15,26 @@ from nextrec.basic.model import BaseModel
15
15
  from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
16
16
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
17
17
 
18
+
18
19
  class CrossNetwork(nn.Module):
19
20
  """Stacked Cross Layers from DCN (Wang et al., 2017)."""
20
21
 
21
22
  def __init__(self, input_dim, num_layers):
22
23
  super().__init__()
23
24
  self.num_layers = num_layers
24
- self.w = torch.nn.ModuleList([torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)])
25
- self.b = torch.nn.ParameterList([torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)])
25
+ self.w = torch.nn.ModuleList(
26
+ [torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)]
27
+ )
28
+ self.b = torch.nn.ParameterList(
29
+ [torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)]
30
+ )
26
31
 
27
32
  def forward(self, x):
28
- """
29
- :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
30
- """
31
33
  x0 = x
32
34
  for i in range(self.num_layers):
33
35
  xw = self.w[i](x)
34
36
  x = x0 * xw + self.b[i] + x
35
- return x
37
+ return x # [batch_size, input_dim]
36
38
 
37
39
 
38
40
  class DCN(BaseModel):
@@ -41,46 +43,48 @@ class DCN(BaseModel):
41
43
  return "DCN"
42
44
 
43
45
  @property
44
- def task_type(self):
46
+ def default_task(self):
45
47
  return "binary"
46
-
47
- def __init__(self,
48
- dense_features: list[DenseFeature],
49
- sparse_features: list[SparseFeature],
50
- sequence_features: list[SequenceFeature],
51
- cross_num: int = 3,
52
- mlp_params: dict | None = None,
53
- target: list[str] = [],
54
- optimizer: str = "adam",
55
- optimizer_params: dict = {},
56
- loss: str | nn.Module | None = "bce",
57
- loss_params: dict | list[dict] | None = None,
58
- device: str = 'cpu',
59
- embedding_l1_reg=1e-6,
60
- dense_l1_reg=1e-5,
61
- embedding_l2_reg=1e-5,
62
- dense_l2_reg=1e-4,
63
- **kwargs):
64
-
48
+
49
+ def __init__(
50
+ self,
51
+ dense_features: list[DenseFeature],
52
+ sparse_features: list[SparseFeature],
53
+ sequence_features: list[SequenceFeature],
54
+ cross_num: int = 3,
55
+ mlp_params: dict | None = None,
56
+ target: list[str] = [],
57
+ task: str | list[str] | None = None,
58
+ optimizer: str = "adam",
59
+ optimizer_params: dict = {},
60
+ loss: str | nn.Module | None = "bce",
61
+ loss_params: dict | list[dict] | None = None,
62
+ device: str = "cpu",
63
+ embedding_l1_reg=1e-6,
64
+ dense_l1_reg=1e-5,
65
+ embedding_l2_reg=1e-5,
66
+ dense_l2_reg=1e-4,
67
+ **kwargs,
68
+ ):
69
+
65
70
  super(DCN, self).__init__(
66
71
  dense_features=dense_features,
67
72
  sparse_features=sparse_features,
68
73
  sequence_features=sequence_features,
69
74
  target=target,
70
- task=self.task_type,
75
+ task=task or self.default_task,
71
76
  device=device,
72
77
  embedding_l1_reg=embedding_l1_reg,
73
78
  dense_l1_reg=dense_l1_reg,
74
79
  embedding_l2_reg=embedding_l2_reg,
75
80
  dense_l2_reg=dense_l2_reg,
76
- early_stop_patience=20,
77
- **kwargs
81
+ **kwargs,
78
82
  )
79
83
 
80
84
  self.loss = loss
81
85
  if self.loss is None:
82
86
  self.loss = "bce"
83
-
87
+
84
88
  # All features
85
89
  self.all_features = dense_features + sparse_features + sequence_features
86
90
 
@@ -88,30 +92,41 @@ class DCN(BaseModel):
88
92
  self.embedding = EmbeddingLayer(features=self.all_features)
89
93
 
90
94
  # Calculate input dimension
91
- emb_dim_total = sum([f.embedding_dim for f in self.all_features if not isinstance(f, DenseFeature)])
92
- dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
95
+ emb_dim_total = sum(
96
+ [
97
+ f.embedding_dim
98
+ for f in self.all_features
99
+ if not isinstance(f, DenseFeature)
100
+ ]
101
+ )
102
+ dense_input_dim = sum(
103
+ [getattr(f, "embedding_dim", 1) or 1 for f in dense_features]
104
+ )
93
105
  input_dim = emb_dim_total + dense_input_dim
94
-
106
+
95
107
  # Cross Network
96
108
  self.cross_network = CrossNetwork(input_dim=input_dim, num_layers=cross_num)
97
-
109
+
98
110
  # Deep Network (optional)
99
111
  if mlp_params is not None:
100
112
  self.use_dnn = True
101
113
  self.mlp = MLP(input_dim=input_dim, **mlp_params)
114
+ deep_dim = self.mlp.output_dim
102
115
  # Final layer combines cross and deep
103
- self.final_layer = nn.Linear(input_dim + 1, 1) # +1 for MLP output
116
+ self.final_layer = nn.Linear(
117
+ input_dim + deep_dim, 1
118
+ ) # + deep_dim for MLP output
104
119
  else:
105
120
  self.use_dnn = False
106
121
  # Final layer only uses cross network output
107
122
  self.final_layer = nn.Linear(input_dim, 1)
108
123
 
109
- self.prediction_layer = PredictionLayer(task_type=self.task_type)
124
+ self.prediction_layer = PredictionLayer(task_type=self.task)
110
125
 
111
126
  # Register regularization weights
112
127
  self.register_regularization_weights(
113
- embedding_attr='embedding',
114
- include_modules=['cross_network', 'mlp', 'final_layer']
128
+ embedding_attr="embedding",
129
+ include_modules=["cross_network", "mlp", "final_layer"],
115
130
  )
116
131
 
117
132
  self.compile(
@@ -124,18 +139,20 @@ class DCN(BaseModel):
124
139
  def forward(self, x):
125
140
  # Get all embeddings and flatten
126
141
  input_flat = self.embedding(x=x, features=self.all_features, squeeze_dim=True)
127
-
142
+
128
143
  # Cross Network
129
144
  cross_output = self.cross_network(input_flat) # [B, input_dim]
130
-
145
+
131
146
  if self.use_dnn:
132
147
  # Deep Network
133
148
  deep_output = self.mlp(input_flat) # [B, 1]
134
149
  # Concatenate cross and deep
135
- combined = torch.cat([cross_output, deep_output], dim=-1) # [B, input_dim + 1]
150
+ combined = torch.cat(
151
+ [cross_output, deep_output], dim=-1
152
+ ) # [B, input_dim + 1]
136
153
  else:
137
154
  combined = cross_output
138
-
155
+
139
156
  # Final prediction
140
157
  y = self.final_layer(combined)
141
158
  return self.prediction_layer(y)