nextrec 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +1 -1
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -5
- nextrec/basic/callback.py +1 -0
- nextrec/basic/features.py +30 -22
- nextrec/basic/layers.py +220 -106
- nextrec/basic/loggers.py +62 -43
- nextrec/basic/metrics.py +268 -119
- nextrec/basic/model.py +1082 -400
- nextrec/basic/session.py +10 -3
- nextrec/cli.py +498 -0
- nextrec/data/__init__.py +19 -25
- nextrec/data/batch_utils.py +11 -3
- nextrec/data/data_processing.py +51 -45
- nextrec/data/data_utils.py +26 -15
- nextrec/data/dataloader.py +272 -95
- nextrec/data/preprocessor.py +320 -199
- nextrec/loss/listwise.py +17 -9
- nextrec/loss/loss_utils.py +7 -8
- nextrec/loss/pairwise.py +2 -0
- nextrec/loss/pointwise.py +30 -12
- nextrec/models/generative/hstu.py +103 -38
- nextrec/models/match/dssm.py +82 -68
- nextrec/models/match/dssm_v2.py +72 -57
- nextrec/models/match/mind.py +175 -107
- nextrec/models/match/sdm.py +104 -87
- nextrec/models/match/youtube_dnn.py +73 -59
- nextrec/models/multi_task/esmm.py +53 -37
- nextrec/models/multi_task/mmoe.py +64 -45
- nextrec/models/multi_task/ple.py +101 -48
- nextrec/models/multi_task/poso.py +113 -36
- nextrec/models/multi_task/share_bottom.py +48 -35
- nextrec/models/ranking/afm.py +72 -37
- nextrec/models/ranking/autoint.py +72 -55
- nextrec/models/ranking/dcn.py +55 -35
- nextrec/models/ranking/dcn_v2.py +64 -23
- nextrec/models/ranking/deepfm.py +32 -22
- nextrec/models/ranking/dien.py +155 -99
- nextrec/models/ranking/din.py +85 -57
- nextrec/models/ranking/fibinet.py +52 -32
- nextrec/models/ranking/fm.py +29 -23
- nextrec/models/ranking/masknet.py +91 -29
- nextrec/models/ranking/pnn.py +31 -28
- nextrec/models/ranking/widedeep.py +34 -26
- nextrec/models/ranking/xdeepfm.py +60 -38
- nextrec/utils/__init__.py +59 -34
- nextrec/utils/config.py +490 -0
- nextrec/utils/device.py +30 -20
- nextrec/utils/distributed.py +36 -9
- nextrec/utils/embedding.py +1 -0
- nextrec/utils/feature.py +1 -0
- nextrec/utils/file.py +32 -11
- nextrec/utils/initializer.py +61 -16
- nextrec/utils/optimizer.py +25 -9
- nextrec/utils/synthetic_data.py +283 -165
- nextrec/utils/tensor.py +24 -13
- {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/METADATA +4 -4
- nextrec-0.4.2.dist-info/RECORD +69 -0
- nextrec-0.4.2.dist-info/entry_points.txt +2 -0
- nextrec-0.4.1.dist-info/RECORD +0 -66
- {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
- {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
nextrec/utils/synthetic_data.py
CHANGED
|
@@ -10,10 +10,8 @@ Author: Yang Zhou, zyaztec@gmail.com
|
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
import pandas as pd
|
|
13
|
-
from typing import Optional, Dict, List, Tuple
|
|
13
|
+
from typing import Optional, Dict, List, Tuple
|
|
14
14
|
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
17
15
|
|
|
18
16
|
def generate_ranking_data(
|
|
19
17
|
n_samples: int = 10000,
|
|
@@ -27,38 +25,38 @@ def generate_ranking_data(
|
|
|
27
25
|
embedding_dim: int = 16,
|
|
28
26
|
seed: int = 42,
|
|
29
27
|
custom_sparse_features: Optional[Dict[str, int]] = None,
|
|
30
|
-
use_simple_names: bool = True
|
|
28
|
+
use_simple_names: bool = True,
|
|
31
29
|
) -> Tuple[pd.DataFrame, List, List, List]:
|
|
32
30
|
"""
|
|
33
31
|
Generate synthetic data for ranking tasks (CTR prediction)
|
|
34
|
-
|
|
32
|
+
|
|
35
33
|
Returns:
|
|
36
34
|
tuple: (dataframe, dense_features, sparse_features, sequence_features)
|
|
37
35
|
"""
|
|
38
36
|
print(f"Generating {n_samples} synthetic ranking samples...")
|
|
39
|
-
|
|
37
|
+
|
|
40
38
|
np.random.seed(seed)
|
|
41
39
|
data = {}
|
|
42
|
-
|
|
40
|
+
|
|
43
41
|
for i in range(n_dense):
|
|
44
|
-
data[f
|
|
45
|
-
|
|
42
|
+
data[f"dense_{i}"] = np.random.randn(n_samples).astype(np.float32)
|
|
43
|
+
|
|
46
44
|
# Generate basic sparse features (always include user_id and item_id)
|
|
47
|
-
data[
|
|
48
|
-
data[
|
|
49
|
-
|
|
45
|
+
data["user_id"] = np.random.randint(1, user_vocab_size, n_samples)
|
|
46
|
+
data["item_id"] = np.random.randint(1, item_vocab_size, n_samples)
|
|
47
|
+
|
|
50
48
|
# Generate additional sparse features
|
|
51
49
|
if custom_sparse_features:
|
|
52
50
|
for feat_name, vocab_size in custom_sparse_features.items():
|
|
53
51
|
data[feat_name] = np.random.randint(0, vocab_size, n_samples)
|
|
54
52
|
else:
|
|
55
53
|
for i in range(n_sparse - 2):
|
|
56
|
-
data[f
|
|
57
|
-
|
|
54
|
+
data[f"sparse_{i}"] = np.random.randint(1, sparse_vocab_size, n_samples)
|
|
55
|
+
|
|
58
56
|
# Generate sequence features (list of IDs)
|
|
59
57
|
sequence_names = []
|
|
60
58
|
sequence_vocabs = []
|
|
61
|
-
|
|
59
|
+
|
|
62
60
|
for i in range(n_sequences):
|
|
63
61
|
sequences = []
|
|
64
62
|
for _ in range(n_samples):
|
|
@@ -68,77 +66,126 @@ def generate_ranking_data(
|
|
|
68
66
|
seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
|
|
69
67
|
seq_vocab = item_vocab_size
|
|
70
68
|
if custom_sparse_features:
|
|
71
|
-
seq_name =
|
|
69
|
+
seq_name = "hist_items"
|
|
72
70
|
else:
|
|
73
|
-
seq_name =
|
|
71
|
+
seq_name = "sequence_0"
|
|
74
72
|
else:
|
|
75
73
|
# Other sequences use category vocabulary
|
|
76
|
-
if custom_sparse_features and
|
|
77
|
-
seq_vocab = custom_sparse_features[
|
|
74
|
+
if custom_sparse_features and "category" in custom_sparse_features:
|
|
75
|
+
seq_vocab = custom_sparse_features["category"]
|
|
78
76
|
seq = np.random.randint(0, seq_vocab, seq_len).tolist()
|
|
79
|
-
seq_name =
|
|
77
|
+
seq_name = "hist_categories" if i == 1 else f"sequence_{i}"
|
|
80
78
|
else:
|
|
81
79
|
seq_vocab = sparse_vocab_size
|
|
82
80
|
seq = np.random.randint(0, seq_vocab, seq_len).tolist()
|
|
83
|
-
seq_name = f
|
|
84
|
-
|
|
81
|
+
seq_name = f"sequence_{i}"
|
|
82
|
+
|
|
85
83
|
# Padding
|
|
86
84
|
seq = seq + [0] * (sequence_max_len - len(seq))
|
|
87
85
|
sequences.append(seq)
|
|
88
|
-
|
|
86
|
+
|
|
89
87
|
data[seq_name] = sequences
|
|
90
88
|
sequence_names.append(seq_name)
|
|
91
89
|
sequence_vocabs.append(seq_vocab)
|
|
92
|
-
|
|
93
|
-
if
|
|
90
|
+
|
|
91
|
+
if "gender" in data and "dense_0" in data:
|
|
94
92
|
# Complex label generation with feature correlation
|
|
95
|
-
label_probs = 1 / (
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
93
|
+
label_probs = 1 / (
|
|
94
|
+
1
|
|
95
|
+
+ np.exp(
|
|
96
|
+
-(
|
|
97
|
+
data["dense_0"] * 0.3
|
|
98
|
+
+ data["dense_1"] * 0.2
|
|
99
|
+
+ (data["gender"] - 0.5) * 0.5
|
|
100
|
+
+ np.random.randn(n_samples) * 0.1
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
data["label"] = (label_probs > 0.5).astype(np.float32)
|
|
102
105
|
else:
|
|
103
|
-
data[
|
|
104
|
-
|
|
106
|
+
data["label"] = np.random.randint(0, 2, n_samples).astype(np.float32)
|
|
107
|
+
|
|
105
108
|
df = pd.DataFrame(data)
|
|
106
109
|
print(f"Generated data shape: {df.shape}")
|
|
107
|
-
if
|
|
110
|
+
if "gender" in data:
|
|
108
111
|
print(f"Positive rate: {data['label'].mean():.4f}")
|
|
109
|
-
|
|
112
|
+
|
|
110
113
|
# Import here to avoid circular import
|
|
111
114
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
112
|
-
|
|
115
|
+
|
|
113
116
|
# Create feature definitions
|
|
114
117
|
# Use input_dim for dense features to be compatible with both simple and complex scenarios
|
|
115
|
-
dense_features = [
|
|
116
|
-
|
|
118
|
+
dense_features = [
|
|
119
|
+
DenseFeature(name=f"dense_{i}", input_dim=1) for i in range(n_dense)
|
|
120
|
+
]
|
|
121
|
+
|
|
117
122
|
# Create sparse features
|
|
118
|
-
sparse_features = [
|
|
119
|
-
|
|
120
|
-
|
|
123
|
+
sparse_features = [
|
|
124
|
+
SparseFeature(
|
|
125
|
+
name="user_id",
|
|
126
|
+
embedding_name="user_emb",
|
|
127
|
+
vocab_size=user_vocab_size,
|
|
128
|
+
embedding_dim=embedding_dim,
|
|
129
|
+
),
|
|
130
|
+
SparseFeature(
|
|
131
|
+
name="item_id",
|
|
132
|
+
embedding_name="item_emb",
|
|
133
|
+
vocab_size=item_vocab_size,
|
|
134
|
+
embedding_dim=embedding_dim,
|
|
135
|
+
),
|
|
136
|
+
]
|
|
137
|
+
|
|
121
138
|
if custom_sparse_features:
|
|
122
139
|
# Add custom sparse features with proper vocab sizes
|
|
123
140
|
for feat_name, vocab_size in custom_sparse_features.items():
|
|
124
|
-
sparse_features.append(
|
|
141
|
+
sparse_features.append(
|
|
142
|
+
SparseFeature(
|
|
143
|
+
name=feat_name,
|
|
144
|
+
embedding_name=f"{feat_name}_emb",
|
|
145
|
+
vocab_size=vocab_size,
|
|
146
|
+
embedding_dim=embedding_dim,
|
|
147
|
+
)
|
|
148
|
+
)
|
|
125
149
|
else:
|
|
126
150
|
# Add generic sparse features
|
|
127
|
-
sparse_features.extend(
|
|
128
|
-
|
|
151
|
+
sparse_features.extend(
|
|
152
|
+
[
|
|
153
|
+
SparseFeature(
|
|
154
|
+
name=f"sparse_{i}",
|
|
155
|
+
embedding_name=f"sparse_{i}_emb",
|
|
156
|
+
vocab_size=sparse_vocab_size,
|
|
157
|
+
embedding_dim=embedding_dim,
|
|
158
|
+
)
|
|
159
|
+
for i in range(n_sparse - 2)
|
|
160
|
+
]
|
|
161
|
+
)
|
|
162
|
+
|
|
129
163
|
# Create sequence features
|
|
130
164
|
sequence_features = []
|
|
131
165
|
for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
|
|
132
166
|
if i == 0:
|
|
133
167
|
# First sequence shares embedding with item_id
|
|
134
|
-
embedding_name =
|
|
135
|
-
elif
|
|
168
|
+
embedding_name = "item_emb"
|
|
169
|
+
elif (
|
|
170
|
+
custom_sparse_features
|
|
171
|
+
and "category" in custom_sparse_features
|
|
172
|
+
and seq_name == "hist_categories"
|
|
173
|
+
):
|
|
136
174
|
# hist_categories shares embedding with category
|
|
137
|
-
embedding_name =
|
|
175
|
+
embedding_name = "category_emb"
|
|
138
176
|
else:
|
|
139
177
|
# Other sequences share with sparse_0
|
|
140
|
-
embedding_name =
|
|
141
|
-
sequence_features.append(
|
|
178
|
+
embedding_name = "sparse_0_emb"
|
|
179
|
+
sequence_features.append(
|
|
180
|
+
SequenceFeature(
|
|
181
|
+
name=seq_name,
|
|
182
|
+
vocab_size=seq_vocab,
|
|
183
|
+
max_len=sequence_max_len,
|
|
184
|
+
embedding_dim=embedding_dim,
|
|
185
|
+
padding_idx=0,
|
|
186
|
+
embedding_name=embedding_name,
|
|
187
|
+
)
|
|
188
|
+
)
|
|
142
189
|
return df, dense_features, sparse_features, sequence_features
|
|
143
190
|
|
|
144
191
|
|
|
@@ -154,29 +201,31 @@ def generate_match_data(
|
|
|
154
201
|
sequence_max_len: int = 50,
|
|
155
202
|
user_embedding_dim: int = 32,
|
|
156
203
|
item_embedding_dim: int = 32,
|
|
157
|
-
seed: int = 42
|
|
204
|
+
seed: int = 42,
|
|
158
205
|
) -> Tuple[pd.DataFrame, List, List, List, List, List, List]:
|
|
159
206
|
"""
|
|
160
207
|
Generate synthetic data for match/retrieval tasks
|
|
161
|
-
|
|
208
|
+
|
|
162
209
|
Returns:
|
|
163
210
|
tuple: (dataframe, user_dense_features, user_sparse_features, user_sequence_features,
|
|
164
211
|
item_dense_features, item_sparse_features, item_sequence_features)
|
|
165
212
|
"""
|
|
166
213
|
print(f"Generating {n_samples} synthetic match samples...")
|
|
167
|
-
|
|
214
|
+
|
|
168
215
|
np.random.seed(seed)
|
|
169
216
|
data = {}
|
|
170
|
-
|
|
217
|
+
|
|
171
218
|
# User features
|
|
172
|
-
data[
|
|
173
|
-
data[
|
|
174
|
-
data[
|
|
175
|
-
data[
|
|
176
|
-
|
|
219
|
+
data["user_id"] = np.random.randint(1, user_vocab_size, n_samples)
|
|
220
|
+
data["user_age"] = np.random.randn(n_samples).astype(np.float32)
|
|
221
|
+
data["user_gender"] = np.random.randint(0, 2, n_samples)
|
|
222
|
+
data["user_city"] = np.random.randint(0, city_vocab_size, n_samples)
|
|
223
|
+
|
|
177
224
|
for i in range(3):
|
|
178
|
-
data[f
|
|
179
|
-
|
|
225
|
+
data[f"user_feature_{i}"] = np.random.randint(
|
|
226
|
+
1, user_feature_vocab_size, n_samples
|
|
227
|
+
)
|
|
228
|
+
|
|
180
229
|
# User behavior sequences
|
|
181
230
|
user_hist_items = []
|
|
182
231
|
user_hist_categories = []
|
|
@@ -185,80 +234,122 @@ def generate_match_data(
|
|
|
185
234
|
hist_items = np.random.randint(1, item_vocab_size, seq_len).tolist()
|
|
186
235
|
hist_items = hist_items + [0] * (sequence_max_len - len(hist_items))
|
|
187
236
|
user_hist_items.append(hist_items)
|
|
188
|
-
|
|
237
|
+
|
|
189
238
|
hist_cats = np.random.randint(1, category_vocab_size, seq_len).tolist()
|
|
190
239
|
hist_cats = hist_cats + [0] * (sequence_max_len - len(hist_cats))
|
|
191
240
|
user_hist_categories.append(hist_cats)
|
|
192
|
-
|
|
193
|
-
data[
|
|
194
|
-
data[
|
|
195
|
-
|
|
241
|
+
|
|
242
|
+
data["user_hist_items"] = user_hist_items
|
|
243
|
+
data["user_hist_categories"] = user_hist_categories
|
|
244
|
+
|
|
196
245
|
# Item features
|
|
197
|
-
data[
|
|
198
|
-
data[
|
|
199
|
-
data[
|
|
200
|
-
data[
|
|
201
|
-
|
|
246
|
+
data["item_id"] = np.random.randint(1, item_vocab_size, n_samples)
|
|
247
|
+
data["item_price"] = np.random.randn(n_samples).astype(np.float32)
|
|
248
|
+
data["item_category"] = np.random.randint(1, category_vocab_size, n_samples)
|
|
249
|
+
data["item_brand"] = np.random.randint(1, brand_vocab_size, n_samples)
|
|
250
|
+
|
|
202
251
|
for i in range(3):
|
|
203
|
-
data[f
|
|
204
|
-
|
|
252
|
+
data[f"item_feature_{i}"] = np.random.randint(
|
|
253
|
+
1, item_feature_vocab_size, n_samples
|
|
254
|
+
)
|
|
255
|
+
|
|
205
256
|
# Generate labels with some correlation to features
|
|
206
|
-
label_probs = 1 / (
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
257
|
+
label_probs = 1 / (
|
|
258
|
+
1
|
|
259
|
+
+ np.exp(
|
|
260
|
+
-(
|
|
261
|
+
data["user_age"] * 0.2
|
|
262
|
+
+ (data["user_gender"] - 0.5) * 0.3
|
|
263
|
+
+ data["item_price"] * 0.15
|
|
264
|
+
+ np.random.randn(n_samples) * 0.5
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
data["label"] = (label_probs > 0.5).astype(np.float32)
|
|
269
|
+
|
|
214
270
|
df = pd.DataFrame(data)
|
|
215
271
|
print(f"Generated data shape: {df.shape}")
|
|
216
272
|
print(f"Positive rate: {data['label'].mean():.4f}")
|
|
217
|
-
|
|
273
|
+
|
|
218
274
|
# Import here to avoid circular import
|
|
219
275
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
220
|
-
|
|
276
|
+
|
|
221
277
|
# User dense features
|
|
222
|
-
user_dense_features = [DenseFeature(name=
|
|
223
|
-
|
|
278
|
+
user_dense_features = [DenseFeature(name="user_age", input_dim=1)]
|
|
279
|
+
|
|
224
280
|
# User sparse features
|
|
225
281
|
user_sparse_features = [
|
|
226
|
-
SparseFeature(
|
|
227
|
-
|
|
228
|
-
|
|
282
|
+
SparseFeature(
|
|
283
|
+
name="user_id", vocab_size=user_vocab_size, embedding_dim=user_embedding_dim
|
|
284
|
+
),
|
|
285
|
+
SparseFeature(name="user_gender", vocab_size=2, embedding_dim=8),
|
|
286
|
+
SparseFeature(name="user_city", vocab_size=city_vocab_size, embedding_dim=16),
|
|
229
287
|
]
|
|
230
|
-
user_sparse_features.extend(
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
288
|
+
user_sparse_features.extend(
|
|
289
|
+
[
|
|
290
|
+
SparseFeature(
|
|
291
|
+
name=f"user_feature_{i}",
|
|
292
|
+
vocab_size=user_feature_vocab_size,
|
|
293
|
+
embedding_dim=8,
|
|
294
|
+
)
|
|
295
|
+
for i in range(3)
|
|
296
|
+
]
|
|
297
|
+
)
|
|
298
|
+
|
|
235
299
|
# User sequence features
|
|
236
300
|
user_sequence_features = [
|
|
237
|
-
SequenceFeature(
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
301
|
+
SequenceFeature(
|
|
302
|
+
name="user_hist_items",
|
|
303
|
+
vocab_size=item_vocab_size,
|
|
304
|
+
max_len=sequence_max_len,
|
|
305
|
+
embedding_dim=user_embedding_dim,
|
|
306
|
+
padding_idx=0,
|
|
307
|
+
),
|
|
308
|
+
SequenceFeature(
|
|
309
|
+
name="user_hist_categories",
|
|
310
|
+
vocab_size=category_vocab_size,
|
|
311
|
+
max_len=sequence_max_len,
|
|
312
|
+
embedding_dim=16,
|
|
313
|
+
padding_idx=0,
|
|
314
|
+
),
|
|
241
315
|
]
|
|
242
|
-
|
|
316
|
+
|
|
243
317
|
# Item dense features
|
|
244
|
-
item_dense_features = [DenseFeature(name=
|
|
245
|
-
|
|
318
|
+
item_dense_features = [DenseFeature(name="item_price", input_dim=1)]
|
|
319
|
+
|
|
246
320
|
# Item sparse features
|
|
247
321
|
item_sparse_features = [
|
|
248
|
-
SparseFeature(
|
|
249
|
-
|
|
250
|
-
|
|
322
|
+
SparseFeature(
|
|
323
|
+
name="item_id", vocab_size=item_vocab_size, embedding_dim=item_embedding_dim
|
|
324
|
+
),
|
|
325
|
+
SparseFeature(
|
|
326
|
+
name="item_category", vocab_size=category_vocab_size, embedding_dim=16
|
|
327
|
+
),
|
|
328
|
+
SparseFeature(name="item_brand", vocab_size=brand_vocab_size, embedding_dim=16),
|
|
251
329
|
]
|
|
252
|
-
item_sparse_features.extend(
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
330
|
+
item_sparse_features.extend(
|
|
331
|
+
[
|
|
332
|
+
SparseFeature(
|
|
333
|
+
name=f"item_feature_{i}",
|
|
334
|
+
vocab_size=item_feature_vocab_size,
|
|
335
|
+
embedding_dim=8,
|
|
336
|
+
)
|
|
337
|
+
for i in range(3)
|
|
338
|
+
]
|
|
339
|
+
)
|
|
340
|
+
|
|
257
341
|
# Item sequence features (empty for most match models)
|
|
258
342
|
item_sequence_features = []
|
|
259
|
-
|
|
260
|
-
return (
|
|
261
|
-
|
|
343
|
+
|
|
344
|
+
return (
|
|
345
|
+
df,
|
|
346
|
+
user_dense_features,
|
|
347
|
+
user_sparse_features,
|
|
348
|
+
user_sequence_features,
|
|
349
|
+
item_dense_features,
|
|
350
|
+
item_sparse_features,
|
|
351
|
+
item_sequence_features,
|
|
352
|
+
)
|
|
262
353
|
|
|
263
354
|
|
|
264
355
|
def generate_multitask_data(
|
|
@@ -271,34 +362,34 @@ def generate_multitask_data(
|
|
|
271
362
|
sparse_vocab_size: int = 50,
|
|
272
363
|
sequence_max_len: int = 20,
|
|
273
364
|
embedding_dim: int = 16,
|
|
274
|
-
seed: int = 42
|
|
365
|
+
seed: int = 42,
|
|
275
366
|
) -> Tuple[pd.DataFrame, List, List, List]:
|
|
276
367
|
"""
|
|
277
368
|
Generate synthetic data for multi-task learning
|
|
278
|
-
|
|
369
|
+
|
|
279
370
|
Returns:
|
|
280
371
|
tuple: (dataframe, dense_features, sparse_features, sequence_features)
|
|
281
372
|
"""
|
|
282
373
|
print(f"Generating {n_samples} synthetic multi-task samples...")
|
|
283
|
-
|
|
374
|
+
|
|
284
375
|
np.random.seed(seed)
|
|
285
376
|
data = {}
|
|
286
|
-
|
|
377
|
+
|
|
287
378
|
# Generate dense features
|
|
288
379
|
for i in range(n_dense):
|
|
289
|
-
data[f
|
|
290
|
-
|
|
380
|
+
data[f"dense_{i}"] = np.random.randn(n_samples).astype(np.float32)
|
|
381
|
+
|
|
291
382
|
# Generate sparse features
|
|
292
|
-
data[
|
|
293
|
-
data[
|
|
294
|
-
|
|
383
|
+
data["user_id"] = np.random.randint(1, user_vocab_size, n_samples)
|
|
384
|
+
data["item_id"] = np.random.randint(1, item_vocab_size, n_samples)
|
|
385
|
+
|
|
295
386
|
for i in range(n_sparse - 2):
|
|
296
|
-
data[f
|
|
297
|
-
|
|
387
|
+
data[f"sparse_{i}"] = np.random.randint(1, sparse_vocab_size, n_samples)
|
|
388
|
+
|
|
298
389
|
# Generate sequence features
|
|
299
390
|
sequence_names = []
|
|
300
391
|
sequence_vocabs = []
|
|
301
|
-
|
|
392
|
+
|
|
302
393
|
for i in range(n_sequences):
|
|
303
394
|
sequences = []
|
|
304
395
|
for _ in range(n_samples):
|
|
@@ -306,79 +397,101 @@ def generate_multitask_data(
|
|
|
306
397
|
if i == 0:
|
|
307
398
|
seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
|
|
308
399
|
seq_vocab = item_vocab_size
|
|
309
|
-
seq_name =
|
|
400
|
+
seq_name = "sequence_0"
|
|
310
401
|
else:
|
|
311
402
|
seq = np.random.randint(0, sparse_vocab_size, seq_len).tolist()
|
|
312
403
|
seq_vocab = sparse_vocab_size
|
|
313
|
-
seq_name = f
|
|
314
|
-
|
|
404
|
+
seq_name = f"sequence_{i}"
|
|
405
|
+
|
|
315
406
|
seq = seq + [0] * (sequence_max_len - len(seq))
|
|
316
407
|
sequences.append(seq)
|
|
317
|
-
|
|
408
|
+
|
|
318
409
|
data[seq_name] = sequences
|
|
319
410
|
sequence_names.append(seq_name)
|
|
320
411
|
sequence_vocabs.append(seq_vocab)
|
|
321
|
-
|
|
412
|
+
|
|
322
413
|
# Generate multi-task labels with correlation
|
|
323
414
|
# CTR (click) is relatively easier to predict
|
|
324
415
|
ctr_logits = (
|
|
325
|
-
data[
|
|
326
|
-
data['dense_1'] * 0.2 +
|
|
327
|
-
np.random.randn(n_samples) * 0.5
|
|
416
|
+
data["dense_0"] * 0.3 + data["dense_1"] * 0.2 + np.random.randn(n_samples) * 0.5
|
|
328
417
|
)
|
|
329
|
-
data[
|
|
330
|
-
|
|
418
|
+
data["click"] = (1 / (1 + np.exp(-ctr_logits)) > 0.5).astype(np.float32)
|
|
419
|
+
|
|
331
420
|
# CVR (conversion) depends on click and is harder
|
|
332
421
|
cvr_logits = (
|
|
333
|
-
data[
|
|
334
|
-
data[
|
|
335
|
-
data[
|
|
336
|
-
np.random.randn(n_samples) * 0.8
|
|
422
|
+
data["dense_2"] * 0.2
|
|
423
|
+
+ data["dense_3"] * 0.15
|
|
424
|
+
+ data["click"] * 1.5 # Strong dependency on click
|
|
425
|
+
+ np.random.randn(n_samples) * 0.8
|
|
337
426
|
)
|
|
338
|
-
data[
|
|
339
|
-
|
|
427
|
+
data["conversion"] = (1 / (1 + np.exp(-cvr_logits)) > 0.3).astype(np.float32)
|
|
428
|
+
|
|
340
429
|
# CTCVR = click AND conversion
|
|
341
|
-
data[
|
|
342
|
-
|
|
430
|
+
data["ctcvr"] = (data["click"] * data["conversion"]).astype(np.float32)
|
|
431
|
+
|
|
343
432
|
df = pd.DataFrame(data)
|
|
344
433
|
print(f"Generated data shape: {df.shape}")
|
|
345
434
|
print(f"Click rate: {data['click'].mean():.4f}")
|
|
346
435
|
print(f"Conversion rate (overall): {data['conversion'].mean():.4f}")
|
|
347
|
-
if data[
|
|
348
|
-
print(
|
|
436
|
+
if data["click"].sum() > 0:
|
|
437
|
+
print(
|
|
438
|
+
f"Conversion rate (given click): {data['conversion'][data['click'] == 1].mean():.4f}"
|
|
439
|
+
)
|
|
349
440
|
print(f"CTCVR rate: {data['ctcvr'].mean():.4f}")
|
|
350
|
-
|
|
441
|
+
|
|
351
442
|
# Import here to avoid circular import
|
|
352
443
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
353
|
-
|
|
444
|
+
|
|
354
445
|
# Create feature definitions
|
|
355
|
-
dense_features = [
|
|
356
|
-
|
|
446
|
+
dense_features = [
|
|
447
|
+
DenseFeature(name=f"dense_{i}", input_dim=1) for i in range(n_dense)
|
|
448
|
+
]
|
|
449
|
+
|
|
357
450
|
# Create sparse features
|
|
358
451
|
sparse_features = [
|
|
359
|
-
SparseFeature(
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
452
|
+
SparseFeature(
|
|
453
|
+
name="user_id",
|
|
454
|
+
embedding_name="user_emb",
|
|
455
|
+
vocab_size=user_vocab_size,
|
|
456
|
+
embedding_dim=embedding_dim,
|
|
457
|
+
),
|
|
458
|
+
SparseFeature(
|
|
459
|
+
name="item_id",
|
|
460
|
+
embedding_name="item_emb",
|
|
461
|
+
vocab_size=item_vocab_size,
|
|
462
|
+
embedding_dim=embedding_dim,
|
|
463
|
+
),
|
|
363
464
|
]
|
|
364
|
-
sparse_features.extend(
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
465
|
+
sparse_features.extend(
|
|
466
|
+
[
|
|
467
|
+
SparseFeature(
|
|
468
|
+
name=f"sparse_{i}",
|
|
469
|
+
embedding_name=f"sparse_{i}_emb",
|
|
470
|
+
vocab_size=sparse_vocab_size,
|
|
471
|
+
embedding_dim=embedding_dim,
|
|
472
|
+
)
|
|
473
|
+
for i in range(n_sparse - 2)
|
|
474
|
+
]
|
|
475
|
+
)
|
|
476
|
+
|
|
370
477
|
# Create sequence features
|
|
371
478
|
sequence_features = []
|
|
372
479
|
for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
|
|
373
480
|
if i == 0:
|
|
374
|
-
embedding_name =
|
|
481
|
+
embedding_name = "item_emb"
|
|
375
482
|
else:
|
|
376
|
-
embedding_name =
|
|
483
|
+
embedding_name = "sparse_0_emb"
|
|
377
484
|
sequence_features.append(
|
|
378
|
-
SequenceFeature(
|
|
379
|
-
|
|
485
|
+
SequenceFeature(
|
|
486
|
+
name=seq_name,
|
|
487
|
+
vocab_size=seq_vocab,
|
|
488
|
+
max_len=sequence_max_len,
|
|
489
|
+
embedding_dim=embedding_dim,
|
|
490
|
+
padding_idx=0,
|
|
491
|
+
embedding_name=embedding_name,
|
|
492
|
+
)
|
|
380
493
|
)
|
|
381
|
-
|
|
494
|
+
|
|
382
495
|
return df, dense_features, sparse_features, sequence_features
|
|
383
496
|
|
|
384
497
|
|
|
@@ -394,7 +507,7 @@ def generate_distributed_ranking_data(
|
|
|
394
507
|
) -> Tuple[pd.DataFrame, List, List, List]:
|
|
395
508
|
"""
|
|
396
509
|
Generate synthetic data for distributed training scenarios
|
|
397
|
-
|
|
510
|
+
|
|
398
511
|
Returns:
|
|
399
512
|
tuple: (dataframe, dense_features, sparse_features, sequence_features)
|
|
400
513
|
"""
|
|
@@ -408,6 +521,11 @@ def generate_distributed_ranking_data(
|
|
|
408
521
|
sequence_max_len=max_seq_len,
|
|
409
522
|
embedding_dim=embedding_dim,
|
|
410
523
|
seed=seed,
|
|
411
|
-
custom_sparse_features={
|
|
412
|
-
|
|
524
|
+
custom_sparse_features={
|
|
525
|
+
"gender": 2,
|
|
526
|
+
"age_group": 7,
|
|
527
|
+
"category": num_categories,
|
|
528
|
+
"city": num_cities,
|
|
529
|
+
},
|
|
530
|
+
use_simple_names=False,
|
|
413
531
|
)
|