nextrec 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +1 -1
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -5
- nextrec/basic/callback.py +1 -0
- nextrec/basic/features.py +30 -22
- nextrec/basic/layers.py +244 -113
- nextrec/basic/loggers.py +62 -43
- nextrec/basic/metrics.py +268 -119
- nextrec/basic/model.py +1373 -443
- nextrec/basic/session.py +10 -3
- nextrec/cli.py +498 -0
- nextrec/data/__init__.py +19 -25
- nextrec/data/batch_utils.py +11 -3
- nextrec/data/data_processing.py +42 -24
- nextrec/data/data_utils.py +26 -15
- nextrec/data/dataloader.py +303 -96
- nextrec/data/preprocessor.py +320 -199
- nextrec/loss/listwise.py +17 -9
- nextrec/loss/loss_utils.py +7 -8
- nextrec/loss/pairwise.py +2 -0
- nextrec/loss/pointwise.py +30 -12
- nextrec/models/generative/hstu.py +106 -40
- nextrec/models/match/dssm.py +82 -69
- nextrec/models/match/dssm_v2.py +72 -58
- nextrec/models/match/mind.py +175 -108
- nextrec/models/match/sdm.py +104 -88
- nextrec/models/match/youtube_dnn.py +73 -60
- nextrec/models/multi_task/esmm.py +53 -39
- nextrec/models/multi_task/mmoe.py +70 -47
- nextrec/models/multi_task/ple.py +107 -50
- nextrec/models/multi_task/poso.py +121 -41
- nextrec/models/multi_task/share_bottom.py +54 -38
- nextrec/models/ranking/afm.py +172 -45
- nextrec/models/ranking/autoint.py +84 -61
- nextrec/models/ranking/dcn.py +59 -42
- nextrec/models/ranking/dcn_v2.py +64 -23
- nextrec/models/ranking/deepfm.py +36 -26
- nextrec/models/ranking/dien.py +158 -102
- nextrec/models/ranking/din.py +88 -60
- nextrec/models/ranking/fibinet.py +55 -35
- nextrec/models/ranking/fm.py +32 -26
- nextrec/models/ranking/masknet.py +95 -34
- nextrec/models/ranking/pnn.py +34 -31
- nextrec/models/ranking/widedeep.py +37 -29
- nextrec/models/ranking/xdeepfm.py +63 -41
- nextrec/utils/__init__.py +61 -32
- nextrec/utils/config.py +490 -0
- nextrec/utils/device.py +52 -12
- nextrec/utils/distributed.py +141 -0
- nextrec/utils/embedding.py +1 -0
- nextrec/utils/feature.py +1 -0
- nextrec/utils/file.py +32 -11
- nextrec/utils/initializer.py +61 -16
- nextrec/utils/optimizer.py +25 -9
- nextrec/utils/synthetic_data.py +531 -0
- nextrec/utils/tensor.py +24 -13
- {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/METADATA +15 -5
- nextrec-0.4.2.dist-info/RECORD +69 -0
- nextrec-0.4.2.dist-info/entry_points.txt +2 -0
- nextrec-0.3.6.dist-info/RECORD +0 -64
- {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
- {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Synthetic Data Generation Utilities
|
|
3
|
+
|
|
4
|
+
This module provides utilities for generating synthetic datasets for testing
|
|
5
|
+
and tutorial purposes in the NextRec framework.
|
|
6
|
+
|
|
7
|
+
Date: create on 06/12/2025
|
|
8
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from typing import Optional, Dict, List, Tuple
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def generate_ranking_data(
|
|
17
|
+
n_samples: int = 10000,
|
|
18
|
+
n_dense: int = 5,
|
|
19
|
+
n_sparse: int = 8,
|
|
20
|
+
n_sequences: int = 2,
|
|
21
|
+
user_vocab_size: int = 1000,
|
|
22
|
+
item_vocab_size: int = 500,
|
|
23
|
+
sparse_vocab_size: int = 50,
|
|
24
|
+
sequence_max_len: int = 20,
|
|
25
|
+
embedding_dim: int = 16,
|
|
26
|
+
seed: int = 42,
|
|
27
|
+
custom_sparse_features: Optional[Dict[str, int]] = None,
|
|
28
|
+
use_simple_names: bool = True,
|
|
29
|
+
) -> Tuple[pd.DataFrame, List, List, List]:
|
|
30
|
+
"""
|
|
31
|
+
Generate synthetic data for ranking tasks (CTR prediction)
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
tuple: (dataframe, dense_features, sparse_features, sequence_features)
|
|
35
|
+
"""
|
|
36
|
+
print(f"Generating {n_samples} synthetic ranking samples...")
|
|
37
|
+
|
|
38
|
+
np.random.seed(seed)
|
|
39
|
+
data = {}
|
|
40
|
+
|
|
41
|
+
for i in range(n_dense):
|
|
42
|
+
data[f"dense_{i}"] = np.random.randn(n_samples).astype(np.float32)
|
|
43
|
+
|
|
44
|
+
# Generate basic sparse features (always include user_id and item_id)
|
|
45
|
+
data["user_id"] = np.random.randint(1, user_vocab_size, n_samples)
|
|
46
|
+
data["item_id"] = np.random.randint(1, item_vocab_size, n_samples)
|
|
47
|
+
|
|
48
|
+
# Generate additional sparse features
|
|
49
|
+
if custom_sparse_features:
|
|
50
|
+
for feat_name, vocab_size in custom_sparse_features.items():
|
|
51
|
+
data[feat_name] = np.random.randint(0, vocab_size, n_samples)
|
|
52
|
+
else:
|
|
53
|
+
for i in range(n_sparse - 2):
|
|
54
|
+
data[f"sparse_{i}"] = np.random.randint(1, sparse_vocab_size, n_samples)
|
|
55
|
+
|
|
56
|
+
# Generate sequence features (list of IDs)
|
|
57
|
+
sequence_names = []
|
|
58
|
+
sequence_vocabs = []
|
|
59
|
+
|
|
60
|
+
for i in range(n_sequences):
|
|
61
|
+
sequences = []
|
|
62
|
+
for _ in range(n_samples):
|
|
63
|
+
seq_len = np.random.randint(5, sequence_max_len + 1)
|
|
64
|
+
if i == 0:
|
|
65
|
+
# First sequence uses item vocabulary
|
|
66
|
+
seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
|
|
67
|
+
seq_vocab = item_vocab_size
|
|
68
|
+
if custom_sparse_features:
|
|
69
|
+
seq_name = "hist_items"
|
|
70
|
+
else:
|
|
71
|
+
seq_name = "sequence_0"
|
|
72
|
+
else:
|
|
73
|
+
# Other sequences use category vocabulary
|
|
74
|
+
if custom_sparse_features and "category" in custom_sparse_features:
|
|
75
|
+
seq_vocab = custom_sparse_features["category"]
|
|
76
|
+
seq = np.random.randint(0, seq_vocab, seq_len).tolist()
|
|
77
|
+
seq_name = "hist_categories" if i == 1 else f"sequence_{i}"
|
|
78
|
+
else:
|
|
79
|
+
seq_vocab = sparse_vocab_size
|
|
80
|
+
seq = np.random.randint(0, seq_vocab, seq_len).tolist()
|
|
81
|
+
seq_name = f"sequence_{i}"
|
|
82
|
+
|
|
83
|
+
# Padding
|
|
84
|
+
seq = seq + [0] * (sequence_max_len - len(seq))
|
|
85
|
+
sequences.append(seq)
|
|
86
|
+
|
|
87
|
+
data[seq_name] = sequences
|
|
88
|
+
sequence_names.append(seq_name)
|
|
89
|
+
sequence_vocabs.append(seq_vocab)
|
|
90
|
+
|
|
91
|
+
if "gender" in data and "dense_0" in data:
|
|
92
|
+
# Complex label generation with feature correlation
|
|
93
|
+
label_probs = 1 / (
|
|
94
|
+
1
|
|
95
|
+
+ np.exp(
|
|
96
|
+
-(
|
|
97
|
+
data["dense_0"] * 0.3
|
|
98
|
+
+ data["dense_1"] * 0.2
|
|
99
|
+
+ (data["gender"] - 0.5) * 0.5
|
|
100
|
+
+ np.random.randn(n_samples) * 0.1
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
data["label"] = (label_probs > 0.5).astype(np.float32)
|
|
105
|
+
else:
|
|
106
|
+
data["label"] = np.random.randint(0, 2, n_samples).astype(np.float32)
|
|
107
|
+
|
|
108
|
+
df = pd.DataFrame(data)
|
|
109
|
+
print(f"Generated data shape: {df.shape}")
|
|
110
|
+
if "gender" in data:
|
|
111
|
+
print(f"Positive rate: {data['label'].mean():.4f}")
|
|
112
|
+
|
|
113
|
+
# Import here to avoid circular import
|
|
114
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
115
|
+
|
|
116
|
+
# Create feature definitions
|
|
117
|
+
# Use input_dim for dense features to be compatible with both simple and complex scenarios
|
|
118
|
+
dense_features = [
|
|
119
|
+
DenseFeature(name=f"dense_{i}", input_dim=1) for i in range(n_dense)
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
# Create sparse features
|
|
123
|
+
sparse_features = [
|
|
124
|
+
SparseFeature(
|
|
125
|
+
name="user_id",
|
|
126
|
+
embedding_name="user_emb",
|
|
127
|
+
vocab_size=user_vocab_size,
|
|
128
|
+
embedding_dim=embedding_dim,
|
|
129
|
+
),
|
|
130
|
+
SparseFeature(
|
|
131
|
+
name="item_id",
|
|
132
|
+
embedding_name="item_emb",
|
|
133
|
+
vocab_size=item_vocab_size,
|
|
134
|
+
embedding_dim=embedding_dim,
|
|
135
|
+
),
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
if custom_sparse_features:
|
|
139
|
+
# Add custom sparse features with proper vocab sizes
|
|
140
|
+
for feat_name, vocab_size in custom_sparse_features.items():
|
|
141
|
+
sparse_features.append(
|
|
142
|
+
SparseFeature(
|
|
143
|
+
name=feat_name,
|
|
144
|
+
embedding_name=f"{feat_name}_emb",
|
|
145
|
+
vocab_size=vocab_size,
|
|
146
|
+
embedding_dim=embedding_dim,
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
# Add generic sparse features
|
|
151
|
+
sparse_features.extend(
|
|
152
|
+
[
|
|
153
|
+
SparseFeature(
|
|
154
|
+
name=f"sparse_{i}",
|
|
155
|
+
embedding_name=f"sparse_{i}_emb",
|
|
156
|
+
vocab_size=sparse_vocab_size,
|
|
157
|
+
embedding_dim=embedding_dim,
|
|
158
|
+
)
|
|
159
|
+
for i in range(n_sparse - 2)
|
|
160
|
+
]
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Create sequence features
|
|
164
|
+
sequence_features = []
|
|
165
|
+
for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
|
|
166
|
+
if i == 0:
|
|
167
|
+
# First sequence shares embedding with item_id
|
|
168
|
+
embedding_name = "item_emb"
|
|
169
|
+
elif (
|
|
170
|
+
custom_sparse_features
|
|
171
|
+
and "category" in custom_sparse_features
|
|
172
|
+
and seq_name == "hist_categories"
|
|
173
|
+
):
|
|
174
|
+
# hist_categories shares embedding with category
|
|
175
|
+
embedding_name = "category_emb"
|
|
176
|
+
else:
|
|
177
|
+
# Other sequences share with sparse_0
|
|
178
|
+
embedding_name = "sparse_0_emb"
|
|
179
|
+
sequence_features.append(
|
|
180
|
+
SequenceFeature(
|
|
181
|
+
name=seq_name,
|
|
182
|
+
vocab_size=seq_vocab,
|
|
183
|
+
max_len=sequence_max_len,
|
|
184
|
+
embedding_dim=embedding_dim,
|
|
185
|
+
padding_idx=0,
|
|
186
|
+
embedding_name=embedding_name,
|
|
187
|
+
)
|
|
188
|
+
)
|
|
189
|
+
return df, dense_features, sparse_features, sequence_features
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def generate_match_data(
|
|
193
|
+
n_samples: int = 10000,
|
|
194
|
+
user_vocab_size: int = 1000,
|
|
195
|
+
item_vocab_size: int = 5000,
|
|
196
|
+
category_vocab_size: int = 100,
|
|
197
|
+
brand_vocab_size: int = 200,
|
|
198
|
+
city_vocab_size: int = 100,
|
|
199
|
+
user_feature_vocab_size: int = 50,
|
|
200
|
+
item_feature_vocab_size: int = 50,
|
|
201
|
+
sequence_max_len: int = 50,
|
|
202
|
+
user_embedding_dim: int = 32,
|
|
203
|
+
item_embedding_dim: int = 32,
|
|
204
|
+
seed: int = 42,
|
|
205
|
+
) -> Tuple[pd.DataFrame, List, List, List, List, List, List]:
|
|
206
|
+
"""
|
|
207
|
+
Generate synthetic data for match/retrieval tasks
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
tuple: (dataframe, user_dense_features, user_sparse_features, user_sequence_features,
|
|
211
|
+
item_dense_features, item_sparse_features, item_sequence_features)
|
|
212
|
+
"""
|
|
213
|
+
print(f"Generating {n_samples} synthetic match samples...")
|
|
214
|
+
|
|
215
|
+
np.random.seed(seed)
|
|
216
|
+
data = {}
|
|
217
|
+
|
|
218
|
+
# User features
|
|
219
|
+
data["user_id"] = np.random.randint(1, user_vocab_size, n_samples)
|
|
220
|
+
data["user_age"] = np.random.randn(n_samples).astype(np.float32)
|
|
221
|
+
data["user_gender"] = np.random.randint(0, 2, n_samples)
|
|
222
|
+
data["user_city"] = np.random.randint(0, city_vocab_size, n_samples)
|
|
223
|
+
|
|
224
|
+
for i in range(3):
|
|
225
|
+
data[f"user_feature_{i}"] = np.random.randint(
|
|
226
|
+
1, user_feature_vocab_size, n_samples
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# User behavior sequences
|
|
230
|
+
user_hist_items = []
|
|
231
|
+
user_hist_categories = []
|
|
232
|
+
for _ in range(n_samples):
|
|
233
|
+
seq_len = np.random.randint(10, sequence_max_len + 1)
|
|
234
|
+
hist_items = np.random.randint(1, item_vocab_size, seq_len).tolist()
|
|
235
|
+
hist_items = hist_items + [0] * (sequence_max_len - len(hist_items))
|
|
236
|
+
user_hist_items.append(hist_items)
|
|
237
|
+
|
|
238
|
+
hist_cats = np.random.randint(1, category_vocab_size, seq_len).tolist()
|
|
239
|
+
hist_cats = hist_cats + [0] * (sequence_max_len - len(hist_cats))
|
|
240
|
+
user_hist_categories.append(hist_cats)
|
|
241
|
+
|
|
242
|
+
data["user_hist_items"] = user_hist_items
|
|
243
|
+
data["user_hist_categories"] = user_hist_categories
|
|
244
|
+
|
|
245
|
+
# Item features
|
|
246
|
+
data["item_id"] = np.random.randint(1, item_vocab_size, n_samples)
|
|
247
|
+
data["item_price"] = np.random.randn(n_samples).astype(np.float32)
|
|
248
|
+
data["item_category"] = np.random.randint(1, category_vocab_size, n_samples)
|
|
249
|
+
data["item_brand"] = np.random.randint(1, brand_vocab_size, n_samples)
|
|
250
|
+
|
|
251
|
+
for i in range(3):
|
|
252
|
+
data[f"item_feature_{i}"] = np.random.randint(
|
|
253
|
+
1, item_feature_vocab_size, n_samples
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# Generate labels with some correlation to features
|
|
257
|
+
label_probs = 1 / (
|
|
258
|
+
1
|
|
259
|
+
+ np.exp(
|
|
260
|
+
-(
|
|
261
|
+
data["user_age"] * 0.2
|
|
262
|
+
+ (data["user_gender"] - 0.5) * 0.3
|
|
263
|
+
+ data["item_price"] * 0.15
|
|
264
|
+
+ np.random.randn(n_samples) * 0.5
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
data["label"] = (label_probs > 0.5).astype(np.float32)
|
|
269
|
+
|
|
270
|
+
df = pd.DataFrame(data)
|
|
271
|
+
print(f"Generated data shape: {df.shape}")
|
|
272
|
+
print(f"Positive rate: {data['label'].mean():.4f}")
|
|
273
|
+
|
|
274
|
+
# Import here to avoid circular import
|
|
275
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
276
|
+
|
|
277
|
+
# User dense features
|
|
278
|
+
user_dense_features = [DenseFeature(name="user_age", input_dim=1)]
|
|
279
|
+
|
|
280
|
+
# User sparse features
|
|
281
|
+
user_sparse_features = [
|
|
282
|
+
SparseFeature(
|
|
283
|
+
name="user_id", vocab_size=user_vocab_size, embedding_dim=user_embedding_dim
|
|
284
|
+
),
|
|
285
|
+
SparseFeature(name="user_gender", vocab_size=2, embedding_dim=8),
|
|
286
|
+
SparseFeature(name="user_city", vocab_size=city_vocab_size, embedding_dim=16),
|
|
287
|
+
]
|
|
288
|
+
user_sparse_features.extend(
|
|
289
|
+
[
|
|
290
|
+
SparseFeature(
|
|
291
|
+
name=f"user_feature_{i}",
|
|
292
|
+
vocab_size=user_feature_vocab_size,
|
|
293
|
+
embedding_dim=8,
|
|
294
|
+
)
|
|
295
|
+
for i in range(3)
|
|
296
|
+
]
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# User sequence features
|
|
300
|
+
user_sequence_features = [
|
|
301
|
+
SequenceFeature(
|
|
302
|
+
name="user_hist_items",
|
|
303
|
+
vocab_size=item_vocab_size,
|
|
304
|
+
max_len=sequence_max_len,
|
|
305
|
+
embedding_dim=user_embedding_dim,
|
|
306
|
+
padding_idx=0,
|
|
307
|
+
),
|
|
308
|
+
SequenceFeature(
|
|
309
|
+
name="user_hist_categories",
|
|
310
|
+
vocab_size=category_vocab_size,
|
|
311
|
+
max_len=sequence_max_len,
|
|
312
|
+
embedding_dim=16,
|
|
313
|
+
padding_idx=0,
|
|
314
|
+
),
|
|
315
|
+
]
|
|
316
|
+
|
|
317
|
+
# Item dense features
|
|
318
|
+
item_dense_features = [DenseFeature(name="item_price", input_dim=1)]
|
|
319
|
+
|
|
320
|
+
# Item sparse features
|
|
321
|
+
item_sparse_features = [
|
|
322
|
+
SparseFeature(
|
|
323
|
+
name="item_id", vocab_size=item_vocab_size, embedding_dim=item_embedding_dim
|
|
324
|
+
),
|
|
325
|
+
SparseFeature(
|
|
326
|
+
name="item_category", vocab_size=category_vocab_size, embedding_dim=16
|
|
327
|
+
),
|
|
328
|
+
SparseFeature(name="item_brand", vocab_size=brand_vocab_size, embedding_dim=16),
|
|
329
|
+
]
|
|
330
|
+
item_sparse_features.extend(
|
|
331
|
+
[
|
|
332
|
+
SparseFeature(
|
|
333
|
+
name=f"item_feature_{i}",
|
|
334
|
+
vocab_size=item_feature_vocab_size,
|
|
335
|
+
embedding_dim=8,
|
|
336
|
+
)
|
|
337
|
+
for i in range(3)
|
|
338
|
+
]
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Item sequence features (empty for most match models)
|
|
342
|
+
item_sequence_features = []
|
|
343
|
+
|
|
344
|
+
return (
|
|
345
|
+
df,
|
|
346
|
+
user_dense_features,
|
|
347
|
+
user_sparse_features,
|
|
348
|
+
user_sequence_features,
|
|
349
|
+
item_dense_features,
|
|
350
|
+
item_sparse_features,
|
|
351
|
+
item_sequence_features,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def generate_multitask_data(
|
|
356
|
+
n_samples: int = 10000,
|
|
357
|
+
n_dense: int = 5,
|
|
358
|
+
n_sparse: int = 8,
|
|
359
|
+
n_sequences: int = 2,
|
|
360
|
+
user_vocab_size: int = 1000,
|
|
361
|
+
item_vocab_size: int = 500,
|
|
362
|
+
sparse_vocab_size: int = 50,
|
|
363
|
+
sequence_max_len: int = 20,
|
|
364
|
+
embedding_dim: int = 16,
|
|
365
|
+
seed: int = 42,
|
|
366
|
+
) -> Tuple[pd.DataFrame, List, List, List]:
|
|
367
|
+
"""
|
|
368
|
+
Generate synthetic data for multi-task learning
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
tuple: (dataframe, dense_features, sparse_features, sequence_features)
|
|
372
|
+
"""
|
|
373
|
+
print(f"Generating {n_samples} synthetic multi-task samples...")
|
|
374
|
+
|
|
375
|
+
np.random.seed(seed)
|
|
376
|
+
data = {}
|
|
377
|
+
|
|
378
|
+
# Generate dense features
|
|
379
|
+
for i in range(n_dense):
|
|
380
|
+
data[f"dense_{i}"] = np.random.randn(n_samples).astype(np.float32)
|
|
381
|
+
|
|
382
|
+
# Generate sparse features
|
|
383
|
+
data["user_id"] = np.random.randint(1, user_vocab_size, n_samples)
|
|
384
|
+
data["item_id"] = np.random.randint(1, item_vocab_size, n_samples)
|
|
385
|
+
|
|
386
|
+
for i in range(n_sparse - 2):
|
|
387
|
+
data[f"sparse_{i}"] = np.random.randint(1, sparse_vocab_size, n_samples)
|
|
388
|
+
|
|
389
|
+
# Generate sequence features
|
|
390
|
+
sequence_names = []
|
|
391
|
+
sequence_vocabs = []
|
|
392
|
+
|
|
393
|
+
for i in range(n_sequences):
|
|
394
|
+
sequences = []
|
|
395
|
+
for _ in range(n_samples):
|
|
396
|
+
seq_len = np.random.randint(5, sequence_max_len + 1)
|
|
397
|
+
if i == 0:
|
|
398
|
+
seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
|
|
399
|
+
seq_vocab = item_vocab_size
|
|
400
|
+
seq_name = "sequence_0"
|
|
401
|
+
else:
|
|
402
|
+
seq = np.random.randint(0, sparse_vocab_size, seq_len).tolist()
|
|
403
|
+
seq_vocab = sparse_vocab_size
|
|
404
|
+
seq_name = f"sequence_{i}"
|
|
405
|
+
|
|
406
|
+
seq = seq + [0] * (sequence_max_len - len(seq))
|
|
407
|
+
sequences.append(seq)
|
|
408
|
+
|
|
409
|
+
data[seq_name] = sequences
|
|
410
|
+
sequence_names.append(seq_name)
|
|
411
|
+
sequence_vocabs.append(seq_vocab)
|
|
412
|
+
|
|
413
|
+
# Generate multi-task labels with correlation
|
|
414
|
+
# CTR (click) is relatively easier to predict
|
|
415
|
+
ctr_logits = (
|
|
416
|
+
data["dense_0"] * 0.3 + data["dense_1"] * 0.2 + np.random.randn(n_samples) * 0.5
|
|
417
|
+
)
|
|
418
|
+
data["click"] = (1 / (1 + np.exp(-ctr_logits)) > 0.5).astype(np.float32)
|
|
419
|
+
|
|
420
|
+
# CVR (conversion) depends on click and is harder
|
|
421
|
+
cvr_logits = (
|
|
422
|
+
data["dense_2"] * 0.2
|
|
423
|
+
+ data["dense_3"] * 0.15
|
|
424
|
+
+ data["click"] * 1.5 # Strong dependency on click
|
|
425
|
+
+ np.random.randn(n_samples) * 0.8
|
|
426
|
+
)
|
|
427
|
+
data["conversion"] = (1 / (1 + np.exp(-cvr_logits)) > 0.3).astype(np.float32)
|
|
428
|
+
|
|
429
|
+
# CTCVR = click AND conversion
|
|
430
|
+
data["ctcvr"] = (data["click"] * data["conversion"]).astype(np.float32)
|
|
431
|
+
|
|
432
|
+
df = pd.DataFrame(data)
|
|
433
|
+
print(f"Generated data shape: {df.shape}")
|
|
434
|
+
print(f"Click rate: {data['click'].mean():.4f}")
|
|
435
|
+
print(f"Conversion rate (overall): {data['conversion'].mean():.4f}")
|
|
436
|
+
if data["click"].sum() > 0:
|
|
437
|
+
print(
|
|
438
|
+
f"Conversion rate (given click): {data['conversion'][data['click'] == 1].mean():.4f}"
|
|
439
|
+
)
|
|
440
|
+
print(f"CTCVR rate: {data['ctcvr'].mean():.4f}")
|
|
441
|
+
|
|
442
|
+
# Import here to avoid circular import
|
|
443
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
444
|
+
|
|
445
|
+
# Create feature definitions
|
|
446
|
+
dense_features = [
|
|
447
|
+
DenseFeature(name=f"dense_{i}", input_dim=1) for i in range(n_dense)
|
|
448
|
+
]
|
|
449
|
+
|
|
450
|
+
# Create sparse features
|
|
451
|
+
sparse_features = [
|
|
452
|
+
SparseFeature(
|
|
453
|
+
name="user_id",
|
|
454
|
+
embedding_name="user_emb",
|
|
455
|
+
vocab_size=user_vocab_size,
|
|
456
|
+
embedding_dim=embedding_dim,
|
|
457
|
+
),
|
|
458
|
+
SparseFeature(
|
|
459
|
+
name="item_id",
|
|
460
|
+
embedding_name="item_emb",
|
|
461
|
+
vocab_size=item_vocab_size,
|
|
462
|
+
embedding_dim=embedding_dim,
|
|
463
|
+
),
|
|
464
|
+
]
|
|
465
|
+
sparse_features.extend(
|
|
466
|
+
[
|
|
467
|
+
SparseFeature(
|
|
468
|
+
name=f"sparse_{i}",
|
|
469
|
+
embedding_name=f"sparse_{i}_emb",
|
|
470
|
+
vocab_size=sparse_vocab_size,
|
|
471
|
+
embedding_dim=embedding_dim,
|
|
472
|
+
)
|
|
473
|
+
for i in range(n_sparse - 2)
|
|
474
|
+
]
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
# Create sequence features
|
|
478
|
+
sequence_features = []
|
|
479
|
+
for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
|
|
480
|
+
if i == 0:
|
|
481
|
+
embedding_name = "item_emb"
|
|
482
|
+
else:
|
|
483
|
+
embedding_name = "sparse_0_emb"
|
|
484
|
+
sequence_features.append(
|
|
485
|
+
SequenceFeature(
|
|
486
|
+
name=seq_name,
|
|
487
|
+
vocab_size=seq_vocab,
|
|
488
|
+
max_len=sequence_max_len,
|
|
489
|
+
embedding_dim=embedding_dim,
|
|
490
|
+
padding_idx=0,
|
|
491
|
+
embedding_name=embedding_name,
|
|
492
|
+
)
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
return df, dense_features, sparse_features, sequence_features
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def generate_distributed_ranking_data(
|
|
499
|
+
num_samples: int = 100000,
|
|
500
|
+
num_users: int = 10000,
|
|
501
|
+
num_items: int = 5000,
|
|
502
|
+
num_categories: int = 20,
|
|
503
|
+
num_cities: int = 100,
|
|
504
|
+
max_seq_len: int = 50,
|
|
505
|
+
embedding_dim: int = 32,
|
|
506
|
+
seed: int = 42,
|
|
507
|
+
) -> Tuple[pd.DataFrame, List, List, List]:
|
|
508
|
+
"""
|
|
509
|
+
Generate synthetic data for distributed training scenarios
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
tuple: (dataframe, dense_features, sparse_features, sequence_features)
|
|
513
|
+
"""
|
|
514
|
+
return generate_ranking_data(
|
|
515
|
+
n_samples=num_samples,
|
|
516
|
+
n_dense=5,
|
|
517
|
+
n_sparse=6, # user_id, item_id + 4 custom features
|
|
518
|
+
n_sequences=2,
|
|
519
|
+
user_vocab_size=num_users + 1,
|
|
520
|
+
item_vocab_size=num_items + 1,
|
|
521
|
+
sequence_max_len=max_seq_len,
|
|
522
|
+
embedding_dim=embedding_dim,
|
|
523
|
+
seed=seed,
|
|
524
|
+
custom_sparse_features={
|
|
525
|
+
"gender": 2,
|
|
526
|
+
"age_group": 7,
|
|
527
|
+
"category": num_categories,
|
|
528
|
+
"city": num_cities,
|
|
529
|
+
},
|
|
530
|
+
use_simple_names=False,
|
|
531
|
+
)
|
nextrec/utils/tensor.py
CHANGED
|
@@ -6,56 +6,67 @@ Author: Yang Zhou, zyaztec@gmail.com
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import torch
|
|
9
|
-
import numpy as np
|
|
10
9
|
from typing import Any
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
def to_tensor(
|
|
14
|
-
value: Any,
|
|
15
|
-
dtype: torch.dtype,
|
|
16
|
-
device: torch.device | str | None = None
|
|
13
|
+
value: Any, dtype: torch.dtype, device: torch.device | str | None = None
|
|
17
14
|
) -> torch.Tensor:
|
|
18
15
|
if value is None:
|
|
19
16
|
raise ValueError("[Tensor Utils Error] Cannot convert None to tensor.")
|
|
20
17
|
tensor = value if isinstance(value, torch.Tensor) else torch.as_tensor(value)
|
|
21
18
|
if tensor.dtype != dtype:
|
|
22
19
|
tensor = tensor.to(dtype=dtype)
|
|
23
|
-
|
|
20
|
+
|
|
24
21
|
if device is not None:
|
|
25
|
-
target_device =
|
|
22
|
+
target_device = (
|
|
23
|
+
device if isinstance(device, torch.device) else torch.device(device)
|
|
24
|
+
)
|
|
26
25
|
if tensor.device != target_device:
|
|
27
26
|
tensor = tensor.to(target_device)
|
|
28
27
|
return tensor
|
|
29
28
|
|
|
29
|
+
|
|
30
30
|
def stack_tensors(tensors: list[torch.Tensor], dim: int = 0) -> torch.Tensor:
|
|
31
31
|
if not tensors:
|
|
32
32
|
raise ValueError("[Tensor Utils Error] Cannot stack empty list of tensors.")
|
|
33
33
|
return torch.stack(tensors, dim=dim)
|
|
34
34
|
|
|
35
|
+
|
|
35
36
|
def concat_tensors(tensors: list[torch.Tensor], dim: int = 0) -> torch.Tensor:
|
|
36
37
|
if not tensors:
|
|
37
|
-
raise ValueError(
|
|
38
|
+
raise ValueError(
|
|
39
|
+
"[Tensor Utils Error] Cannot concatenate empty list of tensors."
|
|
40
|
+
)
|
|
38
41
|
return torch.cat(tensors, dim=dim)
|
|
39
42
|
|
|
43
|
+
|
|
40
44
|
def pad_sequence_tensors(
|
|
41
45
|
tensors: list[torch.Tensor],
|
|
42
46
|
max_len: int | None = None,
|
|
43
47
|
padding_value: float = 0.0,
|
|
44
|
-
padding_side: str =
|
|
48
|
+
padding_side: str = "right",
|
|
45
49
|
) -> torch.Tensor:
|
|
46
50
|
if not tensors:
|
|
47
51
|
raise ValueError("[Tensor Utils Error] Cannot pad empty list of tensors.")
|
|
48
52
|
if max_len is None:
|
|
49
53
|
max_len = max(t.size(0) for t in tensors)
|
|
50
54
|
batch_size = len(tensors)
|
|
51
|
-
padded = torch.full(
|
|
52
|
-
|
|
55
|
+
padded = torch.full(
|
|
56
|
+
(batch_size, max_len),
|
|
57
|
+
padding_value,
|
|
58
|
+
dtype=tensors[0].dtype,
|
|
59
|
+
device=tensors[0].device,
|
|
60
|
+
)
|
|
61
|
+
|
|
53
62
|
for i, tensor in enumerate(tensors):
|
|
54
63
|
length = min(tensor.size(0), max_len)
|
|
55
|
-
if padding_side ==
|
|
64
|
+
if padding_side == "right":
|
|
56
65
|
padded[i, :length] = tensor[:length]
|
|
57
|
-
elif padding_side ==
|
|
66
|
+
elif padding_side == "left":
|
|
58
67
|
padded[i, -length:] = tensor[:length]
|
|
59
68
|
else:
|
|
60
|
-
raise ValueError(
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"[Tensor Utils Error] padding_side must be 'right' or 'left', got {padding_side}"
|
|
71
|
+
)
|
|
61
72
|
return padded
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nextrec
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
|
|
5
5
|
Project-URL: Homepage, https://github.com/zerolovesea/NextRec
|
|
6
6
|
Project-URL: Repository, https://github.com/zerolovesea/NextRec
|
|
@@ -55,7 +55,7 @@ Requires-Dist: seaborn>=0.12.0; extra == 'dev'
|
|
|
55
55
|
Description-Content-Type: text/markdown
|
|
56
56
|
|
|
57
57
|
<p align="center">
|
|
58
|
-
<img align="center" src="
|
|
58
|
+
<img align="center" src="assets/logo.png" width="40%">
|
|
59
59
|
<p>
|
|
60
60
|
|
|
61
61
|
<div align="center">
|
|
@@ -63,7 +63,7 @@ Description-Content-Type: text/markdown
|
|
|
63
63
|

|
|
64
64
|

|
|
65
65
|

|
|
66
|
-

|
|
67
67
|
|
|
68
68
|
English | [中文文档](README_zh.md)
|
|
69
69
|
|
|
@@ -86,7 +86,7 @@ NextRec is a modern recommendation framework built on PyTorch, delivering a unif
|
|
|
86
86
|
|
|
87
87
|
NextRec adopts a modular and low-coupling engineering design, enabling full-pipeline reusability and scalability across data processing → model construction → training & evaluation → inference & deployment. Its core components include: a Feature-Spec-driven Embedding architecture, the BaseModel abstraction, a set of independent reusable Layers, a unified DataLoader for both training and inference, and a ready-to-use Model Zoo.
|
|
88
88
|
|
|
89
|
-

|
|
90
90
|
|
|
91
91
|
> The project borrows ideas from excellent open-source rec libraries. Early layers referenced [torch-rechub](https://github.com/datawhalechina/torch-rechub) but have been replaced with in-house implementations. torch-rechub remains mature in architecture and models; the author contributed a bit there—feel free to check it out.
|
|
92
92
|
|
|
@@ -110,7 +110,7 @@ To dive deeper, Jupyter notebooks are available:
|
|
|
110
110
|
- [Hands on the NextRec framework](/tutorials/notebooks/en/Hands%20on%20nextrec.ipynb)
|
|
111
111
|
- [Using the data processor for preprocessing](/tutorials/notebooks/en/Hands%20on%20dataprocessor.ipynb)
|
|
112
112
|
|
|
113
|
-
> Current version [0.
|
|
113
|
+
> Current version [0.4.2]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
|
|
114
114
|
|
|
115
115
|
## 5-Minute Quick Start
|
|
116
116
|
|
|
@@ -196,6 +196,16 @@ metrics = model.evaluate(
|
|
|
196
196
|
)
|
|
197
197
|
```
|
|
198
198
|
|
|
199
|
+
## Platform Compatibility
|
|
200
|
+
|
|
201
|
+
The current version is 0.4.2. All models and test code have been validated on the following platforms. If you encounter compatibility issues, please report them in the issue tracker with your system version:
|
|
202
|
+
|
|
203
|
+
| Platform | Configuration |
|
|
204
|
+
|----------|---------------|
|
|
205
|
+
| MacOS latest | MacBook Pro M4 Pro 24GB RAM |
|
|
206
|
+
| Ubuntu latest | AutoDL 4070D Dual GPU |
|
|
207
|
+
| CentOS 7 | Intel Xeon 5138Y 96 cores 377GB RAM |
|
|
208
|
+
|
|
199
209
|
---
|
|
200
210
|
|
|
201
211
|
## Supported Models
|