nextrec 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. nextrec/__init__.py +1 -1
  2. nextrec/__version__.py +1 -1
  3. nextrec/basic/activation.py +10 -5
  4. nextrec/basic/callback.py +1 -0
  5. nextrec/basic/features.py +30 -22
  6. nextrec/basic/layers.py +250 -112
  7. nextrec/basic/loggers.py +63 -44
  8. nextrec/basic/metrics.py +270 -120
  9. nextrec/basic/model.py +1084 -402
  10. nextrec/basic/session.py +10 -3
  11. nextrec/cli.py +492 -0
  12. nextrec/data/__init__.py +19 -25
  13. nextrec/data/batch_utils.py +11 -3
  14. nextrec/data/data_processing.py +51 -45
  15. nextrec/data/data_utils.py +26 -15
  16. nextrec/data/dataloader.py +273 -96
  17. nextrec/data/preprocessor.py +320 -199
  18. nextrec/loss/listwise.py +17 -9
  19. nextrec/loss/loss_utils.py +7 -8
  20. nextrec/loss/pairwise.py +2 -0
  21. nextrec/loss/pointwise.py +30 -12
  22. nextrec/models/generative/hstu.py +103 -38
  23. nextrec/models/match/dssm.py +82 -68
  24. nextrec/models/match/dssm_v2.py +72 -57
  25. nextrec/models/match/mind.py +175 -107
  26. nextrec/models/match/sdm.py +104 -87
  27. nextrec/models/match/youtube_dnn.py +73 -59
  28. nextrec/models/multi_task/esmm.py +69 -46
  29. nextrec/models/multi_task/mmoe.py +91 -53
  30. nextrec/models/multi_task/ple.py +117 -58
  31. nextrec/models/multi_task/poso.py +163 -55
  32. nextrec/models/multi_task/share_bottom.py +63 -36
  33. nextrec/models/ranking/afm.py +80 -45
  34. nextrec/models/ranking/autoint.py +74 -57
  35. nextrec/models/ranking/dcn.py +110 -48
  36. nextrec/models/ranking/dcn_v2.py +265 -45
  37. nextrec/models/ranking/deepfm.py +39 -24
  38. nextrec/models/ranking/dien.py +335 -146
  39. nextrec/models/ranking/din.py +158 -92
  40. nextrec/models/ranking/fibinet.py +134 -52
  41. nextrec/models/ranking/fm.py +68 -26
  42. nextrec/models/ranking/masknet.py +95 -33
  43. nextrec/models/ranking/pnn.py +128 -58
  44. nextrec/models/ranking/widedeep.py +40 -28
  45. nextrec/models/ranking/xdeepfm.py +67 -40
  46. nextrec/utils/__init__.py +59 -34
  47. nextrec/utils/config.py +496 -0
  48. nextrec/utils/device.py +30 -20
  49. nextrec/utils/distributed.py +36 -9
  50. nextrec/utils/embedding.py +1 -0
  51. nextrec/utils/feature.py +1 -0
  52. nextrec/utils/file.py +33 -11
  53. nextrec/utils/initializer.py +61 -16
  54. nextrec/utils/model.py +22 -0
  55. nextrec/utils/optimizer.py +25 -9
  56. nextrec/utils/synthetic_data.py +283 -165
  57. nextrec/utils/tensor.py +24 -13
  58. {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/METADATA +53 -24
  59. nextrec-0.4.3.dist-info/RECORD +69 -0
  60. nextrec-0.4.3.dist-info/entry_points.txt +2 -0
  61. nextrec-0.4.1.dist-info/RECORD +0 -66
  62. {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/WHEEL +0 -0
  63. {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/licenses/LICENSE +0 -0
@@ -10,10 +10,8 @@ Author: Yang Zhou, zyaztec@gmail.com
10
10
 
11
11
  import numpy as np
12
12
  import pandas as pd
13
- from typing import Optional, Dict, List, Tuple, TYPE_CHECKING
13
+ from typing import Optional, Dict, List, Tuple
14
14
 
15
- if TYPE_CHECKING:
16
- from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
17
15
 
18
16
  def generate_ranking_data(
19
17
  n_samples: int = 10000,
@@ -27,38 +25,38 @@ def generate_ranking_data(
27
25
  embedding_dim: int = 16,
28
26
  seed: int = 42,
29
27
  custom_sparse_features: Optional[Dict[str, int]] = None,
30
- use_simple_names: bool = True
28
+ use_simple_names: bool = True,
31
29
  ) -> Tuple[pd.DataFrame, List, List, List]:
32
30
  """
33
31
  Generate synthetic data for ranking tasks (CTR prediction)
34
-
32
+
35
33
  Returns:
36
34
  tuple: (dataframe, dense_features, sparse_features, sequence_features)
37
35
  """
38
36
  print(f"Generating {n_samples} synthetic ranking samples...")
39
-
37
+
40
38
  np.random.seed(seed)
41
39
  data = {}
42
-
40
+
43
41
  for i in range(n_dense):
44
- data[f'dense_{i}'] = np.random.randn(n_samples).astype(np.float32)
45
-
42
+ data[f"dense_{i}"] = np.random.randn(n_samples).astype(np.float32)
43
+
46
44
  # Generate basic sparse features (always include user_id and item_id)
47
- data['user_id'] = np.random.randint(1, user_vocab_size, n_samples)
48
- data['item_id'] = np.random.randint(1, item_vocab_size, n_samples)
49
-
45
+ data["user_id"] = np.random.randint(1, user_vocab_size, n_samples)
46
+ data["item_id"] = np.random.randint(1, item_vocab_size, n_samples)
47
+
50
48
  # Generate additional sparse features
51
49
  if custom_sparse_features:
52
50
  for feat_name, vocab_size in custom_sparse_features.items():
53
51
  data[feat_name] = np.random.randint(0, vocab_size, n_samples)
54
52
  else:
55
53
  for i in range(n_sparse - 2):
56
- data[f'sparse_{i}'] = np.random.randint(1, sparse_vocab_size, n_samples)
57
-
54
+ data[f"sparse_{i}"] = np.random.randint(1, sparse_vocab_size, n_samples)
55
+
58
56
  # Generate sequence features (list of IDs)
59
57
  sequence_names = []
60
58
  sequence_vocabs = []
61
-
59
+
62
60
  for i in range(n_sequences):
63
61
  sequences = []
64
62
  for _ in range(n_samples):
@@ -68,77 +66,126 @@ def generate_ranking_data(
68
66
  seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
69
67
  seq_vocab = item_vocab_size
70
68
  if custom_sparse_features:
71
- seq_name = 'hist_items'
69
+ seq_name = "hist_items"
72
70
  else:
73
- seq_name = 'sequence_0'
71
+ seq_name = "sequence_0"
74
72
  else:
75
73
  # Other sequences use category vocabulary
76
- if custom_sparse_features and 'category' in custom_sparse_features:
77
- seq_vocab = custom_sparse_features['category']
74
+ if custom_sparse_features and "category" in custom_sparse_features:
75
+ seq_vocab = custom_sparse_features["category"]
78
76
  seq = np.random.randint(0, seq_vocab, seq_len).tolist()
79
- seq_name = f'hist_categories' if i == 1 else f'sequence_{i}'
77
+ seq_name = "hist_categories" if i == 1 else f"sequence_{i}"
80
78
  else:
81
79
  seq_vocab = sparse_vocab_size
82
80
  seq = np.random.randint(0, seq_vocab, seq_len).tolist()
83
- seq_name = f'sequence_{i}'
84
-
81
+ seq_name = f"sequence_{i}"
82
+
85
83
  # Padding
86
84
  seq = seq + [0] * (sequence_max_len - len(seq))
87
85
  sequences.append(seq)
88
-
86
+
89
87
  data[seq_name] = sequences
90
88
  sequence_names.append(seq_name)
91
89
  sequence_vocabs.append(seq_vocab)
92
-
93
- if 'gender' in data and 'dense_0' in data:
90
+
91
+ if "gender" in data and "dense_0" in data:
94
92
  # Complex label generation with feature correlation
95
- label_probs = 1 / (1 + np.exp(-(
96
- data['dense_0'] * 0.3 +
97
- data['dense_1'] * 0.2 +
98
- (data['gender'] - 0.5) * 0.5 +
99
- np.random.randn(n_samples) * 0.1
100
- )))
101
- data['label'] = (label_probs > 0.5).astype(np.float32)
93
+ label_probs = 1 / (
94
+ 1
95
+ + np.exp(
96
+ -(
97
+ data["dense_0"] * 0.3
98
+ + data["dense_1"] * 0.2
99
+ + (data["gender"] - 0.5) * 0.5
100
+ + np.random.randn(n_samples) * 0.1
101
+ )
102
+ )
103
+ )
104
+ data["label"] = (label_probs > 0.5).astype(np.float32)
102
105
  else:
103
- data['label'] = np.random.randint(0, 2, n_samples).astype(np.float32)
104
-
106
+ data["label"] = np.random.randint(0, 2, n_samples).astype(np.float32)
107
+
105
108
  df = pd.DataFrame(data)
106
109
  print(f"Generated data shape: {df.shape}")
107
- if 'gender' in data:
110
+ if "gender" in data:
108
111
  print(f"Positive rate: {data['label'].mean():.4f}")
109
-
112
+
110
113
  # Import here to avoid circular import
111
114
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
112
-
115
+
113
116
  # Create feature definitions
114
117
  # Use input_dim for dense features to be compatible with both simple and complex scenarios
115
- dense_features = [DenseFeature(name=f'dense_{i}', input_dim=1) for i in range(n_dense)]
116
-
118
+ dense_features = [
119
+ DenseFeature(name=f"dense_{i}", input_dim=1) for i in range(n_dense)
120
+ ]
121
+
117
122
  # Create sparse features
118
- sparse_features = [SparseFeature( name='user_id', embedding_name='user_emb', vocab_size=user_vocab_size, embedding_dim=embedding_dim),
119
- SparseFeature(name='item_id', embedding_name='item_emb', vocab_size=item_vocab_size, embedding_dim=embedding_dim),]
120
-
123
+ sparse_features = [
124
+ SparseFeature(
125
+ name="user_id",
126
+ embedding_name="user_emb",
127
+ vocab_size=user_vocab_size,
128
+ embedding_dim=embedding_dim,
129
+ ),
130
+ SparseFeature(
131
+ name="item_id",
132
+ embedding_name="item_emb",
133
+ vocab_size=item_vocab_size,
134
+ embedding_dim=embedding_dim,
135
+ ),
136
+ ]
137
+
121
138
  if custom_sparse_features:
122
139
  # Add custom sparse features with proper vocab sizes
123
140
  for feat_name, vocab_size in custom_sparse_features.items():
124
- sparse_features.append(SparseFeature(name=feat_name, embedding_name=f'{feat_name}_emb', vocab_size=vocab_size, embedding_dim=embedding_dim))
141
+ sparse_features.append(
142
+ SparseFeature(
143
+ name=feat_name,
144
+ embedding_name=f"{feat_name}_emb",
145
+ vocab_size=vocab_size,
146
+ embedding_dim=embedding_dim,
147
+ )
148
+ )
125
149
  else:
126
150
  # Add generic sparse features
127
- sparse_features.extend([SparseFeature(name=f'sparse_{i}', embedding_name=f'sparse_{i}_emb', vocab_size=sparse_vocab_size, embedding_dim=embedding_dim) for i in range(n_sparse - 2)])
128
-
151
+ sparse_features.extend(
152
+ [
153
+ SparseFeature(
154
+ name=f"sparse_{i}",
155
+ embedding_name=f"sparse_{i}_emb",
156
+ vocab_size=sparse_vocab_size,
157
+ embedding_dim=embedding_dim,
158
+ )
159
+ for i in range(n_sparse - 2)
160
+ ]
161
+ )
162
+
129
163
  # Create sequence features
130
164
  sequence_features = []
131
165
  for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
132
166
  if i == 0:
133
167
  # First sequence shares embedding with item_id
134
- embedding_name = 'item_emb'
135
- elif custom_sparse_features and 'category' in custom_sparse_features and seq_name == 'hist_categories':
168
+ embedding_name = "item_emb"
169
+ elif (
170
+ custom_sparse_features
171
+ and "category" in custom_sparse_features
172
+ and seq_name == "hist_categories"
173
+ ):
136
174
  # hist_categories shares embedding with category
137
- embedding_name = 'category_emb'
175
+ embedding_name = "category_emb"
138
176
  else:
139
177
  # Other sequences share with sparse_0
140
- embedding_name = 'sparse_0_emb'
141
- sequence_features.append(SequenceFeature(name=seq_name, vocab_size=seq_vocab, max_len=sequence_max_len, embedding_dim=embedding_dim, padding_idx=0, embedding_name=embedding_name))
178
+ embedding_name = "sparse_0_emb"
179
+ sequence_features.append(
180
+ SequenceFeature(
181
+ name=seq_name,
182
+ vocab_size=seq_vocab,
183
+ max_len=sequence_max_len,
184
+ embedding_dim=embedding_dim,
185
+ padding_idx=0,
186
+ embedding_name=embedding_name,
187
+ )
188
+ )
142
189
  return df, dense_features, sparse_features, sequence_features
143
190
 
144
191
 
@@ -154,29 +201,31 @@ def generate_match_data(
154
201
  sequence_max_len: int = 50,
155
202
  user_embedding_dim: int = 32,
156
203
  item_embedding_dim: int = 32,
157
- seed: int = 42
204
+ seed: int = 42,
158
205
  ) -> Tuple[pd.DataFrame, List, List, List, List, List, List]:
159
206
  """
160
207
  Generate synthetic data for match/retrieval tasks
161
-
208
+
162
209
  Returns:
163
210
  tuple: (dataframe, user_dense_features, user_sparse_features, user_sequence_features,
164
211
  item_dense_features, item_sparse_features, item_sequence_features)
165
212
  """
166
213
  print(f"Generating {n_samples} synthetic match samples...")
167
-
214
+
168
215
  np.random.seed(seed)
169
216
  data = {}
170
-
217
+
171
218
  # User features
172
- data['user_id'] = np.random.randint(1, user_vocab_size, n_samples)
173
- data['user_age'] = np.random.randn(n_samples).astype(np.float32)
174
- data['user_gender'] = np.random.randint(0, 2, n_samples)
175
- data['user_city'] = np.random.randint(0, city_vocab_size, n_samples)
176
-
219
+ data["user_id"] = np.random.randint(1, user_vocab_size, n_samples)
220
+ data["user_age"] = np.random.randn(n_samples).astype(np.float32)
221
+ data["user_gender"] = np.random.randint(0, 2, n_samples)
222
+ data["user_city"] = np.random.randint(0, city_vocab_size, n_samples)
223
+
177
224
  for i in range(3):
178
- data[f'user_feature_{i}'] = np.random.randint(1, user_feature_vocab_size, n_samples)
179
-
225
+ data[f"user_feature_{i}"] = np.random.randint(
226
+ 1, user_feature_vocab_size, n_samples
227
+ )
228
+
180
229
  # User behavior sequences
181
230
  user_hist_items = []
182
231
  user_hist_categories = []
@@ -185,80 +234,122 @@ def generate_match_data(
185
234
  hist_items = np.random.randint(1, item_vocab_size, seq_len).tolist()
186
235
  hist_items = hist_items + [0] * (sequence_max_len - len(hist_items))
187
236
  user_hist_items.append(hist_items)
188
-
237
+
189
238
  hist_cats = np.random.randint(1, category_vocab_size, seq_len).tolist()
190
239
  hist_cats = hist_cats + [0] * (sequence_max_len - len(hist_cats))
191
240
  user_hist_categories.append(hist_cats)
192
-
193
- data['user_hist_items'] = user_hist_items
194
- data['user_hist_categories'] = user_hist_categories
195
-
241
+
242
+ data["user_hist_items"] = user_hist_items
243
+ data["user_hist_categories"] = user_hist_categories
244
+
196
245
  # Item features
197
- data['item_id'] = np.random.randint(1, item_vocab_size, n_samples)
198
- data['item_price'] = np.random.randn(n_samples).astype(np.float32)
199
- data['item_category'] = np.random.randint(1, category_vocab_size, n_samples)
200
- data['item_brand'] = np.random.randint(1, brand_vocab_size, n_samples)
201
-
246
+ data["item_id"] = np.random.randint(1, item_vocab_size, n_samples)
247
+ data["item_price"] = np.random.randn(n_samples).astype(np.float32)
248
+ data["item_category"] = np.random.randint(1, category_vocab_size, n_samples)
249
+ data["item_brand"] = np.random.randint(1, brand_vocab_size, n_samples)
250
+
202
251
  for i in range(3):
203
- data[f'item_feature_{i}'] = np.random.randint(1, item_feature_vocab_size, n_samples)
204
-
252
+ data[f"item_feature_{i}"] = np.random.randint(
253
+ 1, item_feature_vocab_size, n_samples
254
+ )
255
+
205
256
  # Generate labels with some correlation to features
206
- label_probs = 1 / (1 + np.exp(-(
207
- data['user_age'] * 0.2 +
208
- (data['user_gender'] - 0.5) * 0.3 +
209
- data['item_price'] * 0.15 +
210
- np.random.randn(n_samples) * 0.5
211
- )))
212
- data['label'] = (label_probs > 0.5).astype(np.float32)
213
-
257
+ label_probs = 1 / (
258
+ 1
259
+ + np.exp(
260
+ -(
261
+ data["user_age"] * 0.2
262
+ + (data["user_gender"] - 0.5) * 0.3
263
+ + data["item_price"] * 0.15
264
+ + np.random.randn(n_samples) * 0.5
265
+ )
266
+ )
267
+ )
268
+ data["label"] = (label_probs > 0.5).astype(np.float32)
269
+
214
270
  df = pd.DataFrame(data)
215
271
  print(f"Generated data shape: {df.shape}")
216
272
  print(f"Positive rate: {data['label'].mean():.4f}")
217
-
273
+
218
274
  # Import here to avoid circular import
219
275
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
220
-
276
+
221
277
  # User dense features
222
- user_dense_features = [DenseFeature(name='user_age', input_dim=1)]
223
-
278
+ user_dense_features = [DenseFeature(name="user_age", input_dim=1)]
279
+
224
280
  # User sparse features
225
281
  user_sparse_features = [
226
- SparseFeature(name='user_id', vocab_size=user_vocab_size, embedding_dim=user_embedding_dim),
227
- SparseFeature(name='user_gender', vocab_size=2, embedding_dim=8),
228
- SparseFeature(name='user_city', vocab_size=city_vocab_size, embedding_dim=16),
282
+ SparseFeature(
283
+ name="user_id", vocab_size=user_vocab_size, embedding_dim=user_embedding_dim
284
+ ),
285
+ SparseFeature(name="user_gender", vocab_size=2, embedding_dim=8),
286
+ SparseFeature(name="user_city", vocab_size=city_vocab_size, embedding_dim=16),
229
287
  ]
230
- user_sparse_features.extend([
231
- SparseFeature(name=f'user_feature_{i}', vocab_size=user_feature_vocab_size, embedding_dim=8)
232
- for i in range(3)
233
- ])
234
-
288
+ user_sparse_features.extend(
289
+ [
290
+ SparseFeature(
291
+ name=f"user_feature_{i}",
292
+ vocab_size=user_feature_vocab_size,
293
+ embedding_dim=8,
294
+ )
295
+ for i in range(3)
296
+ ]
297
+ )
298
+
235
299
  # User sequence features
236
300
  user_sequence_features = [
237
- SequenceFeature(name='user_hist_items', vocab_size=item_vocab_size,
238
- max_len=sequence_max_len, embedding_dim=user_embedding_dim, padding_idx=0),
239
- SequenceFeature(name='user_hist_categories', vocab_size=category_vocab_size,
240
- max_len=sequence_max_len, embedding_dim=16, padding_idx=0),
301
+ SequenceFeature(
302
+ name="user_hist_items",
303
+ vocab_size=item_vocab_size,
304
+ max_len=sequence_max_len,
305
+ embedding_dim=user_embedding_dim,
306
+ padding_idx=0,
307
+ ),
308
+ SequenceFeature(
309
+ name="user_hist_categories",
310
+ vocab_size=category_vocab_size,
311
+ max_len=sequence_max_len,
312
+ embedding_dim=16,
313
+ padding_idx=0,
314
+ ),
241
315
  ]
242
-
316
+
243
317
  # Item dense features
244
- item_dense_features = [DenseFeature(name='item_price', input_dim=1)]
245
-
318
+ item_dense_features = [DenseFeature(name="item_price", input_dim=1)]
319
+
246
320
  # Item sparse features
247
321
  item_sparse_features = [
248
- SparseFeature(name='item_id', vocab_size=item_vocab_size, embedding_dim=item_embedding_dim),
249
- SparseFeature(name='item_category', vocab_size=category_vocab_size, embedding_dim=16),
250
- SparseFeature(name='item_brand', vocab_size=brand_vocab_size, embedding_dim=16),
322
+ SparseFeature(
323
+ name="item_id", vocab_size=item_vocab_size, embedding_dim=item_embedding_dim
324
+ ),
325
+ SparseFeature(
326
+ name="item_category", vocab_size=category_vocab_size, embedding_dim=16
327
+ ),
328
+ SparseFeature(name="item_brand", vocab_size=brand_vocab_size, embedding_dim=16),
251
329
  ]
252
- item_sparse_features.extend([
253
- SparseFeature(name=f'item_feature_{i}', vocab_size=item_feature_vocab_size, embedding_dim=8)
254
- for i in range(3)
255
- ])
256
-
330
+ item_sparse_features.extend(
331
+ [
332
+ SparseFeature(
333
+ name=f"item_feature_{i}",
334
+ vocab_size=item_feature_vocab_size,
335
+ embedding_dim=8,
336
+ )
337
+ for i in range(3)
338
+ ]
339
+ )
340
+
257
341
  # Item sequence features (empty for most match models)
258
342
  item_sequence_features = []
259
-
260
- return (df, user_dense_features, user_sparse_features, user_sequence_features,
261
- item_dense_features, item_sparse_features, item_sequence_features)
343
+
344
+ return (
345
+ df,
346
+ user_dense_features,
347
+ user_sparse_features,
348
+ user_sequence_features,
349
+ item_dense_features,
350
+ item_sparse_features,
351
+ item_sequence_features,
352
+ )
262
353
 
263
354
 
264
355
  def generate_multitask_data(
@@ -271,34 +362,34 @@ def generate_multitask_data(
271
362
  sparse_vocab_size: int = 50,
272
363
  sequence_max_len: int = 20,
273
364
  embedding_dim: int = 16,
274
- seed: int = 42
365
+ seed: int = 42,
275
366
  ) -> Tuple[pd.DataFrame, List, List, List]:
276
367
  """
277
368
  Generate synthetic data for multi-task learning
278
-
369
+
279
370
  Returns:
280
371
  tuple: (dataframe, dense_features, sparse_features, sequence_features)
281
372
  """
282
373
  print(f"Generating {n_samples} synthetic multi-task samples...")
283
-
374
+
284
375
  np.random.seed(seed)
285
376
  data = {}
286
-
377
+
287
378
  # Generate dense features
288
379
  for i in range(n_dense):
289
- data[f'dense_{i}'] = np.random.randn(n_samples).astype(np.float32)
290
-
380
+ data[f"dense_{i}"] = np.random.randn(n_samples).astype(np.float32)
381
+
291
382
  # Generate sparse features
292
- data['user_id'] = np.random.randint(1, user_vocab_size, n_samples)
293
- data['item_id'] = np.random.randint(1, item_vocab_size, n_samples)
294
-
383
+ data["user_id"] = np.random.randint(1, user_vocab_size, n_samples)
384
+ data["item_id"] = np.random.randint(1, item_vocab_size, n_samples)
385
+
295
386
  for i in range(n_sparse - 2):
296
- data[f'sparse_{i}'] = np.random.randint(1, sparse_vocab_size, n_samples)
297
-
387
+ data[f"sparse_{i}"] = np.random.randint(1, sparse_vocab_size, n_samples)
388
+
298
389
  # Generate sequence features
299
390
  sequence_names = []
300
391
  sequence_vocabs = []
301
-
392
+
302
393
  for i in range(n_sequences):
303
394
  sequences = []
304
395
  for _ in range(n_samples):
@@ -306,79 +397,101 @@ def generate_multitask_data(
306
397
  if i == 0:
307
398
  seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
308
399
  seq_vocab = item_vocab_size
309
- seq_name = 'sequence_0'
400
+ seq_name = "sequence_0"
310
401
  else:
311
402
  seq = np.random.randint(0, sparse_vocab_size, seq_len).tolist()
312
403
  seq_vocab = sparse_vocab_size
313
- seq_name = f'sequence_{i}'
314
-
404
+ seq_name = f"sequence_{i}"
405
+
315
406
  seq = seq + [0] * (sequence_max_len - len(seq))
316
407
  sequences.append(seq)
317
-
408
+
318
409
  data[seq_name] = sequences
319
410
  sequence_names.append(seq_name)
320
411
  sequence_vocabs.append(seq_vocab)
321
-
412
+
322
413
  # Generate multi-task labels with correlation
323
414
  # CTR (click) is relatively easier to predict
324
415
  ctr_logits = (
325
- data['dense_0'] * 0.3 +
326
- data['dense_1'] * 0.2 +
327
- np.random.randn(n_samples) * 0.5
416
+ data["dense_0"] * 0.3 + data["dense_1"] * 0.2 + np.random.randn(n_samples) * 0.5
328
417
  )
329
- data['click'] = (1 / (1 + np.exp(-ctr_logits)) > 0.5).astype(np.float32)
330
-
418
+ data["click"] = (1 / (1 + np.exp(-ctr_logits)) > 0.5).astype(np.float32)
419
+
331
420
  # CVR (conversion) depends on click and is harder
332
421
  cvr_logits = (
333
- data['dense_2'] * 0.2 +
334
- data['dense_3'] * 0.15 +
335
- data['click'] * 1.5 + # Strong dependency on click
336
- np.random.randn(n_samples) * 0.8
422
+ data["dense_2"] * 0.2
423
+ + data["dense_3"] * 0.15
424
+ + data["click"] * 1.5 # Strong dependency on click
425
+ + np.random.randn(n_samples) * 0.8
337
426
  )
338
- data['conversion'] = (1 / (1 + np.exp(-cvr_logits)) > 0.3).astype(np.float32)
339
-
427
+ data["conversion"] = (1 / (1 + np.exp(-cvr_logits)) > 0.3).astype(np.float32)
428
+
340
429
  # CTCVR = click AND conversion
341
- data['ctcvr'] = (data['click'] * data['conversion']).astype(np.float32)
342
-
430
+ data["ctcvr"] = (data["click"] * data["conversion"]).astype(np.float32)
431
+
343
432
  df = pd.DataFrame(data)
344
433
  print(f"Generated data shape: {df.shape}")
345
434
  print(f"Click rate: {data['click'].mean():.4f}")
346
435
  print(f"Conversion rate (overall): {data['conversion'].mean():.4f}")
347
- if data['click'].sum() > 0:
348
- print(f"Conversion rate (given click): {data['conversion'][data['click'] == 1].mean():.4f}")
436
+ if data["click"].sum() > 0:
437
+ print(
438
+ f"Conversion rate (given click): {data['conversion'][data['click'] == 1].mean():.4f}"
439
+ )
349
440
  print(f"CTCVR rate: {data['ctcvr'].mean():.4f}")
350
-
441
+
351
442
  # Import here to avoid circular import
352
443
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
353
-
444
+
354
445
  # Create feature definitions
355
- dense_features = [DenseFeature(name=f'dense_{i}', input_dim=1) for i in range(n_dense)]
356
-
446
+ dense_features = [
447
+ DenseFeature(name=f"dense_{i}", input_dim=1) for i in range(n_dense)
448
+ ]
449
+
357
450
  # Create sparse features
358
451
  sparse_features = [
359
- SparseFeature(name='user_id', embedding_name='user_emb',
360
- vocab_size=user_vocab_size, embedding_dim=embedding_dim),
361
- SparseFeature(name='item_id', embedding_name='item_emb',
362
- vocab_size=item_vocab_size, embedding_dim=embedding_dim),
452
+ SparseFeature(
453
+ name="user_id",
454
+ embedding_name="user_emb",
455
+ vocab_size=user_vocab_size,
456
+ embedding_dim=embedding_dim,
457
+ ),
458
+ SparseFeature(
459
+ name="item_id",
460
+ embedding_name="item_emb",
461
+ vocab_size=item_vocab_size,
462
+ embedding_dim=embedding_dim,
463
+ ),
363
464
  ]
364
- sparse_features.extend([
365
- SparseFeature(name=f'sparse_{i}', embedding_name=f'sparse_{i}_emb',
366
- vocab_size=sparse_vocab_size, embedding_dim=embedding_dim)
367
- for i in range(n_sparse - 2)
368
- ])
369
-
465
+ sparse_features.extend(
466
+ [
467
+ SparseFeature(
468
+ name=f"sparse_{i}",
469
+ embedding_name=f"sparse_{i}_emb",
470
+ vocab_size=sparse_vocab_size,
471
+ embedding_dim=embedding_dim,
472
+ )
473
+ for i in range(n_sparse - 2)
474
+ ]
475
+ )
476
+
370
477
  # Create sequence features
371
478
  sequence_features = []
372
479
  for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
373
480
  if i == 0:
374
- embedding_name = 'item_emb'
481
+ embedding_name = "item_emb"
375
482
  else:
376
- embedding_name = 'sparse_0_emb'
483
+ embedding_name = "sparse_0_emb"
377
484
  sequence_features.append(
378
- SequenceFeature(name=seq_name, vocab_size=seq_vocab, max_len=sequence_max_len,
379
- embedding_dim=embedding_dim, padding_idx=0, embedding_name=embedding_name)
485
+ SequenceFeature(
486
+ name=seq_name,
487
+ vocab_size=seq_vocab,
488
+ max_len=sequence_max_len,
489
+ embedding_dim=embedding_dim,
490
+ padding_idx=0,
491
+ embedding_name=embedding_name,
492
+ )
380
493
  )
381
-
494
+
382
495
  return df, dense_features, sparse_features, sequence_features
383
496
 
384
497
 
@@ -394,7 +507,7 @@ def generate_distributed_ranking_data(
394
507
  ) -> Tuple[pd.DataFrame, List, List, List]:
395
508
  """
396
509
  Generate synthetic data for distributed training scenarios
397
-
510
+
398
511
  Returns:
399
512
  tuple: (dataframe, dense_features, sparse_features, sequence_features)
400
513
  """
@@ -408,6 +521,11 @@ def generate_distributed_ranking_data(
408
521
  sequence_max_len=max_seq_len,
409
522
  embedding_dim=embedding_dim,
410
523
  seed=seed,
411
- custom_sparse_features={'gender': 2, 'age_group': 7, 'category': num_categories,'city': num_cities},
412
- use_simple_names=False
524
+ custom_sparse_features={
525
+ "gender": 2,
526
+ "age_group": 7,
527
+ "category": num_categories,
528
+ "city": num_cities,
529
+ },
530
+ use_simple_names=False,
413
531
  )