nextrec 0.3.6__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. nextrec/__version__.py +1 -1
  2. nextrec/basic/layers.py +32 -15
  3. nextrec/basic/model.py +435 -187
  4. nextrec/data/data_processing.py +31 -19
  5. nextrec/data/dataloader.py +40 -10
  6. nextrec/models/generative/hstu.py +3 -2
  7. nextrec/models/match/dssm.py +0 -1
  8. nextrec/models/match/dssm_v2.py +0 -1
  9. nextrec/models/match/mind.py +0 -1
  10. nextrec/models/match/sdm.py +0 -1
  11. nextrec/models/match/youtube_dnn.py +0 -1
  12. nextrec/models/multi_task/esmm.py +5 -7
  13. nextrec/models/multi_task/mmoe.py +10 -6
  14. nextrec/models/multi_task/ple.py +10 -6
  15. nextrec/models/multi_task/poso.py +9 -6
  16. nextrec/models/multi_task/share_bottom.py +10 -7
  17. nextrec/models/ranking/afm.py +113 -21
  18. nextrec/models/ranking/autoint.py +15 -9
  19. nextrec/models/ranking/dcn.py +8 -11
  20. nextrec/models/ranking/deepfm.py +5 -5
  21. nextrec/models/ranking/dien.py +4 -4
  22. nextrec/models/ranking/din.py +4 -4
  23. nextrec/models/ranking/fibinet.py +4 -4
  24. nextrec/models/ranking/fm.py +4 -4
  25. nextrec/models/ranking/masknet.py +4 -5
  26. nextrec/models/ranking/pnn.py +4 -4
  27. nextrec/models/ranking/widedeep.py +4 -4
  28. nextrec/models/ranking/xdeepfm.py +4 -4
  29. nextrec/utils/__init__.py +7 -3
  30. nextrec/utils/device.py +30 -0
  31. nextrec/utils/distributed.py +114 -0
  32. nextrec/utils/synthetic_data.py +413 -0
  33. {nextrec-0.3.6.dist-info → nextrec-0.4.1.dist-info}/METADATA +15 -5
  34. nextrec-0.4.1.dist-info/RECORD +66 -0
  35. nextrec-0.3.6.dist-info/RECORD +0 -64
  36. {nextrec-0.3.6.dist-info → nextrec-0.4.1.dist-info}/WHEEL +0 -0
  37. {nextrec-0.3.6.dist-info → nextrec-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -56,7 +56,7 @@ class xDeepFM(BaseModel):
56
56
  return "xDeepFM"
57
57
 
58
58
  @property
59
- def task_type(self):
59
+ def default_task(self):
60
60
  return "binary"
61
61
 
62
62
  def __init__(self,
@@ -67,6 +67,7 @@ class xDeepFM(BaseModel):
67
67
  cin_size: list[int] = [128, 128],
68
68
  split_half: bool = True,
69
69
  target: list[str] = [],
70
+ task: str | list[str] | None = None,
70
71
  optimizer: str = "adam",
71
72
  optimizer_params: dict = {},
72
73
  loss: str | nn.Module | None = "bce",
@@ -83,13 +84,12 @@ class xDeepFM(BaseModel):
83
84
  sparse_features=sparse_features,
84
85
  sequence_features=sequence_features,
85
86
  target=target,
86
- task=self.task_type,
87
+ task=task or self.default_task,
87
88
  device=device,
88
89
  embedding_l1_reg=embedding_l1_reg,
89
90
  dense_l1_reg=dense_l1_reg,
90
91
  embedding_l2_reg=embedding_l2_reg,
91
92
  dense_l2_reg=dense_l2_reg,
92
- early_stop_patience=20,
93
93
  **kwargs
94
94
  )
95
95
 
@@ -118,7 +118,7 @@ class xDeepFM(BaseModel):
118
118
  deep_emb_dim_total = sum([f.embedding_dim for f in self.deep_features if not isinstance(f, DenseFeature)])
119
119
  dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
120
120
  self.mlp = MLP(input_dim=deep_emb_dim_total + dense_input_dim, **mlp_params)
121
- self.prediction_layer = PredictionLayer(task_type=self.task_type)
121
+ self.prediction_layer = PredictionLayer(task_type=self.task)
122
122
 
123
123
  # Register regularization weights
124
124
  self.register_regularization_weights(
nextrec/utils/__init__.py CHANGED
@@ -12,10 +12,10 @@ This package provides various utility functions organized by category:
12
12
  - feature_utils: Feature processing utilities
13
13
 
14
14
  Date: create on 13/11/2025
15
- Last update: 03/12/2025 (refactored)
15
+ Last update: 06/12/2025
16
16
  Author: Yang Zhou, zyaztec@gmail.com
17
17
  """
18
-
18
+ from . import optimizer, initializer, embedding
19
19
  from .optimizer import get_optimizer, get_scheduler
20
20
  from .initializer import get_initializer
21
21
  from .embedding import get_auto_embedding_dim
@@ -24,7 +24,7 @@ from .tensor import to_tensor, stack_tensors, concat_tensors, pad_sequence_tenso
24
24
  from .file import resolve_file_paths, read_table, load_dataframes, iter_file_chunks, default_output_dir
25
25
  from .model import merge_features, get_mlp_output_dim
26
26
  from .feature import normalize_to_list
27
- from . import optimizer, initializer, embedding
27
+ from .synthetic_data import generate_ranking_data, generate_distributed_ranking_data, generate_match_data, generate_multitask_data
28
28
 
29
29
  __all__ = [
30
30
  # Optimizer & Scheduler
@@ -61,6 +61,10 @@ __all__ = [
61
61
  # Feature utilities
62
62
  'normalize_to_list',
63
63
 
64
+ # Synthetic data utilities
65
+ 'generate_ranking_data',
66
+ 'generate_distributed_ranking_data',
67
+
64
68
  # Module exports
65
69
  'optimizer',
66
70
  'initializer',
nextrec/utils/device.py CHANGED
@@ -7,6 +7,7 @@ Author: Yang Zhou, zyaztec@gmail.com
7
7
  import os
8
8
  import torch
9
9
  import platform
10
+ import logging
10
11
  import multiprocessing
11
12
 
12
13
 
@@ -36,3 +37,32 @@ def get_device_info() -> dict:
36
37
  info['cuda_capability'] = torch.cuda.get_device_capability(0)
37
38
 
38
39
  return info
40
+
41
+ def configure_device(
42
+ distributed: bool,
43
+ local_rank: int,
44
+ base_device: torch.device | str = "cpu"
45
+ ) -> torch.device:
46
+ try:
47
+ device = torch.device(base_device)
48
+ except Exception:
49
+ logging.warning("[configure_device Warning] Invalid base_device, falling back to CPU.")
50
+ return torch.device("cpu")
51
+
52
+ if distributed:
53
+ if device.type == "cuda":
54
+ if not torch.cuda.is_available():
55
+ logging.warning("[Distributed Warning] CUDA requested but unavailable. Falling back to CPU.")
56
+ return torch.device("cpu")
57
+ if not (0 <= local_rank < torch.cuda.device_count()):
58
+ logging.warning(f"[Distributed Warning] local_rank {local_rank} is invalid for available CUDA devices. Falling back to CPU.")
59
+ return torch.device("cpu")
60
+ try:
61
+ torch.cuda.set_device(local_rank)
62
+ return torch.device(f"cuda:{local_rank}")
63
+ except Exception as exc:
64
+ logging.warning(f"[Distributed Warning] Failed to set CUDA device for local_rank {local_rank}: {exc}. Falling back to CPU.")
65
+ return torch.device("cpu")
66
+ else:
67
+ return torch.device("cpu")
68
+ return device
@@ -0,0 +1,114 @@
1
+ """
2
+ Distributed utilities for NextRec.
3
+
4
+ Date: create on 04/12/2025
5
+ Checkpoint: edit on 05/12/2025
6
+ Author: Yang Zhou,zyaztec@gmail.com
7
+ """
8
+
9
+ import logging
10
+ import numpy as np
11
+ import torch
12
+ import torch.distributed as dist
13
+
14
+ from torch.utils.data import DataLoader, IterableDataset
15
+ from torch.utils.data.distributed import DistributedSampler
16
+ from nextrec.basic.loggers import colorize
17
+
18
+ def init_process_group(distributed: bool, rank: int, world_size: int, device_id: int | None = None) -> None:
19
+ """
20
+ initialize distributed process group for multi-GPU training.
21
+
22
+ Args:
23
+ distributed: whether to enable distributed training
24
+ rank: global rank of the current process
25
+ world_size: total number of processes
26
+ """
27
+ if (not distributed) or (not dist.is_available()) or dist.is_initialized():
28
+ return
29
+ backend = "nccl" if device_id is not None else "gloo"
30
+ if backend == "nccl":
31
+ torch.cuda.set_device(device_id)
32
+ dist.init_process_group(backend=backend, init_method="env://", rank=rank, world_size=world_size)
33
+
34
+ def gather_numpy(self, array: np.ndarray | None) -> np.ndarray | None:
35
+ """
36
+ Gather numpy arrays (or None) across ranks. Uses all_gather_object to avoid
37
+ shape mismatches and ensures every rank participates even when local data is empty.
38
+ """
39
+ if not (self.distributed and dist.is_available() and dist.is_initialized()):
40
+ return array
41
+
42
+ world_size = dist.get_world_size()
43
+ gathered: list[np.ndarray | None] = [None for _ in range(world_size)]
44
+ dist.all_gather_object(gathered, array)
45
+ pieces: list[np.ndarray] = []
46
+ for item in gathered:
47
+ if item is None:
48
+ continue
49
+ item_np = np.asarray(item)
50
+ if item_np.size > 0:
51
+ pieces.append(item_np)
52
+ if not pieces:
53
+ return None
54
+ return np.concatenate(pieces, axis=0)
55
+
56
+ def add_distributed_sampler(
57
+ loader: DataLoader,
58
+ distributed: bool,
59
+ world_size: int,
60
+ rank: int,
61
+ shuffle: bool,
62
+ drop_last: bool,
63
+ default_batch_size: int,
64
+ is_main_process: bool = False,
65
+ ) -> tuple[DataLoader, DistributedSampler | None]:
66
+ """
67
+ add distributedsampler to a dataloader, this for distributed training
68
+ when each device has its own dataloader
69
+ """
70
+ # early return if not distributed
71
+ if not (distributed and dist.is_available() and dist.is_initialized()):
72
+ return loader, None
73
+ # return if already has DistributedSampler
74
+ if isinstance(loader.sampler, DistributedSampler):
75
+ return loader, loader.sampler
76
+ dataset = getattr(loader, "dataset", None)
77
+ if dataset is None:
78
+ return loader, None
79
+ if isinstance(dataset, IterableDataset):
80
+ if is_main_process:
81
+ logging.info(colorize("[Distributed Info] Iterable/streaming DataLoader provided; DistributedSampler is skipped. Ensure dataset handles sharding per rank.", color="yellow"))
82
+ return loader, None
83
+ sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, drop_last=drop_last)
84
+ loader_kwargs = {
85
+ "batch_size": loader.batch_size if loader.batch_size is not None else default_batch_size,
86
+ "shuffle": False,
87
+ "sampler": sampler,
88
+ "num_workers": loader.num_workers,
89
+ "collate_fn": loader.collate_fn,
90
+ "drop_last": drop_last,
91
+ }
92
+ if getattr(loader, "pin_memory", False):
93
+ loader_kwargs["pin_memory"] = True
94
+ pin_memory_device = getattr(loader, "pin_memory_device", None)
95
+ if pin_memory_device:
96
+ loader_kwargs["pin_memory_device"] = pin_memory_device
97
+ timeout = getattr(loader, "timeout", None)
98
+ if timeout:
99
+ loader_kwargs["timeout"] = timeout
100
+ worker_init_fn = getattr(loader, "worker_init_fn", None)
101
+ if worker_init_fn is not None:
102
+ loader_kwargs["worker_init_fn"] = worker_init_fn
103
+ generator = getattr(loader, "generator", None)
104
+ if generator is not None:
105
+ loader_kwargs["generator"] = generator
106
+ if loader.num_workers > 0:
107
+ loader_kwargs["persistent_workers"] = getattr(loader, "persistent_workers", False)
108
+ prefetch_factor = getattr(loader, "prefetch_factor", None)
109
+ if prefetch_factor is not None:
110
+ loader_kwargs["prefetch_factor"] = prefetch_factor
111
+ distributed_loader = DataLoader(dataset, **loader_kwargs)
112
+ if is_main_process:
113
+ logging.info(colorize("[Distributed Info] Attached DistributedSampler to provided DataLoader", color="cyan"))
114
+ return distributed_loader, sampler
@@ -0,0 +1,413 @@
1
+ """
2
+ Synthetic Data Generation Utilities
3
+
4
+ This module provides utilities for generating synthetic datasets for testing
5
+ and tutorial purposes in the NextRec framework.
6
+
7
+ Date: create on 06/12/2025
8
+ Author: Yang Zhou, zyaztec@gmail.com
9
+ """
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from typing import Optional, Dict, List, Tuple, TYPE_CHECKING
14
+
15
+ if TYPE_CHECKING:
16
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
17
+
18
+ def generate_ranking_data(
19
+ n_samples: int = 10000,
20
+ n_dense: int = 5,
21
+ n_sparse: int = 8,
22
+ n_sequences: int = 2,
23
+ user_vocab_size: int = 1000,
24
+ item_vocab_size: int = 500,
25
+ sparse_vocab_size: int = 50,
26
+ sequence_max_len: int = 20,
27
+ embedding_dim: int = 16,
28
+ seed: int = 42,
29
+ custom_sparse_features: Optional[Dict[str, int]] = None,
30
+ use_simple_names: bool = True
31
+ ) -> Tuple[pd.DataFrame, List, List, List]:
32
+ """
33
+ Generate synthetic data for ranking tasks (CTR prediction)
34
+
35
+ Returns:
36
+ tuple: (dataframe, dense_features, sparse_features, sequence_features)
37
+ """
38
+ print(f"Generating {n_samples} synthetic ranking samples...")
39
+
40
+ np.random.seed(seed)
41
+ data = {}
42
+
43
+ for i in range(n_dense):
44
+ data[f'dense_{i}'] = np.random.randn(n_samples).astype(np.float32)
45
+
46
+ # Generate basic sparse features (always include user_id and item_id)
47
+ data['user_id'] = np.random.randint(1, user_vocab_size, n_samples)
48
+ data['item_id'] = np.random.randint(1, item_vocab_size, n_samples)
49
+
50
+ # Generate additional sparse features
51
+ if custom_sparse_features:
52
+ for feat_name, vocab_size in custom_sparse_features.items():
53
+ data[feat_name] = np.random.randint(0, vocab_size, n_samples)
54
+ else:
55
+ for i in range(n_sparse - 2):
56
+ data[f'sparse_{i}'] = np.random.randint(1, sparse_vocab_size, n_samples)
57
+
58
+ # Generate sequence features (list of IDs)
59
+ sequence_names = []
60
+ sequence_vocabs = []
61
+
62
+ for i in range(n_sequences):
63
+ sequences = []
64
+ for _ in range(n_samples):
65
+ seq_len = np.random.randint(5, sequence_max_len + 1)
66
+ if i == 0:
67
+ # First sequence uses item vocabulary
68
+ seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
69
+ seq_vocab = item_vocab_size
70
+ if custom_sparse_features:
71
+ seq_name = 'hist_items'
72
+ else:
73
+ seq_name = 'sequence_0'
74
+ else:
75
+ # Other sequences use category vocabulary
76
+ if custom_sparse_features and 'category' in custom_sparse_features:
77
+ seq_vocab = custom_sparse_features['category']
78
+ seq = np.random.randint(0, seq_vocab, seq_len).tolist()
79
+ seq_name = f'hist_categories' if i == 1 else f'sequence_{i}'
80
+ else:
81
+ seq_vocab = sparse_vocab_size
82
+ seq = np.random.randint(0, seq_vocab, seq_len).tolist()
83
+ seq_name = f'sequence_{i}'
84
+
85
+ # Padding
86
+ seq = seq + [0] * (sequence_max_len - len(seq))
87
+ sequences.append(seq)
88
+
89
+ data[seq_name] = sequences
90
+ sequence_names.append(seq_name)
91
+ sequence_vocabs.append(seq_vocab)
92
+
93
+ if 'gender' in data and 'dense_0' in data:
94
+ # Complex label generation with feature correlation
95
+ label_probs = 1 / (1 + np.exp(-(
96
+ data['dense_0'] * 0.3 +
97
+ data['dense_1'] * 0.2 +
98
+ (data['gender'] - 0.5) * 0.5 +
99
+ np.random.randn(n_samples) * 0.1
100
+ )))
101
+ data['label'] = (label_probs > 0.5).astype(np.float32)
102
+ else:
103
+ data['label'] = np.random.randint(0, 2, n_samples).astype(np.float32)
104
+
105
+ df = pd.DataFrame(data)
106
+ print(f"Generated data shape: {df.shape}")
107
+ if 'gender' in data:
108
+ print(f"Positive rate: {data['label'].mean():.4f}")
109
+
110
+ # Import here to avoid circular import
111
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
112
+
113
+ # Create feature definitions
114
+ # Use input_dim for dense features to be compatible with both simple and complex scenarios
115
+ dense_features = [DenseFeature(name=f'dense_{i}', input_dim=1) for i in range(n_dense)]
116
+
117
+ # Create sparse features
118
+ sparse_features = [SparseFeature( name='user_id', embedding_name='user_emb', vocab_size=user_vocab_size, embedding_dim=embedding_dim),
119
+ SparseFeature(name='item_id', embedding_name='item_emb', vocab_size=item_vocab_size, embedding_dim=embedding_dim),]
120
+
121
+ if custom_sparse_features:
122
+ # Add custom sparse features with proper vocab sizes
123
+ for feat_name, vocab_size in custom_sparse_features.items():
124
+ sparse_features.append(SparseFeature(name=feat_name, embedding_name=f'{feat_name}_emb', vocab_size=vocab_size, embedding_dim=embedding_dim))
125
+ else:
126
+ # Add generic sparse features
127
+ sparse_features.extend([SparseFeature(name=f'sparse_{i}', embedding_name=f'sparse_{i}_emb', vocab_size=sparse_vocab_size, embedding_dim=embedding_dim) for i in range(n_sparse - 2)])
128
+
129
+ # Create sequence features
130
+ sequence_features = []
131
+ for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
132
+ if i == 0:
133
+ # First sequence shares embedding with item_id
134
+ embedding_name = 'item_emb'
135
+ elif custom_sparse_features and 'category' in custom_sparse_features and seq_name == 'hist_categories':
136
+ # hist_categories shares embedding with category
137
+ embedding_name = 'category_emb'
138
+ else:
139
+ # Other sequences share with sparse_0
140
+ embedding_name = 'sparse_0_emb'
141
+ sequence_features.append(SequenceFeature(name=seq_name, vocab_size=seq_vocab, max_len=sequence_max_len, embedding_dim=embedding_dim, padding_idx=0, embedding_name=embedding_name))
142
+ return df, dense_features, sparse_features, sequence_features
143
+
144
+
145
+ def generate_match_data(
146
+ n_samples: int = 10000,
147
+ user_vocab_size: int = 1000,
148
+ item_vocab_size: int = 5000,
149
+ category_vocab_size: int = 100,
150
+ brand_vocab_size: int = 200,
151
+ city_vocab_size: int = 100,
152
+ user_feature_vocab_size: int = 50,
153
+ item_feature_vocab_size: int = 50,
154
+ sequence_max_len: int = 50,
155
+ user_embedding_dim: int = 32,
156
+ item_embedding_dim: int = 32,
157
+ seed: int = 42
158
+ ) -> Tuple[pd.DataFrame, List, List, List, List, List, List]:
159
+ """
160
+ Generate synthetic data for match/retrieval tasks
161
+
162
+ Returns:
163
+ tuple: (dataframe, user_dense_features, user_sparse_features, user_sequence_features,
164
+ item_dense_features, item_sparse_features, item_sequence_features)
165
+ """
166
+ print(f"Generating {n_samples} synthetic match samples...")
167
+
168
+ np.random.seed(seed)
169
+ data = {}
170
+
171
+ # User features
172
+ data['user_id'] = np.random.randint(1, user_vocab_size, n_samples)
173
+ data['user_age'] = np.random.randn(n_samples).astype(np.float32)
174
+ data['user_gender'] = np.random.randint(0, 2, n_samples)
175
+ data['user_city'] = np.random.randint(0, city_vocab_size, n_samples)
176
+
177
+ for i in range(3):
178
+ data[f'user_feature_{i}'] = np.random.randint(1, user_feature_vocab_size, n_samples)
179
+
180
+ # User behavior sequences
181
+ user_hist_items = []
182
+ user_hist_categories = []
183
+ for _ in range(n_samples):
184
+ seq_len = np.random.randint(10, sequence_max_len + 1)
185
+ hist_items = np.random.randint(1, item_vocab_size, seq_len).tolist()
186
+ hist_items = hist_items + [0] * (sequence_max_len - len(hist_items))
187
+ user_hist_items.append(hist_items)
188
+
189
+ hist_cats = np.random.randint(1, category_vocab_size, seq_len).tolist()
190
+ hist_cats = hist_cats + [0] * (sequence_max_len - len(hist_cats))
191
+ user_hist_categories.append(hist_cats)
192
+
193
+ data['user_hist_items'] = user_hist_items
194
+ data['user_hist_categories'] = user_hist_categories
195
+
196
+ # Item features
197
+ data['item_id'] = np.random.randint(1, item_vocab_size, n_samples)
198
+ data['item_price'] = np.random.randn(n_samples).astype(np.float32)
199
+ data['item_category'] = np.random.randint(1, category_vocab_size, n_samples)
200
+ data['item_brand'] = np.random.randint(1, brand_vocab_size, n_samples)
201
+
202
+ for i in range(3):
203
+ data[f'item_feature_{i}'] = np.random.randint(1, item_feature_vocab_size, n_samples)
204
+
205
+ # Generate labels with some correlation to features
206
+ label_probs = 1 / (1 + np.exp(-(
207
+ data['user_age'] * 0.2 +
208
+ (data['user_gender'] - 0.5) * 0.3 +
209
+ data['item_price'] * 0.15 +
210
+ np.random.randn(n_samples) * 0.5
211
+ )))
212
+ data['label'] = (label_probs > 0.5).astype(np.float32)
213
+
214
+ df = pd.DataFrame(data)
215
+ print(f"Generated data shape: {df.shape}")
216
+ print(f"Positive rate: {data['label'].mean():.4f}")
217
+
218
+ # Import here to avoid circular import
219
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
220
+
221
+ # User dense features
222
+ user_dense_features = [DenseFeature(name='user_age', input_dim=1)]
223
+
224
+ # User sparse features
225
+ user_sparse_features = [
226
+ SparseFeature(name='user_id', vocab_size=user_vocab_size, embedding_dim=user_embedding_dim),
227
+ SparseFeature(name='user_gender', vocab_size=2, embedding_dim=8),
228
+ SparseFeature(name='user_city', vocab_size=city_vocab_size, embedding_dim=16),
229
+ ]
230
+ user_sparse_features.extend([
231
+ SparseFeature(name=f'user_feature_{i}', vocab_size=user_feature_vocab_size, embedding_dim=8)
232
+ for i in range(3)
233
+ ])
234
+
235
+ # User sequence features
236
+ user_sequence_features = [
237
+ SequenceFeature(name='user_hist_items', vocab_size=item_vocab_size,
238
+ max_len=sequence_max_len, embedding_dim=user_embedding_dim, padding_idx=0),
239
+ SequenceFeature(name='user_hist_categories', vocab_size=category_vocab_size,
240
+ max_len=sequence_max_len, embedding_dim=16, padding_idx=0),
241
+ ]
242
+
243
+ # Item dense features
244
+ item_dense_features = [DenseFeature(name='item_price', input_dim=1)]
245
+
246
+ # Item sparse features
247
+ item_sparse_features = [
248
+ SparseFeature(name='item_id', vocab_size=item_vocab_size, embedding_dim=item_embedding_dim),
249
+ SparseFeature(name='item_category', vocab_size=category_vocab_size, embedding_dim=16),
250
+ SparseFeature(name='item_brand', vocab_size=brand_vocab_size, embedding_dim=16),
251
+ ]
252
+ item_sparse_features.extend([
253
+ SparseFeature(name=f'item_feature_{i}', vocab_size=item_feature_vocab_size, embedding_dim=8)
254
+ for i in range(3)
255
+ ])
256
+
257
+ # Item sequence features (empty for most match models)
258
+ item_sequence_features = []
259
+
260
+ return (df, user_dense_features, user_sparse_features, user_sequence_features,
261
+ item_dense_features, item_sparse_features, item_sequence_features)
262
+
263
+
264
+ def generate_multitask_data(
265
+ n_samples: int = 10000,
266
+ n_dense: int = 5,
267
+ n_sparse: int = 8,
268
+ n_sequences: int = 2,
269
+ user_vocab_size: int = 1000,
270
+ item_vocab_size: int = 500,
271
+ sparse_vocab_size: int = 50,
272
+ sequence_max_len: int = 20,
273
+ embedding_dim: int = 16,
274
+ seed: int = 42
275
+ ) -> Tuple[pd.DataFrame, List, List, List]:
276
+ """
277
+ Generate synthetic data for multi-task learning
278
+
279
+ Returns:
280
+ tuple: (dataframe, dense_features, sparse_features, sequence_features)
281
+ """
282
+ print(f"Generating {n_samples} synthetic multi-task samples...")
283
+
284
+ np.random.seed(seed)
285
+ data = {}
286
+
287
+ # Generate dense features
288
+ for i in range(n_dense):
289
+ data[f'dense_{i}'] = np.random.randn(n_samples).astype(np.float32)
290
+
291
+ # Generate sparse features
292
+ data['user_id'] = np.random.randint(1, user_vocab_size, n_samples)
293
+ data['item_id'] = np.random.randint(1, item_vocab_size, n_samples)
294
+
295
+ for i in range(n_sparse - 2):
296
+ data[f'sparse_{i}'] = np.random.randint(1, sparse_vocab_size, n_samples)
297
+
298
+ # Generate sequence features
299
+ sequence_names = []
300
+ sequence_vocabs = []
301
+
302
+ for i in range(n_sequences):
303
+ sequences = []
304
+ for _ in range(n_samples):
305
+ seq_len = np.random.randint(5, sequence_max_len + 1)
306
+ if i == 0:
307
+ seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
308
+ seq_vocab = item_vocab_size
309
+ seq_name = 'sequence_0'
310
+ else:
311
+ seq = np.random.randint(0, sparse_vocab_size, seq_len).tolist()
312
+ seq_vocab = sparse_vocab_size
313
+ seq_name = f'sequence_{i}'
314
+
315
+ seq = seq + [0] * (sequence_max_len - len(seq))
316
+ sequences.append(seq)
317
+
318
+ data[seq_name] = sequences
319
+ sequence_names.append(seq_name)
320
+ sequence_vocabs.append(seq_vocab)
321
+
322
+ # Generate multi-task labels with correlation
323
+ # CTR (click) is relatively easier to predict
324
+ ctr_logits = (
325
+ data['dense_0'] * 0.3 +
326
+ data['dense_1'] * 0.2 +
327
+ np.random.randn(n_samples) * 0.5
328
+ )
329
+ data['click'] = (1 / (1 + np.exp(-ctr_logits)) > 0.5).astype(np.float32)
330
+
331
+ # CVR (conversion) depends on click and is harder
332
+ cvr_logits = (
333
+ data['dense_2'] * 0.2 +
334
+ data['dense_3'] * 0.15 +
335
+ data['click'] * 1.5 + # Strong dependency on click
336
+ np.random.randn(n_samples) * 0.8
337
+ )
338
+ data['conversion'] = (1 / (1 + np.exp(-cvr_logits)) > 0.3).astype(np.float32)
339
+
340
+ # CTCVR = click AND conversion
341
+ data['ctcvr'] = (data['click'] * data['conversion']).astype(np.float32)
342
+
343
+ df = pd.DataFrame(data)
344
+ print(f"Generated data shape: {df.shape}")
345
+ print(f"Click rate: {data['click'].mean():.4f}")
346
+ print(f"Conversion rate (overall): {data['conversion'].mean():.4f}")
347
+ if data['click'].sum() > 0:
348
+ print(f"Conversion rate (given click): {data['conversion'][data['click'] == 1].mean():.4f}")
349
+ print(f"CTCVR rate: {data['ctcvr'].mean():.4f}")
350
+
351
+ # Import here to avoid circular import
352
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
353
+
354
+ # Create feature definitions
355
+ dense_features = [DenseFeature(name=f'dense_{i}', input_dim=1) for i in range(n_dense)]
356
+
357
+ # Create sparse features
358
+ sparse_features = [
359
+ SparseFeature(name='user_id', embedding_name='user_emb',
360
+ vocab_size=user_vocab_size, embedding_dim=embedding_dim),
361
+ SparseFeature(name='item_id', embedding_name='item_emb',
362
+ vocab_size=item_vocab_size, embedding_dim=embedding_dim),
363
+ ]
364
+ sparse_features.extend([
365
+ SparseFeature(name=f'sparse_{i}', embedding_name=f'sparse_{i}_emb',
366
+ vocab_size=sparse_vocab_size, embedding_dim=embedding_dim)
367
+ for i in range(n_sparse - 2)
368
+ ])
369
+
370
+ # Create sequence features
371
+ sequence_features = []
372
+ for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
373
+ if i == 0:
374
+ embedding_name = 'item_emb'
375
+ else:
376
+ embedding_name = 'sparse_0_emb'
377
+ sequence_features.append(
378
+ SequenceFeature(name=seq_name, vocab_size=seq_vocab, max_len=sequence_max_len,
379
+ embedding_dim=embedding_dim, padding_idx=0, embedding_name=embedding_name)
380
+ )
381
+
382
+ return df, dense_features, sparse_features, sequence_features
383
+
384
+
385
+ def generate_distributed_ranking_data(
386
+ num_samples: int = 100000,
387
+ num_users: int = 10000,
388
+ num_items: int = 5000,
389
+ num_categories: int = 20,
390
+ num_cities: int = 100,
391
+ max_seq_len: int = 50,
392
+ embedding_dim: int = 32,
393
+ seed: int = 42,
394
+ ) -> Tuple[pd.DataFrame, List, List, List]:
395
+ """
396
+ Generate synthetic data for distributed training scenarios
397
+
398
+ Returns:
399
+ tuple: (dataframe, dense_features, sparse_features, sequence_features)
400
+ """
401
+ return generate_ranking_data(
402
+ n_samples=num_samples,
403
+ n_dense=5,
404
+ n_sparse=6, # user_id, item_id + 4 custom features
405
+ n_sequences=2,
406
+ user_vocab_size=num_users + 1,
407
+ item_vocab_size=num_items + 1,
408
+ sequence_max_len=max_seq_len,
409
+ embedding_dim=embedding_dim,
410
+ seed=seed,
411
+ custom_sparse_features={'gender': 2, 'age_group': 7, 'category': num_categories,'city': num_cities},
412
+ use_simple_names=False
413
+ )