nextrec 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +0 -30
- nextrec/__version__.py +1 -1
- nextrec/basic/layers.py +32 -15
- nextrec/basic/loggers.py +1 -1
- nextrec/basic/model.py +440 -189
- nextrec/basic/session.py +4 -2
- nextrec/data/__init__.py +0 -25
- nextrec/data/data_processing.py +31 -19
- nextrec/data/dataloader.py +51 -16
- nextrec/models/generative/__init__.py +0 -5
- nextrec/models/generative/hstu.py +3 -2
- nextrec/models/match/__init__.py +0 -13
- nextrec/models/match/dssm.py +0 -1
- nextrec/models/match/dssm_v2.py +0 -1
- nextrec/models/match/mind.py +0 -1
- nextrec/models/match/sdm.py +0 -1
- nextrec/models/match/youtube_dnn.py +0 -1
- nextrec/models/multi_task/__init__.py +0 -0
- nextrec/models/multi_task/esmm.py +5 -7
- nextrec/models/multi_task/mmoe.py +10 -6
- nextrec/models/multi_task/ple.py +10 -6
- nextrec/models/multi_task/poso.py +9 -6
- nextrec/models/multi_task/share_bottom.py +10 -7
- nextrec/models/ranking/__init__.py +0 -27
- nextrec/models/ranking/afm.py +113 -21
- nextrec/models/ranking/autoint.py +15 -9
- nextrec/models/ranking/dcn.py +8 -11
- nextrec/models/ranking/deepfm.py +5 -5
- nextrec/models/ranking/dien.py +4 -4
- nextrec/models/ranking/din.py +4 -4
- nextrec/models/ranking/fibinet.py +4 -4
- nextrec/models/ranking/fm.py +4 -4
- nextrec/models/ranking/masknet.py +4 -5
- nextrec/models/ranking/pnn.py +4 -4
- nextrec/models/ranking/widedeep.py +4 -4
- nextrec/models/ranking/xdeepfm.py +4 -4
- nextrec/utils/__init__.py +7 -3
- nextrec/utils/device.py +32 -1
- nextrec/utils/distributed.py +114 -0
- nextrec/utils/synthetic_data.py +413 -0
- {nextrec-0.3.5.dist-info → nextrec-0.4.1.dist-info}/METADATA +15 -5
- nextrec-0.4.1.dist-info/RECORD +66 -0
- nextrec-0.3.5.dist-info/RECORD +0 -63
- {nextrec-0.3.5.dist-info → nextrec-0.4.1.dist-info}/WHEEL +0 -0
- {nextrec-0.3.5.dist-info → nextrec-0.4.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -53,7 +53,7 @@ class WideDeep(BaseModel):
|
|
|
53
53
|
return "WideDeep"
|
|
54
54
|
|
|
55
55
|
@property
|
|
56
|
-
def
|
|
56
|
+
def default_task(self):
|
|
57
57
|
return "binary"
|
|
58
58
|
|
|
59
59
|
def __init__(self,
|
|
@@ -62,6 +62,7 @@ class WideDeep(BaseModel):
|
|
|
62
62
|
sequence_features: list[SequenceFeature],
|
|
63
63
|
mlp_params: dict,
|
|
64
64
|
target: list[str] = [],
|
|
65
|
+
task: str | list[str] | None = None,
|
|
65
66
|
optimizer: str = "adam",
|
|
66
67
|
optimizer_params: dict = {},
|
|
67
68
|
loss: str | nn.Module | None = "bce",
|
|
@@ -78,13 +79,12 @@ class WideDeep(BaseModel):
|
|
|
78
79
|
sparse_features=sparse_features,
|
|
79
80
|
sequence_features=sequence_features,
|
|
80
81
|
target=target,
|
|
81
|
-
task=self.
|
|
82
|
+
task=task or self.default_task,
|
|
82
83
|
device=device,
|
|
83
84
|
embedding_l1_reg=embedding_l1_reg,
|
|
84
85
|
dense_l1_reg=dense_l1_reg,
|
|
85
86
|
embedding_l2_reg=embedding_l2_reg,
|
|
86
87
|
dense_l2_reg=dense_l2_reg,
|
|
87
|
-
early_stop_patience=20,
|
|
88
88
|
**kwargs
|
|
89
89
|
)
|
|
90
90
|
|
|
@@ -109,7 +109,7 @@ class WideDeep(BaseModel):
|
|
|
109
109
|
# deep_emb_dim_total = sum([f.embedding_dim for f in self.deep_features if not isinstance(f, DenseFeature)])
|
|
110
110
|
# dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
|
|
111
111
|
self.mlp = MLP(input_dim=input_dim, **mlp_params)
|
|
112
|
-
self.prediction_layer = PredictionLayer(task_type=self.
|
|
112
|
+
self.prediction_layer = PredictionLayer(task_type=self.task)
|
|
113
113
|
# Register regularization weights
|
|
114
114
|
self.register_regularization_weights(embedding_attr='embedding', include_modules=['linear', 'mlp'])
|
|
115
115
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
|
|
@@ -56,7 +56,7 @@ class xDeepFM(BaseModel):
|
|
|
56
56
|
return "xDeepFM"
|
|
57
57
|
|
|
58
58
|
@property
|
|
59
|
-
def
|
|
59
|
+
def default_task(self):
|
|
60
60
|
return "binary"
|
|
61
61
|
|
|
62
62
|
def __init__(self,
|
|
@@ -67,6 +67,7 @@ class xDeepFM(BaseModel):
|
|
|
67
67
|
cin_size: list[int] = [128, 128],
|
|
68
68
|
split_half: bool = True,
|
|
69
69
|
target: list[str] = [],
|
|
70
|
+
task: str | list[str] | None = None,
|
|
70
71
|
optimizer: str = "adam",
|
|
71
72
|
optimizer_params: dict = {},
|
|
72
73
|
loss: str | nn.Module | None = "bce",
|
|
@@ -83,13 +84,12 @@ class xDeepFM(BaseModel):
|
|
|
83
84
|
sparse_features=sparse_features,
|
|
84
85
|
sequence_features=sequence_features,
|
|
85
86
|
target=target,
|
|
86
|
-
task=self.
|
|
87
|
+
task=task or self.default_task,
|
|
87
88
|
device=device,
|
|
88
89
|
embedding_l1_reg=embedding_l1_reg,
|
|
89
90
|
dense_l1_reg=dense_l1_reg,
|
|
90
91
|
embedding_l2_reg=embedding_l2_reg,
|
|
91
92
|
dense_l2_reg=dense_l2_reg,
|
|
92
|
-
early_stop_patience=20,
|
|
93
93
|
**kwargs
|
|
94
94
|
)
|
|
95
95
|
|
|
@@ -118,7 +118,7 @@ class xDeepFM(BaseModel):
|
|
|
118
118
|
deep_emb_dim_total = sum([f.embedding_dim for f in self.deep_features if not isinstance(f, DenseFeature)])
|
|
119
119
|
dense_input_dim = sum([getattr(f, "embedding_dim", 1) or 1 for f in dense_features])
|
|
120
120
|
self.mlp = MLP(input_dim=deep_emb_dim_total + dense_input_dim, **mlp_params)
|
|
121
|
-
self.prediction_layer = PredictionLayer(task_type=self.
|
|
121
|
+
self.prediction_layer = PredictionLayer(task_type=self.task)
|
|
122
122
|
|
|
123
123
|
# Register regularization weights
|
|
124
124
|
self.register_regularization_weights(
|
nextrec/utils/__init__.py
CHANGED
|
@@ -12,10 +12,10 @@ This package provides various utility functions organized by category:
|
|
|
12
12
|
- feature_utils: Feature processing utilities
|
|
13
13
|
|
|
14
14
|
Date: create on 13/11/2025
|
|
15
|
-
Last update:
|
|
15
|
+
Last update: 06/12/2025
|
|
16
16
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
17
17
|
"""
|
|
18
|
-
|
|
18
|
+
from . import optimizer, initializer, embedding
|
|
19
19
|
from .optimizer import get_optimizer, get_scheduler
|
|
20
20
|
from .initializer import get_initializer
|
|
21
21
|
from .embedding import get_auto_embedding_dim
|
|
@@ -24,7 +24,7 @@ from .tensor import to_tensor, stack_tensors, concat_tensors, pad_sequence_tenso
|
|
|
24
24
|
from .file import resolve_file_paths, read_table, load_dataframes, iter_file_chunks, default_output_dir
|
|
25
25
|
from .model import merge_features, get_mlp_output_dim
|
|
26
26
|
from .feature import normalize_to_list
|
|
27
|
-
from . import
|
|
27
|
+
from .synthetic_data import generate_ranking_data, generate_distributed_ranking_data, generate_match_data, generate_multitask_data
|
|
28
28
|
|
|
29
29
|
__all__ = [
|
|
30
30
|
# Optimizer & Scheduler
|
|
@@ -61,6 +61,10 @@ __all__ = [
|
|
|
61
61
|
# Feature utilities
|
|
62
62
|
'normalize_to_list',
|
|
63
63
|
|
|
64
|
+
# Synthetic data utilities
|
|
65
|
+
'generate_ranking_data',
|
|
66
|
+
'generate_distributed_ranking_data',
|
|
67
|
+
|
|
64
68
|
# Module exports
|
|
65
69
|
'optimizer',
|
|
66
70
|
'initializer',
|
nextrec/utils/device.py
CHANGED
|
@@ -4,9 +4,11 @@ Device management utilities for NextRec
|
|
|
4
4
|
Date: create on 03/12/2025
|
|
5
5
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
6
6
|
"""
|
|
7
|
-
|
|
7
|
+
import os
|
|
8
8
|
import torch
|
|
9
9
|
import platform
|
|
10
|
+
import logging
|
|
11
|
+
import multiprocessing
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
def resolve_device() -> str:
|
|
@@ -35,3 +37,32 @@ def get_device_info() -> dict:
|
|
|
35
37
|
info['cuda_capability'] = torch.cuda.get_device_capability(0)
|
|
36
38
|
|
|
37
39
|
return info
|
|
40
|
+
|
|
41
|
+
def configure_device(
|
|
42
|
+
distributed: bool,
|
|
43
|
+
local_rank: int,
|
|
44
|
+
base_device: torch.device | str = "cpu"
|
|
45
|
+
) -> torch.device:
|
|
46
|
+
try:
|
|
47
|
+
device = torch.device(base_device)
|
|
48
|
+
except Exception:
|
|
49
|
+
logging.warning("[configure_device Warning] Invalid base_device, falling back to CPU.")
|
|
50
|
+
return torch.device("cpu")
|
|
51
|
+
|
|
52
|
+
if distributed:
|
|
53
|
+
if device.type == "cuda":
|
|
54
|
+
if not torch.cuda.is_available():
|
|
55
|
+
logging.warning("[Distributed Warning] CUDA requested but unavailable. Falling back to CPU.")
|
|
56
|
+
return torch.device("cpu")
|
|
57
|
+
if not (0 <= local_rank < torch.cuda.device_count()):
|
|
58
|
+
logging.warning(f"[Distributed Warning] local_rank {local_rank} is invalid for available CUDA devices. Falling back to CPU.")
|
|
59
|
+
return torch.device("cpu")
|
|
60
|
+
try:
|
|
61
|
+
torch.cuda.set_device(local_rank)
|
|
62
|
+
return torch.device(f"cuda:{local_rank}")
|
|
63
|
+
except Exception as exc:
|
|
64
|
+
logging.warning(f"[Distributed Warning] Failed to set CUDA device for local_rank {local_rank}: {exc}. Falling back to CPU.")
|
|
65
|
+
return torch.device("cpu")
|
|
66
|
+
else:
|
|
67
|
+
return torch.device("cpu")
|
|
68
|
+
return device
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Distributed utilities for NextRec.
|
|
3
|
+
|
|
4
|
+
Date: create on 04/12/2025
|
|
5
|
+
Checkpoint: edit on 05/12/2025
|
|
6
|
+
Author: Yang Zhou,zyaztec@gmail.com
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import numpy as np
|
|
11
|
+
import torch
|
|
12
|
+
import torch.distributed as dist
|
|
13
|
+
|
|
14
|
+
from torch.utils.data import DataLoader, IterableDataset
|
|
15
|
+
from torch.utils.data.distributed import DistributedSampler
|
|
16
|
+
from nextrec.basic.loggers import colorize
|
|
17
|
+
|
|
18
|
+
def init_process_group(distributed: bool, rank: int, world_size: int, device_id: int | None = None) -> None:
|
|
19
|
+
"""
|
|
20
|
+
initialize distributed process group for multi-GPU training.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
distributed: whether to enable distributed training
|
|
24
|
+
rank: global rank of the current process
|
|
25
|
+
world_size: total number of processes
|
|
26
|
+
"""
|
|
27
|
+
if (not distributed) or (not dist.is_available()) or dist.is_initialized():
|
|
28
|
+
return
|
|
29
|
+
backend = "nccl" if device_id is not None else "gloo"
|
|
30
|
+
if backend == "nccl":
|
|
31
|
+
torch.cuda.set_device(device_id)
|
|
32
|
+
dist.init_process_group(backend=backend, init_method="env://", rank=rank, world_size=world_size)
|
|
33
|
+
|
|
34
|
+
def gather_numpy(self, array: np.ndarray | None) -> np.ndarray | None:
|
|
35
|
+
"""
|
|
36
|
+
Gather numpy arrays (or None) across ranks. Uses all_gather_object to avoid
|
|
37
|
+
shape mismatches and ensures every rank participates even when local data is empty.
|
|
38
|
+
"""
|
|
39
|
+
if not (self.distributed and dist.is_available() and dist.is_initialized()):
|
|
40
|
+
return array
|
|
41
|
+
|
|
42
|
+
world_size = dist.get_world_size()
|
|
43
|
+
gathered: list[np.ndarray | None] = [None for _ in range(world_size)]
|
|
44
|
+
dist.all_gather_object(gathered, array)
|
|
45
|
+
pieces: list[np.ndarray] = []
|
|
46
|
+
for item in gathered:
|
|
47
|
+
if item is None:
|
|
48
|
+
continue
|
|
49
|
+
item_np = np.asarray(item)
|
|
50
|
+
if item_np.size > 0:
|
|
51
|
+
pieces.append(item_np)
|
|
52
|
+
if not pieces:
|
|
53
|
+
return None
|
|
54
|
+
return np.concatenate(pieces, axis=0)
|
|
55
|
+
|
|
56
|
+
def add_distributed_sampler(
|
|
57
|
+
loader: DataLoader,
|
|
58
|
+
distributed: bool,
|
|
59
|
+
world_size: int,
|
|
60
|
+
rank: int,
|
|
61
|
+
shuffle: bool,
|
|
62
|
+
drop_last: bool,
|
|
63
|
+
default_batch_size: int,
|
|
64
|
+
is_main_process: bool = False,
|
|
65
|
+
) -> tuple[DataLoader, DistributedSampler | None]:
|
|
66
|
+
"""
|
|
67
|
+
add distributedsampler to a dataloader, this for distributed training
|
|
68
|
+
when each device has its own dataloader
|
|
69
|
+
"""
|
|
70
|
+
# early return if not distributed
|
|
71
|
+
if not (distributed and dist.is_available() and dist.is_initialized()):
|
|
72
|
+
return loader, None
|
|
73
|
+
# return if already has DistributedSampler
|
|
74
|
+
if isinstance(loader.sampler, DistributedSampler):
|
|
75
|
+
return loader, loader.sampler
|
|
76
|
+
dataset = getattr(loader, "dataset", None)
|
|
77
|
+
if dataset is None:
|
|
78
|
+
return loader, None
|
|
79
|
+
if isinstance(dataset, IterableDataset):
|
|
80
|
+
if is_main_process:
|
|
81
|
+
logging.info(colorize("[Distributed Info] Iterable/streaming DataLoader provided; DistributedSampler is skipped. Ensure dataset handles sharding per rank.", color="yellow"))
|
|
82
|
+
return loader, None
|
|
83
|
+
sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, drop_last=drop_last)
|
|
84
|
+
loader_kwargs = {
|
|
85
|
+
"batch_size": loader.batch_size if loader.batch_size is not None else default_batch_size,
|
|
86
|
+
"shuffle": False,
|
|
87
|
+
"sampler": sampler,
|
|
88
|
+
"num_workers": loader.num_workers,
|
|
89
|
+
"collate_fn": loader.collate_fn,
|
|
90
|
+
"drop_last": drop_last,
|
|
91
|
+
}
|
|
92
|
+
if getattr(loader, "pin_memory", False):
|
|
93
|
+
loader_kwargs["pin_memory"] = True
|
|
94
|
+
pin_memory_device = getattr(loader, "pin_memory_device", None)
|
|
95
|
+
if pin_memory_device:
|
|
96
|
+
loader_kwargs["pin_memory_device"] = pin_memory_device
|
|
97
|
+
timeout = getattr(loader, "timeout", None)
|
|
98
|
+
if timeout:
|
|
99
|
+
loader_kwargs["timeout"] = timeout
|
|
100
|
+
worker_init_fn = getattr(loader, "worker_init_fn", None)
|
|
101
|
+
if worker_init_fn is not None:
|
|
102
|
+
loader_kwargs["worker_init_fn"] = worker_init_fn
|
|
103
|
+
generator = getattr(loader, "generator", None)
|
|
104
|
+
if generator is not None:
|
|
105
|
+
loader_kwargs["generator"] = generator
|
|
106
|
+
if loader.num_workers > 0:
|
|
107
|
+
loader_kwargs["persistent_workers"] = getattr(loader, "persistent_workers", False)
|
|
108
|
+
prefetch_factor = getattr(loader, "prefetch_factor", None)
|
|
109
|
+
if prefetch_factor is not None:
|
|
110
|
+
loader_kwargs["prefetch_factor"] = prefetch_factor
|
|
111
|
+
distributed_loader = DataLoader(dataset, **loader_kwargs)
|
|
112
|
+
if is_main_process:
|
|
113
|
+
logging.info(colorize("[Distributed Info] Attached DistributedSampler to provided DataLoader", color="cyan"))
|
|
114
|
+
return distributed_loader, sampler
|
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Synthetic Data Generation Utilities
|
|
3
|
+
|
|
4
|
+
This module provides utilities for generating synthetic datasets for testing
|
|
5
|
+
and tutorial purposes in the NextRec framework.
|
|
6
|
+
|
|
7
|
+
Date: create on 06/12/2025
|
|
8
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from typing import Optional, Dict, List, Tuple, TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
17
|
+
|
|
18
|
+
def generate_ranking_data(
|
|
19
|
+
n_samples: int = 10000,
|
|
20
|
+
n_dense: int = 5,
|
|
21
|
+
n_sparse: int = 8,
|
|
22
|
+
n_sequences: int = 2,
|
|
23
|
+
user_vocab_size: int = 1000,
|
|
24
|
+
item_vocab_size: int = 500,
|
|
25
|
+
sparse_vocab_size: int = 50,
|
|
26
|
+
sequence_max_len: int = 20,
|
|
27
|
+
embedding_dim: int = 16,
|
|
28
|
+
seed: int = 42,
|
|
29
|
+
custom_sparse_features: Optional[Dict[str, int]] = None,
|
|
30
|
+
use_simple_names: bool = True
|
|
31
|
+
) -> Tuple[pd.DataFrame, List, List, List]:
|
|
32
|
+
"""
|
|
33
|
+
Generate synthetic data for ranking tasks (CTR prediction)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
tuple: (dataframe, dense_features, sparse_features, sequence_features)
|
|
37
|
+
"""
|
|
38
|
+
print(f"Generating {n_samples} synthetic ranking samples...")
|
|
39
|
+
|
|
40
|
+
np.random.seed(seed)
|
|
41
|
+
data = {}
|
|
42
|
+
|
|
43
|
+
for i in range(n_dense):
|
|
44
|
+
data[f'dense_{i}'] = np.random.randn(n_samples).astype(np.float32)
|
|
45
|
+
|
|
46
|
+
# Generate basic sparse features (always include user_id and item_id)
|
|
47
|
+
data['user_id'] = np.random.randint(1, user_vocab_size, n_samples)
|
|
48
|
+
data['item_id'] = np.random.randint(1, item_vocab_size, n_samples)
|
|
49
|
+
|
|
50
|
+
# Generate additional sparse features
|
|
51
|
+
if custom_sparse_features:
|
|
52
|
+
for feat_name, vocab_size in custom_sparse_features.items():
|
|
53
|
+
data[feat_name] = np.random.randint(0, vocab_size, n_samples)
|
|
54
|
+
else:
|
|
55
|
+
for i in range(n_sparse - 2):
|
|
56
|
+
data[f'sparse_{i}'] = np.random.randint(1, sparse_vocab_size, n_samples)
|
|
57
|
+
|
|
58
|
+
# Generate sequence features (list of IDs)
|
|
59
|
+
sequence_names = []
|
|
60
|
+
sequence_vocabs = []
|
|
61
|
+
|
|
62
|
+
for i in range(n_sequences):
|
|
63
|
+
sequences = []
|
|
64
|
+
for _ in range(n_samples):
|
|
65
|
+
seq_len = np.random.randint(5, sequence_max_len + 1)
|
|
66
|
+
if i == 0:
|
|
67
|
+
# First sequence uses item vocabulary
|
|
68
|
+
seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
|
|
69
|
+
seq_vocab = item_vocab_size
|
|
70
|
+
if custom_sparse_features:
|
|
71
|
+
seq_name = 'hist_items'
|
|
72
|
+
else:
|
|
73
|
+
seq_name = 'sequence_0'
|
|
74
|
+
else:
|
|
75
|
+
# Other sequences use category vocabulary
|
|
76
|
+
if custom_sparse_features and 'category' in custom_sparse_features:
|
|
77
|
+
seq_vocab = custom_sparse_features['category']
|
|
78
|
+
seq = np.random.randint(0, seq_vocab, seq_len).tolist()
|
|
79
|
+
seq_name = f'hist_categories' if i == 1 else f'sequence_{i}'
|
|
80
|
+
else:
|
|
81
|
+
seq_vocab = sparse_vocab_size
|
|
82
|
+
seq = np.random.randint(0, seq_vocab, seq_len).tolist()
|
|
83
|
+
seq_name = f'sequence_{i}'
|
|
84
|
+
|
|
85
|
+
# Padding
|
|
86
|
+
seq = seq + [0] * (sequence_max_len - len(seq))
|
|
87
|
+
sequences.append(seq)
|
|
88
|
+
|
|
89
|
+
data[seq_name] = sequences
|
|
90
|
+
sequence_names.append(seq_name)
|
|
91
|
+
sequence_vocabs.append(seq_vocab)
|
|
92
|
+
|
|
93
|
+
if 'gender' in data and 'dense_0' in data:
|
|
94
|
+
# Complex label generation with feature correlation
|
|
95
|
+
label_probs = 1 / (1 + np.exp(-(
|
|
96
|
+
data['dense_0'] * 0.3 +
|
|
97
|
+
data['dense_1'] * 0.2 +
|
|
98
|
+
(data['gender'] - 0.5) * 0.5 +
|
|
99
|
+
np.random.randn(n_samples) * 0.1
|
|
100
|
+
)))
|
|
101
|
+
data['label'] = (label_probs > 0.5).astype(np.float32)
|
|
102
|
+
else:
|
|
103
|
+
data['label'] = np.random.randint(0, 2, n_samples).astype(np.float32)
|
|
104
|
+
|
|
105
|
+
df = pd.DataFrame(data)
|
|
106
|
+
print(f"Generated data shape: {df.shape}")
|
|
107
|
+
if 'gender' in data:
|
|
108
|
+
print(f"Positive rate: {data['label'].mean():.4f}")
|
|
109
|
+
|
|
110
|
+
# Import here to avoid circular import
|
|
111
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
112
|
+
|
|
113
|
+
# Create feature definitions
|
|
114
|
+
# Use input_dim for dense features to be compatible with both simple and complex scenarios
|
|
115
|
+
dense_features = [DenseFeature(name=f'dense_{i}', input_dim=1) for i in range(n_dense)]
|
|
116
|
+
|
|
117
|
+
# Create sparse features
|
|
118
|
+
sparse_features = [SparseFeature( name='user_id', embedding_name='user_emb', vocab_size=user_vocab_size, embedding_dim=embedding_dim),
|
|
119
|
+
SparseFeature(name='item_id', embedding_name='item_emb', vocab_size=item_vocab_size, embedding_dim=embedding_dim),]
|
|
120
|
+
|
|
121
|
+
if custom_sparse_features:
|
|
122
|
+
# Add custom sparse features with proper vocab sizes
|
|
123
|
+
for feat_name, vocab_size in custom_sparse_features.items():
|
|
124
|
+
sparse_features.append(SparseFeature(name=feat_name, embedding_name=f'{feat_name}_emb', vocab_size=vocab_size, embedding_dim=embedding_dim))
|
|
125
|
+
else:
|
|
126
|
+
# Add generic sparse features
|
|
127
|
+
sparse_features.extend([SparseFeature(name=f'sparse_{i}', embedding_name=f'sparse_{i}_emb', vocab_size=sparse_vocab_size, embedding_dim=embedding_dim) for i in range(n_sparse - 2)])
|
|
128
|
+
|
|
129
|
+
# Create sequence features
|
|
130
|
+
sequence_features = []
|
|
131
|
+
for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
|
|
132
|
+
if i == 0:
|
|
133
|
+
# First sequence shares embedding with item_id
|
|
134
|
+
embedding_name = 'item_emb'
|
|
135
|
+
elif custom_sparse_features and 'category' in custom_sparse_features and seq_name == 'hist_categories':
|
|
136
|
+
# hist_categories shares embedding with category
|
|
137
|
+
embedding_name = 'category_emb'
|
|
138
|
+
else:
|
|
139
|
+
# Other sequences share with sparse_0
|
|
140
|
+
embedding_name = 'sparse_0_emb'
|
|
141
|
+
sequence_features.append(SequenceFeature(name=seq_name, vocab_size=seq_vocab, max_len=sequence_max_len, embedding_dim=embedding_dim, padding_idx=0, embedding_name=embedding_name))
|
|
142
|
+
return df, dense_features, sparse_features, sequence_features
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def generate_match_data(
|
|
146
|
+
n_samples: int = 10000,
|
|
147
|
+
user_vocab_size: int = 1000,
|
|
148
|
+
item_vocab_size: int = 5000,
|
|
149
|
+
category_vocab_size: int = 100,
|
|
150
|
+
brand_vocab_size: int = 200,
|
|
151
|
+
city_vocab_size: int = 100,
|
|
152
|
+
user_feature_vocab_size: int = 50,
|
|
153
|
+
item_feature_vocab_size: int = 50,
|
|
154
|
+
sequence_max_len: int = 50,
|
|
155
|
+
user_embedding_dim: int = 32,
|
|
156
|
+
item_embedding_dim: int = 32,
|
|
157
|
+
seed: int = 42
|
|
158
|
+
) -> Tuple[pd.DataFrame, List, List, List, List, List, List]:
|
|
159
|
+
"""
|
|
160
|
+
Generate synthetic data for match/retrieval tasks
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
tuple: (dataframe, user_dense_features, user_sparse_features, user_sequence_features,
|
|
164
|
+
item_dense_features, item_sparse_features, item_sequence_features)
|
|
165
|
+
"""
|
|
166
|
+
print(f"Generating {n_samples} synthetic match samples...")
|
|
167
|
+
|
|
168
|
+
np.random.seed(seed)
|
|
169
|
+
data = {}
|
|
170
|
+
|
|
171
|
+
# User features
|
|
172
|
+
data['user_id'] = np.random.randint(1, user_vocab_size, n_samples)
|
|
173
|
+
data['user_age'] = np.random.randn(n_samples).astype(np.float32)
|
|
174
|
+
data['user_gender'] = np.random.randint(0, 2, n_samples)
|
|
175
|
+
data['user_city'] = np.random.randint(0, city_vocab_size, n_samples)
|
|
176
|
+
|
|
177
|
+
for i in range(3):
|
|
178
|
+
data[f'user_feature_{i}'] = np.random.randint(1, user_feature_vocab_size, n_samples)
|
|
179
|
+
|
|
180
|
+
# User behavior sequences
|
|
181
|
+
user_hist_items = []
|
|
182
|
+
user_hist_categories = []
|
|
183
|
+
for _ in range(n_samples):
|
|
184
|
+
seq_len = np.random.randint(10, sequence_max_len + 1)
|
|
185
|
+
hist_items = np.random.randint(1, item_vocab_size, seq_len).tolist()
|
|
186
|
+
hist_items = hist_items + [0] * (sequence_max_len - len(hist_items))
|
|
187
|
+
user_hist_items.append(hist_items)
|
|
188
|
+
|
|
189
|
+
hist_cats = np.random.randint(1, category_vocab_size, seq_len).tolist()
|
|
190
|
+
hist_cats = hist_cats + [0] * (sequence_max_len - len(hist_cats))
|
|
191
|
+
user_hist_categories.append(hist_cats)
|
|
192
|
+
|
|
193
|
+
data['user_hist_items'] = user_hist_items
|
|
194
|
+
data['user_hist_categories'] = user_hist_categories
|
|
195
|
+
|
|
196
|
+
# Item features
|
|
197
|
+
data['item_id'] = np.random.randint(1, item_vocab_size, n_samples)
|
|
198
|
+
data['item_price'] = np.random.randn(n_samples).astype(np.float32)
|
|
199
|
+
data['item_category'] = np.random.randint(1, category_vocab_size, n_samples)
|
|
200
|
+
data['item_brand'] = np.random.randint(1, brand_vocab_size, n_samples)
|
|
201
|
+
|
|
202
|
+
for i in range(3):
|
|
203
|
+
data[f'item_feature_{i}'] = np.random.randint(1, item_feature_vocab_size, n_samples)
|
|
204
|
+
|
|
205
|
+
# Generate labels with some correlation to features
|
|
206
|
+
label_probs = 1 / (1 + np.exp(-(
|
|
207
|
+
data['user_age'] * 0.2 +
|
|
208
|
+
(data['user_gender'] - 0.5) * 0.3 +
|
|
209
|
+
data['item_price'] * 0.15 +
|
|
210
|
+
np.random.randn(n_samples) * 0.5
|
|
211
|
+
)))
|
|
212
|
+
data['label'] = (label_probs > 0.5).astype(np.float32)
|
|
213
|
+
|
|
214
|
+
df = pd.DataFrame(data)
|
|
215
|
+
print(f"Generated data shape: {df.shape}")
|
|
216
|
+
print(f"Positive rate: {data['label'].mean():.4f}")
|
|
217
|
+
|
|
218
|
+
# Import here to avoid circular import
|
|
219
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
220
|
+
|
|
221
|
+
# User dense features
|
|
222
|
+
user_dense_features = [DenseFeature(name='user_age', input_dim=1)]
|
|
223
|
+
|
|
224
|
+
# User sparse features
|
|
225
|
+
user_sparse_features = [
|
|
226
|
+
SparseFeature(name='user_id', vocab_size=user_vocab_size, embedding_dim=user_embedding_dim),
|
|
227
|
+
SparseFeature(name='user_gender', vocab_size=2, embedding_dim=8),
|
|
228
|
+
SparseFeature(name='user_city', vocab_size=city_vocab_size, embedding_dim=16),
|
|
229
|
+
]
|
|
230
|
+
user_sparse_features.extend([
|
|
231
|
+
SparseFeature(name=f'user_feature_{i}', vocab_size=user_feature_vocab_size, embedding_dim=8)
|
|
232
|
+
for i in range(3)
|
|
233
|
+
])
|
|
234
|
+
|
|
235
|
+
# User sequence features
|
|
236
|
+
user_sequence_features = [
|
|
237
|
+
SequenceFeature(name='user_hist_items', vocab_size=item_vocab_size,
|
|
238
|
+
max_len=sequence_max_len, embedding_dim=user_embedding_dim, padding_idx=0),
|
|
239
|
+
SequenceFeature(name='user_hist_categories', vocab_size=category_vocab_size,
|
|
240
|
+
max_len=sequence_max_len, embedding_dim=16, padding_idx=0),
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
# Item dense features
|
|
244
|
+
item_dense_features = [DenseFeature(name='item_price', input_dim=1)]
|
|
245
|
+
|
|
246
|
+
# Item sparse features
|
|
247
|
+
item_sparse_features = [
|
|
248
|
+
SparseFeature(name='item_id', vocab_size=item_vocab_size, embedding_dim=item_embedding_dim),
|
|
249
|
+
SparseFeature(name='item_category', vocab_size=category_vocab_size, embedding_dim=16),
|
|
250
|
+
SparseFeature(name='item_brand', vocab_size=brand_vocab_size, embedding_dim=16),
|
|
251
|
+
]
|
|
252
|
+
item_sparse_features.extend([
|
|
253
|
+
SparseFeature(name=f'item_feature_{i}', vocab_size=item_feature_vocab_size, embedding_dim=8)
|
|
254
|
+
for i in range(3)
|
|
255
|
+
])
|
|
256
|
+
|
|
257
|
+
# Item sequence features (empty for most match models)
|
|
258
|
+
item_sequence_features = []
|
|
259
|
+
|
|
260
|
+
return (df, user_dense_features, user_sparse_features, user_sequence_features,
|
|
261
|
+
item_dense_features, item_sparse_features, item_sequence_features)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def generate_multitask_data(
|
|
265
|
+
n_samples: int = 10000,
|
|
266
|
+
n_dense: int = 5,
|
|
267
|
+
n_sparse: int = 8,
|
|
268
|
+
n_sequences: int = 2,
|
|
269
|
+
user_vocab_size: int = 1000,
|
|
270
|
+
item_vocab_size: int = 500,
|
|
271
|
+
sparse_vocab_size: int = 50,
|
|
272
|
+
sequence_max_len: int = 20,
|
|
273
|
+
embedding_dim: int = 16,
|
|
274
|
+
seed: int = 42
|
|
275
|
+
) -> Tuple[pd.DataFrame, List, List, List]:
|
|
276
|
+
"""
|
|
277
|
+
Generate synthetic data for multi-task learning
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
tuple: (dataframe, dense_features, sparse_features, sequence_features)
|
|
281
|
+
"""
|
|
282
|
+
print(f"Generating {n_samples} synthetic multi-task samples...")
|
|
283
|
+
|
|
284
|
+
np.random.seed(seed)
|
|
285
|
+
data = {}
|
|
286
|
+
|
|
287
|
+
# Generate dense features
|
|
288
|
+
for i in range(n_dense):
|
|
289
|
+
data[f'dense_{i}'] = np.random.randn(n_samples).astype(np.float32)
|
|
290
|
+
|
|
291
|
+
# Generate sparse features
|
|
292
|
+
data['user_id'] = np.random.randint(1, user_vocab_size, n_samples)
|
|
293
|
+
data['item_id'] = np.random.randint(1, item_vocab_size, n_samples)
|
|
294
|
+
|
|
295
|
+
for i in range(n_sparse - 2):
|
|
296
|
+
data[f'sparse_{i}'] = np.random.randint(1, sparse_vocab_size, n_samples)
|
|
297
|
+
|
|
298
|
+
# Generate sequence features
|
|
299
|
+
sequence_names = []
|
|
300
|
+
sequence_vocabs = []
|
|
301
|
+
|
|
302
|
+
for i in range(n_sequences):
|
|
303
|
+
sequences = []
|
|
304
|
+
for _ in range(n_samples):
|
|
305
|
+
seq_len = np.random.randint(5, sequence_max_len + 1)
|
|
306
|
+
if i == 0:
|
|
307
|
+
seq = np.random.randint(0, item_vocab_size, seq_len).tolist()
|
|
308
|
+
seq_vocab = item_vocab_size
|
|
309
|
+
seq_name = 'sequence_0'
|
|
310
|
+
else:
|
|
311
|
+
seq = np.random.randint(0, sparse_vocab_size, seq_len).tolist()
|
|
312
|
+
seq_vocab = sparse_vocab_size
|
|
313
|
+
seq_name = f'sequence_{i}'
|
|
314
|
+
|
|
315
|
+
seq = seq + [0] * (sequence_max_len - len(seq))
|
|
316
|
+
sequences.append(seq)
|
|
317
|
+
|
|
318
|
+
data[seq_name] = sequences
|
|
319
|
+
sequence_names.append(seq_name)
|
|
320
|
+
sequence_vocabs.append(seq_vocab)
|
|
321
|
+
|
|
322
|
+
# Generate multi-task labels with correlation
|
|
323
|
+
# CTR (click) is relatively easier to predict
|
|
324
|
+
ctr_logits = (
|
|
325
|
+
data['dense_0'] * 0.3 +
|
|
326
|
+
data['dense_1'] * 0.2 +
|
|
327
|
+
np.random.randn(n_samples) * 0.5
|
|
328
|
+
)
|
|
329
|
+
data['click'] = (1 / (1 + np.exp(-ctr_logits)) > 0.5).astype(np.float32)
|
|
330
|
+
|
|
331
|
+
# CVR (conversion) depends on click and is harder
|
|
332
|
+
cvr_logits = (
|
|
333
|
+
data['dense_2'] * 0.2 +
|
|
334
|
+
data['dense_3'] * 0.15 +
|
|
335
|
+
data['click'] * 1.5 + # Strong dependency on click
|
|
336
|
+
np.random.randn(n_samples) * 0.8
|
|
337
|
+
)
|
|
338
|
+
data['conversion'] = (1 / (1 + np.exp(-cvr_logits)) > 0.3).astype(np.float32)
|
|
339
|
+
|
|
340
|
+
# CTCVR = click AND conversion
|
|
341
|
+
data['ctcvr'] = (data['click'] * data['conversion']).astype(np.float32)
|
|
342
|
+
|
|
343
|
+
df = pd.DataFrame(data)
|
|
344
|
+
print(f"Generated data shape: {df.shape}")
|
|
345
|
+
print(f"Click rate: {data['click'].mean():.4f}")
|
|
346
|
+
print(f"Conversion rate (overall): {data['conversion'].mean():.4f}")
|
|
347
|
+
if data['click'].sum() > 0:
|
|
348
|
+
print(f"Conversion rate (given click): {data['conversion'][data['click'] == 1].mean():.4f}")
|
|
349
|
+
print(f"CTCVR rate: {data['ctcvr'].mean():.4f}")
|
|
350
|
+
|
|
351
|
+
# Import here to avoid circular import
|
|
352
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature
|
|
353
|
+
|
|
354
|
+
# Create feature definitions
|
|
355
|
+
dense_features = [DenseFeature(name=f'dense_{i}', input_dim=1) for i in range(n_dense)]
|
|
356
|
+
|
|
357
|
+
# Create sparse features
|
|
358
|
+
sparse_features = [
|
|
359
|
+
SparseFeature(name='user_id', embedding_name='user_emb',
|
|
360
|
+
vocab_size=user_vocab_size, embedding_dim=embedding_dim),
|
|
361
|
+
SparseFeature(name='item_id', embedding_name='item_emb',
|
|
362
|
+
vocab_size=item_vocab_size, embedding_dim=embedding_dim),
|
|
363
|
+
]
|
|
364
|
+
sparse_features.extend([
|
|
365
|
+
SparseFeature(name=f'sparse_{i}', embedding_name=f'sparse_{i}_emb',
|
|
366
|
+
vocab_size=sparse_vocab_size, embedding_dim=embedding_dim)
|
|
367
|
+
for i in range(n_sparse - 2)
|
|
368
|
+
])
|
|
369
|
+
|
|
370
|
+
# Create sequence features
|
|
371
|
+
sequence_features = []
|
|
372
|
+
for i, (seq_name, seq_vocab) in enumerate(zip(sequence_names, sequence_vocabs)):
|
|
373
|
+
if i == 0:
|
|
374
|
+
embedding_name = 'item_emb'
|
|
375
|
+
else:
|
|
376
|
+
embedding_name = 'sparse_0_emb'
|
|
377
|
+
sequence_features.append(
|
|
378
|
+
SequenceFeature(name=seq_name, vocab_size=seq_vocab, max_len=sequence_max_len,
|
|
379
|
+
embedding_dim=embedding_dim, padding_idx=0, embedding_name=embedding_name)
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
return df, dense_features, sparse_features, sequence_features
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def generate_distributed_ranking_data(
|
|
386
|
+
num_samples: int = 100000,
|
|
387
|
+
num_users: int = 10000,
|
|
388
|
+
num_items: int = 5000,
|
|
389
|
+
num_categories: int = 20,
|
|
390
|
+
num_cities: int = 100,
|
|
391
|
+
max_seq_len: int = 50,
|
|
392
|
+
embedding_dim: int = 32,
|
|
393
|
+
seed: int = 42,
|
|
394
|
+
) -> Tuple[pd.DataFrame, List, List, List]:
|
|
395
|
+
"""
|
|
396
|
+
Generate synthetic data for distributed training scenarios
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
tuple: (dataframe, dense_features, sparse_features, sequence_features)
|
|
400
|
+
"""
|
|
401
|
+
return generate_ranking_data(
|
|
402
|
+
n_samples=num_samples,
|
|
403
|
+
n_dense=5,
|
|
404
|
+
n_sparse=6, # user_id, item_id + 4 custom features
|
|
405
|
+
n_sequences=2,
|
|
406
|
+
user_vocab_size=num_users + 1,
|
|
407
|
+
item_vocab_size=num_items + 1,
|
|
408
|
+
sequence_max_len=max_seq_len,
|
|
409
|
+
embedding_dim=embedding_dim,
|
|
410
|
+
seed=seed,
|
|
411
|
+
custom_sparse_features={'gender': 2, 'age_group': 7, 'category': num_categories,'city': num_cities},
|
|
412
|
+
use_simple_names=False
|
|
413
|
+
)
|