nextrec 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +41 -0
- nextrec/__version__.py +1 -0
- nextrec/basic/__init__.py +0 -0
- nextrec/basic/activation.py +92 -0
- nextrec/basic/callback.py +35 -0
- nextrec/basic/dataloader.py +447 -0
- nextrec/basic/features.py +87 -0
- nextrec/basic/layers.py +985 -0
- nextrec/basic/loggers.py +124 -0
- nextrec/basic/metrics.py +557 -0
- nextrec/basic/model.py +1438 -0
- nextrec/data/__init__.py +27 -0
- nextrec/data/data_utils.py +132 -0
- nextrec/data/preprocessor.py +662 -0
- nextrec/loss/__init__.py +35 -0
- nextrec/loss/loss_utils.py +136 -0
- nextrec/loss/match_losses.py +294 -0
- nextrec/models/generative/hstu.py +0 -0
- nextrec/models/generative/tiger.py +0 -0
- nextrec/models/match/__init__.py +13 -0
- nextrec/models/match/dssm.py +200 -0
- nextrec/models/match/dssm_v2.py +162 -0
- nextrec/models/match/mind.py +210 -0
- nextrec/models/match/sdm.py +253 -0
- nextrec/models/match/youtube_dnn.py +172 -0
- nextrec/models/multi_task/esmm.py +129 -0
- nextrec/models/multi_task/mmoe.py +161 -0
- nextrec/models/multi_task/ple.py +260 -0
- nextrec/models/multi_task/share_bottom.py +126 -0
- nextrec/models/ranking/__init__.py +17 -0
- nextrec/models/ranking/afm.py +118 -0
- nextrec/models/ranking/autoint.py +140 -0
- nextrec/models/ranking/dcn.py +120 -0
- nextrec/models/ranking/deepfm.py +95 -0
- nextrec/models/ranking/dien.py +214 -0
- nextrec/models/ranking/din.py +181 -0
- nextrec/models/ranking/fibinet.py +130 -0
- nextrec/models/ranking/fm.py +87 -0
- nextrec/models/ranking/masknet.py +125 -0
- nextrec/models/ranking/pnn.py +128 -0
- nextrec/models/ranking/widedeep.py +105 -0
- nextrec/models/ranking/xdeepfm.py +117 -0
- nextrec/utils/__init__.py +18 -0
- nextrec/utils/common.py +14 -0
- nextrec/utils/embedding.py +19 -0
- nextrec/utils/initializer.py +47 -0
- nextrec/utils/optimizer.py +75 -0
- nextrec-0.1.1.dist-info/METADATA +302 -0
- nextrec-0.1.1.dist-info/RECORD +51 -0
- nextrec-0.1.1.dist-info/WHEEL +4 -0
- nextrec-0.1.1.dist-info/licenses/LICENSE +21 -0
nextrec/data/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data utilities package for NextRec
|
|
3
|
+
|
|
4
|
+
This package provides data processing and manipulation utilities.
|
|
5
|
+
|
|
6
|
+
Date: create on 13/11/2025
|
|
7
|
+
Author:
|
|
8
|
+
Yang Zhou, zyaztec@gmail.com
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from nextrec.data.data_utils import (
|
|
12
|
+
collate_fn,
|
|
13
|
+
get_column_data,
|
|
14
|
+
split_dict_random,
|
|
15
|
+
build_eval_candidates,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
# For backward compatibility, keep utils accessible
|
|
19
|
+
from nextrec.data import data_utils
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
'collate_fn',
|
|
23
|
+
'get_column_data',
|
|
24
|
+
'split_dict_random',
|
|
25
|
+
'build_eval_candidates',
|
|
26
|
+
'data_utils',
|
|
27
|
+
]
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data processing utilities for NextRec
|
|
3
|
+
|
|
4
|
+
Date: create on 13/11/2025
|
|
5
|
+
Author:
|
|
6
|
+
Yang Zhou, zyaztec@gmail.com
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
def collate_fn(batch):
|
|
14
|
+
"""
|
|
15
|
+
Custom collate function for batching tuples of tensors.
|
|
16
|
+
Each element in batch is a tuple of tensors from FileDataset.
|
|
17
|
+
|
|
18
|
+
Examples:
|
|
19
|
+
# Single sample in batch
|
|
20
|
+
(tensor([1.0, 2.0]), tensor([10, 20]), tensor([100, 200]), tensor(1.0))
|
|
21
|
+
# Batched output
|
|
22
|
+
(tensor([[1.0, 2.0], [3.0, 4.0]]), # dense_features batch
|
|
23
|
+
tensor([[10, 20], [30, 40]]), # sparse_features batch
|
|
24
|
+
tensor([[100, 200], [300, 400]]), # sequence_features batch
|
|
25
|
+
tensor([1.0, 0.0]) # labels batch)
|
|
26
|
+
|
|
27
|
+
"""
|
|
28
|
+
if not batch:
|
|
29
|
+
return tuple()
|
|
30
|
+
|
|
31
|
+
num_tensors = len(batch[0])
|
|
32
|
+
result = []
|
|
33
|
+
|
|
34
|
+
for i in range(num_tensors):
|
|
35
|
+
tensor_list = [item[i] for item in batch]
|
|
36
|
+
stacked = torch.cat(tensor_list, dim=0)
|
|
37
|
+
result.append(stacked)
|
|
38
|
+
|
|
39
|
+
return tuple(result)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_column_data(data: dict | pd.DataFrame, name: str):
|
|
43
|
+
"""Extract column data from various data structures."""
|
|
44
|
+
if isinstance(data, dict):
|
|
45
|
+
return data[name] if name in data else None
|
|
46
|
+
elif isinstance(data, pd.DataFrame):
|
|
47
|
+
if name not in data.columns:
|
|
48
|
+
return None
|
|
49
|
+
return data[name].values
|
|
50
|
+
else:
|
|
51
|
+
if hasattr(data, name):
|
|
52
|
+
return getattr(data, name)
|
|
53
|
+
raise KeyError(f"Unsupported data type for extracting column {name}")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def split_dict_random(data_dict: dict, test_size: float=0.2, random_state:int|None=None):
|
|
57
|
+
"""Randomly split a dictionary of data into training and testing sets."""
|
|
58
|
+
lengths = [len(v) for v in data_dict.values()]
|
|
59
|
+
if len(set(lengths)) != 1:
|
|
60
|
+
raise ValueError(f"Length mismatch: {lengths}")
|
|
61
|
+
n = lengths[0]
|
|
62
|
+
|
|
63
|
+
rng = np.random.default_rng(random_state)
|
|
64
|
+
perm = rng.permutation(n)
|
|
65
|
+
cut = int(round(n * (1 - test_size)))
|
|
66
|
+
train_idx, test_idx = perm[:cut], perm[cut:]
|
|
67
|
+
|
|
68
|
+
def take(v, idx):
|
|
69
|
+
if isinstance(v, np.ndarray):
|
|
70
|
+
return v[idx]
|
|
71
|
+
elif isinstance(v, pd.Series):
|
|
72
|
+
return v.iloc[idx].to_numpy()
|
|
73
|
+
else:
|
|
74
|
+
v_arr = np.asarray(v, dtype=object)
|
|
75
|
+
return v_arr[idx]
|
|
76
|
+
|
|
77
|
+
train_dict = {k: take(v, train_idx) for k, v in data_dict.items()}
|
|
78
|
+
test_dict = {k: take(v, test_idx) for k, v in data_dict.items()}
|
|
79
|
+
return train_dict, test_dict
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def build_eval_candidates(
|
|
83
|
+
df_all: pd.DataFrame,
|
|
84
|
+
user_col: str,
|
|
85
|
+
item_col: str,
|
|
86
|
+
label_col: str,
|
|
87
|
+
user_features: pd.DataFrame,
|
|
88
|
+
item_features: pd.DataFrame,
|
|
89
|
+
num_pos_per_user: int = 5,
|
|
90
|
+
num_neg_per_pos: int = 50,
|
|
91
|
+
random_seed: int = 2025,
|
|
92
|
+
) -> pd.DataFrame:
|
|
93
|
+
"""Build evaluation candidates with positive and negative samples for each user. """
|
|
94
|
+
rng = np.random.default_rng(random_seed)
|
|
95
|
+
|
|
96
|
+
users = df_all[user_col].unique()
|
|
97
|
+
all_items = item_features[item_col].unique()
|
|
98
|
+
|
|
99
|
+
rows = []
|
|
100
|
+
|
|
101
|
+
user_hist_items = {
|
|
102
|
+
u: df_all[df_all[user_col] == u][item_col].unique()
|
|
103
|
+
for u in users
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
for u in users:
|
|
107
|
+
df_user = df_all[df_all[user_col] == u]
|
|
108
|
+
pos_items = df_user[df_user[label_col] == 1][item_col].unique()
|
|
109
|
+
if len(pos_items) == 0:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
pos_items = pos_items[:num_pos_per_user]
|
|
113
|
+
seen_items = set(user_hist_items[u])
|
|
114
|
+
|
|
115
|
+
neg_pool = np.setdiff1d(all_items, np.fromiter(seen_items, dtype=all_items.dtype))
|
|
116
|
+
if len(neg_pool) == 0:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
for pos in pos_items:
|
|
120
|
+
if len(neg_pool) <= num_neg_per_pos:
|
|
121
|
+
neg_items = neg_pool
|
|
122
|
+
else:
|
|
123
|
+
neg_items = rng.choice(neg_pool, size=num_neg_per_pos, replace=False)
|
|
124
|
+
|
|
125
|
+
rows.append((u, pos, 1))
|
|
126
|
+
for ni in neg_items:
|
|
127
|
+
rows.append((u, ni, 0))
|
|
128
|
+
|
|
129
|
+
eval_df = pd.DataFrame(rows, columns=[user_col, item_col, label_col])
|
|
130
|
+
eval_df = eval_df.merge(user_features, on=user_col, how='left')
|
|
131
|
+
eval_df = eval_df.merge(item_features, on=item_col, how='left')
|
|
132
|
+
return eval_df
|