nextrec 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. nextrec/__init__.py +41 -0
  2. nextrec/__version__.py +1 -0
  3. nextrec/basic/__init__.py +0 -0
  4. nextrec/basic/activation.py +92 -0
  5. nextrec/basic/callback.py +35 -0
  6. nextrec/basic/dataloader.py +447 -0
  7. nextrec/basic/features.py +87 -0
  8. nextrec/basic/layers.py +985 -0
  9. nextrec/basic/loggers.py +124 -0
  10. nextrec/basic/metrics.py +557 -0
  11. nextrec/basic/model.py +1438 -0
  12. nextrec/data/__init__.py +27 -0
  13. nextrec/data/data_utils.py +132 -0
  14. nextrec/data/preprocessor.py +662 -0
  15. nextrec/loss/__init__.py +35 -0
  16. nextrec/loss/loss_utils.py +136 -0
  17. nextrec/loss/match_losses.py +294 -0
  18. nextrec/models/generative/hstu.py +0 -0
  19. nextrec/models/generative/tiger.py +0 -0
  20. nextrec/models/match/__init__.py +13 -0
  21. nextrec/models/match/dssm.py +200 -0
  22. nextrec/models/match/dssm_v2.py +162 -0
  23. nextrec/models/match/mind.py +210 -0
  24. nextrec/models/match/sdm.py +253 -0
  25. nextrec/models/match/youtube_dnn.py +172 -0
  26. nextrec/models/multi_task/esmm.py +129 -0
  27. nextrec/models/multi_task/mmoe.py +161 -0
  28. nextrec/models/multi_task/ple.py +260 -0
  29. nextrec/models/multi_task/share_bottom.py +126 -0
  30. nextrec/models/ranking/__init__.py +17 -0
  31. nextrec/models/ranking/afm.py +118 -0
  32. nextrec/models/ranking/autoint.py +140 -0
  33. nextrec/models/ranking/dcn.py +120 -0
  34. nextrec/models/ranking/deepfm.py +95 -0
  35. nextrec/models/ranking/dien.py +214 -0
  36. nextrec/models/ranking/din.py +181 -0
  37. nextrec/models/ranking/fibinet.py +130 -0
  38. nextrec/models/ranking/fm.py +87 -0
  39. nextrec/models/ranking/masknet.py +125 -0
  40. nextrec/models/ranking/pnn.py +128 -0
  41. nextrec/models/ranking/widedeep.py +105 -0
  42. nextrec/models/ranking/xdeepfm.py +117 -0
  43. nextrec/utils/__init__.py +18 -0
  44. nextrec/utils/common.py +14 -0
  45. nextrec/utils/embedding.py +19 -0
  46. nextrec/utils/initializer.py +47 -0
  47. nextrec/utils/optimizer.py +75 -0
  48. nextrec-0.1.1.dist-info/METADATA +302 -0
  49. nextrec-0.1.1.dist-info/RECORD +51 -0
  50. nextrec-0.1.1.dist-info/WHEEL +4 -0
  51. nextrec-0.1.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,27 @@
1
+ """
2
+ Data utilities package for NextRec
3
+
4
+ This package provides data processing and manipulation utilities.
5
+
6
+ Date: create on 13/11/2025
7
+ Author:
8
+ Yang Zhou, zyaztec@gmail.com
9
+ """
10
+
11
+ from nextrec.data.data_utils import (
12
+ collate_fn,
13
+ get_column_data,
14
+ split_dict_random,
15
+ build_eval_candidates,
16
+ )
17
+
18
+ # For backward compatibility, keep utils accessible
19
+ from nextrec.data import data_utils
20
+
21
+ __all__ = [
22
+ 'collate_fn',
23
+ 'get_column_data',
24
+ 'split_dict_random',
25
+ 'build_eval_candidates',
26
+ 'data_utils',
27
+ ]
@@ -0,0 +1,132 @@
1
+ """
2
+ Data processing utilities for NextRec
3
+
4
+ Date: create on 13/11/2025
5
+ Author:
6
+ Yang Zhou, zyaztec@gmail.com
7
+ """
8
+
9
+ import torch
10
+ import numpy as np
11
+ import pandas as pd
12
+
13
+ def collate_fn(batch):
14
+ """
15
+ Custom collate function for batching tuples of tensors.
16
+ Each element in batch is a tuple of tensors from FileDataset.
17
+
18
+ Examples:
19
+ # Single sample in batch
20
+ (tensor([1.0, 2.0]), tensor([10, 20]), tensor([100, 200]), tensor(1.0))
21
+ # Batched output
22
+ (tensor([[1.0, 2.0], [3.0, 4.0]]), # dense_features batch
23
+ tensor([[10, 20], [30, 40]]), # sparse_features batch
24
+ tensor([[100, 200], [300, 400]]), # sequence_features batch
25
+ tensor([1.0, 0.0]) # labels batch)
26
+
27
+ """
28
+ if not batch:
29
+ return tuple()
30
+
31
+ num_tensors = len(batch[0])
32
+ result = []
33
+
34
+ for i in range(num_tensors):
35
+ tensor_list = [item[i] for item in batch]
36
+ stacked = torch.cat(tensor_list, dim=0)
37
+ result.append(stacked)
38
+
39
+ return tuple(result)
40
+
41
+
42
+ def get_column_data(data: dict | pd.DataFrame, name: str):
43
+ """Extract column data from various data structures."""
44
+ if isinstance(data, dict):
45
+ return data[name] if name in data else None
46
+ elif isinstance(data, pd.DataFrame):
47
+ if name not in data.columns:
48
+ return None
49
+ return data[name].values
50
+ else:
51
+ if hasattr(data, name):
52
+ return getattr(data, name)
53
+ raise KeyError(f"Unsupported data type for extracting column {name}")
54
+
55
+
56
+ def split_dict_random(data_dict: dict, test_size: float=0.2, random_state:int|None=None):
57
+ """Randomly split a dictionary of data into training and testing sets."""
58
+ lengths = [len(v) for v in data_dict.values()]
59
+ if len(set(lengths)) != 1:
60
+ raise ValueError(f"Length mismatch: {lengths}")
61
+ n = lengths[0]
62
+
63
+ rng = np.random.default_rng(random_state)
64
+ perm = rng.permutation(n)
65
+ cut = int(round(n * (1 - test_size)))
66
+ train_idx, test_idx = perm[:cut], perm[cut:]
67
+
68
+ def take(v, idx):
69
+ if isinstance(v, np.ndarray):
70
+ return v[idx]
71
+ elif isinstance(v, pd.Series):
72
+ return v.iloc[idx].to_numpy()
73
+ else:
74
+ v_arr = np.asarray(v, dtype=object)
75
+ return v_arr[idx]
76
+
77
+ train_dict = {k: take(v, train_idx) for k, v in data_dict.items()}
78
+ test_dict = {k: take(v, test_idx) for k, v in data_dict.items()}
79
+ return train_dict, test_dict
80
+
81
+
82
+ def build_eval_candidates(
83
+ df_all: pd.DataFrame,
84
+ user_col: str,
85
+ item_col: str,
86
+ label_col: str,
87
+ user_features: pd.DataFrame,
88
+ item_features: pd.DataFrame,
89
+ num_pos_per_user: int = 5,
90
+ num_neg_per_pos: int = 50,
91
+ random_seed: int = 2025,
92
+ ) -> pd.DataFrame:
93
+ """Build evaluation candidates with positive and negative samples for each user. """
94
+ rng = np.random.default_rng(random_seed)
95
+
96
+ users = df_all[user_col].unique()
97
+ all_items = item_features[item_col].unique()
98
+
99
+ rows = []
100
+
101
+ user_hist_items = {
102
+ u: df_all[df_all[user_col] == u][item_col].unique()
103
+ for u in users
104
+ }
105
+
106
+ for u in users:
107
+ df_user = df_all[df_all[user_col] == u]
108
+ pos_items = df_user[df_user[label_col] == 1][item_col].unique()
109
+ if len(pos_items) == 0:
110
+ continue
111
+
112
+ pos_items = pos_items[:num_pos_per_user]
113
+ seen_items = set(user_hist_items[u])
114
+
115
+ neg_pool = np.setdiff1d(all_items, np.fromiter(seen_items, dtype=all_items.dtype))
116
+ if len(neg_pool) == 0:
117
+ continue
118
+
119
+ for pos in pos_items:
120
+ if len(neg_pool) <= num_neg_per_pos:
121
+ neg_items = neg_pool
122
+ else:
123
+ neg_items = rng.choice(neg_pool, size=num_neg_per_pos, replace=False)
124
+
125
+ rows.append((u, pos, 1))
126
+ for ni in neg_items:
127
+ rows.append((u, ni, 0))
128
+
129
+ eval_df = pd.DataFrame(rows, columns=[user_col, item_col, label_col])
130
+ eval_df = eval_df.merge(user_features, on=user_col, how='left')
131
+ eval_df = eval_df.merge(item_features, on=item_col, how='left')
132
+ return eval_df