nextrec 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/features.py +10 -23
- nextrec/basic/layers.py +18 -61
- nextrec/basic/metrics.py +55 -33
- nextrec/basic/model.py +247 -389
- nextrec/data/__init__.py +2 -2
- nextrec/data/data_utils.py +80 -4
- nextrec/data/dataloader.py +36 -57
- nextrec/data/preprocessor.py +5 -4
- nextrec/models/generative/hstu.py +1 -1
- nextrec/models/match/dssm.py +2 -2
- nextrec/models/match/dssm_v2.py +2 -2
- nextrec/models/match/mind.py +2 -2
- nextrec/models/match/sdm.py +2 -2
- nextrec/models/match/youtube_dnn.py +2 -2
- nextrec/models/multi_task/esmm.py +1 -1
- nextrec/models/multi_task/mmoe.py +1 -1
- nextrec/models/multi_task/ple.py +1 -1
- nextrec/models/multi_task/poso.py +1 -1
- nextrec/models/multi_task/share_bottom.py +1 -1
- nextrec/models/ranking/afm.py +1 -1
- nextrec/models/ranking/autoint.py +1 -1
- nextrec/models/ranking/dcn.py +1 -1
- nextrec/models/ranking/deepfm.py +1 -1
- nextrec/models/ranking/dien.py +1 -1
- nextrec/models/ranking/din.py +1 -1
- nextrec/models/ranking/fibinet.py +1 -1
- nextrec/models/ranking/fm.py +1 -1
- nextrec/models/ranking/masknet.py +2 -2
- nextrec/models/ranking/pnn.py +1 -1
- nextrec/models/ranking/widedeep.py +1 -1
- nextrec/models/ranking/xdeepfm.py +1 -1
- nextrec/utils/__init__.py +2 -1
- nextrec/utils/common.py +21 -2
- {nextrec-0.3.2.dist-info → nextrec-0.3.3.dist-info}/METADATA +3 -3
- nextrec-0.3.3.dist-info/RECORD +57 -0
- nextrec-0.3.2.dist-info/RECORD +0 -57
- {nextrec-0.3.2.dist-info → nextrec-0.3.3.dist-info}/WHEEL +0 -0
- {nextrec-0.3.2.dist-info → nextrec-0.3.3.dist-info}/licenses/LICENSE +0 -0
nextrec/data/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ from nextrec.data.data_utils import (
|
|
|
18
18
|
read_table,
|
|
19
19
|
load_dataframes,
|
|
20
20
|
)
|
|
21
|
-
from nextrec.basic.features import
|
|
21
|
+
from nextrec.basic.features import FeatureSet
|
|
22
22
|
from nextrec.data import data_utils
|
|
23
23
|
from nextrec.data.dataloader import (
|
|
24
24
|
TensorDictDataset,
|
|
@@ -38,7 +38,7 @@ __all__ = [
|
|
|
38
38
|
'iter_file_chunks',
|
|
39
39
|
'read_table',
|
|
40
40
|
'load_dataframes',
|
|
41
|
-
'
|
|
41
|
+
'FeatureSet',
|
|
42
42
|
'data_utils',
|
|
43
43
|
'TensorDictDataset',
|
|
44
44
|
'FileDataset',
|
nextrec/data/data_utils.py
CHANGED
|
@@ -5,8 +5,9 @@ import numpy as np
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import pyarrow.parquet as pq
|
|
7
7
|
from pathlib import Path
|
|
8
|
+
from typing import Any, Mapping, Sequence
|
|
8
9
|
|
|
9
|
-
def
|
|
10
|
+
def stack_section(batch: list[dict], section: str):
|
|
10
11
|
"""Stack one section of the batch (features/labels/ids)."""
|
|
11
12
|
entries = [item.get(section) for item in batch if item.get(section) is not None]
|
|
12
13
|
if not entries:
|
|
@@ -39,9 +40,9 @@ def collate_fn(batch):
|
|
|
39
40
|
"ids": first.get("ids"),
|
|
40
41
|
}
|
|
41
42
|
return {
|
|
42
|
-
"features":
|
|
43
|
-
"labels":
|
|
44
|
-
"ids":
|
|
43
|
+
"features": stack_section(batch, "features") or {},
|
|
44
|
+
"labels": stack_section(batch, "labels"),
|
|
45
|
+
"ids": stack_section(batch, "ids"),
|
|
45
46
|
}
|
|
46
47
|
|
|
47
48
|
# Fallback: stack tuples/lists of tensors
|
|
@@ -190,3 +191,78 @@ def build_eval_candidates(
|
|
|
190
191
|
eval_df = eval_df.merge(user_features, on=user_col, how='left')
|
|
191
192
|
eval_df = eval_df.merge(item_features, on=item_col, how='left')
|
|
192
193
|
return eval_df
|
|
194
|
+
|
|
195
|
+
def batch_to_dict(batch_data: Any, include_ids: bool = True) -> dict:
|
|
196
|
+
"""Standardize a dataloader batch into a dict of features, labels, and ids."""
|
|
197
|
+
if not (isinstance(batch_data, Mapping) and "features" in batch_data):
|
|
198
|
+
raise TypeError(
|
|
199
|
+
"[BaseModel-batch_to_dict Error] Batch data must be a dict with 'features' produced by the current DataLoader."
|
|
200
|
+
)
|
|
201
|
+
return {
|
|
202
|
+
"features": batch_data.get("features", {}),
|
|
203
|
+
"labels": batch_data.get("labels"),
|
|
204
|
+
"ids": batch_data.get("ids") if include_ids else None,
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# def get_user_ids(
|
|
209
|
+
# data: dict | pd.DataFrame | None, user_id_column: str = "user_id"
|
|
210
|
+
# ) -> np.ndarray | None:
|
|
211
|
+
# """Extract user IDs from a dataset dict or DataFrame."""
|
|
212
|
+
# if data is None:
|
|
213
|
+
# return None
|
|
214
|
+
# if isinstance(data, pd.DataFrame) and user_id_column in data.columns:
|
|
215
|
+
# return np.asarray(data[user_id_column].values)
|
|
216
|
+
# if isinstance(data, dict) and user_id_column in data:
|
|
217
|
+
# return np.asarray(data[user_id_column])
|
|
218
|
+
# return None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# def get_user_ids_from_batch(
|
|
222
|
+
# batch_dict: Mapping[str, Any], id_columns: Sequence[str] | None = None
|
|
223
|
+
# ) -> np.ndarray | None:
|
|
224
|
+
# """Extract the prioritized user id column from a batch dict."""
|
|
225
|
+
# ids_container = batch_dict.get("ids") if isinstance(batch_dict, Mapping) else None
|
|
226
|
+
# if not ids_container:
|
|
227
|
+
# return None
|
|
228
|
+
|
|
229
|
+
# batch_user_id = None
|
|
230
|
+
# if id_columns:
|
|
231
|
+
# for id_name in id_columns:
|
|
232
|
+
# if id_name in ids_container:
|
|
233
|
+
# batch_user_id = ids_container[id_name]
|
|
234
|
+
# break
|
|
235
|
+
# if batch_user_id is None:
|
|
236
|
+
# batch_user_id = next(iter(ids_container.values()), None)
|
|
237
|
+
# if batch_user_id is None:
|
|
238
|
+
# return None
|
|
239
|
+
|
|
240
|
+
# if isinstance(batch_user_id, torch.Tensor):
|
|
241
|
+
# ids_np = batch_user_id.detach().cpu().numpy()
|
|
242
|
+
# else:
|
|
243
|
+
# ids_np = np.asarray(batch_user_id)
|
|
244
|
+
# if ids_np.ndim == 0:
|
|
245
|
+
# ids_np = ids_np.reshape(1)
|
|
246
|
+
# return ids_np.reshape(ids_np.shape[0])
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def get_user_ids(data, id_columns: list[str] | str | None = None) -> np.ndarray | None:
|
|
250
|
+
id_columns = id_columns if isinstance(id_columns, list) else [id_columns] if isinstance(id_columns, str) else []
|
|
251
|
+
if not id_columns:
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
main_id = id_columns[0]
|
|
255
|
+
if isinstance(data, pd.DataFrame) and main_id in data.columns:
|
|
256
|
+
arr = np.asarray(data[main_id].values)
|
|
257
|
+
return arr.reshape(arr.shape[0])
|
|
258
|
+
if isinstance(data, dict):
|
|
259
|
+
ids_container = data.get("ids")
|
|
260
|
+
if isinstance(ids_container, dict) and main_id in ids_container:
|
|
261
|
+
val = ids_container[main_id]
|
|
262
|
+
val = val.detach().cpu().numpy() if isinstance(val, torch.Tensor) else np.asarray(val)
|
|
263
|
+
return val.reshape(val.shape[0])
|
|
264
|
+
if main_id in data:
|
|
265
|
+
arr = np.asarray(data[main_id])
|
|
266
|
+
return arr.reshape(arr.shape[0])
|
|
267
|
+
|
|
268
|
+
return None
|
nextrec/data/dataloader.py
CHANGED
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
Dataloader definitions
|
|
3
3
|
|
|
4
4
|
Date: create on 27/10/2025
|
|
5
|
-
Checkpoint: edit on
|
|
5
|
+
Checkpoint: edit on 02/12/2025
|
|
6
6
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
7
7
|
"""
|
|
8
8
|
import os
|
|
9
|
-
import tqdm
|
|
10
9
|
import torch
|
|
11
10
|
import logging
|
|
12
11
|
import numpy as np
|
|
@@ -18,15 +17,11 @@ from typing import cast
|
|
|
18
17
|
|
|
19
18
|
from torch.utils.data import DataLoader, Dataset, IterableDataset
|
|
20
19
|
from nextrec.data.preprocessor import DataProcessor
|
|
21
|
-
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature,
|
|
20
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
|
|
22
21
|
|
|
23
22
|
from nextrec.basic.loggers import colorize
|
|
24
|
-
from nextrec.data import
|
|
25
|
-
|
|
26
|
-
collate_fn,
|
|
27
|
-
resolve_file_paths,
|
|
28
|
-
read_table,
|
|
29
|
-
)
|
|
23
|
+
from nextrec.data import get_column_data, collate_fn, resolve_file_paths, read_table
|
|
24
|
+
from nextrec.utils import to_tensor
|
|
30
25
|
|
|
31
26
|
class TensorDictDataset(Dataset):
|
|
32
27
|
"""Dataset returning sample-level dicts matching the unified batch schema."""
|
|
@@ -52,7 +47,7 @@ class TensorDictDataset(Dataset):
|
|
|
52
47
|
sample_ids = {name: tensor[idx] for name, tensor in self.ids.items()} if self.ids else None
|
|
53
48
|
return {"features": sample_features, "labels": sample_labels, "ids": sample_ids}
|
|
54
49
|
|
|
55
|
-
class FileDataset(
|
|
50
|
+
class FileDataset(FeatureSet, IterableDataset):
|
|
56
51
|
def __init__(self,
|
|
57
52
|
file_paths: list[str], # file paths to read, containing CSV or Parquet files
|
|
58
53
|
dense_features: list[DenseFeature], # dense feature definitions
|
|
@@ -67,44 +62,37 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
|
|
|
67
62
|
self.chunk_size = chunk_size
|
|
68
63
|
self.file_type = file_type
|
|
69
64
|
self.processor = processor
|
|
70
|
-
self.
|
|
65
|
+
self.set_all_features(dense_features, sparse_features, sequence_features, target_columns, id_columns)
|
|
71
66
|
self.current_file_index = 0
|
|
72
67
|
self.total_files = len(file_paths)
|
|
73
68
|
|
|
74
69
|
def __iter__(self):
|
|
75
70
|
self.current_file_index = 0
|
|
76
|
-
self._file_pbar = None
|
|
77
|
-
if self.total_files > 1:
|
|
78
|
-
self._file_pbar = tqdm.tqdm(total=self.total_files, desc="Files", unit="file", position=0, leave=True, bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
|
|
79
71
|
for file_path in self.file_paths:
|
|
80
72
|
self.current_file_index += 1
|
|
81
|
-
if self.
|
|
82
|
-
self._file_pbar.update(1)
|
|
83
|
-
elif self.total_files == 1:
|
|
73
|
+
if self.total_files == 1:
|
|
84
74
|
file_name = os.path.basename(file_path)
|
|
85
75
|
logging.info(f"Processing file: {file_name}")
|
|
86
76
|
if self.file_type == 'csv':
|
|
87
|
-
yield from self.
|
|
77
|
+
yield from self.read_csv_chunks(file_path)
|
|
88
78
|
elif self.file_type == 'parquet':
|
|
89
|
-
yield from self.
|
|
90
|
-
if self._file_pbar is not None:
|
|
91
|
-
self._file_pbar.close()
|
|
79
|
+
yield from self.read_parquet_chunks(file_path)
|
|
92
80
|
|
|
93
|
-
def
|
|
81
|
+
def read_csv_chunks(self, file_path: str):
|
|
94
82
|
chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
|
|
95
83
|
for chunk in chunk_iterator:
|
|
96
|
-
tensors = self.
|
|
84
|
+
tensors = self.dataframeto_tensors(chunk)
|
|
97
85
|
yield tensors
|
|
98
86
|
|
|
99
|
-
def
|
|
87
|
+
def read_parquet_chunks(self, file_path: str):
|
|
100
88
|
parquet_file = pq.ParquetFile(file_path)
|
|
101
89
|
for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
|
|
102
90
|
chunk = batch.to_pandas()
|
|
103
|
-
tensors = self.
|
|
91
|
+
tensors = self.dataframeto_tensors(chunk)
|
|
104
92
|
yield tensors
|
|
105
93
|
del chunk
|
|
106
94
|
|
|
107
|
-
def
|
|
95
|
+
def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
|
|
108
96
|
if self.processor is not None:
|
|
109
97
|
if not self.processor.is_fitted:
|
|
110
98
|
raise ValueError("[DataLoader Error] DataProcessor must be fitted before using in streaming mode")
|
|
@@ -120,7 +108,7 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
|
|
|
120
108
|
return batch
|
|
121
109
|
|
|
122
110
|
|
|
123
|
-
class RecDataLoader(
|
|
111
|
+
class RecDataLoader(FeatureSet):
|
|
124
112
|
def __init__(self,
|
|
125
113
|
dense_features: list[DenseFeature] | None = None,
|
|
126
114
|
sparse_features: list[SparseFeature] | None = None,
|
|
@@ -129,7 +117,7 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
129
117
|
id_columns: str | list[str] | None = None,
|
|
130
118
|
processor: DataProcessor | None = None):
|
|
131
119
|
self.processor = processor
|
|
132
|
-
self.
|
|
120
|
+
self.set_all_features(dense_features, sparse_features, sequence_features, target, id_columns)
|
|
133
121
|
|
|
134
122
|
def create_dataloader(self,
|
|
135
123
|
data: dict | pd.DataFrame | str | DataLoader,
|
|
@@ -140,13 +128,13 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
140
128
|
if isinstance(data, DataLoader):
|
|
141
129
|
return data
|
|
142
130
|
elif isinstance(data, (str, os.PathLike)):
|
|
143
|
-
return self.
|
|
131
|
+
return self.create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size)
|
|
144
132
|
elif isinstance(data, (dict, pd.DataFrame)):
|
|
145
|
-
return self.
|
|
133
|
+
return self.create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle)
|
|
146
134
|
else:
|
|
147
135
|
raise ValueError(f"[RecDataLoader Error] Unsupported data type: {type(data)}")
|
|
148
136
|
|
|
149
|
-
def
|
|
137
|
+
def create_from_memory(self,
|
|
150
138
|
data: dict | pd.DataFrame,
|
|
151
139
|
batch_size: int,
|
|
152
140
|
shuffle: bool) -> DataLoader:
|
|
@@ -162,7 +150,7 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
162
150
|
dataset = TensorDictDataset(tensors)
|
|
163
151
|
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
|
|
164
152
|
|
|
165
|
-
def
|
|
153
|
+
def create_from_path(self,
|
|
166
154
|
path: str,
|
|
167
155
|
batch_size: int,
|
|
168
156
|
shuffle: bool,
|
|
@@ -179,7 +167,6 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
179
167
|
except OSError:
|
|
180
168
|
pass
|
|
181
169
|
try:
|
|
182
|
-
df = read_table(file_path, file_type)
|
|
183
170
|
dfs.append(df)
|
|
184
171
|
except MemoryError as exc:
|
|
185
172
|
raise MemoryError(f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming.") from exc
|
|
@@ -187,11 +174,11 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
187
174
|
combined_df = pd.concat(dfs, ignore_index=True)
|
|
188
175
|
except MemoryError as exc:
|
|
189
176
|
raise MemoryError(f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size.") from exc
|
|
190
|
-
return self.
|
|
177
|
+
return self.create_from_memory(combined_df, batch_size, shuffle,)
|
|
191
178
|
else:
|
|
192
|
-
return self.
|
|
179
|
+
return self.load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle)
|
|
193
180
|
|
|
194
|
-
def
|
|
181
|
+
def load_files_streaming(self,
|
|
195
182
|
file_paths: list[str],
|
|
196
183
|
file_type: str,
|
|
197
184
|
batch_size: int,
|
|
@@ -201,20 +188,10 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
201
188
|
logging.warning("[RecDataLoader Warning] Shuffle is ignored in streaming mode (IterableDataset).")
|
|
202
189
|
if batch_size != 1:
|
|
203
190
|
logging.warning("[RecDataLoader Warning] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
|
|
204
|
-
dataset = FileDataset(
|
|
205
|
-
file_paths=file_paths,
|
|
206
|
-
dense_features=self.dense_features,
|
|
207
|
-
sparse_features=self.sparse_features,
|
|
208
|
-
sequence_features=self.sequence_features,
|
|
209
|
-
target_columns=self.target_columns,
|
|
210
|
-
id_columns=self.id_columns,
|
|
211
|
-
chunk_size=chunk_size,
|
|
212
|
-
file_type=file_type,
|
|
213
|
-
processor=self.processor
|
|
214
|
-
)
|
|
191
|
+
dataset = FileDataset(file_paths=file_paths, dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target_columns=self.target_columns, id_columns=self.id_columns, chunk_size=chunk_size, file_type=file_type, processor=self.processor)
|
|
215
192
|
return DataLoader(dataset, batch_size=1, collate_fn=collate_fn)
|
|
216
193
|
|
|
217
|
-
def
|
|
194
|
+
def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
218
195
|
if isinstance(column, pd.Series):
|
|
219
196
|
column = column.tolist()
|
|
220
197
|
if isinstance(column, (list, tuple)):
|
|
@@ -250,25 +227,27 @@ def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
|
250
227
|
column = column.reshape(-1, 1)
|
|
251
228
|
return np.asarray(column, dtype=np.int64)
|
|
252
229
|
|
|
253
|
-
|
|
254
|
-
def build_tensors_from_data( # noqa: C901
|
|
230
|
+
def build_tensors_from_data(
|
|
255
231
|
data: dict | pd.DataFrame,
|
|
256
232
|
raw_data: dict | pd.DataFrame,
|
|
257
233
|
features: list,
|
|
258
234
|
target_columns: list[str],
|
|
259
235
|
id_columns: list[str]
|
|
260
236
|
) -> dict | None:
|
|
261
|
-
feature_tensors
|
|
237
|
+
feature_tensors = {}
|
|
262
238
|
for feature in features:
|
|
263
239
|
column = get_column_data(data, feature.name)
|
|
264
240
|
if column is None:
|
|
265
241
|
raise ValueError(f"[RecDataLoader Error] Feature column '{feature.name}' not found in data")
|
|
266
|
-
if isinstance(feature, SequenceFeature):
|
|
267
|
-
|
|
242
|
+
if isinstance(feature, SequenceFeature): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
|
|
243
|
+
arr = normalize_sequence_column(column, feature)
|
|
244
|
+
tensor = to_tensor(arr, dtype=torch.long)
|
|
268
245
|
elif isinstance(feature, DenseFeature):
|
|
269
|
-
|
|
246
|
+
arr = np.asarray(column, dtype=np.float32)
|
|
247
|
+
tensor = to_tensor(arr, dtype=torch.float32)
|
|
270
248
|
else:
|
|
271
|
-
|
|
249
|
+
arr = np.asarray(column, dtype=np.int64)
|
|
250
|
+
tensor = to_tensor(arr, dtype=torch.long)
|
|
272
251
|
feature_tensors[feature.name] = tensor
|
|
273
252
|
label_tensors = None
|
|
274
253
|
if target_columns:
|
|
@@ -277,7 +256,7 @@ def build_tensors_from_data( # noqa: C901
|
|
|
277
256
|
column = get_column_data(data, target_name)
|
|
278
257
|
if column is None:
|
|
279
258
|
continue
|
|
280
|
-
label_tensor =
|
|
259
|
+
label_tensor = to_tensor(np.asarray(column, dtype=np.float32), dtype=torch.float32)
|
|
281
260
|
if label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
|
|
282
261
|
label_tensor = label_tensor.t()
|
|
283
262
|
if label_tensor.shape[1:] == (1,):
|
|
@@ -298,7 +277,7 @@ def build_tensors_from_data( # noqa: C901
|
|
|
298
277
|
id_arr = np.asarray(column, dtype=np.int64)
|
|
299
278
|
except Exception as exc:
|
|
300
279
|
raise TypeError( f"[RecDataLoader Error] ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}") from exc
|
|
301
|
-
id_tensors[id_col] = torch.
|
|
280
|
+
id_tensors[id_col] = to_tensor(id_arr, dtype=torch.long)
|
|
302
281
|
if not feature_tensors:
|
|
303
282
|
return None
|
|
304
283
|
return {"features": feature_tensors, "labels": label_tensors, "ids": id_tensors}
|
nextrec/data/preprocessor.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
3
3
|
|
|
4
4
|
Date: create on 13/11/2025
|
|
5
|
+
Checkpoint: edit on 02/12/2025
|
|
5
6
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
6
7
|
"""
|
|
7
8
|
from __future__ import annotations
|
|
@@ -32,11 +33,11 @@ from nextrec.data.data_utils import (
|
|
|
32
33
|
default_output_dir,
|
|
33
34
|
)
|
|
34
35
|
from nextrec.basic.session import resolve_save_path
|
|
35
|
-
from nextrec.basic.features import
|
|
36
|
+
from nextrec.basic.features import FeatureSet
|
|
36
37
|
from nextrec.__version__ import __version__
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
class DataProcessor(
|
|
40
|
+
class DataProcessor(FeatureSet):
|
|
40
41
|
"""DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
41
42
|
|
|
42
43
|
Examples:
|
|
@@ -70,7 +71,7 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
70
71
|
self.scalers: Dict[str, Any] = {}
|
|
71
72
|
self.label_encoders: Dict[str, LabelEncoder] = {}
|
|
72
73
|
self.target_encoders: Dict[str, Dict[str, int]] = {}
|
|
73
|
-
self.
|
|
74
|
+
self.set_target_id([], [])
|
|
74
75
|
|
|
75
76
|
def add_numeric_feature(
|
|
76
77
|
self,
|
|
@@ -129,7 +130,7 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
129
130
|
'target_type': target_type,
|
|
130
131
|
'label_map': label_map
|
|
131
132
|
}
|
|
132
|
-
self.
|
|
133
|
+
self.set_target_id(list(self.target_features.keys()), [])
|
|
133
134
|
|
|
134
135
|
def _hash_string(self, s: str, hash_size: int) -> int:
|
|
135
136
|
return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
|
|
@@ -344,7 +344,7 @@ class HSTU(BaseModel):
|
|
|
344
344
|
loss_params.setdefault("ignore_index", self.ignore_index)
|
|
345
345
|
|
|
346
346
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, scheduler=scheduler, scheduler_params=scheduler_params, loss="crossentropy", loss_params=loss_params)
|
|
347
|
-
self.
|
|
347
|
+
self.register_regularization_weights(embedding_attr="token_embedding", include_modules=["layers", "lm_head"])
|
|
348
348
|
|
|
349
349
|
def _build_causal_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
|
|
350
350
|
"""
|
nextrec/models/match/dssm.py
CHANGED
|
@@ -143,11 +143,11 @@ class DSSM(BaseMatchModel):
|
|
|
143
143
|
activation=dnn_activation
|
|
144
144
|
)
|
|
145
145
|
|
|
146
|
-
self.
|
|
146
|
+
self.register_regularization_weights(
|
|
147
147
|
embedding_attr='user_embedding',
|
|
148
148
|
include_modules=['user_dnn']
|
|
149
149
|
)
|
|
150
|
-
self.
|
|
150
|
+
self.register_regularization_weights(
|
|
151
151
|
embedding_attr='item_embedding',
|
|
152
152
|
include_modules=['item_dnn']
|
|
153
153
|
)
|
nextrec/models/match/dssm_v2.py
CHANGED
|
@@ -134,11 +134,11 @@ class DSSM_v2(BaseMatchModel):
|
|
|
134
134
|
activation=dnn_activation
|
|
135
135
|
)
|
|
136
136
|
|
|
137
|
-
self.
|
|
137
|
+
self.register_regularization_weights(
|
|
138
138
|
embedding_attr='user_embedding',
|
|
139
139
|
include_modules=['user_dnn']
|
|
140
140
|
)
|
|
141
|
-
self.
|
|
141
|
+
self.register_regularization_weights(
|
|
142
142
|
embedding_attr='item_embedding',
|
|
143
143
|
include_modules=['item_dnn']
|
|
144
144
|
)
|
nextrec/models/match/mind.py
CHANGED
|
@@ -258,11 +258,11 @@ class MIND(BaseMatchModel):
|
|
|
258
258
|
else:
|
|
259
259
|
self.item_dnn = None
|
|
260
260
|
|
|
261
|
-
self.
|
|
261
|
+
self.register_regularization_weights(
|
|
262
262
|
embedding_attr='user_embedding',
|
|
263
263
|
include_modules=['capsule_network']
|
|
264
264
|
)
|
|
265
|
-
self.
|
|
265
|
+
self.register_regularization_weights(
|
|
266
266
|
embedding_attr='item_embedding',
|
|
267
267
|
include_modules=['item_dnn'] if self.item_dnn else []
|
|
268
268
|
)
|
nextrec/models/match/sdm.py
CHANGED
|
@@ -176,11 +176,11 @@ class SDM(BaseMatchModel):
|
|
|
176
176
|
else:
|
|
177
177
|
self.item_dnn = None
|
|
178
178
|
|
|
179
|
-
self.
|
|
179
|
+
self.register_regularization_weights(
|
|
180
180
|
embedding_attr='user_embedding',
|
|
181
181
|
include_modules=['rnn', 'user_dnn']
|
|
182
182
|
)
|
|
183
|
-
self.
|
|
183
|
+
self.register_regularization_weights(
|
|
184
184
|
embedding_attr='item_embedding',
|
|
185
185
|
include_modules=['item_dnn'] if self.item_dnn else []
|
|
186
186
|
)
|
|
@@ -140,11 +140,11 @@ class YoutubeDNN(BaseMatchModel):
|
|
|
140
140
|
activation=dnn_activation
|
|
141
141
|
)
|
|
142
142
|
|
|
143
|
-
self.
|
|
143
|
+
self.register_regularization_weights(
|
|
144
144
|
embedding_attr='user_embedding',
|
|
145
145
|
include_modules=['user_dnn']
|
|
146
146
|
)
|
|
147
|
-
self.
|
|
147
|
+
self.register_regularization_weights(
|
|
148
148
|
embedding_attr='item_embedding',
|
|
149
149
|
include_modules=['item_dnn']
|
|
150
150
|
)
|
|
@@ -128,7 +128,7 @@ class ESMM(BaseModel):
|
|
|
128
128
|
self.cvr_tower = MLP(input_dim=input_dim, output_layer=True, **cvr_params)
|
|
129
129
|
self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1, 1])
|
|
130
130
|
# Register regularization weights
|
|
131
|
-
self.
|
|
131
|
+
self.register_regularization_weights(embedding_attr='embedding', include_modules=['ctr_tower', 'cvr_tower'])
|
|
132
132
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
|
|
133
133
|
|
|
134
134
|
def forward(self, x):
|
|
@@ -146,7 +146,7 @@ class MMOE(BaseModel):
|
|
|
146
146
|
self.towers.append(tower)
|
|
147
147
|
self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks)
|
|
148
148
|
# Register regularization weights
|
|
149
|
-
self.
|
|
149
|
+
self.register_regularization_weights(embedding_attr='embedding', include_modules=['experts', 'gates', 'towers'])
|
|
150
150
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params,)
|
|
151
151
|
|
|
152
152
|
def forward(self, x):
|
nextrec/models/multi_task/ple.py
CHANGED
|
@@ -249,7 +249,7 @@ class PLE(BaseModel):
|
|
|
249
249
|
self.towers.append(tower)
|
|
250
250
|
self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks)
|
|
251
251
|
# Register regularization weights
|
|
252
|
-
self.
|
|
252
|
+
self.register_regularization_weights(embedding_attr='embedding', include_modules=['cgc_layers', 'towers'])
|
|
253
253
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=self.loss, loss_params=loss_params)
|
|
254
254
|
|
|
255
255
|
def forward(self, x):
|
|
@@ -389,7 +389,7 @@ class POSO(BaseModel):
|
|
|
389
389
|
self.tower_heads = None
|
|
390
390
|
self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks,)
|
|
391
391
|
include_modules = ["towers", "tower_heads"] if self.architecture == "mlp" else ["mmoe", "towers"]
|
|
392
|
-
self.
|
|
392
|
+
self.register_regularization_weights(embedding_attr="embedding", include_modules=include_modules)
|
|
393
393
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
|
|
394
394
|
|
|
395
395
|
def forward(self, x):
|
|
@@ -122,7 +122,7 @@ class ShareBottom(BaseModel):
|
|
|
122
122
|
self.towers.append(tower)
|
|
123
123
|
self.prediction_layer = PredictionLayer(task_type=self.task_type, task_dims=[1] * self.num_tasks)
|
|
124
124
|
# Register regularization weights
|
|
125
|
-
self.
|
|
125
|
+
self.register_regularization_weights(embedding_attr='embedding', include_modules=['bottom', 'towers'])
|
|
126
126
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
|
|
127
127
|
|
|
128
128
|
def forward(self, x):
|
nextrec/models/ranking/afm.py
CHANGED
|
@@ -81,7 +81,7 @@ class AFM(BaseModel):
|
|
|
81
81
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
82
82
|
|
|
83
83
|
# Register regularization weights
|
|
84
|
-
self.
|
|
84
|
+
self.register_regularization_weights(
|
|
85
85
|
embedding_attr='embedding',
|
|
86
86
|
include_modules=['linear', 'attention_linear', 'attention_p', 'output_projection']
|
|
87
87
|
)
|
|
@@ -150,7 +150,7 @@ class AutoInt(BaseModel):
|
|
|
150
150
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
151
151
|
|
|
152
152
|
# Register regularization weights
|
|
153
|
-
self.
|
|
153
|
+
self.register_regularization_weights(
|
|
154
154
|
embedding_attr='embedding',
|
|
155
155
|
include_modules=['projection_layers', 'attention_layers', 'fc']
|
|
156
156
|
)
|
nextrec/models/ranking/dcn.py
CHANGED
|
@@ -109,7 +109,7 @@ class DCN(BaseModel):
|
|
|
109
109
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
110
110
|
|
|
111
111
|
# Register regularization weights
|
|
112
|
-
self.
|
|
112
|
+
self.register_regularization_weights(
|
|
113
113
|
embedding_attr='embedding',
|
|
114
114
|
include_modules=['cross_network', 'mlp', 'final_layer']
|
|
115
115
|
)
|
nextrec/models/ranking/deepfm.py
CHANGED
|
@@ -107,7 +107,7 @@ class DeepFM(BaseModel):
|
|
|
107
107
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
108
108
|
|
|
109
109
|
# Register regularization weights
|
|
110
|
-
self.
|
|
110
|
+
self.register_regularization_weights(embedding_attr='embedding', include_modules=['linear', 'mlp'])
|
|
111
111
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
|
|
112
112
|
|
|
113
113
|
def forward(self, x):
|
nextrec/models/ranking/dien.py
CHANGED
|
@@ -237,7 +237,7 @@ class DIEN(BaseModel):
|
|
|
237
237
|
self.mlp = MLP(input_dim=mlp_input_dim, **mlp_params)
|
|
238
238
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
239
239
|
# Register regularization weights
|
|
240
|
-
self.
|
|
240
|
+
self.register_regularization_weights(embedding_attr='embedding', include_modules=['interest_extractor', 'interest_evolution', 'attention_layer', 'mlp', 'candidate_proj'])
|
|
241
241
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
|
|
242
242
|
|
|
243
243
|
def forward(self, x):
|
nextrec/models/ranking/din.py
CHANGED
|
@@ -108,7 +108,7 @@ class DIN(BaseModel):
|
|
|
108
108
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
109
109
|
|
|
110
110
|
# Register regularization weights
|
|
111
|
-
self.
|
|
111
|
+
self.register_regularization_weights(
|
|
112
112
|
embedding_attr='embedding',
|
|
113
113
|
include_modules=['attention', 'mlp', 'candidate_attention_proj']
|
|
114
114
|
)
|
|
@@ -104,7 +104,7 @@ class FiBiNET(BaseModel):
|
|
|
104
104
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
105
105
|
|
|
106
106
|
# Register regularization weights
|
|
107
|
-
self.
|
|
107
|
+
self.register_regularization_weights(
|
|
108
108
|
embedding_attr='embedding',
|
|
109
109
|
include_modules=['linear', 'senet', 'bilinear_standard', 'bilinear_senet', 'mlp']
|
|
110
110
|
)
|
nextrec/models/ranking/fm.py
CHANGED
|
@@ -69,7 +69,7 @@ class FM(BaseModel):
|
|
|
69
69
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
70
70
|
|
|
71
71
|
# Register regularization weights
|
|
72
|
-
self.
|
|
72
|
+
self.register_regularization_weights(
|
|
73
73
|
embedding_attr='embedding',
|
|
74
74
|
include_modules=['linear']
|
|
75
75
|
)
|
|
@@ -234,10 +234,10 @@ class MaskNet(BaseModel):
|
|
|
234
234
|
self.prediction_layer = PredictionLayer(task_type=self.task_type)
|
|
235
235
|
|
|
236
236
|
if self.model_type == "serial":
|
|
237
|
-
self.
|
|
237
|
+
self.register_regularization_weights(embedding_attr="embedding", include_modules=["mask_blocks", "output_layer"],)
|
|
238
238
|
# serial
|
|
239
239
|
else:
|
|
240
|
-
self.
|
|
240
|
+
self.register_regularization_weights(embedding_attr="embedding", include_modules=["mask_blocks", "final_mlp"])
|
|
241
241
|
self.compile(optimizer=optimizer, optimizer_params=optimizer_params, loss=loss, loss_params=loss_params)
|
|
242
242
|
|
|
243
243
|
def forward(self, x: dict[str, torch.Tensor]) -> torch.Tensor:
|
nextrec/models/ranking/pnn.py
CHANGED