nextrec 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/features.py +10 -23
- nextrec/basic/layers.py +18 -61
- nextrec/basic/loggers.py +1 -1
- nextrec/basic/metrics.py +55 -33
- nextrec/basic/model.py +258 -394
- nextrec/data/__init__.py +2 -2
- nextrec/data/data_utils.py +80 -4
- nextrec/data/dataloader.py +36 -57
- nextrec/data/preprocessor.py +5 -4
- nextrec/models/generative/__init__.py +5 -0
- nextrec/models/generative/hstu.py +399 -0
- nextrec/models/match/dssm.py +2 -2
- nextrec/models/match/dssm_v2.py +2 -2
- nextrec/models/match/mind.py +2 -2
- nextrec/models/match/sdm.py +2 -2
- nextrec/models/match/youtube_dnn.py +2 -2
- nextrec/models/multi_task/esmm.py +1 -1
- nextrec/models/multi_task/mmoe.py +1 -1
- nextrec/models/multi_task/ple.py +1 -1
- nextrec/models/multi_task/poso.py +1 -1
- nextrec/models/multi_task/share_bottom.py +1 -1
- nextrec/models/ranking/afm.py +1 -1
- nextrec/models/ranking/autoint.py +1 -1
- nextrec/models/ranking/dcn.py +1 -1
- nextrec/models/ranking/deepfm.py +1 -1
- nextrec/models/ranking/dien.py +1 -1
- nextrec/models/ranking/din.py +1 -1
- nextrec/models/ranking/fibinet.py +1 -1
- nextrec/models/ranking/fm.py +1 -1
- nextrec/models/ranking/masknet.py +2 -2
- nextrec/models/ranking/pnn.py +1 -1
- nextrec/models/ranking/widedeep.py +1 -1
- nextrec/models/ranking/xdeepfm.py +1 -1
- nextrec/utils/__init__.py +2 -1
- nextrec/utils/common.py +21 -2
- nextrec/utils/optimizer.py +7 -3
- {nextrec-0.3.1.dist-info → nextrec-0.3.3.dist-info}/METADATA +10 -4
- nextrec-0.3.3.dist-info/RECORD +57 -0
- nextrec-0.3.1.dist-info/RECORD +0 -56
- {nextrec-0.3.1.dist-info → nextrec-0.3.3.dist-info}/WHEEL +0 -0
- {nextrec-0.3.1.dist-info → nextrec-0.3.3.dist-info}/licenses/LICENSE +0 -0
nextrec/data/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@ from nextrec.data.data_utils import (
|
|
|
18
18
|
read_table,
|
|
19
19
|
load_dataframes,
|
|
20
20
|
)
|
|
21
|
-
from nextrec.basic.features import
|
|
21
|
+
from nextrec.basic.features import FeatureSet
|
|
22
22
|
from nextrec.data import data_utils
|
|
23
23
|
from nextrec.data.dataloader import (
|
|
24
24
|
TensorDictDataset,
|
|
@@ -38,7 +38,7 @@ __all__ = [
|
|
|
38
38
|
'iter_file_chunks',
|
|
39
39
|
'read_table',
|
|
40
40
|
'load_dataframes',
|
|
41
|
-
'
|
|
41
|
+
'FeatureSet',
|
|
42
42
|
'data_utils',
|
|
43
43
|
'TensorDictDataset',
|
|
44
44
|
'FileDataset',
|
nextrec/data/data_utils.py
CHANGED
|
@@ -5,8 +5,9 @@ import numpy as np
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import pyarrow.parquet as pq
|
|
7
7
|
from pathlib import Path
|
|
8
|
+
from typing import Any, Mapping, Sequence
|
|
8
9
|
|
|
9
|
-
def
|
|
10
|
+
def stack_section(batch: list[dict], section: str):
|
|
10
11
|
"""Stack one section of the batch (features/labels/ids)."""
|
|
11
12
|
entries = [item.get(section) for item in batch if item.get(section) is not None]
|
|
12
13
|
if not entries:
|
|
@@ -39,9 +40,9 @@ def collate_fn(batch):
|
|
|
39
40
|
"ids": first.get("ids"),
|
|
40
41
|
}
|
|
41
42
|
return {
|
|
42
|
-
"features":
|
|
43
|
-
"labels":
|
|
44
|
-
"ids":
|
|
43
|
+
"features": stack_section(batch, "features") or {},
|
|
44
|
+
"labels": stack_section(batch, "labels"),
|
|
45
|
+
"ids": stack_section(batch, "ids"),
|
|
45
46
|
}
|
|
46
47
|
|
|
47
48
|
# Fallback: stack tuples/lists of tensors
|
|
@@ -190,3 +191,78 @@ def build_eval_candidates(
|
|
|
190
191
|
eval_df = eval_df.merge(user_features, on=user_col, how='left')
|
|
191
192
|
eval_df = eval_df.merge(item_features, on=item_col, how='left')
|
|
192
193
|
return eval_df
|
|
194
|
+
|
|
195
|
+
def batch_to_dict(batch_data: Any, include_ids: bool = True) -> dict:
|
|
196
|
+
"""Standardize a dataloader batch into a dict of features, labels, and ids."""
|
|
197
|
+
if not (isinstance(batch_data, Mapping) and "features" in batch_data):
|
|
198
|
+
raise TypeError(
|
|
199
|
+
"[BaseModel-batch_to_dict Error] Batch data must be a dict with 'features' produced by the current DataLoader."
|
|
200
|
+
)
|
|
201
|
+
return {
|
|
202
|
+
"features": batch_data.get("features", {}),
|
|
203
|
+
"labels": batch_data.get("labels"),
|
|
204
|
+
"ids": batch_data.get("ids") if include_ids else None,
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# def get_user_ids(
|
|
209
|
+
# data: dict | pd.DataFrame | None, user_id_column: str = "user_id"
|
|
210
|
+
# ) -> np.ndarray | None:
|
|
211
|
+
# """Extract user IDs from a dataset dict or DataFrame."""
|
|
212
|
+
# if data is None:
|
|
213
|
+
# return None
|
|
214
|
+
# if isinstance(data, pd.DataFrame) and user_id_column in data.columns:
|
|
215
|
+
# return np.asarray(data[user_id_column].values)
|
|
216
|
+
# if isinstance(data, dict) and user_id_column in data:
|
|
217
|
+
# return np.asarray(data[user_id_column])
|
|
218
|
+
# return None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# def get_user_ids_from_batch(
|
|
222
|
+
# batch_dict: Mapping[str, Any], id_columns: Sequence[str] | None = None
|
|
223
|
+
# ) -> np.ndarray | None:
|
|
224
|
+
# """Extract the prioritized user id column from a batch dict."""
|
|
225
|
+
# ids_container = batch_dict.get("ids") if isinstance(batch_dict, Mapping) else None
|
|
226
|
+
# if not ids_container:
|
|
227
|
+
# return None
|
|
228
|
+
|
|
229
|
+
# batch_user_id = None
|
|
230
|
+
# if id_columns:
|
|
231
|
+
# for id_name in id_columns:
|
|
232
|
+
# if id_name in ids_container:
|
|
233
|
+
# batch_user_id = ids_container[id_name]
|
|
234
|
+
# break
|
|
235
|
+
# if batch_user_id is None:
|
|
236
|
+
# batch_user_id = next(iter(ids_container.values()), None)
|
|
237
|
+
# if batch_user_id is None:
|
|
238
|
+
# return None
|
|
239
|
+
|
|
240
|
+
# if isinstance(batch_user_id, torch.Tensor):
|
|
241
|
+
# ids_np = batch_user_id.detach().cpu().numpy()
|
|
242
|
+
# else:
|
|
243
|
+
# ids_np = np.asarray(batch_user_id)
|
|
244
|
+
# if ids_np.ndim == 0:
|
|
245
|
+
# ids_np = ids_np.reshape(1)
|
|
246
|
+
# return ids_np.reshape(ids_np.shape[0])
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def get_user_ids(data, id_columns: list[str] | str | None = None) -> np.ndarray | None:
|
|
250
|
+
id_columns = id_columns if isinstance(id_columns, list) else [id_columns] if isinstance(id_columns, str) else []
|
|
251
|
+
if not id_columns:
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
main_id = id_columns[0]
|
|
255
|
+
if isinstance(data, pd.DataFrame) and main_id in data.columns:
|
|
256
|
+
arr = np.asarray(data[main_id].values)
|
|
257
|
+
return arr.reshape(arr.shape[0])
|
|
258
|
+
if isinstance(data, dict):
|
|
259
|
+
ids_container = data.get("ids")
|
|
260
|
+
if isinstance(ids_container, dict) and main_id in ids_container:
|
|
261
|
+
val = ids_container[main_id]
|
|
262
|
+
val = val.detach().cpu().numpy() if isinstance(val, torch.Tensor) else np.asarray(val)
|
|
263
|
+
return val.reshape(val.shape[0])
|
|
264
|
+
if main_id in data:
|
|
265
|
+
arr = np.asarray(data[main_id])
|
|
266
|
+
return arr.reshape(arr.shape[0])
|
|
267
|
+
|
|
268
|
+
return None
|
nextrec/data/dataloader.py
CHANGED
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
Dataloader definitions
|
|
3
3
|
|
|
4
4
|
Date: create on 27/10/2025
|
|
5
|
-
Checkpoint: edit on
|
|
5
|
+
Checkpoint: edit on 02/12/2025
|
|
6
6
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
7
7
|
"""
|
|
8
8
|
import os
|
|
9
|
-
import tqdm
|
|
10
9
|
import torch
|
|
11
10
|
import logging
|
|
12
11
|
import numpy as np
|
|
@@ -18,15 +17,11 @@ from typing import cast
|
|
|
18
17
|
|
|
19
18
|
from torch.utils.data import DataLoader, Dataset, IterableDataset
|
|
20
19
|
from nextrec.data.preprocessor import DataProcessor
|
|
21
|
-
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature,
|
|
20
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
|
|
22
21
|
|
|
23
22
|
from nextrec.basic.loggers import colorize
|
|
24
|
-
from nextrec.data import
|
|
25
|
-
|
|
26
|
-
collate_fn,
|
|
27
|
-
resolve_file_paths,
|
|
28
|
-
read_table,
|
|
29
|
-
)
|
|
23
|
+
from nextrec.data import get_column_data, collate_fn, resolve_file_paths, read_table
|
|
24
|
+
from nextrec.utils import to_tensor
|
|
30
25
|
|
|
31
26
|
class TensorDictDataset(Dataset):
|
|
32
27
|
"""Dataset returning sample-level dicts matching the unified batch schema."""
|
|
@@ -52,7 +47,7 @@ class TensorDictDataset(Dataset):
|
|
|
52
47
|
sample_ids = {name: tensor[idx] for name, tensor in self.ids.items()} if self.ids else None
|
|
53
48
|
return {"features": sample_features, "labels": sample_labels, "ids": sample_ids}
|
|
54
49
|
|
|
55
|
-
class FileDataset(
|
|
50
|
+
class FileDataset(FeatureSet, IterableDataset):
|
|
56
51
|
def __init__(self,
|
|
57
52
|
file_paths: list[str], # file paths to read, containing CSV or Parquet files
|
|
58
53
|
dense_features: list[DenseFeature], # dense feature definitions
|
|
@@ -67,44 +62,37 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
|
|
|
67
62
|
self.chunk_size = chunk_size
|
|
68
63
|
self.file_type = file_type
|
|
69
64
|
self.processor = processor
|
|
70
|
-
self.
|
|
65
|
+
self.set_all_features(dense_features, sparse_features, sequence_features, target_columns, id_columns)
|
|
71
66
|
self.current_file_index = 0
|
|
72
67
|
self.total_files = len(file_paths)
|
|
73
68
|
|
|
74
69
|
def __iter__(self):
|
|
75
70
|
self.current_file_index = 0
|
|
76
|
-
self._file_pbar = None
|
|
77
|
-
if self.total_files > 1:
|
|
78
|
-
self._file_pbar = tqdm.tqdm(total=self.total_files, desc="Files", unit="file", position=0, leave=True, bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
|
|
79
71
|
for file_path in self.file_paths:
|
|
80
72
|
self.current_file_index += 1
|
|
81
|
-
if self.
|
|
82
|
-
self._file_pbar.update(1)
|
|
83
|
-
elif self.total_files == 1:
|
|
73
|
+
if self.total_files == 1:
|
|
84
74
|
file_name = os.path.basename(file_path)
|
|
85
75
|
logging.info(f"Processing file: {file_name}")
|
|
86
76
|
if self.file_type == 'csv':
|
|
87
|
-
yield from self.
|
|
77
|
+
yield from self.read_csv_chunks(file_path)
|
|
88
78
|
elif self.file_type == 'parquet':
|
|
89
|
-
yield from self.
|
|
90
|
-
if self._file_pbar is not None:
|
|
91
|
-
self._file_pbar.close()
|
|
79
|
+
yield from self.read_parquet_chunks(file_path)
|
|
92
80
|
|
|
93
|
-
def
|
|
81
|
+
def read_csv_chunks(self, file_path: str):
|
|
94
82
|
chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
|
|
95
83
|
for chunk in chunk_iterator:
|
|
96
|
-
tensors = self.
|
|
84
|
+
tensors = self.dataframeto_tensors(chunk)
|
|
97
85
|
yield tensors
|
|
98
86
|
|
|
99
|
-
def
|
|
87
|
+
def read_parquet_chunks(self, file_path: str):
|
|
100
88
|
parquet_file = pq.ParquetFile(file_path)
|
|
101
89
|
for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
|
|
102
90
|
chunk = batch.to_pandas()
|
|
103
|
-
tensors = self.
|
|
91
|
+
tensors = self.dataframeto_tensors(chunk)
|
|
104
92
|
yield tensors
|
|
105
93
|
del chunk
|
|
106
94
|
|
|
107
|
-
def
|
|
95
|
+
def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
|
|
108
96
|
if self.processor is not None:
|
|
109
97
|
if not self.processor.is_fitted:
|
|
110
98
|
raise ValueError("[DataLoader Error] DataProcessor must be fitted before using in streaming mode")
|
|
@@ -120,7 +108,7 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
|
|
|
120
108
|
return batch
|
|
121
109
|
|
|
122
110
|
|
|
123
|
-
class RecDataLoader(
|
|
111
|
+
class RecDataLoader(FeatureSet):
|
|
124
112
|
def __init__(self,
|
|
125
113
|
dense_features: list[DenseFeature] | None = None,
|
|
126
114
|
sparse_features: list[SparseFeature] | None = None,
|
|
@@ -129,7 +117,7 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
129
117
|
id_columns: str | list[str] | None = None,
|
|
130
118
|
processor: DataProcessor | None = None):
|
|
131
119
|
self.processor = processor
|
|
132
|
-
self.
|
|
120
|
+
self.set_all_features(dense_features, sparse_features, sequence_features, target, id_columns)
|
|
133
121
|
|
|
134
122
|
def create_dataloader(self,
|
|
135
123
|
data: dict | pd.DataFrame | str | DataLoader,
|
|
@@ -140,13 +128,13 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
140
128
|
if isinstance(data, DataLoader):
|
|
141
129
|
return data
|
|
142
130
|
elif isinstance(data, (str, os.PathLike)):
|
|
143
|
-
return self.
|
|
131
|
+
return self.create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size)
|
|
144
132
|
elif isinstance(data, (dict, pd.DataFrame)):
|
|
145
|
-
return self.
|
|
133
|
+
return self.create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle)
|
|
146
134
|
else:
|
|
147
135
|
raise ValueError(f"[RecDataLoader Error] Unsupported data type: {type(data)}")
|
|
148
136
|
|
|
149
|
-
def
|
|
137
|
+
def create_from_memory(self,
|
|
150
138
|
data: dict | pd.DataFrame,
|
|
151
139
|
batch_size: int,
|
|
152
140
|
shuffle: bool) -> DataLoader:
|
|
@@ -162,7 +150,7 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
162
150
|
dataset = TensorDictDataset(tensors)
|
|
163
151
|
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
|
|
164
152
|
|
|
165
|
-
def
|
|
153
|
+
def create_from_path(self,
|
|
166
154
|
path: str,
|
|
167
155
|
batch_size: int,
|
|
168
156
|
shuffle: bool,
|
|
@@ -179,7 +167,6 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
179
167
|
except OSError:
|
|
180
168
|
pass
|
|
181
169
|
try:
|
|
182
|
-
df = read_table(file_path, file_type)
|
|
183
170
|
dfs.append(df)
|
|
184
171
|
except MemoryError as exc:
|
|
185
172
|
raise MemoryError(f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming.") from exc
|
|
@@ -187,11 +174,11 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
187
174
|
combined_df = pd.concat(dfs, ignore_index=True)
|
|
188
175
|
except MemoryError as exc:
|
|
189
176
|
raise MemoryError(f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size.") from exc
|
|
190
|
-
return self.
|
|
177
|
+
return self.create_from_memory(combined_df, batch_size, shuffle,)
|
|
191
178
|
else:
|
|
192
|
-
return self.
|
|
179
|
+
return self.load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle)
|
|
193
180
|
|
|
194
|
-
def
|
|
181
|
+
def load_files_streaming(self,
|
|
195
182
|
file_paths: list[str],
|
|
196
183
|
file_type: str,
|
|
197
184
|
batch_size: int,
|
|
@@ -201,20 +188,10 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
201
188
|
logging.warning("[RecDataLoader Warning] Shuffle is ignored in streaming mode (IterableDataset).")
|
|
202
189
|
if batch_size != 1:
|
|
203
190
|
logging.warning("[RecDataLoader Warning] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
|
|
204
|
-
dataset = FileDataset(
|
|
205
|
-
file_paths=file_paths,
|
|
206
|
-
dense_features=self.dense_features,
|
|
207
|
-
sparse_features=self.sparse_features,
|
|
208
|
-
sequence_features=self.sequence_features,
|
|
209
|
-
target_columns=self.target_columns,
|
|
210
|
-
id_columns=self.id_columns,
|
|
211
|
-
chunk_size=chunk_size,
|
|
212
|
-
file_type=file_type,
|
|
213
|
-
processor=self.processor
|
|
214
|
-
)
|
|
191
|
+
dataset = FileDataset(file_paths=file_paths, dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target_columns=self.target_columns, id_columns=self.id_columns, chunk_size=chunk_size, file_type=file_type, processor=self.processor)
|
|
215
192
|
return DataLoader(dataset, batch_size=1, collate_fn=collate_fn)
|
|
216
193
|
|
|
217
|
-
def
|
|
194
|
+
def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
218
195
|
if isinstance(column, pd.Series):
|
|
219
196
|
column = column.tolist()
|
|
220
197
|
if isinstance(column, (list, tuple)):
|
|
@@ -250,25 +227,27 @@ def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
|
250
227
|
column = column.reshape(-1, 1)
|
|
251
228
|
return np.asarray(column, dtype=np.int64)
|
|
252
229
|
|
|
253
|
-
|
|
254
|
-
def build_tensors_from_data( # noqa: C901
|
|
230
|
+
def build_tensors_from_data(
|
|
255
231
|
data: dict | pd.DataFrame,
|
|
256
232
|
raw_data: dict | pd.DataFrame,
|
|
257
233
|
features: list,
|
|
258
234
|
target_columns: list[str],
|
|
259
235
|
id_columns: list[str]
|
|
260
236
|
) -> dict | None:
|
|
261
|
-
feature_tensors
|
|
237
|
+
feature_tensors = {}
|
|
262
238
|
for feature in features:
|
|
263
239
|
column = get_column_data(data, feature.name)
|
|
264
240
|
if column is None:
|
|
265
241
|
raise ValueError(f"[RecDataLoader Error] Feature column '{feature.name}' not found in data")
|
|
266
|
-
if isinstance(feature, SequenceFeature):
|
|
267
|
-
|
|
242
|
+
if isinstance(feature, SequenceFeature): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
|
|
243
|
+
arr = normalize_sequence_column(column, feature)
|
|
244
|
+
tensor = to_tensor(arr, dtype=torch.long)
|
|
268
245
|
elif isinstance(feature, DenseFeature):
|
|
269
|
-
|
|
246
|
+
arr = np.asarray(column, dtype=np.float32)
|
|
247
|
+
tensor = to_tensor(arr, dtype=torch.float32)
|
|
270
248
|
else:
|
|
271
|
-
|
|
249
|
+
arr = np.asarray(column, dtype=np.int64)
|
|
250
|
+
tensor = to_tensor(arr, dtype=torch.long)
|
|
272
251
|
feature_tensors[feature.name] = tensor
|
|
273
252
|
label_tensors = None
|
|
274
253
|
if target_columns:
|
|
@@ -277,7 +256,7 @@ def build_tensors_from_data( # noqa: C901
|
|
|
277
256
|
column = get_column_data(data, target_name)
|
|
278
257
|
if column is None:
|
|
279
258
|
continue
|
|
280
|
-
label_tensor =
|
|
259
|
+
label_tensor = to_tensor(np.asarray(column, dtype=np.float32), dtype=torch.float32)
|
|
281
260
|
if label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
|
|
282
261
|
label_tensor = label_tensor.t()
|
|
283
262
|
if label_tensor.shape[1:] == (1,):
|
|
@@ -298,7 +277,7 @@ def build_tensors_from_data( # noqa: C901
|
|
|
298
277
|
id_arr = np.asarray(column, dtype=np.int64)
|
|
299
278
|
except Exception as exc:
|
|
300
279
|
raise TypeError( f"[RecDataLoader Error] ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}") from exc
|
|
301
|
-
id_tensors[id_col] = torch.
|
|
280
|
+
id_tensors[id_col] = to_tensor(id_arr, dtype=torch.long)
|
|
302
281
|
if not feature_tensors:
|
|
303
282
|
return None
|
|
304
283
|
return {"features": feature_tensors, "labels": label_tensors, "ids": id_tensors}
|
nextrec/data/preprocessor.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
3
3
|
|
|
4
4
|
Date: create on 13/11/2025
|
|
5
|
+
Checkpoint: edit on 02/12/2025
|
|
5
6
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
6
7
|
"""
|
|
7
8
|
from __future__ import annotations
|
|
@@ -32,11 +33,11 @@ from nextrec.data.data_utils import (
|
|
|
32
33
|
default_output_dir,
|
|
33
34
|
)
|
|
34
35
|
from nextrec.basic.session import resolve_save_path
|
|
35
|
-
from nextrec.basic.features import
|
|
36
|
+
from nextrec.basic.features import FeatureSet
|
|
36
37
|
from nextrec.__version__ import __version__
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
class DataProcessor(
|
|
40
|
+
class DataProcessor(FeatureSet):
|
|
40
41
|
"""DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
41
42
|
|
|
42
43
|
Examples:
|
|
@@ -70,7 +71,7 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
70
71
|
self.scalers: Dict[str, Any] = {}
|
|
71
72
|
self.label_encoders: Dict[str, LabelEncoder] = {}
|
|
72
73
|
self.target_encoders: Dict[str, Dict[str, int]] = {}
|
|
73
|
-
self.
|
|
74
|
+
self.set_target_id([], [])
|
|
74
75
|
|
|
75
76
|
def add_numeric_feature(
|
|
76
77
|
self,
|
|
@@ -129,7 +130,7 @@ class DataProcessor(FeatureSpecMixin):
|
|
|
129
130
|
'target_type': target_type,
|
|
130
131
|
'label_map': label_map
|
|
131
132
|
}
|
|
132
|
-
self.
|
|
133
|
+
self.set_target_id(list(self.target_features.keys()), [])
|
|
133
134
|
|
|
134
135
|
def _hash_string(self, s: str, hash_size: int) -> int:
|
|
135
136
|
return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
|