nextrec 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/features.py +1 -1
- nextrec/basic/loggers.py +71 -8
- nextrec/basic/model.py +45 -11
- nextrec/basic/session.py +3 -10
- nextrec/data/__init__.py +47 -9
- nextrec/data/batch_utils.py +80 -0
- nextrec/data/data_processing.py +152 -0
- nextrec/data/data_utils.py +35 -268
- nextrec/data/dataloader.py +6 -4
- nextrec/data/preprocessor.py +39 -85
- nextrec/models/multi_task/poso.py +1 -1
- nextrec/utils/__init__.py +53 -3
- nextrec/utils/device.py +37 -0
- nextrec/utils/feature.py +13 -0
- nextrec/utils/file.py +70 -0
- nextrec/utils/initializer.py +0 -8
- nextrec/utils/model.py +22 -0
- nextrec/utils/optimizer.py +0 -19
- nextrec/utils/tensor.py +61 -0
- {nextrec-0.3.3.dist-info → nextrec-0.3.5.dist-info}/METADATA +3 -3
- {nextrec-0.3.3.dist-info → nextrec-0.3.5.dist-info}/RECORD +24 -18
- nextrec/utils/common.py +0 -60
- {nextrec-0.3.3.dist-info → nextrec-0.3.5.dist-info}/WHEEL +0 -0
- {nextrec-0.3.3.dist-info → nextrec-0.3.5.dist-info}/licenses/LICENSE +0 -0
nextrec/data/data_utils.py
CHANGED
|
@@ -1,268 +1,35 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
if first.get("_already_batched") and len(batch) == 1:
|
|
37
|
-
return {
|
|
38
|
-
"features": first.get("features", {}),
|
|
39
|
-
"labels": first.get("labels"),
|
|
40
|
-
"ids": first.get("ids"),
|
|
41
|
-
}
|
|
42
|
-
return {
|
|
43
|
-
"features": stack_section(batch, "features") or {},
|
|
44
|
-
"labels": stack_section(batch, "labels"),
|
|
45
|
-
"ids": stack_section(batch, "ids"),
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
# Fallback: stack tuples/lists of tensors
|
|
49
|
-
num_tensors = len(first)
|
|
50
|
-
result = []
|
|
51
|
-
for i in range(num_tensors):
|
|
52
|
-
tensor_list = [item[i] for item in batch]
|
|
53
|
-
first_item = tensor_list[0]
|
|
54
|
-
if isinstance(first_item, torch.Tensor):
|
|
55
|
-
stacked = torch.cat(tensor_list, dim=0)
|
|
56
|
-
elif isinstance(first_item, np.ndarray):
|
|
57
|
-
stacked = np.concatenate(tensor_list, axis=0)
|
|
58
|
-
elif isinstance(first_item, list):
|
|
59
|
-
combined = []
|
|
60
|
-
for entry in tensor_list:
|
|
61
|
-
combined.extend(entry)
|
|
62
|
-
stacked = combined
|
|
63
|
-
else:
|
|
64
|
-
stacked = tensor_list
|
|
65
|
-
result.append(stacked)
|
|
66
|
-
return tuple(result)
|
|
67
|
-
|
|
68
|
-
def get_column_data(data: dict | pd.DataFrame, name: str):
|
|
69
|
-
"""Extract column data from various data structures."""
|
|
70
|
-
if isinstance(data, dict):
|
|
71
|
-
return data[name] if name in data else None
|
|
72
|
-
elif isinstance(data, pd.DataFrame):
|
|
73
|
-
if name not in data.columns:
|
|
74
|
-
return None
|
|
75
|
-
return data[name].values
|
|
76
|
-
else:
|
|
77
|
-
if hasattr(data, name):
|
|
78
|
-
return getattr(data, name)
|
|
79
|
-
raise KeyError(f"Unsupported data type for extracting column {name}")
|
|
80
|
-
|
|
81
|
-
def resolve_file_paths(path: str) -> tuple[list[str], str]:
|
|
82
|
-
"""Resolve file or directory path into a sorted list of files and file type."""
|
|
83
|
-
path_obj = Path(path)
|
|
84
|
-
|
|
85
|
-
if path_obj.is_file():
|
|
86
|
-
file_type = path_obj.suffix.lower().lstrip(".")
|
|
87
|
-
assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
|
|
88
|
-
return [str(path_obj)], file_type
|
|
89
|
-
|
|
90
|
-
if path_obj.is_dir():
|
|
91
|
-
collected_files = [p for p in path_obj.iterdir() if p.is_file()]
|
|
92
|
-
csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
|
|
93
|
-
parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
|
|
94
|
-
|
|
95
|
-
if csv_files and parquet_files:
|
|
96
|
-
raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
|
|
97
|
-
file_paths = csv_files if csv_files else parquet_files
|
|
98
|
-
if not file_paths:
|
|
99
|
-
raise ValueError(f"No CSV or Parquet files found in directory: {path}")
|
|
100
|
-
file_paths.sort()
|
|
101
|
-
file_type = "csv" if csv_files else "parquet"
|
|
102
|
-
return file_paths, file_type
|
|
103
|
-
|
|
104
|
-
raise ValueError(f"Invalid path: {path}")
|
|
105
|
-
|
|
106
|
-
def iter_file_chunks(file_path: str, file_type: str, chunk_size: int):
|
|
107
|
-
"""Yield DataFrame chunks for CSV/Parquet without loading the whole file."""
|
|
108
|
-
if file_type == "csv":
|
|
109
|
-
yield from pd.read_csv(file_path, chunksize=chunk_size)
|
|
110
|
-
return
|
|
111
|
-
parquet_file = pq.ParquetFile(file_path)
|
|
112
|
-
for batch in parquet_file.iter_batches(batch_size=chunk_size):
|
|
113
|
-
yield batch.to_pandas()
|
|
114
|
-
|
|
115
|
-
def read_table(file_path: str, file_type: str) -> pd.DataFrame:
|
|
116
|
-
"""Read a single CSV/Parquet file."""
|
|
117
|
-
if file_type == "csv":
|
|
118
|
-
return pd.read_csv(file_path)
|
|
119
|
-
return pd.read_parquet(file_path)
|
|
120
|
-
|
|
121
|
-
def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
|
|
122
|
-
"""Load multiple files of the same type into DataFrames."""
|
|
123
|
-
return [read_table(fp, file_type) for fp in file_paths]
|
|
124
|
-
|
|
125
|
-
def default_output_dir(path: str) -> Path:
|
|
126
|
-
"""Generate a default output directory path based on the input path."""
|
|
127
|
-
path_obj = Path(path)
|
|
128
|
-
if path_obj.is_file():
|
|
129
|
-
return path_obj.parent / f"{path_obj.stem}_preprocessed"
|
|
130
|
-
return path_obj.with_name(f"{path_obj.name}_preprocessed")
|
|
131
|
-
|
|
132
|
-
def split_dict_random(data_dict: dict, test_size: float = 0.2, random_state: int | None = None):
|
|
133
|
-
"""Randomly split a dictionary of data into training and testing sets."""
|
|
134
|
-
lengths = [len(v) for v in data_dict.values()]
|
|
135
|
-
if len(set(lengths)) != 1:
|
|
136
|
-
raise ValueError(f"Length mismatch: {lengths}")
|
|
137
|
-
n = lengths[0]
|
|
138
|
-
rng = np.random.default_rng(random_state)
|
|
139
|
-
perm = rng.permutation(n)
|
|
140
|
-
cut = int(round(n * (1 - test_size)))
|
|
141
|
-
train_idx, test_idx = perm[:cut], perm[cut:]
|
|
142
|
-
def take(v, idx):
|
|
143
|
-
if isinstance(v, np.ndarray):
|
|
144
|
-
return v[idx]
|
|
145
|
-
elif isinstance(v, pd.Series):
|
|
146
|
-
return v.iloc[idx].to_numpy()
|
|
147
|
-
else:
|
|
148
|
-
v_arr = np.asarray(v, dtype=object)
|
|
149
|
-
return v_arr[idx]
|
|
150
|
-
train_dict = {k: take(v, train_idx) for k, v in data_dict.items()}
|
|
151
|
-
test_dict = {k: take(v, test_idx) for k, v in data_dict.items()}
|
|
152
|
-
return train_dict, test_dict
|
|
153
|
-
|
|
154
|
-
def build_eval_candidates(
|
|
155
|
-
df_all: pd.DataFrame,
|
|
156
|
-
user_col: str,
|
|
157
|
-
item_col: str,
|
|
158
|
-
label_col: str,
|
|
159
|
-
user_features: pd.DataFrame,
|
|
160
|
-
item_features: pd.DataFrame,
|
|
161
|
-
num_pos_per_user: int = 5,
|
|
162
|
-
num_neg_per_pos: int = 50,
|
|
163
|
-
random_seed: int = 2025,
|
|
164
|
-
) -> pd.DataFrame:
|
|
165
|
-
"""Build evaluation candidates with positive and negative samples for each user. """
|
|
166
|
-
rng = np.random.default_rng(random_seed)
|
|
167
|
-
|
|
168
|
-
users = df_all[user_col].unique()
|
|
169
|
-
all_items = item_features[item_col].unique()
|
|
170
|
-
rows = []
|
|
171
|
-
user_hist_items = {u: df_all[df_all[user_col] == u][item_col].unique() for u in users}
|
|
172
|
-
for u in users:
|
|
173
|
-
df_user = df_all[df_all[user_col] == u]
|
|
174
|
-
pos_items = df_user[df_user[label_col] == 1][item_col].unique()
|
|
175
|
-
if len(pos_items) == 0:
|
|
176
|
-
continue
|
|
177
|
-
pos_items = pos_items[:num_pos_per_user]
|
|
178
|
-
seen_items = set(user_hist_items[u])
|
|
179
|
-
neg_pool = np.setdiff1d(all_items, np.fromiter(seen_items, dtype=all_items.dtype))
|
|
180
|
-
if len(neg_pool) == 0:
|
|
181
|
-
continue
|
|
182
|
-
for pos in pos_items:
|
|
183
|
-
if len(neg_pool) <= num_neg_per_pos:
|
|
184
|
-
neg_items = neg_pool
|
|
185
|
-
else:
|
|
186
|
-
neg_items = rng.choice(neg_pool, size=num_neg_per_pos, replace=False)
|
|
187
|
-
rows.append((u, pos, 1))
|
|
188
|
-
for ni in neg_items:
|
|
189
|
-
rows.append((u, ni, 0))
|
|
190
|
-
eval_df = pd.DataFrame(rows, columns=[user_col, item_col, label_col])
|
|
191
|
-
eval_df = eval_df.merge(user_features, on=user_col, how='left')
|
|
192
|
-
eval_df = eval_df.merge(item_features, on=item_col, how='left')
|
|
193
|
-
return eval_df
|
|
194
|
-
|
|
195
|
-
def batch_to_dict(batch_data: Any, include_ids: bool = True) -> dict:
|
|
196
|
-
"""Standardize a dataloader batch into a dict of features, labels, and ids."""
|
|
197
|
-
if not (isinstance(batch_data, Mapping) and "features" in batch_data):
|
|
198
|
-
raise TypeError(
|
|
199
|
-
"[BaseModel-batch_to_dict Error] Batch data must be a dict with 'features' produced by the current DataLoader."
|
|
200
|
-
)
|
|
201
|
-
return {
|
|
202
|
-
"features": batch_data.get("features", {}),
|
|
203
|
-
"labels": batch_data.get("labels"),
|
|
204
|
-
"ids": batch_data.get("ids") if include_ids else None,
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
# def get_user_ids(
|
|
209
|
-
# data: dict | pd.DataFrame | None, user_id_column: str = "user_id"
|
|
210
|
-
# ) -> np.ndarray | None:
|
|
211
|
-
# """Extract user IDs from a dataset dict or DataFrame."""
|
|
212
|
-
# if data is None:
|
|
213
|
-
# return None
|
|
214
|
-
# if isinstance(data, pd.DataFrame) and user_id_column in data.columns:
|
|
215
|
-
# return np.asarray(data[user_id_column].values)
|
|
216
|
-
# if isinstance(data, dict) and user_id_column in data:
|
|
217
|
-
# return np.asarray(data[user_id_column])
|
|
218
|
-
# return None
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
# def get_user_ids_from_batch(
|
|
222
|
-
# batch_dict: Mapping[str, Any], id_columns: Sequence[str] | None = None
|
|
223
|
-
# ) -> np.ndarray | None:
|
|
224
|
-
# """Extract the prioritized user id column from a batch dict."""
|
|
225
|
-
# ids_container = batch_dict.get("ids") if isinstance(batch_dict, Mapping) else None
|
|
226
|
-
# if not ids_container:
|
|
227
|
-
# return None
|
|
228
|
-
|
|
229
|
-
# batch_user_id = None
|
|
230
|
-
# if id_columns:
|
|
231
|
-
# for id_name in id_columns:
|
|
232
|
-
# if id_name in ids_container:
|
|
233
|
-
# batch_user_id = ids_container[id_name]
|
|
234
|
-
# break
|
|
235
|
-
# if batch_user_id is None:
|
|
236
|
-
# batch_user_id = next(iter(ids_container.values()), None)
|
|
237
|
-
# if batch_user_id is None:
|
|
238
|
-
# return None
|
|
239
|
-
|
|
240
|
-
# if isinstance(batch_user_id, torch.Tensor):
|
|
241
|
-
# ids_np = batch_user_id.detach().cpu().numpy()
|
|
242
|
-
# else:
|
|
243
|
-
# ids_np = np.asarray(batch_user_id)
|
|
244
|
-
# if ids_np.ndim == 0:
|
|
245
|
-
# ids_np = ids_np.reshape(1)
|
|
246
|
-
# return ids_np.reshape(ids_np.shape[0])
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
def get_user_ids(data, id_columns: list[str] | str | None = None) -> np.ndarray | None:
|
|
250
|
-
id_columns = id_columns if isinstance(id_columns, list) else [id_columns] if isinstance(id_columns, str) else []
|
|
251
|
-
if not id_columns:
|
|
252
|
-
return None
|
|
253
|
-
|
|
254
|
-
main_id = id_columns[0]
|
|
255
|
-
if isinstance(data, pd.DataFrame) and main_id in data.columns:
|
|
256
|
-
arr = np.asarray(data[main_id].values)
|
|
257
|
-
return arr.reshape(arr.shape[0])
|
|
258
|
-
if isinstance(data, dict):
|
|
259
|
-
ids_container = data.get("ids")
|
|
260
|
-
if isinstance(ids_container, dict) and main_id in ids_container:
|
|
261
|
-
val = ids_container[main_id]
|
|
262
|
-
val = val.detach().cpu().numpy() if isinstance(val, torch.Tensor) else np.asarray(val)
|
|
263
|
-
return val.reshape(val.shape[0])
|
|
264
|
-
if main_id in data:
|
|
265
|
-
arr = np.asarray(data[main_id])
|
|
266
|
-
return arr.reshape(arr.shape[0])
|
|
267
|
-
|
|
268
|
-
return None
|
|
1
|
+
"""
|
|
2
|
+
Data processing utilities for NextRec (Refactored)
|
|
3
|
+
|
|
4
|
+
This module now re-exports functions from specialized submodules:
|
|
5
|
+
- batch_utils: collate_fn, batch_to_dict
|
|
6
|
+
- data_processing: get_column_data, split_dict_random, build_eval_candidates, get_user_ids
|
|
7
|
+
- nextrec.utils.file_utils: resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
|
|
8
|
+
|
|
9
|
+
Date: create on 27/10/2025
|
|
10
|
+
Last update: 03/12/2025 (refactored)
|
|
11
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
# Import from new organized modules
|
|
15
|
+
from nextrec.data.batch_utils import collate_fn, batch_to_dict, stack_section
|
|
16
|
+
from nextrec.data.data_processing import get_column_data, split_dict_random, build_eval_candidates, get_user_ids
|
|
17
|
+
from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
# Batch utilities
|
|
21
|
+
'collate_fn',
|
|
22
|
+
'batch_to_dict',
|
|
23
|
+
'stack_section',
|
|
24
|
+
# Data processing
|
|
25
|
+
'get_column_data',
|
|
26
|
+
'split_dict_random',
|
|
27
|
+
'build_eval_candidates',
|
|
28
|
+
'get_user_ids',
|
|
29
|
+
# File utilities
|
|
30
|
+
'resolve_file_paths',
|
|
31
|
+
'iter_file_chunks',
|
|
32
|
+
'read_table',
|
|
33
|
+
'load_dataframes',
|
|
34
|
+
'default_output_dir',
|
|
35
|
+
]
|
nextrec/data/dataloader.py
CHANGED
|
@@ -20,8 +20,10 @@ from nextrec.data.preprocessor import DataProcessor
|
|
|
20
20
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
|
|
21
21
|
|
|
22
22
|
from nextrec.basic.loggers import colorize
|
|
23
|
-
from nextrec.data import get_column_data
|
|
24
|
-
from nextrec.
|
|
23
|
+
from nextrec.data.data_processing import get_column_data
|
|
24
|
+
from nextrec.data.batch_utils import collate_fn
|
|
25
|
+
from nextrec.utils.file import resolve_file_paths, read_table
|
|
26
|
+
from nextrec.utils.tensor import to_tensor
|
|
25
27
|
|
|
26
28
|
class TensorDictDataset(Dataset):
|
|
27
29
|
"""Dataset returning sample-level dicts matching the unified batch schema."""
|
|
@@ -185,9 +187,9 @@ class RecDataLoader(FeatureSet):
|
|
|
185
187
|
chunk_size: int,
|
|
186
188
|
shuffle: bool) -> DataLoader:
|
|
187
189
|
if shuffle:
|
|
188
|
-
logging.
|
|
190
|
+
logging.info("[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset).")
|
|
189
191
|
if batch_size != 1:
|
|
190
|
-
logging.
|
|
192
|
+
logging.info("[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
|
|
191
193
|
dataset = FileDataset(file_paths=file_paths, dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target_columns=self.target_columns, id_columns=self.id_columns, chunk_size=chunk_size, file_type=file_type, processor=self.processor)
|
|
192
194
|
return DataLoader(dataset, batch_size=1, collate_fn=collate_fn)
|
|
193
195
|
|
nextrec/data/preprocessor.py
CHANGED
|
@@ -16,48 +16,18 @@ import pandas as pd
|
|
|
16
16
|
import tqdm
|
|
17
17
|
from pathlib import Path
|
|
18
18
|
from typing import Dict, Union, Optional, Literal, Any
|
|
19
|
-
from sklearn.preprocessing import
|
|
20
|
-
|
|
21
|
-
MinMaxScaler,
|
|
22
|
-
RobustScaler,
|
|
23
|
-
MaxAbsScaler,
|
|
24
|
-
LabelEncoder
|
|
25
|
-
)
|
|
19
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, LabelEncoder
|
|
20
|
+
|
|
26
21
|
|
|
27
|
-
from nextrec.basic.loggers import setup_logger, colorize
|
|
28
|
-
from nextrec.data.data_utils import (
|
|
29
|
-
resolve_file_paths,
|
|
30
|
-
iter_file_chunks,
|
|
31
|
-
read_table,
|
|
32
|
-
load_dataframes,
|
|
33
|
-
default_output_dir,
|
|
34
|
-
)
|
|
35
|
-
from nextrec.basic.session import resolve_save_path
|
|
36
22
|
from nextrec.basic.features import FeatureSet
|
|
23
|
+
from nextrec.basic.loggers import colorize
|
|
24
|
+
from nextrec.basic.session import resolve_save_path
|
|
25
|
+
from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
|
|
26
|
+
|
|
37
27
|
from nextrec.__version__ import __version__
|
|
38
28
|
|
|
39
29
|
|
|
40
30
|
class DataProcessor(FeatureSet):
|
|
41
|
-
"""DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
42
|
-
|
|
43
|
-
Examples:
|
|
44
|
-
>>> processor = DataProcessor()
|
|
45
|
-
>>> processor.add_numeric_feature('age', scaler='standard')
|
|
46
|
-
>>> processor.add_sparse_feature('user_id', encode_method='hash', hash_size=10000)
|
|
47
|
-
>>> processor.add_sequence_feature('item_history', encode_method='label', max_len=50, pad_value=0)
|
|
48
|
-
>>> processor.add_target('label', target_type='binary')
|
|
49
|
-
>>>
|
|
50
|
-
>>> # Fit and transform data
|
|
51
|
-
>>> processor.fit(train_df)
|
|
52
|
-
>>> processed_data = processor.transform(test_df) # Returns dict of numpy arrays
|
|
53
|
-
>>>
|
|
54
|
-
>>> # Save and load processor
|
|
55
|
-
>>> processor.save('processor.pkl')
|
|
56
|
-
>>> loaded_processor = DataProcessor.load('processor.pkl')
|
|
57
|
-
>>>
|
|
58
|
-
>>> # Get vocabulary sizes for embedding layers
|
|
59
|
-
>>> vocab_sizes = processor.get_vocab_sizes()
|
|
60
|
-
"""
|
|
61
31
|
def __init__(self):
|
|
62
32
|
self.numeric_features: Dict[str, Dict[str, Any]] = {}
|
|
63
33
|
self.sparse_features: Dict[str, Dict[str, Any]] = {}
|
|
@@ -132,10 +102,10 @@ class DataProcessor(FeatureSet):
|
|
|
132
102
|
}
|
|
133
103
|
self.set_target_id(list(self.target_features.keys()), [])
|
|
134
104
|
|
|
135
|
-
def
|
|
105
|
+
def hash_string(self, s: str, hash_size: int) -> int:
|
|
136
106
|
return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
|
|
137
107
|
|
|
138
|
-
def
|
|
108
|
+
def process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
139
109
|
name = str(data.name)
|
|
140
110
|
scaler_type = config['scaler']
|
|
141
111
|
fill_na = config['fill_na']
|
|
@@ -164,7 +134,7 @@ class DataProcessor(FeatureSet):
|
|
|
164
134
|
scaler.fit(values)
|
|
165
135
|
self.scalers[name] = scaler
|
|
166
136
|
|
|
167
|
-
def
|
|
137
|
+
def process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
168
138
|
logger = logging.getLogger()
|
|
169
139
|
name = str(data.name)
|
|
170
140
|
scaler_type = config['scaler']
|
|
@@ -184,7 +154,7 @@ class DataProcessor(FeatureSet):
|
|
|
184
154
|
result = scaler.transform(values.reshape(-1, 1)).ravel()
|
|
185
155
|
return result
|
|
186
156
|
|
|
187
|
-
def
|
|
157
|
+
def process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
188
158
|
name = str(data.name)
|
|
189
159
|
encode_method = config['encode_method']
|
|
190
160
|
fill_na = config['fill_na'] # <UNK>
|
|
@@ -197,7 +167,7 @@ class DataProcessor(FeatureSet):
|
|
|
197
167
|
elif encode_method == 'hash':
|
|
198
168
|
config['vocab_size'] = config['hash_size']
|
|
199
169
|
|
|
200
|
-
def
|
|
170
|
+
def process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
201
171
|
name = str(data.name)
|
|
202
172
|
encode_method = config['encode_method']
|
|
203
173
|
fill_na = config['fill_na']
|
|
@@ -215,11 +185,11 @@ class DataProcessor(FeatureSet):
|
|
|
215
185
|
return encoded.to_numpy()
|
|
216
186
|
if encode_method == 'hash':
|
|
217
187
|
hash_size = config['hash_size']
|
|
218
|
-
hash_fn = self.
|
|
188
|
+
hash_fn = self.hash_string
|
|
219
189
|
return np.fromiter((hash_fn(v, hash_size) for v in sparse_series.to_numpy()), dtype=np.int64, count=sparse_series.size,)
|
|
220
190
|
return np.array([], dtype=np.int64)
|
|
221
191
|
|
|
222
|
-
def
|
|
192
|
+
def process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
223
193
|
name = str(data.name)
|
|
224
194
|
encode_method = config['encode_method']
|
|
225
195
|
separator = config['separator']
|
|
@@ -252,7 +222,7 @@ class DataProcessor(FeatureSet):
|
|
|
252
222
|
elif encode_method == 'hash':
|
|
253
223
|
config['vocab_size'] = config['hash_size']
|
|
254
224
|
|
|
255
|
-
def
|
|
225
|
+
def process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
256
226
|
"""Optimized sequence transform with preallocation and cached vocab map."""
|
|
257
227
|
name = str(data.name)
|
|
258
228
|
encode_method = config['encode_method']
|
|
@@ -276,7 +246,7 @@ class DataProcessor(FeatureSet):
|
|
|
276
246
|
config['_class_to_idx'] = class_to_idx
|
|
277
247
|
else:
|
|
278
248
|
class_to_idx = None # type: ignore
|
|
279
|
-
hash_fn = self.
|
|
249
|
+
hash_fn = self.hash_string
|
|
280
250
|
hash_size = config.get('hash_size')
|
|
281
251
|
for i, seq in enumerate(arr):
|
|
282
252
|
# normalize sequence to a list of strings
|
|
@@ -301,11 +271,7 @@ class DataProcessor(FeatureSet):
|
|
|
301
271
|
elif encode_method == 'hash':
|
|
302
272
|
if hash_size is None:
|
|
303
273
|
raise ValueError("hash_size must be set for hash encoding")
|
|
304
|
-
encoded = [
|
|
305
|
-
hash_fn(str(token), hash_size)
|
|
306
|
-
for token in tokens
|
|
307
|
-
if str(token).strip()
|
|
308
|
-
]
|
|
274
|
+
encoded = [hash_fn(str(token), hash_size) for token in tokens if str(token).strip()]
|
|
309
275
|
else:
|
|
310
276
|
encoded = []
|
|
311
277
|
if not encoded:
|
|
@@ -315,7 +281,7 @@ class DataProcessor(FeatureSet):
|
|
|
315
281
|
output[i, : len(encoded)] = encoded
|
|
316
282
|
return output
|
|
317
283
|
|
|
318
|
-
def
|
|
284
|
+
def process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
|
|
319
285
|
name = str(data.name)
|
|
320
286
|
target_type = config['target_type']
|
|
321
287
|
label_map = config.get('label_map')
|
|
@@ -334,7 +300,7 @@ class DataProcessor(FeatureSet):
|
|
|
334
300
|
config['label_map'] = label_map
|
|
335
301
|
self.target_encoders[name] = label_map
|
|
336
302
|
|
|
337
|
-
def
|
|
303
|
+
def process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
|
|
338
304
|
logger = logging.getLogger()
|
|
339
305
|
name = str(data.name)
|
|
340
306
|
target_type = config.get('target_type')
|
|
@@ -355,13 +321,13 @@ class DataProcessor(FeatureSet):
|
|
|
355
321
|
result.append(0)
|
|
356
322
|
return np.array(result, dtype=np.int64 if target_type == 'multiclass' else np.float32)
|
|
357
323
|
|
|
358
|
-
def
|
|
324
|
+
def load_dataframe_from_path(self, path: str) -> pd.DataFrame:
|
|
359
325
|
"""Load all data from a file or directory path into a single DataFrame."""
|
|
360
326
|
file_paths, file_type = resolve_file_paths(path)
|
|
361
327
|
frames = load_dataframes(file_paths, file_type)
|
|
362
328
|
return pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0]
|
|
363
329
|
|
|
364
|
-
def
|
|
330
|
+
def extract_sequence_tokens(self, value: Any, separator: str) -> list[str]:
|
|
365
331
|
"""Extract sequence tokens from a single value."""
|
|
366
332
|
if value is None:
|
|
367
333
|
return []
|
|
@@ -374,7 +340,7 @@ class DataProcessor(FeatureSet):
|
|
|
374
340
|
return [str(v) for v in value]
|
|
375
341
|
return [str(value)]
|
|
376
342
|
|
|
377
|
-
def
|
|
343
|
+
def fit_from_path(self, path: str, chunk_size: int) -> 'DataProcessor':
|
|
378
344
|
"""Fit processor statistics by streaming files to reduce memory usage."""
|
|
379
345
|
logger = logging.getLogger()
|
|
380
346
|
logger.info(colorize("Fitting DataProcessor (streaming path mode)...", color="cyan", bold=True))
|
|
@@ -433,7 +399,7 @@ class DataProcessor(FeatureSet):
|
|
|
433
399
|
series = chunk[name]
|
|
434
400
|
tokens = []
|
|
435
401
|
for val in series:
|
|
436
|
-
tokens.extend(self.
|
|
402
|
+
tokens.extend(self.extract_sequence_tokens(val, separator))
|
|
437
403
|
seq_vocab[name].update(tokens)
|
|
438
404
|
|
|
439
405
|
# target features
|
|
@@ -548,7 +514,7 @@ class DataProcessor(FeatureSet):
|
|
|
548
514
|
logger.info(colorize("DataProcessor fitted successfully (streaming path mode)", color="green", bold=True))
|
|
549
515
|
return self
|
|
550
516
|
|
|
551
|
-
def
|
|
517
|
+
def transform_in_memory(
|
|
552
518
|
self,
|
|
553
519
|
data: Union[pd.DataFrame, Dict[str, Any]],
|
|
554
520
|
return_dict: bool,
|
|
@@ -581,7 +547,7 @@ class DataProcessor(FeatureSet):
|
|
|
581
547
|
continue
|
|
582
548
|
# Convert to Series for processing
|
|
583
549
|
series_data = pd.Series(data_dict[name], name=name)
|
|
584
|
-
processed = self.
|
|
550
|
+
processed = self.process_numeric_feature_transform(series_data, config)
|
|
585
551
|
result_dict[name] = processed
|
|
586
552
|
|
|
587
553
|
# process sparse features
|
|
@@ -590,7 +556,7 @@ class DataProcessor(FeatureSet):
|
|
|
590
556
|
logger.warning(f"Sparse feature {name} not found in data")
|
|
591
557
|
continue
|
|
592
558
|
series_data = pd.Series(data_dict[name], name=name)
|
|
593
|
-
processed = self.
|
|
559
|
+
processed = self.process_sparse_feature_transform(series_data, config)
|
|
594
560
|
result_dict[name] = processed
|
|
595
561
|
|
|
596
562
|
# process sequence features
|
|
@@ -599,7 +565,7 @@ class DataProcessor(FeatureSet):
|
|
|
599
565
|
logger.warning(f"Sequence feature {name} not found in data")
|
|
600
566
|
continue
|
|
601
567
|
series_data = pd.Series(data_dict[name], name=name)
|
|
602
|
-
processed = self.
|
|
568
|
+
processed = self.process_sequence_feature_transform(series_data, config)
|
|
603
569
|
result_dict[name] = processed
|
|
604
570
|
|
|
605
571
|
# process target features
|
|
@@ -608,10 +574,10 @@ class DataProcessor(FeatureSet):
|
|
|
608
574
|
logger.warning(f"Target {name} not found in data")
|
|
609
575
|
continue
|
|
610
576
|
series_data = pd.Series(data_dict[name], name=name)
|
|
611
|
-
processed = self.
|
|
577
|
+
processed = self.process_target_transform(series_data, config)
|
|
612
578
|
result_dict[name] = processed
|
|
613
579
|
|
|
614
|
-
def
|
|
580
|
+
def dict_to_dataframe(result: Dict[str, np.ndarray]) -> pd.DataFrame:
|
|
615
581
|
# Convert all arrays to Series/lists at once to avoid fragmentation
|
|
616
582
|
columns_dict = {}
|
|
617
583
|
for key, value in result.items():
|
|
@@ -629,7 +595,7 @@ class DataProcessor(FeatureSet):
|
|
|
629
595
|
effective_format = save_format or "parquet"
|
|
630
596
|
result_df = None
|
|
631
597
|
if (not return_dict) or persist:
|
|
632
|
-
result_df =
|
|
598
|
+
result_df = dict_to_dataframe(result_dict)
|
|
633
599
|
if persist:
|
|
634
600
|
if output_path is None:
|
|
635
601
|
raise ValueError("output_path must be provided when persisting transformed data.")
|
|
@@ -649,7 +615,7 @@ class DataProcessor(FeatureSet):
|
|
|
649
615
|
assert result_df is not None, "DataFrame is None after transform"
|
|
650
616
|
return result_df
|
|
651
617
|
|
|
652
|
-
def
|
|
618
|
+
def transform_path(
|
|
653
619
|
self,
|
|
654
620
|
input_path: str,
|
|
655
621
|
output_path: Optional[str],
|
|
@@ -669,13 +635,7 @@ class DataProcessor(FeatureSet):
|
|
|
669
635
|
saved_paths = []
|
|
670
636
|
for file_path in tqdm.tqdm(file_paths, desc="Transforming files", unit="file"):
|
|
671
637
|
df = read_table(file_path, file_type)
|
|
672
|
-
transformed_df = self.
|
|
673
|
-
df,
|
|
674
|
-
return_dict=False,
|
|
675
|
-
persist=False,
|
|
676
|
-
save_format=None,
|
|
677
|
-
output_path=None,
|
|
678
|
-
)
|
|
638
|
+
transformed_df = self.transform_in_memory(df, return_dict=False, persist=False, save_format=None, output_path=None)
|
|
679
639
|
assert isinstance(transformed_df, pd.DataFrame), "Expected DataFrame when return_dict=False"
|
|
680
640
|
source_path = Path(file_path)
|
|
681
641
|
target_file = output_root / f"{source_path.stem}.{target_format}"
|
|
@@ -695,9 +655,9 @@ class DataProcessor(FeatureSet):
|
|
|
695
655
|
uses_robust = any(cfg.get("scaler") == "robust" for cfg in self.numeric_features.values())
|
|
696
656
|
if uses_robust:
|
|
697
657
|
logger.warning("Robust scaler requires full data; loading all files into memory. Consider smaller chunk_size or different scaler if memory is limited.")
|
|
698
|
-
data = self.
|
|
658
|
+
data = self.load_dataframe_from_path(path_str)
|
|
699
659
|
else:
|
|
700
|
-
return self.
|
|
660
|
+
return self.fit_from_path(path_str, chunk_size)
|
|
701
661
|
if isinstance(data, dict):
|
|
702
662
|
data = pd.DataFrame(data)
|
|
703
663
|
logger.info(colorize("Fitting DataProcessor...", color="cyan", bold=True))
|
|
@@ -705,22 +665,22 @@ class DataProcessor(FeatureSet):
|
|
|
705
665
|
if name not in data.columns:
|
|
706
666
|
logger.warning(f"Numeric feature {name} not found in data")
|
|
707
667
|
continue
|
|
708
|
-
self.
|
|
668
|
+
self.process_numeric_feature_fit(data[name], config)
|
|
709
669
|
for name, config in self.sparse_features.items():
|
|
710
670
|
if name not in data.columns:
|
|
711
671
|
logger.warning(f"Sparse feature {name} not found in data")
|
|
712
672
|
continue
|
|
713
|
-
self.
|
|
673
|
+
self.process_sparse_feature_fit(data[name], config)
|
|
714
674
|
for name, config in self.sequence_features.items():
|
|
715
675
|
if name not in data.columns:
|
|
716
676
|
logger.warning(f"Sequence feature {name} not found in data")
|
|
717
677
|
continue
|
|
718
|
-
self.
|
|
678
|
+
self.process_sequence_feature_fit(data[name], config)
|
|
719
679
|
for name, config in self.target_features.items():
|
|
720
680
|
if name not in data.columns:
|
|
721
681
|
logger.warning(f"Target {name} not found in data")
|
|
722
682
|
continue
|
|
723
|
-
self.
|
|
683
|
+
self.process_target_fit(data[name], config)
|
|
724
684
|
self.is_fitted = True
|
|
725
685
|
return self
|
|
726
686
|
|
|
@@ -736,14 +696,8 @@ class DataProcessor(FeatureSet):
|
|
|
736
696
|
if isinstance(data, (str, os.PathLike)):
|
|
737
697
|
if return_dict:
|
|
738
698
|
raise ValueError("Path transform writes files only; set return_dict=False when passing a path.")
|
|
739
|
-
return self.
|
|
740
|
-
return self.
|
|
741
|
-
data=data,
|
|
742
|
-
return_dict=return_dict,
|
|
743
|
-
persist=output_path is not None,
|
|
744
|
-
save_format=save_format,
|
|
745
|
-
output_path=output_path,
|
|
746
|
-
)
|
|
699
|
+
return self.transform_path(str(data), output_path, save_format)
|
|
700
|
+
return self.transform_in_memory(data=data, return_dict=return_dict, persist=output_path is not None, save_format=save_format, output_path=output_path)
|
|
747
701
|
|
|
748
702
|
def fit_transform(
|
|
749
703
|
self,
|
|
@@ -46,7 +46,7 @@ from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
|
|
|
46
46
|
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
47
47
|
from nextrec.basic.activation import activation_layer
|
|
48
48
|
from nextrec.basic.model import BaseModel
|
|
49
|
-
from nextrec.utils.
|
|
49
|
+
from nextrec.utils.model import merge_features
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
class POSOGate(nn.Module):
|