nextrec 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,268 +1,35 @@
1
- """Data processing utilities for NextRec."""
2
-
3
- import torch
4
- import numpy as np
5
- import pandas as pd
6
- import pyarrow.parquet as pq
7
- from pathlib import Path
8
- from typing import Any, Mapping, Sequence
9
-
10
- def stack_section(batch: list[dict], section: str):
11
- """Stack one section of the batch (features/labels/ids)."""
12
- entries = [item.get(section) for item in batch if item.get(section) is not None]
13
- if not entries:
14
- return None
15
- merged: dict = {}
16
- for name in entries[0]: # type: ignore
17
- tensors = [item[section][name] for item in batch if item.get(section) is not None and name in item[section]]
18
- merged[name] = torch.stack(tensors, dim=0)
19
- return merged
20
-
21
- def collate_fn(batch):
22
- """
23
- Collate a list of sample dicts into the unified batch format:
24
- {
25
- "features": {name: Tensor(B, ...)},
26
- "labels": {target: Tensor(B, ...)} or None,
27
- "ids": {id_name: Tensor(B, ...)} or None,
28
- }
29
- """
30
- if not batch:
31
- return {"features": {}, "labels": None, "ids": None}
32
-
33
- first = batch[0]
34
- if isinstance(first, dict) and "features" in first:
35
- # Streaming dataset yields already-batched chunks; avoid adding an extra dim.
36
- if first.get("_already_batched") and len(batch) == 1:
37
- return {
38
- "features": first.get("features", {}),
39
- "labels": first.get("labels"),
40
- "ids": first.get("ids"),
41
- }
42
- return {
43
- "features": stack_section(batch, "features") or {},
44
- "labels": stack_section(batch, "labels"),
45
- "ids": stack_section(batch, "ids"),
46
- }
47
-
48
- # Fallback: stack tuples/lists of tensors
49
- num_tensors = len(first)
50
- result = []
51
- for i in range(num_tensors):
52
- tensor_list = [item[i] for item in batch]
53
- first_item = tensor_list[0]
54
- if isinstance(first_item, torch.Tensor):
55
- stacked = torch.cat(tensor_list, dim=0)
56
- elif isinstance(first_item, np.ndarray):
57
- stacked = np.concatenate(tensor_list, axis=0)
58
- elif isinstance(first_item, list):
59
- combined = []
60
- for entry in tensor_list:
61
- combined.extend(entry)
62
- stacked = combined
63
- else:
64
- stacked = tensor_list
65
- result.append(stacked)
66
- return tuple(result)
67
-
68
- def get_column_data(data: dict | pd.DataFrame, name: str):
69
- """Extract column data from various data structures."""
70
- if isinstance(data, dict):
71
- return data[name] if name in data else None
72
- elif isinstance(data, pd.DataFrame):
73
- if name not in data.columns:
74
- return None
75
- return data[name].values
76
- else:
77
- if hasattr(data, name):
78
- return getattr(data, name)
79
- raise KeyError(f"Unsupported data type for extracting column {name}")
80
-
81
- def resolve_file_paths(path: str) -> tuple[list[str], str]:
82
- """Resolve file or directory path into a sorted list of files and file type."""
83
- path_obj = Path(path)
84
-
85
- if path_obj.is_file():
86
- file_type = path_obj.suffix.lower().lstrip(".")
87
- assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
88
- return [str(path_obj)], file_type
89
-
90
- if path_obj.is_dir():
91
- collected_files = [p for p in path_obj.iterdir() if p.is_file()]
92
- csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
93
- parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
94
-
95
- if csv_files and parquet_files:
96
- raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
97
- file_paths = csv_files if csv_files else parquet_files
98
- if not file_paths:
99
- raise ValueError(f"No CSV or Parquet files found in directory: {path}")
100
- file_paths.sort()
101
- file_type = "csv" if csv_files else "parquet"
102
- return file_paths, file_type
103
-
104
- raise ValueError(f"Invalid path: {path}")
105
-
106
- def iter_file_chunks(file_path: str, file_type: str, chunk_size: int):
107
- """Yield DataFrame chunks for CSV/Parquet without loading the whole file."""
108
- if file_type == "csv":
109
- yield from pd.read_csv(file_path, chunksize=chunk_size)
110
- return
111
- parquet_file = pq.ParquetFile(file_path)
112
- for batch in parquet_file.iter_batches(batch_size=chunk_size):
113
- yield batch.to_pandas()
114
-
115
- def read_table(file_path: str, file_type: str) -> pd.DataFrame:
116
- """Read a single CSV/Parquet file."""
117
- if file_type == "csv":
118
- return pd.read_csv(file_path)
119
- return pd.read_parquet(file_path)
120
-
121
- def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
122
- """Load multiple files of the same type into DataFrames."""
123
- return [read_table(fp, file_type) for fp in file_paths]
124
-
125
- def default_output_dir(path: str) -> Path:
126
- """Generate a default output directory path based on the input path."""
127
- path_obj = Path(path)
128
- if path_obj.is_file():
129
- return path_obj.parent / f"{path_obj.stem}_preprocessed"
130
- return path_obj.with_name(f"{path_obj.name}_preprocessed")
131
-
132
- def split_dict_random(data_dict: dict, test_size: float = 0.2, random_state: int | None = None):
133
- """Randomly split a dictionary of data into training and testing sets."""
134
- lengths = [len(v) for v in data_dict.values()]
135
- if len(set(lengths)) != 1:
136
- raise ValueError(f"Length mismatch: {lengths}")
137
- n = lengths[0]
138
- rng = np.random.default_rng(random_state)
139
- perm = rng.permutation(n)
140
- cut = int(round(n * (1 - test_size)))
141
- train_idx, test_idx = perm[:cut], perm[cut:]
142
- def take(v, idx):
143
- if isinstance(v, np.ndarray):
144
- return v[idx]
145
- elif isinstance(v, pd.Series):
146
- return v.iloc[idx].to_numpy()
147
- else:
148
- v_arr = np.asarray(v, dtype=object)
149
- return v_arr[idx]
150
- train_dict = {k: take(v, train_idx) for k, v in data_dict.items()}
151
- test_dict = {k: take(v, test_idx) for k, v in data_dict.items()}
152
- return train_dict, test_dict
153
-
154
- def build_eval_candidates(
155
- df_all: pd.DataFrame,
156
- user_col: str,
157
- item_col: str,
158
- label_col: str,
159
- user_features: pd.DataFrame,
160
- item_features: pd.DataFrame,
161
- num_pos_per_user: int = 5,
162
- num_neg_per_pos: int = 50,
163
- random_seed: int = 2025,
164
- ) -> pd.DataFrame:
165
- """Build evaluation candidates with positive and negative samples for each user. """
166
- rng = np.random.default_rng(random_seed)
167
-
168
- users = df_all[user_col].unique()
169
- all_items = item_features[item_col].unique()
170
- rows = []
171
- user_hist_items = {u: df_all[df_all[user_col] == u][item_col].unique() for u in users}
172
- for u in users:
173
- df_user = df_all[df_all[user_col] == u]
174
- pos_items = df_user[df_user[label_col] == 1][item_col].unique()
175
- if len(pos_items) == 0:
176
- continue
177
- pos_items = pos_items[:num_pos_per_user]
178
- seen_items = set(user_hist_items[u])
179
- neg_pool = np.setdiff1d(all_items, np.fromiter(seen_items, dtype=all_items.dtype))
180
- if len(neg_pool) == 0:
181
- continue
182
- for pos in pos_items:
183
- if len(neg_pool) <= num_neg_per_pos:
184
- neg_items = neg_pool
185
- else:
186
- neg_items = rng.choice(neg_pool, size=num_neg_per_pos, replace=False)
187
- rows.append((u, pos, 1))
188
- for ni in neg_items:
189
- rows.append((u, ni, 0))
190
- eval_df = pd.DataFrame(rows, columns=[user_col, item_col, label_col])
191
- eval_df = eval_df.merge(user_features, on=user_col, how='left')
192
- eval_df = eval_df.merge(item_features, on=item_col, how='left')
193
- return eval_df
194
-
195
- def batch_to_dict(batch_data: Any, include_ids: bool = True) -> dict:
196
- """Standardize a dataloader batch into a dict of features, labels, and ids."""
197
- if not (isinstance(batch_data, Mapping) and "features" in batch_data):
198
- raise TypeError(
199
- "[BaseModel-batch_to_dict Error] Batch data must be a dict with 'features' produced by the current DataLoader."
200
- )
201
- return {
202
- "features": batch_data.get("features", {}),
203
- "labels": batch_data.get("labels"),
204
- "ids": batch_data.get("ids") if include_ids else None,
205
- }
206
-
207
-
208
- # def get_user_ids(
209
- # data: dict | pd.DataFrame | None, user_id_column: str = "user_id"
210
- # ) -> np.ndarray | None:
211
- # """Extract user IDs from a dataset dict or DataFrame."""
212
- # if data is None:
213
- # return None
214
- # if isinstance(data, pd.DataFrame) and user_id_column in data.columns:
215
- # return np.asarray(data[user_id_column].values)
216
- # if isinstance(data, dict) and user_id_column in data:
217
- # return np.asarray(data[user_id_column])
218
- # return None
219
-
220
-
221
- # def get_user_ids_from_batch(
222
- # batch_dict: Mapping[str, Any], id_columns: Sequence[str] | None = None
223
- # ) -> np.ndarray | None:
224
- # """Extract the prioritized user id column from a batch dict."""
225
- # ids_container = batch_dict.get("ids") if isinstance(batch_dict, Mapping) else None
226
- # if not ids_container:
227
- # return None
228
-
229
- # batch_user_id = None
230
- # if id_columns:
231
- # for id_name in id_columns:
232
- # if id_name in ids_container:
233
- # batch_user_id = ids_container[id_name]
234
- # break
235
- # if batch_user_id is None:
236
- # batch_user_id = next(iter(ids_container.values()), None)
237
- # if batch_user_id is None:
238
- # return None
239
-
240
- # if isinstance(batch_user_id, torch.Tensor):
241
- # ids_np = batch_user_id.detach().cpu().numpy()
242
- # else:
243
- # ids_np = np.asarray(batch_user_id)
244
- # if ids_np.ndim == 0:
245
- # ids_np = ids_np.reshape(1)
246
- # return ids_np.reshape(ids_np.shape[0])
247
-
248
-
249
- def get_user_ids(data, id_columns: list[str] | str | None = None) -> np.ndarray | None:
250
- id_columns = id_columns if isinstance(id_columns, list) else [id_columns] if isinstance(id_columns, str) else []
251
- if not id_columns:
252
- return None
253
-
254
- main_id = id_columns[0]
255
- if isinstance(data, pd.DataFrame) and main_id in data.columns:
256
- arr = np.asarray(data[main_id].values)
257
- return arr.reshape(arr.shape[0])
258
- if isinstance(data, dict):
259
- ids_container = data.get("ids")
260
- if isinstance(ids_container, dict) and main_id in ids_container:
261
- val = ids_container[main_id]
262
- val = val.detach().cpu().numpy() if isinstance(val, torch.Tensor) else np.asarray(val)
263
- return val.reshape(val.shape[0])
264
- if main_id in data:
265
- arr = np.asarray(data[main_id])
266
- return arr.reshape(arr.shape[0])
267
-
268
- return None
1
+ """
2
+ Data processing utilities for NextRec (Refactored)
3
+
4
+ This module now re-exports functions from specialized submodules:
5
+ - batch_utils: collate_fn, batch_to_dict
6
+ - data_processing: get_column_data, split_dict_random, build_eval_candidates, get_user_ids
7
+ - nextrec.utils.file_utils: resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
8
+
9
+ Date: create on 27/10/2025
10
+ Last update: 03/12/2025 (refactored)
11
+ Author: Yang Zhou, zyaztec@gmail.com
12
+ """
13
+
14
+ # Import from new organized modules
15
+ from nextrec.data.batch_utils import collate_fn, batch_to_dict, stack_section
16
+ from nextrec.data.data_processing import get_column_data, split_dict_random, build_eval_candidates, get_user_ids
17
+ from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
18
+
19
+ __all__ = [
20
+ # Batch utilities
21
+ 'collate_fn',
22
+ 'batch_to_dict',
23
+ 'stack_section',
24
+ # Data processing
25
+ 'get_column_data',
26
+ 'split_dict_random',
27
+ 'build_eval_candidates',
28
+ 'get_user_ids',
29
+ # File utilities
30
+ 'resolve_file_paths',
31
+ 'iter_file_chunks',
32
+ 'read_table',
33
+ 'load_dataframes',
34
+ 'default_output_dir',
35
+ ]
@@ -20,8 +20,10 @@ from nextrec.data.preprocessor import DataProcessor
20
20
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
21
21
 
22
22
  from nextrec.basic.loggers import colorize
23
- from nextrec.data import get_column_data, collate_fn, resolve_file_paths, read_table
24
- from nextrec.utils import to_tensor
23
+ from nextrec.data.data_processing import get_column_data
24
+ from nextrec.data.batch_utils import collate_fn
25
+ from nextrec.utils.file import resolve_file_paths, read_table
26
+ from nextrec.utils.tensor import to_tensor
25
27
 
26
28
  class TensorDictDataset(Dataset):
27
29
  """Dataset returning sample-level dicts matching the unified batch schema."""
@@ -185,9 +187,9 @@ class RecDataLoader(FeatureSet):
185
187
  chunk_size: int,
186
188
  shuffle: bool) -> DataLoader:
187
189
  if shuffle:
188
- logging.warning("[RecDataLoader Warning] Shuffle is ignored in streaming mode (IterableDataset).")
190
+ logging.info("[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset).")
189
191
  if batch_size != 1:
190
- logging.warning("[RecDataLoader Warning] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
192
+ logging.info("[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
191
193
  dataset = FileDataset(file_paths=file_paths, dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target_columns=self.target_columns, id_columns=self.id_columns, chunk_size=chunk_size, file_type=file_type, processor=self.processor)
192
194
  return DataLoader(dataset, batch_size=1, collate_fn=collate_fn)
193
195
 
@@ -16,48 +16,18 @@ import pandas as pd
16
16
  import tqdm
17
17
  from pathlib import Path
18
18
  from typing import Dict, Union, Optional, Literal, Any
19
- from sklearn.preprocessing import (
20
- StandardScaler,
21
- MinMaxScaler,
22
- RobustScaler,
23
- MaxAbsScaler,
24
- LabelEncoder
25
- )
19
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, LabelEncoder
20
+
26
21
 
27
- from nextrec.basic.loggers import setup_logger, colorize
28
- from nextrec.data.data_utils import (
29
- resolve_file_paths,
30
- iter_file_chunks,
31
- read_table,
32
- load_dataframes,
33
- default_output_dir,
34
- )
35
- from nextrec.basic.session import resolve_save_path
36
22
  from nextrec.basic.features import FeatureSet
23
+ from nextrec.basic.loggers import colorize
24
+ from nextrec.basic.session import resolve_save_path
25
+ from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
26
+
37
27
  from nextrec.__version__ import __version__
38
28
 
39
29
 
40
30
  class DataProcessor(FeatureSet):
41
- """DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
42
-
43
- Examples:
44
- >>> processor = DataProcessor()
45
- >>> processor.add_numeric_feature('age', scaler='standard')
46
- >>> processor.add_sparse_feature('user_id', encode_method='hash', hash_size=10000)
47
- >>> processor.add_sequence_feature('item_history', encode_method='label', max_len=50, pad_value=0)
48
- >>> processor.add_target('label', target_type='binary')
49
- >>>
50
- >>> # Fit and transform data
51
- >>> processor.fit(train_df)
52
- >>> processed_data = processor.transform(test_df) # Returns dict of numpy arrays
53
- >>>
54
- >>> # Save and load processor
55
- >>> processor.save('processor.pkl')
56
- >>> loaded_processor = DataProcessor.load('processor.pkl')
57
- >>>
58
- >>> # Get vocabulary sizes for embedding layers
59
- >>> vocab_sizes = processor.get_vocab_sizes()
60
- """
61
31
  def __init__(self):
62
32
  self.numeric_features: Dict[str, Dict[str, Any]] = {}
63
33
  self.sparse_features: Dict[str, Dict[str, Any]] = {}
@@ -132,10 +102,10 @@ class DataProcessor(FeatureSet):
132
102
  }
133
103
  self.set_target_id(list(self.target_features.keys()), [])
134
104
 
135
- def _hash_string(self, s: str, hash_size: int) -> int:
105
+ def hash_string(self, s: str, hash_size: int) -> int:
136
106
  return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
137
107
 
138
- def _process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
108
+ def process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
139
109
  name = str(data.name)
140
110
  scaler_type = config['scaler']
141
111
  fill_na = config['fill_na']
@@ -164,7 +134,7 @@ class DataProcessor(FeatureSet):
164
134
  scaler.fit(values)
165
135
  self.scalers[name] = scaler
166
136
 
167
- def _process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
137
+ def process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
168
138
  logger = logging.getLogger()
169
139
  name = str(data.name)
170
140
  scaler_type = config['scaler']
@@ -184,7 +154,7 @@ class DataProcessor(FeatureSet):
184
154
  result = scaler.transform(values.reshape(-1, 1)).ravel()
185
155
  return result
186
156
 
187
- def _process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
157
+ def process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
188
158
  name = str(data.name)
189
159
  encode_method = config['encode_method']
190
160
  fill_na = config['fill_na'] # <UNK>
@@ -197,7 +167,7 @@ class DataProcessor(FeatureSet):
197
167
  elif encode_method == 'hash':
198
168
  config['vocab_size'] = config['hash_size']
199
169
 
200
- def _process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
170
+ def process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
201
171
  name = str(data.name)
202
172
  encode_method = config['encode_method']
203
173
  fill_na = config['fill_na']
@@ -215,11 +185,11 @@ class DataProcessor(FeatureSet):
215
185
  return encoded.to_numpy()
216
186
  if encode_method == 'hash':
217
187
  hash_size = config['hash_size']
218
- hash_fn = self._hash_string
188
+ hash_fn = self.hash_string
219
189
  return np.fromiter((hash_fn(v, hash_size) for v in sparse_series.to_numpy()), dtype=np.int64, count=sparse_series.size,)
220
190
  return np.array([], dtype=np.int64)
221
191
 
222
- def _process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
192
+ def process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
223
193
  name = str(data.name)
224
194
  encode_method = config['encode_method']
225
195
  separator = config['separator']
@@ -252,7 +222,7 @@ class DataProcessor(FeatureSet):
252
222
  elif encode_method == 'hash':
253
223
  config['vocab_size'] = config['hash_size']
254
224
 
255
- def _process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
225
+ def process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
256
226
  """Optimized sequence transform with preallocation and cached vocab map."""
257
227
  name = str(data.name)
258
228
  encode_method = config['encode_method']
@@ -276,7 +246,7 @@ class DataProcessor(FeatureSet):
276
246
  config['_class_to_idx'] = class_to_idx
277
247
  else:
278
248
  class_to_idx = None # type: ignore
279
- hash_fn = self._hash_string
249
+ hash_fn = self.hash_string
280
250
  hash_size = config.get('hash_size')
281
251
  for i, seq in enumerate(arr):
282
252
  # normalize sequence to a list of strings
@@ -301,11 +271,7 @@ class DataProcessor(FeatureSet):
301
271
  elif encode_method == 'hash':
302
272
  if hash_size is None:
303
273
  raise ValueError("hash_size must be set for hash encoding")
304
- encoded = [
305
- hash_fn(str(token), hash_size)
306
- for token in tokens
307
- if str(token).strip()
308
- ]
274
+ encoded = [hash_fn(str(token), hash_size) for token in tokens if str(token).strip()]
309
275
  else:
310
276
  encoded = []
311
277
  if not encoded:
@@ -315,7 +281,7 @@ class DataProcessor(FeatureSet):
315
281
  output[i, : len(encoded)] = encoded
316
282
  return output
317
283
 
318
- def _process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
284
+ def process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
319
285
  name = str(data.name)
320
286
  target_type = config['target_type']
321
287
  label_map = config.get('label_map')
@@ -334,7 +300,7 @@ class DataProcessor(FeatureSet):
334
300
  config['label_map'] = label_map
335
301
  self.target_encoders[name] = label_map
336
302
 
337
- def _process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
303
+ def process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
338
304
  logger = logging.getLogger()
339
305
  name = str(data.name)
340
306
  target_type = config.get('target_type')
@@ -355,13 +321,13 @@ class DataProcessor(FeatureSet):
355
321
  result.append(0)
356
322
  return np.array(result, dtype=np.int64 if target_type == 'multiclass' else np.float32)
357
323
 
358
- def _load_dataframe_from_path(self, path: str) -> pd.DataFrame:
324
+ def load_dataframe_from_path(self, path: str) -> pd.DataFrame:
359
325
  """Load all data from a file or directory path into a single DataFrame."""
360
326
  file_paths, file_type = resolve_file_paths(path)
361
327
  frames = load_dataframes(file_paths, file_type)
362
328
  return pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0]
363
329
 
364
- def _extract_sequence_tokens(self, value: Any, separator: str) -> list[str]:
330
+ def extract_sequence_tokens(self, value: Any, separator: str) -> list[str]:
365
331
  """Extract sequence tokens from a single value."""
366
332
  if value is None:
367
333
  return []
@@ -374,7 +340,7 @@ class DataProcessor(FeatureSet):
374
340
  return [str(v) for v in value]
375
341
  return [str(value)]
376
342
 
377
- def _fit_from_path(self, path: str, chunk_size: int) -> 'DataProcessor':
343
+ def fit_from_path(self, path: str, chunk_size: int) -> 'DataProcessor':
378
344
  """Fit processor statistics by streaming files to reduce memory usage."""
379
345
  logger = logging.getLogger()
380
346
  logger.info(colorize("Fitting DataProcessor (streaming path mode)...", color="cyan", bold=True))
@@ -433,7 +399,7 @@ class DataProcessor(FeatureSet):
433
399
  series = chunk[name]
434
400
  tokens = []
435
401
  for val in series:
436
- tokens.extend(self._extract_sequence_tokens(val, separator))
402
+ tokens.extend(self.extract_sequence_tokens(val, separator))
437
403
  seq_vocab[name].update(tokens)
438
404
 
439
405
  # target features
@@ -548,7 +514,7 @@ class DataProcessor(FeatureSet):
548
514
  logger.info(colorize("DataProcessor fitted successfully (streaming path mode)", color="green", bold=True))
549
515
  return self
550
516
 
551
- def _transform_in_memory(
517
+ def transform_in_memory(
552
518
  self,
553
519
  data: Union[pd.DataFrame, Dict[str, Any]],
554
520
  return_dict: bool,
@@ -581,7 +547,7 @@ class DataProcessor(FeatureSet):
581
547
  continue
582
548
  # Convert to Series for processing
583
549
  series_data = pd.Series(data_dict[name], name=name)
584
- processed = self._process_numeric_feature_transform(series_data, config)
550
+ processed = self.process_numeric_feature_transform(series_data, config)
585
551
  result_dict[name] = processed
586
552
 
587
553
  # process sparse features
@@ -590,7 +556,7 @@ class DataProcessor(FeatureSet):
590
556
  logger.warning(f"Sparse feature {name} not found in data")
591
557
  continue
592
558
  series_data = pd.Series(data_dict[name], name=name)
593
- processed = self._process_sparse_feature_transform(series_data, config)
559
+ processed = self.process_sparse_feature_transform(series_data, config)
594
560
  result_dict[name] = processed
595
561
 
596
562
  # process sequence features
@@ -599,7 +565,7 @@ class DataProcessor(FeatureSet):
599
565
  logger.warning(f"Sequence feature {name} not found in data")
600
566
  continue
601
567
  series_data = pd.Series(data_dict[name], name=name)
602
- processed = self._process_sequence_feature_transform(series_data, config)
568
+ processed = self.process_sequence_feature_transform(series_data, config)
603
569
  result_dict[name] = processed
604
570
 
605
571
  # process target features
@@ -608,10 +574,10 @@ class DataProcessor(FeatureSet):
608
574
  logger.warning(f"Target {name} not found in data")
609
575
  continue
610
576
  series_data = pd.Series(data_dict[name], name=name)
611
- processed = self._process_target_transform(series_data, config)
577
+ processed = self.process_target_transform(series_data, config)
612
578
  result_dict[name] = processed
613
579
 
614
- def _dict_to_dataframe(result: Dict[str, np.ndarray]) -> pd.DataFrame:
580
+ def dict_to_dataframe(result: Dict[str, np.ndarray]) -> pd.DataFrame:
615
581
  # Convert all arrays to Series/lists at once to avoid fragmentation
616
582
  columns_dict = {}
617
583
  for key, value in result.items():
@@ -629,7 +595,7 @@ class DataProcessor(FeatureSet):
629
595
  effective_format = save_format or "parquet"
630
596
  result_df = None
631
597
  if (not return_dict) or persist:
632
- result_df = _dict_to_dataframe(result_dict)
598
+ result_df = dict_to_dataframe(result_dict)
633
599
  if persist:
634
600
  if output_path is None:
635
601
  raise ValueError("output_path must be provided when persisting transformed data.")
@@ -649,7 +615,7 @@ class DataProcessor(FeatureSet):
649
615
  assert result_df is not None, "DataFrame is None after transform"
650
616
  return result_df
651
617
 
652
- def _transform_path(
618
+ def transform_path(
653
619
  self,
654
620
  input_path: str,
655
621
  output_path: Optional[str],
@@ -669,13 +635,7 @@ class DataProcessor(FeatureSet):
669
635
  saved_paths = []
670
636
  for file_path in tqdm.tqdm(file_paths, desc="Transforming files", unit="file"):
671
637
  df = read_table(file_path, file_type)
672
- transformed_df = self._transform_in_memory(
673
- df,
674
- return_dict=False,
675
- persist=False,
676
- save_format=None,
677
- output_path=None,
678
- )
638
+ transformed_df = self.transform_in_memory(df, return_dict=False, persist=False, save_format=None, output_path=None)
679
639
  assert isinstance(transformed_df, pd.DataFrame), "Expected DataFrame when return_dict=False"
680
640
  source_path = Path(file_path)
681
641
  target_file = output_root / f"{source_path.stem}.{target_format}"
@@ -695,9 +655,9 @@ class DataProcessor(FeatureSet):
695
655
  uses_robust = any(cfg.get("scaler") == "robust" for cfg in self.numeric_features.values())
696
656
  if uses_robust:
697
657
  logger.warning("Robust scaler requires full data; loading all files into memory. Consider smaller chunk_size or different scaler if memory is limited.")
698
- data = self._load_dataframe_from_path(path_str)
658
+ data = self.load_dataframe_from_path(path_str)
699
659
  else:
700
- return self._fit_from_path(path_str, chunk_size)
660
+ return self.fit_from_path(path_str, chunk_size)
701
661
  if isinstance(data, dict):
702
662
  data = pd.DataFrame(data)
703
663
  logger.info(colorize("Fitting DataProcessor...", color="cyan", bold=True))
@@ -705,22 +665,22 @@ class DataProcessor(FeatureSet):
705
665
  if name not in data.columns:
706
666
  logger.warning(f"Numeric feature {name} not found in data")
707
667
  continue
708
- self._process_numeric_feature_fit(data[name], config)
668
+ self.process_numeric_feature_fit(data[name], config)
709
669
  for name, config in self.sparse_features.items():
710
670
  if name not in data.columns:
711
671
  logger.warning(f"Sparse feature {name} not found in data")
712
672
  continue
713
- self._process_sparse_feature_fit(data[name], config)
673
+ self.process_sparse_feature_fit(data[name], config)
714
674
  for name, config in self.sequence_features.items():
715
675
  if name not in data.columns:
716
676
  logger.warning(f"Sequence feature {name} not found in data")
717
677
  continue
718
- self._process_sequence_feature_fit(data[name], config)
678
+ self.process_sequence_feature_fit(data[name], config)
719
679
  for name, config in self.target_features.items():
720
680
  if name not in data.columns:
721
681
  logger.warning(f"Target {name} not found in data")
722
682
  continue
723
- self._process_target_fit(data[name], config)
683
+ self.process_target_fit(data[name], config)
724
684
  self.is_fitted = True
725
685
  return self
726
686
 
@@ -736,14 +696,8 @@ class DataProcessor(FeatureSet):
736
696
  if isinstance(data, (str, os.PathLike)):
737
697
  if return_dict:
738
698
  raise ValueError("Path transform writes files only; set return_dict=False when passing a path.")
739
- return self._transform_path(str(data), output_path, save_format)
740
- return self._transform_in_memory(
741
- data=data,
742
- return_dict=return_dict,
743
- persist=output_path is not None,
744
- save_format=save_format,
745
- output_path=output_path,
746
- )
699
+ return self.transform_path(str(data), output_path, save_format)
700
+ return self.transform_in_memory(data=data, return_dict=return_dict, persist=output_path is not None, save_format=save_format, output_path=output_path)
747
701
 
748
702
  def fit_transform(
749
703
  self,
@@ -46,7 +46,7 @@ from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
46
46
  from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
47
47
  from nextrec.basic.activation import activation_layer
48
48
  from nextrec.basic.model import BaseModel
49
- from nextrec.utils.common import merge_features
49
+ from nextrec.utils.model import merge_features
50
50
 
51
51
 
52
52
  class POSOGate(nn.Module):