nextrec 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. nextrec/__version__.py +1 -1
  2. nextrec/basic/features.py +10 -23
  3. nextrec/basic/layers.py +18 -61
  4. nextrec/basic/loggers.py +1 -1
  5. nextrec/basic/metrics.py +55 -33
  6. nextrec/basic/model.py +258 -394
  7. nextrec/data/__init__.py +2 -2
  8. nextrec/data/data_utils.py +80 -4
  9. nextrec/data/dataloader.py +36 -57
  10. nextrec/data/preprocessor.py +5 -4
  11. nextrec/models/generative/__init__.py +5 -0
  12. nextrec/models/generative/hstu.py +399 -0
  13. nextrec/models/match/dssm.py +2 -2
  14. nextrec/models/match/dssm_v2.py +2 -2
  15. nextrec/models/match/mind.py +2 -2
  16. nextrec/models/match/sdm.py +2 -2
  17. nextrec/models/match/youtube_dnn.py +2 -2
  18. nextrec/models/multi_task/esmm.py +1 -1
  19. nextrec/models/multi_task/mmoe.py +1 -1
  20. nextrec/models/multi_task/ple.py +1 -1
  21. nextrec/models/multi_task/poso.py +1 -1
  22. nextrec/models/multi_task/share_bottom.py +1 -1
  23. nextrec/models/ranking/afm.py +1 -1
  24. nextrec/models/ranking/autoint.py +1 -1
  25. nextrec/models/ranking/dcn.py +1 -1
  26. nextrec/models/ranking/deepfm.py +1 -1
  27. nextrec/models/ranking/dien.py +1 -1
  28. nextrec/models/ranking/din.py +1 -1
  29. nextrec/models/ranking/fibinet.py +1 -1
  30. nextrec/models/ranking/fm.py +1 -1
  31. nextrec/models/ranking/masknet.py +2 -2
  32. nextrec/models/ranking/pnn.py +1 -1
  33. nextrec/models/ranking/widedeep.py +1 -1
  34. nextrec/models/ranking/xdeepfm.py +1 -1
  35. nextrec/utils/__init__.py +2 -1
  36. nextrec/utils/common.py +21 -2
  37. nextrec/utils/optimizer.py +7 -3
  38. {nextrec-0.3.1.dist-info → nextrec-0.3.3.dist-info}/METADATA +10 -4
  39. nextrec-0.3.3.dist-info/RECORD +57 -0
  40. nextrec-0.3.1.dist-info/RECORD +0 -56
  41. {nextrec-0.3.1.dist-info → nextrec-0.3.3.dist-info}/WHEEL +0 -0
  42. {nextrec-0.3.1.dist-info → nextrec-0.3.3.dist-info}/licenses/LICENSE +0 -0
nextrec/data/__init__.py CHANGED
@@ -18,7 +18,7 @@ from nextrec.data.data_utils import (
18
18
  read_table,
19
19
  load_dataframes,
20
20
  )
21
- from nextrec.basic.features import FeatureSpecMixin
21
+ from nextrec.basic.features import FeatureSet
22
22
  from nextrec.data import data_utils
23
23
  from nextrec.data.dataloader import (
24
24
  TensorDictDataset,
@@ -38,7 +38,7 @@ __all__ = [
38
38
  'iter_file_chunks',
39
39
  'read_table',
40
40
  'load_dataframes',
41
- 'FeatureSpecMixin',
41
+ 'FeatureSet',
42
42
  'data_utils',
43
43
  'TensorDictDataset',
44
44
  'FileDataset',
@@ -5,8 +5,9 @@ import numpy as np
5
5
  import pandas as pd
6
6
  import pyarrow.parquet as pq
7
7
  from pathlib import Path
8
+ from typing import Any, Mapping, Sequence
8
9
 
9
- def _stack_section(batch: list[dict], section: str):
10
+ def stack_section(batch: list[dict], section: str):
10
11
  """Stack one section of the batch (features/labels/ids)."""
11
12
  entries = [item.get(section) for item in batch if item.get(section) is not None]
12
13
  if not entries:
@@ -39,9 +40,9 @@ def collate_fn(batch):
39
40
  "ids": first.get("ids"),
40
41
  }
41
42
  return {
42
- "features": _stack_section(batch, "features") or {},
43
- "labels": _stack_section(batch, "labels"),
44
- "ids": _stack_section(batch, "ids"),
43
+ "features": stack_section(batch, "features") or {},
44
+ "labels": stack_section(batch, "labels"),
45
+ "ids": stack_section(batch, "ids"),
45
46
  }
46
47
 
47
48
  # Fallback: stack tuples/lists of tensors
@@ -190,3 +191,78 @@ def build_eval_candidates(
190
191
  eval_df = eval_df.merge(user_features, on=user_col, how='left')
191
192
  eval_df = eval_df.merge(item_features, on=item_col, how='left')
192
193
  return eval_df
194
+
195
+ def batch_to_dict(batch_data: Any, include_ids: bool = True) -> dict:
196
+ """Standardize a dataloader batch into a dict of features, labels, and ids."""
197
+ if not (isinstance(batch_data, Mapping) and "features" in batch_data):
198
+ raise TypeError(
199
+ "[BaseModel-batch_to_dict Error] Batch data must be a dict with 'features' produced by the current DataLoader."
200
+ )
201
+ return {
202
+ "features": batch_data.get("features", {}),
203
+ "labels": batch_data.get("labels"),
204
+ "ids": batch_data.get("ids") if include_ids else None,
205
+ }
206
+
207
+
208
+ # def get_user_ids(
209
+ # data: dict | pd.DataFrame | None, user_id_column: str = "user_id"
210
+ # ) -> np.ndarray | None:
211
+ # """Extract user IDs from a dataset dict or DataFrame."""
212
+ # if data is None:
213
+ # return None
214
+ # if isinstance(data, pd.DataFrame) and user_id_column in data.columns:
215
+ # return np.asarray(data[user_id_column].values)
216
+ # if isinstance(data, dict) and user_id_column in data:
217
+ # return np.asarray(data[user_id_column])
218
+ # return None
219
+
220
+
221
+ # def get_user_ids_from_batch(
222
+ # batch_dict: Mapping[str, Any], id_columns: Sequence[str] | None = None
223
+ # ) -> np.ndarray | None:
224
+ # """Extract the prioritized user id column from a batch dict."""
225
+ # ids_container = batch_dict.get("ids") if isinstance(batch_dict, Mapping) else None
226
+ # if not ids_container:
227
+ # return None
228
+
229
+ # batch_user_id = None
230
+ # if id_columns:
231
+ # for id_name in id_columns:
232
+ # if id_name in ids_container:
233
+ # batch_user_id = ids_container[id_name]
234
+ # break
235
+ # if batch_user_id is None:
236
+ # batch_user_id = next(iter(ids_container.values()), None)
237
+ # if batch_user_id is None:
238
+ # return None
239
+
240
+ # if isinstance(batch_user_id, torch.Tensor):
241
+ # ids_np = batch_user_id.detach().cpu().numpy()
242
+ # else:
243
+ # ids_np = np.asarray(batch_user_id)
244
+ # if ids_np.ndim == 0:
245
+ # ids_np = ids_np.reshape(1)
246
+ # return ids_np.reshape(ids_np.shape[0])
247
+
248
+
249
+ def get_user_ids(data, id_columns: list[str] | str | None = None) -> np.ndarray | None:
250
+ id_columns = id_columns if isinstance(id_columns, list) else [id_columns] if isinstance(id_columns, str) else []
251
+ if not id_columns:
252
+ return None
253
+
254
+ main_id = id_columns[0]
255
+ if isinstance(data, pd.DataFrame) and main_id in data.columns:
256
+ arr = np.asarray(data[main_id].values)
257
+ return arr.reshape(arr.shape[0])
258
+ if isinstance(data, dict):
259
+ ids_container = data.get("ids")
260
+ if isinstance(ids_container, dict) and main_id in ids_container:
261
+ val = ids_container[main_id]
262
+ val = val.detach().cpu().numpy() if isinstance(val, torch.Tensor) else np.asarray(val)
263
+ return val.reshape(val.shape[0])
264
+ if main_id in data:
265
+ arr = np.asarray(data[main_id])
266
+ return arr.reshape(arr.shape[0])
267
+
268
+ return None
@@ -2,11 +2,10 @@
2
2
  Dataloader definitions
3
3
 
4
4
  Date: create on 27/10/2025
5
- Checkpoint: edit on 29/11/2025
5
+ Checkpoint: edit on 02/12/2025
6
6
  Author: Yang Zhou,zyaztec@gmail.com
7
7
  """
8
8
  import os
9
- import tqdm
10
9
  import torch
11
10
  import logging
12
11
  import numpy as np
@@ -18,15 +17,11 @@ from typing import cast
18
17
 
19
18
  from torch.utils.data import DataLoader, Dataset, IterableDataset
20
19
  from nextrec.data.preprocessor import DataProcessor
21
- from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSpecMixin
20
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
22
21
 
23
22
  from nextrec.basic.loggers import colorize
24
- from nextrec.data import (
25
- get_column_data,
26
- collate_fn,
27
- resolve_file_paths,
28
- read_table,
29
- )
23
+ from nextrec.data import get_column_data, collate_fn, resolve_file_paths, read_table
24
+ from nextrec.utils import to_tensor
30
25
 
31
26
  class TensorDictDataset(Dataset):
32
27
  """Dataset returning sample-level dicts matching the unified batch schema."""
@@ -52,7 +47,7 @@ class TensorDictDataset(Dataset):
52
47
  sample_ids = {name: tensor[idx] for name, tensor in self.ids.items()} if self.ids else None
53
48
  return {"features": sample_features, "labels": sample_labels, "ids": sample_ids}
54
49
 
55
- class FileDataset(FeatureSpecMixin, IterableDataset):
50
+ class FileDataset(FeatureSet, IterableDataset):
56
51
  def __init__(self,
57
52
  file_paths: list[str], # file paths to read, containing CSV or Parquet files
58
53
  dense_features: list[DenseFeature], # dense feature definitions
@@ -67,44 +62,37 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
67
62
  self.chunk_size = chunk_size
68
63
  self.file_type = file_type
69
64
  self.processor = processor
70
- self._set_feature_config(dense_features, sparse_features, sequence_features, target_columns, id_columns)
65
+ self.set_all_features(dense_features, sparse_features, sequence_features, target_columns, id_columns)
71
66
  self.current_file_index = 0
72
67
  self.total_files = len(file_paths)
73
68
 
74
69
  def __iter__(self):
75
70
  self.current_file_index = 0
76
- self._file_pbar = None
77
- if self.total_files > 1:
78
- self._file_pbar = tqdm.tqdm(total=self.total_files, desc="Files", unit="file", position=0, leave=True, bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
79
71
  for file_path in self.file_paths:
80
72
  self.current_file_index += 1
81
- if self._file_pbar is not None:
82
- self._file_pbar.update(1)
83
- elif self.total_files == 1:
73
+ if self.total_files == 1:
84
74
  file_name = os.path.basename(file_path)
85
75
  logging.info(f"Processing file: {file_name}")
86
76
  if self.file_type == 'csv':
87
- yield from self._read_csv_chunks(file_path)
77
+ yield from self.read_csv_chunks(file_path)
88
78
  elif self.file_type == 'parquet':
89
- yield from self._read_parquet_chunks(file_path)
90
- if self._file_pbar is not None:
91
- self._file_pbar.close()
79
+ yield from self.read_parquet_chunks(file_path)
92
80
 
93
- def _read_csv_chunks(self, file_path: str):
81
+ def read_csv_chunks(self, file_path: str):
94
82
  chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
95
83
  for chunk in chunk_iterator:
96
- tensors = self._dataframe_to_tensors(chunk)
84
+ tensors = self.dataframeto_tensors(chunk)
97
85
  yield tensors
98
86
 
99
- def _read_parquet_chunks(self, file_path: str):
87
+ def read_parquet_chunks(self, file_path: str):
100
88
  parquet_file = pq.ParquetFile(file_path)
101
89
  for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
102
90
  chunk = batch.to_pandas()
103
- tensors = self._dataframe_to_tensors(chunk)
91
+ tensors = self.dataframeto_tensors(chunk)
104
92
  yield tensors
105
93
  del chunk
106
94
 
107
- def _dataframe_to_tensors(self, df: pd.DataFrame) -> dict | None:
95
+ def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
108
96
  if self.processor is not None:
109
97
  if not self.processor.is_fitted:
110
98
  raise ValueError("[DataLoader Error] DataProcessor must be fitted before using in streaming mode")
@@ -120,7 +108,7 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
120
108
  return batch
121
109
 
122
110
 
123
- class RecDataLoader(FeatureSpecMixin):
111
+ class RecDataLoader(FeatureSet):
124
112
  def __init__(self,
125
113
  dense_features: list[DenseFeature] | None = None,
126
114
  sparse_features: list[SparseFeature] | None = None,
@@ -129,7 +117,7 @@ class RecDataLoader(FeatureSpecMixin):
129
117
  id_columns: str | list[str] | None = None,
130
118
  processor: DataProcessor | None = None):
131
119
  self.processor = processor
132
- self._set_feature_config(dense_features, sparse_features, sequence_features, target, id_columns)
120
+ self.set_all_features(dense_features, sparse_features, sequence_features, target, id_columns)
133
121
 
134
122
  def create_dataloader(self,
135
123
  data: dict | pd.DataFrame | str | DataLoader,
@@ -140,13 +128,13 @@ class RecDataLoader(FeatureSpecMixin):
140
128
  if isinstance(data, DataLoader):
141
129
  return data
142
130
  elif isinstance(data, (str, os.PathLike)):
143
- return self._create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size)
131
+ return self.create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size)
144
132
  elif isinstance(data, (dict, pd.DataFrame)):
145
- return self._create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle)
133
+ return self.create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle)
146
134
  else:
147
135
  raise ValueError(f"[RecDataLoader Error] Unsupported data type: {type(data)}")
148
136
 
149
- def _create_from_memory(self,
137
+ def create_from_memory(self,
150
138
  data: dict | pd.DataFrame,
151
139
  batch_size: int,
152
140
  shuffle: bool) -> DataLoader:
@@ -162,7 +150,7 @@ class RecDataLoader(FeatureSpecMixin):
162
150
  dataset = TensorDictDataset(tensors)
163
151
  return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
164
152
 
165
- def _create_from_path(self,
153
+ def create_from_path(self,
166
154
  path: str,
167
155
  batch_size: int,
168
156
  shuffle: bool,
@@ -179,7 +167,6 @@ class RecDataLoader(FeatureSpecMixin):
179
167
  except OSError:
180
168
  pass
181
169
  try:
182
- df = read_table(file_path, file_type)
183
170
  dfs.append(df)
184
171
  except MemoryError as exc:
185
172
  raise MemoryError(f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming.") from exc
@@ -187,11 +174,11 @@ class RecDataLoader(FeatureSpecMixin):
187
174
  combined_df = pd.concat(dfs, ignore_index=True)
188
175
  except MemoryError as exc:
189
176
  raise MemoryError(f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size.") from exc
190
- return self._create_from_memory(combined_df, batch_size, shuffle,)
177
+ return self.create_from_memory(combined_df, batch_size, shuffle,)
191
178
  else:
192
- return self._load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle)
179
+ return self.load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle)
193
180
 
194
- def _load_files_streaming(self,
181
+ def load_files_streaming(self,
195
182
  file_paths: list[str],
196
183
  file_type: str,
197
184
  batch_size: int,
@@ -201,20 +188,10 @@ class RecDataLoader(FeatureSpecMixin):
201
188
  logging.warning("[RecDataLoader Warning] Shuffle is ignored in streaming mode (IterableDataset).")
202
189
  if batch_size != 1:
203
190
  logging.warning("[RecDataLoader Warning] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
204
- dataset = FileDataset(
205
- file_paths=file_paths,
206
- dense_features=self.dense_features,
207
- sparse_features=self.sparse_features,
208
- sequence_features=self.sequence_features,
209
- target_columns=self.target_columns,
210
- id_columns=self.id_columns,
211
- chunk_size=chunk_size,
212
- file_type=file_type,
213
- processor=self.processor
214
- )
191
+ dataset = FileDataset(file_paths=file_paths, dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target_columns=self.target_columns, id_columns=self.id_columns, chunk_size=chunk_size, file_type=file_type, processor=self.processor)
215
192
  return DataLoader(dataset, batch_size=1, collate_fn=collate_fn)
216
193
 
217
- def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
194
+ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
218
195
  if isinstance(column, pd.Series):
219
196
  column = column.tolist()
220
197
  if isinstance(column, (list, tuple)):
@@ -250,25 +227,27 @@ def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
250
227
  column = column.reshape(-1, 1)
251
228
  return np.asarray(column, dtype=np.int64)
252
229
 
253
-
254
- def build_tensors_from_data( # noqa: C901
230
+ def build_tensors_from_data(
255
231
  data: dict | pd.DataFrame,
256
232
  raw_data: dict | pd.DataFrame,
257
233
  features: list,
258
234
  target_columns: list[str],
259
235
  id_columns: list[str]
260
236
  ) -> dict | None:
261
- feature_tensors: dict[str, torch.Tensor] = {}
237
+ feature_tensors = {}
262
238
  for feature in features:
263
239
  column = get_column_data(data, feature.name)
264
240
  if column is None:
265
241
  raise ValueError(f"[RecDataLoader Error] Feature column '{feature.name}' not found in data")
266
- if isinstance(feature, SequenceFeature):
267
- tensor = torch.from_numpy(_normalize_sequence_column(column, feature))
242
+ if isinstance(feature, SequenceFeature): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
243
+ arr = normalize_sequence_column(column, feature)
244
+ tensor = to_tensor(arr, dtype=torch.long)
268
245
  elif isinstance(feature, DenseFeature):
269
- tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
246
+ arr = np.asarray(column, dtype=np.float32)
247
+ tensor = to_tensor(arr, dtype=torch.float32)
270
248
  else:
271
- tensor = torch.from_numpy(np.asarray(column, dtype=np.int64))
249
+ arr = np.asarray(column, dtype=np.int64)
250
+ tensor = to_tensor(arr, dtype=torch.long)
272
251
  feature_tensors[feature.name] = tensor
273
252
  label_tensors = None
274
253
  if target_columns:
@@ -277,7 +256,7 @@ def build_tensors_from_data( # noqa: C901
277
256
  column = get_column_data(data, target_name)
278
257
  if column is None:
279
258
  continue
280
- label_tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
259
+ label_tensor = to_tensor(np.asarray(column, dtype=np.float32), dtype=torch.float32)
281
260
  if label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
282
261
  label_tensor = label_tensor.t()
283
262
  if label_tensor.shape[1:] == (1,):
@@ -298,7 +277,7 @@ def build_tensors_from_data( # noqa: C901
298
277
  id_arr = np.asarray(column, dtype=np.int64)
299
278
  except Exception as exc:
300
279
  raise TypeError( f"[RecDataLoader Error] ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}") from exc
301
- id_tensors[id_col] = torch.from_numpy(id_arr)
280
+ id_tensors[id_col] = to_tensor(id_arr, dtype=torch.long)
302
281
  if not feature_tensors:
303
282
  return None
304
283
  return {"features": feature_tensors, "labels": label_tensors, "ids": id_tensors}
@@ -2,6 +2,7 @@
2
2
  DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
3
3
 
4
4
  Date: create on 13/11/2025
5
+ Checkpoint: edit on 02/12/2025
5
6
  Author: Yang Zhou, zyaztec@gmail.com
6
7
  """
7
8
  from __future__ import annotations
@@ -32,11 +33,11 @@ from nextrec.data.data_utils import (
32
33
  default_output_dir,
33
34
  )
34
35
  from nextrec.basic.session import resolve_save_path
35
- from nextrec.basic.features import FeatureSpecMixin
36
+ from nextrec.basic.features import FeatureSet
36
37
  from nextrec.__version__ import __version__
37
38
 
38
39
 
39
- class DataProcessor(FeatureSpecMixin):
40
+ class DataProcessor(FeatureSet):
40
41
  """DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
41
42
 
42
43
  Examples:
@@ -70,7 +71,7 @@ class DataProcessor(FeatureSpecMixin):
70
71
  self.scalers: Dict[str, Any] = {}
71
72
  self.label_encoders: Dict[str, LabelEncoder] = {}
72
73
  self.target_encoders: Dict[str, Dict[str, int]] = {}
73
- self._set_target_id_config([], [])
74
+ self.set_target_id([], [])
74
75
 
75
76
  def add_numeric_feature(
76
77
  self,
@@ -129,7 +130,7 @@ class DataProcessor(FeatureSpecMixin):
129
130
  'target_type': target_type,
130
131
  'label_map': label_map
131
132
  }
132
- self._set_target_id_config(list(self.target_features.keys()), [])
133
+ self.set_target_id(list(self.target_features.keys()), [])
133
134
 
134
135
  def _hash_string(self, s: str, hash_size: int) -> int:
135
136
  return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
@@ -0,0 +1,5 @@
1
+ from .hstu import HSTU
2
+
3
+ __all__ = [
4
+ "HSTU",
5
+ ]