nextrec 0.1.11__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. nextrec/__version__.py +1 -1
  2. nextrec/basic/activation.py +1 -2
  3. nextrec/basic/callback.py +1 -2
  4. nextrec/basic/features.py +39 -8
  5. nextrec/basic/layers.py +3 -4
  6. nextrec/basic/loggers.py +15 -10
  7. nextrec/basic/metrics.py +1 -2
  8. nextrec/basic/model.py +160 -125
  9. nextrec/basic/session.py +150 -0
  10. nextrec/data/__init__.py +13 -2
  11. nextrec/data/data_utils.py +74 -22
  12. nextrec/data/dataloader.py +513 -0
  13. nextrec/data/preprocessor.py +494 -134
  14. nextrec/loss/__init__.py +31 -24
  15. nextrec/loss/listwise.py +164 -0
  16. nextrec/loss/loss_utils.py +133 -106
  17. nextrec/loss/pairwise.py +105 -0
  18. nextrec/loss/pointwise.py +198 -0
  19. nextrec/models/match/dssm.py +26 -17
  20. nextrec/models/match/dssm_v2.py +20 -2
  21. nextrec/models/match/mind.py +18 -3
  22. nextrec/models/match/sdm.py +17 -2
  23. nextrec/models/match/youtube_dnn.py +23 -10
  24. nextrec/models/multi_task/esmm.py +8 -8
  25. nextrec/models/multi_task/mmoe.py +8 -8
  26. nextrec/models/multi_task/ple.py +8 -8
  27. nextrec/models/multi_task/share_bottom.py +8 -8
  28. nextrec/models/ranking/__init__.py +8 -0
  29. nextrec/models/ranking/afm.py +5 -4
  30. nextrec/models/ranking/autoint.py +6 -4
  31. nextrec/models/ranking/dcn.py +6 -4
  32. nextrec/models/ranking/deepfm.py +5 -4
  33. nextrec/models/ranking/dien.py +6 -4
  34. nextrec/models/ranking/din.py +6 -4
  35. nextrec/models/ranking/fibinet.py +6 -4
  36. nextrec/models/ranking/fm.py +6 -4
  37. nextrec/models/ranking/masknet.py +6 -4
  38. nextrec/models/ranking/pnn.py +6 -4
  39. nextrec/models/ranking/widedeep.py +6 -4
  40. nextrec/models/ranking/xdeepfm.py +6 -4
  41. nextrec/utils/__init__.py +7 -11
  42. nextrec/utils/embedding.py +2 -4
  43. nextrec/utils/initializer.py +4 -5
  44. nextrec/utils/optimizer.py +7 -8
  45. {nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/METADATA +3 -3
  46. nextrec-0.2.2.dist-info/RECORD +53 -0
  47. nextrec/basic/dataloader.py +0 -447
  48. nextrec/loss/match_losses.py +0 -294
  49. nextrec/utils/common.py +0 -14
  50. nextrec-0.1.11.dist-info/RECORD +0 -51
  51. {nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/WHEEL +0 -0
  52. {nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,513 @@
1
+ """
2
+ Dataloader definitions
3
+
4
+ Date: create on 27/10/2025
5
+ Author: Yang Zhou,zyaztec@gmail.com
6
+ """
7
+ import os
8
+ import tqdm
9
+ import torch
10
+ import logging
11
+ import numpy as np
12
+ import pandas as pd
13
+ import pyarrow.parquet as pq
14
+
15
+ from pathlib import Path
16
+ from typing import Iterator, Literal, Union, Optional
17
+
18
+ from torch.utils.data import DataLoader, TensorDataset, IterableDataset
19
+ from nextrec.data.preprocessor import DataProcessor
20
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureConfig
21
+
22
+ from nextrec.basic.loggers import colorize
23
+ from nextrec.data import (
24
+ get_column_data,
25
+ collate_fn,
26
+ resolve_file_paths,
27
+ read_table,
28
+ )
29
+
30
+
31
+ class FileDataset(FeatureConfig, IterableDataset):
32
+ """
33
+ Iterable dataset that streams CSV/Parquet files in chunks and yields tensor tuples.
34
+
35
+ :param file_paths: Absolute or relative paths to CSV/Parquet files.
36
+ :param dense_features: Dense feature definitions (float tensors).
37
+ :param sparse_features: Sparse/categorical feature definitions (int tensors).
38
+ :param sequence_features: Sequence feature definitions (padded int tensors).
39
+ :param target_columns: Label/target column names.
40
+ :param id_columns: Optional ID columns appended after targets.
41
+ :param chunk_size: Number of rows to read per chunk.
42
+ :param file_type: ``\"csv\"`` or ``\"parquet\"``.
43
+ :param processor: Optional fitted :class:`~nextrec.data.preprocessor.DataProcessor` for online transform.
44
+
45
+ Yields
46
+ ------
47
+ tuple
48
+ Tensors ordered as ``dense + sparse + sequence + targets (+ ids)``. Shape respects chunk size.
49
+ """
50
+
51
+ def __init__(self,
52
+ file_paths: list[str], # file paths to read, containing CSV or Parquet files
53
+ dense_features: list[DenseFeature], # dense feature definitions
54
+ sparse_features: list[SparseFeature], # sparse feature definitions
55
+ sequence_features: list[SequenceFeature], # sequence feature definitions
56
+ target_columns: list[str], # target column names
57
+ id_columns: list[str] | None = None, # id columns to carry through (not used for model inputs)
58
+ chunk_size: int = 10000,
59
+ file_type: str = 'csv',
60
+ processor: DataProcessor | None = None): # optional DataProcessor for transformation
61
+ """
62
+ Initialize a streaming dataset backed by on-disk files.
63
+ """
64
+
65
+ self.file_paths = file_paths
66
+ self.chunk_size = chunk_size
67
+ self.file_type = file_type
68
+ self.processor = processor
69
+
70
+ self._set_feature_config(dense_features, sparse_features, sequence_features)
71
+ self._set_target_config(target_columns, id_columns or [])
72
+ self.current_file_index = 0
73
+ self.total_files = len(file_paths)
74
+
75
+ def __iter__(self) -> Iterator[tuple]:
76
+ """
77
+ Iterate over files and stream tensor tuples chunk by chunk.
78
+
79
+ Files are processed sequentially; each chunk is transformed (optionally via
80
+ ``processor``) and converted to tensors before being yielded to PyTorch ``DataLoader``.
81
+ """
82
+ self.current_file_index = 0
83
+ self._file_pbar = None
84
+
85
+ # Create progress bar for file processing when multiple files
86
+ if self.total_files > 1:
87
+ self._file_pbar = tqdm.tqdm(
88
+ total=self.total_files,
89
+ desc="Files",
90
+ unit="file",
91
+ position=0,
92
+ leave=True,
93
+ bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
94
+ )
95
+
96
+ for file_path in self.file_paths:
97
+ self.current_file_index += 1
98
+
99
+ if self._file_pbar is not None:
100
+ self._file_pbar.update(1)
101
+ elif self.total_files == 1:
102
+ file_name = os.path.basename(file_path)
103
+ logging.info(colorize(f"Processing file: {file_name}", color="cyan"))
104
+
105
+ if self.file_type == 'csv':
106
+ yield from self._read_csv_chunks(file_path)
107
+ elif self.file_type == 'parquet':
108
+ yield from self._read_parquet_chunks(file_path)
109
+
110
+ if self._file_pbar is not None:
111
+ self._file_pbar.close()
112
+
113
+ def _read_csv_chunks(self, file_path: str) -> Iterator[tuple]:
114
+ """
115
+ Stream a CSV file chunk by chunk.
116
+
117
+ :param file_path: Path to the CSV file.
118
+ :yields: Tensor tuples for each chunk.
119
+ """
120
+ chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
121
+
122
+ for chunk in chunk_iterator:
123
+ tensors = self._dataframe_to_tensors(chunk)
124
+ if tensors:
125
+ yield tensors
126
+
127
+ def _read_parquet_chunks(self, file_path: str) -> Iterator[tuple]:
128
+ """
129
+ Stream a Parquet file via ``pyarrow`` batch reading.
130
+
131
+ :param file_path: Path to the Parquet file.
132
+ :yields: Tensor tuples for each batch.
133
+ """
134
+
135
+ parquet_file = pq.ParquetFile(file_path)
136
+ for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
137
+ chunk = batch.to_pandas()
138
+ tensors = self._dataframe_to_tensors(chunk)
139
+ if tensors:
140
+ yield tensors
141
+ del chunk
142
+
143
+ def _dataframe_to_tensors(self, df: pd.DataFrame) -> tuple | None:
144
+ """
145
+ Convert a DataFrame chunk into a tuple of tensors respecting feature order.
146
+
147
+ :param df: DataFrame chunk.
148
+ :returns: Tuple of tensors (features + targets + ids) or ``None`` if no tensors created.
149
+ """
150
+ if self.processor is not None:
151
+ if not self.processor.is_fitted:
152
+ raise ValueError("DataProcessor must be fitted before using in streaming mode")
153
+ transformed_data = self.processor.transform(df, return_dict=True)
154
+ else:
155
+ transformed_data = df
156
+
157
+ return build_tensors_from_data(
158
+ data=transformed_data,
159
+ raw_data=df,
160
+ features=self.all_features,
161
+ target_columns=self.target_columns,
162
+ id_columns=self.id_columns,
163
+ on_missing_feature="raise",
164
+ )
165
+
166
+
167
+ class RecDataLoader(FeatureConfig):
168
+ """
169
+ Convenience wrapper for building PyTorch ``DataLoader`` objects for recommendation models.
170
+
171
+ :param dense_features: Dense feature definitions (float tensors).
172
+ :param sparse_features: Sparse/categorical feature definitions (int tensors).
173
+ :param sequence_features: Sequence feature definitions (padded int tensors).
174
+ :param target: Target column name(s); string or list.
175
+ :param id_columns: Optional ID column name(s) appended after targets.
176
+ :param processor: Optional fitted :class:`~nextrec.data.preprocessor.DataProcessor` for preprocessing.
177
+
178
+ Examples
179
+ --------
180
+ >>> loader = RecDataLoader(
181
+ ... dense_features=dense_features,
182
+ ... sparse_features=sparse_features,
183
+ ... sequence_features=sequence_features,
184
+ ... target=['label'],
185
+ ... processor=processor,
186
+ ... )
187
+ >>> dataloader = loader.create_dataloader(
188
+ ... data=\"/path/to/data.csv\",
189
+ ... batch_size=1024,
190
+ ... load_full=False,
191
+ ... chunk_size=20000,
192
+ ... )
193
+ """
194
+
195
+ def __init__(self,
196
+ dense_features: list[DenseFeature] | None = None,
197
+ sparse_features: list[SparseFeature] | None = None,
198
+ sequence_features: list[SequenceFeature] | None = None,
199
+ target: list[str] | None | str = None,
200
+ id_columns: str | list[str] | None = None,
201
+ processor: Optional['DataProcessor'] = None):
202
+ """
203
+ Initialize the loader with feature/target definitions.
204
+
205
+ :param dense_features: Dense feature definitions (float).
206
+ :param sparse_features: Sparse feature definitions (int).
207
+ :param sequence_features: Sequence feature definitions (int, padded).
208
+ :param target: Single target name or list of names.
209
+ :param id_columns: Optional ID columns to append in output.
210
+ :param processor: Optional fitted ``DataProcessor`` for preprocessing.
211
+ """
212
+
213
+ self.processor = processor
214
+ self._set_feature_config(dense_features, sparse_features, sequence_features)
215
+ self._set_target_config(target, id_columns)
216
+
217
+ def create_dataloader(self,
218
+ data: Union[dict, pd.DataFrame, str, DataLoader],
219
+ batch_size: int = 32,
220
+ shuffle: bool = True,
221
+ load_full: bool = True,
222
+ chunk_size: int = 10000) -> DataLoader:
223
+ """
224
+ Build a ``DataLoader`` from in-memory data, file path, or an existing loader.
225
+
226
+ :param data: Dict/DataFrame (in-memory), path to CSV/Parquet file/dir, or an existing ``DataLoader``.
227
+ :param batch_size: Batch size for the returned ``DataLoader``.
228
+ :param shuffle: Shuffle flag passed to PyTorch ``DataLoader`` (for in-memory and streaming batches).
229
+ :param load_full: If ``True``, load all files into memory; if ``False``, stream with chunks.
230
+ :param chunk_size: Number of rows per chunk when ``load_full=False``.
231
+ :returns: A configured PyTorch ``DataLoader``.
232
+ """
233
+ if isinstance(data, DataLoader):
234
+ return data
235
+
236
+ if isinstance(data, (str, os.PathLike)):
237
+ return self._create_from_path(data, batch_size, shuffle, load_full, chunk_size)
238
+
239
+ if isinstance(data, (dict, pd.DataFrame)):
240
+ return self._create_from_memory(data, batch_size, shuffle)
241
+
242
+ raise ValueError(f"Unsupported data type: {type(data)}")
243
+
244
+ def _create_from_memory(self,
245
+ data: Union[dict, pd.DataFrame],
246
+ batch_size: int,
247
+ shuffle: bool) -> DataLoader:
248
+ """
249
+ Convert in-memory data (dict/DataFrame) into tensors and wrap with ``DataLoader``.
250
+
251
+ :param data: Dict or DataFrame containing feature/target columns.
252
+ :param batch_size: Batch size.
253
+ :param shuffle: Whether to shuffle batches.
254
+ :returns: A ``DataLoader`` backed by ``TensorDataset``.
255
+ """
256
+
257
+ raw_data = data
258
+
259
+ if self.processor is not None:
260
+ assert self.processor.is_fitted, "DataProcessor must be fitted before using in RecDataLoader"
261
+ data = self.processor.transform(data, return_dict=True)
262
+
263
+ tensors = build_tensors_from_data(
264
+ data=data,
265
+ raw_data=raw_data,
266
+ features=self.all_features,
267
+ target_columns=self.target_columns,
268
+ id_columns=self.id_columns,
269
+ on_missing_feature="raise",
270
+ )
271
+
272
+ assert tensors is not None, "No tensors were created from provided data."
273
+
274
+ dataset = TensorDataset(*tensors)
275
+ return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
276
+
277
+ def _create_from_path(self,
278
+ path: str,
279
+ batch_size: int,
280
+ shuffle: bool,
281
+ load_full: bool,
282
+ chunk_size: int) -> DataLoader:
283
+ """
284
+ Build a ``DataLoader`` from a CSV/Parquet file or directory.
285
+
286
+ :param path: File path or directory containing homogeneous CSV/Parquet files.
287
+ :param batch_size: Batch size.
288
+ :param shuffle: Shuffle flag.
289
+ :param load_full: If ``True``, load all rows into memory; otherwise stream.
290
+ :param chunk_size: Chunk rows when streaming.
291
+ :returns: A ``DataLoader`` (in-memory or streaming).
292
+ """
293
+
294
+ file_paths, file_type = resolve_file_paths(str(Path(path)))
295
+
296
+ # Load full data into memory
297
+ if load_full:
298
+ dfs = []
299
+ total_bytes = 0
300
+ for file_path in file_paths:
301
+ try:
302
+ total_bytes += os.path.getsize(file_path)
303
+ except OSError:
304
+ pass
305
+ try:
306
+ df = read_table(file_path, file_type)
307
+ dfs.append(df)
308
+ except MemoryError as exc:
309
+ raise MemoryError(
310
+ f"Out of memory while reading {file_path}. "
311
+ f"Consider using load_full=False with streaming."
312
+ ) from exc
313
+
314
+ try:
315
+ combined_df = pd.concat(dfs, ignore_index=True)
316
+ except MemoryError as exc:
317
+ raise MemoryError(
318
+ f"Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). "
319
+ f"Use load_full=False to stream or reduce chunk_size."
320
+ ) from exc
321
+
322
+ return self._create_from_memory(combined_df, batch_size, shuffle)
323
+ else:
324
+ return self._load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle)
325
+
326
+ def _load_files_streaming(self,
327
+ file_paths: list[str],
328
+ file_type: str,
329
+ batch_size: int,
330
+ chunk_size: int,
331
+ shuffle: bool) -> DataLoader:
332
+ """
333
+ Create a streaming ``DataLoader`` that yields chunked tensors from files.
334
+
335
+ :param file_paths: Ordered list of file paths.
336
+ :param file_type: ``\"csv\"`` or ``\"parquet\"``.
337
+ :param batch_size: Batch size for the outer ``DataLoader``.
338
+ :param chunk_size: Number of rows per chunk when reading files.
339
+ :returns: Streaming ``DataLoader`` with custom ``collate_fn``.
340
+ """
341
+
342
+ if shuffle:
343
+ logging.warning(colorize("Shuffle is ignored in streaming mode (IterableDataset).", "yellow"))
344
+
345
+ if batch_size != 1:
346
+ logging.warning(colorize(
347
+ "Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.",
348
+ "yellow",
349
+ ))
350
+ effective_batch_size = 1
351
+
352
+ dataset = FileDataset(
353
+ file_paths=file_paths,
354
+ dense_features=self.dense_features,
355
+ sparse_features=self.sparse_features,
356
+ sequence_features=self.sequence_features,
357
+ target_columns=self.target_columns,
358
+ id_columns=self.id_columns,
359
+ chunk_size=chunk_size,
360
+ file_type=file_type,
361
+ processor=self.processor
362
+ )
363
+
364
+ return DataLoader(dataset, batch_size=effective_batch_size, collate_fn=collate_fn)
365
+
366
+ def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
367
+ """
368
+ Normalize a raw sequence column into a padded int64 ``ndarray``.
369
+
370
+ :param column: Sequence column from DataFrame/dict; can be Series, list, or ndarray.
371
+ :param feature: Sequence feature definition providing ``max_len`` and optional ``padding_idx``.
372
+ :returns: 2-D numpy array (batch, seq_len) with dtype ``int64``.
373
+ """
374
+ if isinstance(column, pd.Series):
375
+ column = column.tolist()
376
+
377
+ if isinstance(column, (list, tuple)):
378
+ column = np.array(column, dtype=object)
379
+
380
+ if not isinstance(column, np.ndarray):
381
+ column = np.array([column], dtype=object)
382
+
383
+ if column.ndim == 0:
384
+ column = column.reshape(1)
385
+
386
+ if column.dtype == object and any(isinstance(v, str) for v in column.ravel()):
387
+ raise TypeError(
388
+ f"Sequence feature '{feature.name}' expects numeric sequences; found string values."
389
+ )
390
+
391
+ if column.dtype == object and len(column) > 0 and isinstance(column[0], (list, tuple, np.ndarray)):
392
+ sequences = []
393
+ for seq in column:
394
+ if isinstance(seq, str):
395
+ raise TypeError(
396
+ f"Sequence feature '{feature.name}' expects numeric sequences; found string values."
397
+ )
398
+ if isinstance(seq, (list, tuple, np.ndarray)):
399
+ arr = np.asarray(seq, dtype=np.int64)
400
+ else:
401
+ arr = np.asarray([seq], dtype=np.int64)
402
+ sequences.append(arr)
403
+
404
+ max_len = getattr(feature, "max_len", 0)
405
+ if max_len <= 0:
406
+ max_len = max((len(seq) for seq in sequences), default=1)
407
+
408
+ pad_value = getattr(feature, "padding_idx", 0)
409
+ padded = []
410
+ for seq in sequences:
411
+ if len(seq) > max_len:
412
+ padded.append(seq[:max_len])
413
+ else:
414
+ padded.append(np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value))
415
+ column = np.stack(padded)
416
+ elif column.ndim == 1:
417
+ column = column.reshape(-1, 1)
418
+
419
+ return np.asarray(column, dtype=np.int64)
420
+
421
+
422
+ def build_tensors_from_data( # noqa: C901
423
+ data: dict | pd.DataFrame,
424
+ raw_data: dict | pd.DataFrame,
425
+ features: list,
426
+ target_columns: list[str],
427
+ id_columns: list[str],
428
+ on_missing_feature: str = "raise",
429
+ ) -> tuple | None:
430
+ """
431
+ Shared routine to convert structured data into a tuple of tensors.
432
+
433
+ :param data: Preprocessed data (dict or DataFrame) used to fetch model inputs/labels.
434
+ :param raw_data: Original data, used for untouched ID columns.
435
+ :param features: Ordered list of feature definitions.
436
+ :param target_columns: Target/label column names.
437
+ :param id_columns: Extra ID column names to append at the end of the tensor tuple.
438
+ :param on_missing_feature: ``\"warn\"`` to skip missing feature with warning, ``\"raise\"`` to error.
439
+ :returns: Tuple of tensors following the order of ``features`` + targets (+ ids) or ``None`` if empty.
440
+ """
441
+ tensors: list[torch.Tensor] = []
442
+
443
+ for feature in features:
444
+ column = get_column_data(data, feature.name)
445
+ if column is None:
446
+ if on_missing_feature == "warn":
447
+ logging.warning(colorize(f"Feature column '{feature.name}' not found in data", "yellow"))
448
+ continue
449
+ raise AssertionError(f"Feature column {feature.name} not found in data.")
450
+
451
+ if isinstance(feature, SequenceFeature):
452
+ tensor = torch.from_numpy(_normalize_sequence_column(column, feature))
453
+ elif isinstance(feature, DenseFeature):
454
+ tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
455
+ else:
456
+ tensor = torch.from_numpy(np.asarray(column, dtype=np.int64))
457
+
458
+ tensors.append(tensor)
459
+
460
+ label_tensors = []
461
+ if target_columns:
462
+ for target_name in target_columns:
463
+ column = get_column_data(data, target_name)
464
+ assert column is not None, f"Target column '{target_name}' not found in data."
465
+
466
+ label_tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
467
+
468
+ if label_tensor.dim() == 1:
469
+ label_tensor = label_tensor.view(-1, 1)
470
+ elif label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
471
+ label_tensor = label_tensor.t()
472
+
473
+ label_tensors.append(label_tensor)
474
+
475
+ if label_tensors:
476
+ if len(label_tensors) == 1 and label_tensors[0].shape[1] > 1:
477
+ y_tensor = label_tensors[0]
478
+ else:
479
+ y_tensor = torch.cat(label_tensors, dim=1)
480
+
481
+ if y_tensor.shape[1] == 1:
482
+ y_tensor = y_tensor.squeeze(1)
483
+
484
+ tensors.append(y_tensor)
485
+
486
+ if id_columns:
487
+ id_arrays = []
488
+ for id_col in id_columns:
489
+ column = get_column_data(raw_data, id_col)
490
+ if column is None:
491
+ column = get_column_data(data, id_col)
492
+ if column is None:
493
+ raise KeyError(f"ID column '{id_col}' not found in provided data.")
494
+ try:
495
+ id_arr = np.asarray(column, dtype=np.int64)
496
+ except Exception as exc:
497
+ raise TypeError(
498
+ f"ID column '{id_col}' must contain numeric values. "
499
+ f"Received dtype={np.asarray(column).dtype}, error: {exc}"
500
+ ) from exc
501
+ id_arrays.append(id_arr)
502
+
503
+ combined_ids = np.column_stack(id_arrays)
504
+ tensors.append(torch.from_numpy(combined_ids))
505
+
506
+ if not tensors:
507
+ return None
508
+
509
+ return tuple(tensors)
510
+
511
+
512
+ # Backward compatible alias
513
+ _build_tensors_from_data = build_tensors_from_data