nextrec 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. nextrec/__init__.py +1 -1
  2. nextrec/__version__.py +1 -1
  3. nextrec/basic/activation.py +10 -5
  4. nextrec/basic/callback.py +1 -0
  5. nextrec/basic/features.py +30 -22
  6. nextrec/basic/layers.py +220 -106
  7. nextrec/basic/loggers.py +62 -43
  8. nextrec/basic/metrics.py +268 -119
  9. nextrec/basic/model.py +1082 -400
  10. nextrec/basic/session.py +10 -3
  11. nextrec/cli.py +498 -0
  12. nextrec/data/__init__.py +19 -25
  13. nextrec/data/batch_utils.py +11 -3
  14. nextrec/data/data_processing.py +51 -45
  15. nextrec/data/data_utils.py +26 -15
  16. nextrec/data/dataloader.py +272 -95
  17. nextrec/data/preprocessor.py +320 -199
  18. nextrec/loss/listwise.py +17 -9
  19. nextrec/loss/loss_utils.py +7 -8
  20. nextrec/loss/pairwise.py +2 -0
  21. nextrec/loss/pointwise.py +30 -12
  22. nextrec/models/generative/hstu.py +103 -38
  23. nextrec/models/match/dssm.py +82 -68
  24. nextrec/models/match/dssm_v2.py +72 -57
  25. nextrec/models/match/mind.py +175 -107
  26. nextrec/models/match/sdm.py +104 -87
  27. nextrec/models/match/youtube_dnn.py +73 -59
  28. nextrec/models/multi_task/esmm.py +53 -37
  29. nextrec/models/multi_task/mmoe.py +64 -45
  30. nextrec/models/multi_task/ple.py +101 -48
  31. nextrec/models/multi_task/poso.py +113 -36
  32. nextrec/models/multi_task/share_bottom.py +48 -35
  33. nextrec/models/ranking/afm.py +72 -37
  34. nextrec/models/ranking/autoint.py +72 -55
  35. nextrec/models/ranking/dcn.py +55 -35
  36. nextrec/models/ranking/dcn_v2.py +64 -23
  37. nextrec/models/ranking/deepfm.py +32 -22
  38. nextrec/models/ranking/dien.py +155 -99
  39. nextrec/models/ranking/din.py +85 -57
  40. nextrec/models/ranking/fibinet.py +52 -32
  41. nextrec/models/ranking/fm.py +29 -23
  42. nextrec/models/ranking/masknet.py +91 -29
  43. nextrec/models/ranking/pnn.py +31 -28
  44. nextrec/models/ranking/widedeep.py +34 -26
  45. nextrec/models/ranking/xdeepfm.py +60 -38
  46. nextrec/utils/__init__.py +59 -34
  47. nextrec/utils/config.py +490 -0
  48. nextrec/utils/device.py +30 -20
  49. nextrec/utils/distributed.py +36 -9
  50. nextrec/utils/embedding.py +1 -0
  51. nextrec/utils/feature.py +1 -0
  52. nextrec/utils/file.py +32 -11
  53. nextrec/utils/initializer.py +61 -16
  54. nextrec/utils/optimizer.py +25 -9
  55. nextrec/utils/synthetic_data.py +283 -165
  56. nextrec/utils/tensor.py +24 -13
  57. {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/METADATA +4 -4
  58. nextrec-0.4.2.dist-info/RECORD +69 -0
  59. nextrec-0.4.2.dist-info/entry_points.txt +2 -0
  60. nextrec-0.4.1.dist-info/RECORD +0 -66
  61. {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
  62. {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
@@ -5,6 +5,7 @@ Date: create on 27/10/2025
5
5
  Checkpoint: edit on 02/12/2025
6
6
  Author: Yang Zhou,zyaztec@gmail.com
7
7
  """
8
+
8
9
  import os
9
10
  import torch
10
11
  import logging
@@ -15,8 +16,12 @@ import pyarrow.parquet as pq
15
16
  from pathlib import Path
16
17
  from typing import cast
17
18
 
18
- from nextrec.basic.loggers import colorize
19
- from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
19
+ from nextrec.basic.features import (
20
+ DenseFeature,
21
+ SparseFeature,
22
+ SequenceFeature,
23
+ FeatureSet,
24
+ )
20
25
  from nextrec.data.preprocessor import DataProcessor
21
26
  from torch.utils.data import DataLoader, Dataset, IterableDataset
22
27
 
@@ -25,49 +30,75 @@ from nextrec.utils.file import resolve_file_paths, read_table
25
30
  from nextrec.data.batch_utils import collate_fn
26
31
  from nextrec.data.data_processing import get_column_data
27
32
 
33
+
28
34
  class TensorDictDataset(Dataset):
29
35
  """Dataset returning sample-level dicts matching the unified batch schema."""
36
+
30
37
  def __init__(self, tensors: dict):
31
38
  self.features = tensors.get("features", {})
32
39
  self.labels = tensors.get("labels")
33
40
  self.ids = tensors.get("ids")
34
41
  if not self.features:
35
- raise ValueError("[TensorDictDataset Error] Dataset requires at least one feature tensor.")
42
+ raise ValueError(
43
+ "[TensorDictDataset Error] Dataset requires at least one feature tensor."
44
+ )
36
45
  lengths = [tensor.shape[0] for tensor in self.features.values()]
37
46
  if not lengths:
38
47
  raise ValueError("[TensorDictDataset Error] Feature tensors are empty.")
39
48
  self.length = lengths[0]
40
49
  for length in lengths[1:]:
41
50
  if length != self.length:
42
- raise ValueError("[TensorDictDataset Error] All feature tensors must have the same length.")
51
+ raise ValueError(
52
+ "[TensorDictDataset Error] All feature tensors must have the same length."
53
+ )
54
+
43
55
  def __len__(self) -> int:
44
56
  return self.length
45
57
 
46
58
  def __getitem__(self, idx: int) -> dict:
47
59
  sample_features = {name: tensor[idx] for name, tensor in self.features.items()}
48
- sample_labels = {name: tensor[idx] for name, tensor in self.labels.items()} if self.labels else None
49
- sample_ids = {name: tensor[idx] for name, tensor in self.ids.items()} if self.ids else None
60
+ sample_labels = (
61
+ {name: tensor[idx] for name, tensor in self.labels.items()}
62
+ if self.labels
63
+ else None
64
+ )
65
+ sample_ids = (
66
+ {name: tensor[idx] for name, tensor in self.ids.items()}
67
+ if self.ids
68
+ else None
69
+ )
50
70
  return {"features": sample_features, "labels": sample_labels, "ids": sample_ids}
51
71
 
72
+
52
73
  class FileDataset(FeatureSet, IterableDataset):
53
- def __init__(self,
54
- file_paths: list[str], # file paths to read, containing CSV or Parquet files
55
- dense_features: list[DenseFeature], # dense feature definitions
56
- sparse_features: list[SparseFeature], # sparse feature definitions
57
- sequence_features: list[SequenceFeature], # sequence feature definitions
58
- target_columns: list[str], # target column names
59
- id_columns: list[str] | None = None, # id columns to carry through (not used for model inputs)
60
- chunk_size: int = 10000,
61
- file_type: str = 'csv',
62
- processor: DataProcessor | None = None): # optional DataProcessor for transformation
74
+ def __init__(
75
+ self,
76
+ file_paths: list[str], # file paths to read, containing CSV or Parquet files
77
+ dense_features: list[DenseFeature], # dense feature definitions
78
+ sparse_features: list[SparseFeature], # sparse feature definitions
79
+ sequence_features: list[SequenceFeature], # sequence feature definitions
80
+ target_columns: list[str], # target column names
81
+ id_columns: (
82
+ list[str] | None
83
+ ) = None, # id columns to carry through (not used for model inputs)
84
+ chunk_size: int = 10000,
85
+ file_type: str = "csv",
86
+ processor: DataProcessor | None = None,
87
+ ): # optional DataProcessor for transformation
63
88
  self.file_paths = file_paths
64
89
  self.chunk_size = chunk_size
65
90
  self.file_type = file_type
66
91
  self.processor = processor
67
- self.set_all_features(dense_features, sparse_features, sequence_features, target_columns, id_columns)
92
+ self.set_all_features(
93
+ dense_features,
94
+ sparse_features,
95
+ sequence_features,
96
+ target_columns,
97
+ id_columns,
98
+ )
68
99
  self.current_file_index = 0
69
100
  self.total_files = len(file_paths)
70
-
101
+
71
102
  def __iter__(self):
72
103
  self.current_file_index = 0
73
104
  for file_path in self.file_paths:
@@ -75,54 +106,66 @@ class FileDataset(FeatureSet, IterableDataset):
75
106
  if self.total_files == 1:
76
107
  file_name = os.path.basename(file_path)
77
108
  logging.info(f"Processing file: {file_name}")
78
- if self.file_type == 'csv':
109
+ if self.file_type == "csv":
79
110
  yield from self.read_csv_chunks(file_path)
80
- elif self.file_type == 'parquet':
111
+ elif self.file_type == "parquet":
81
112
  yield from self.read_parquet_chunks(file_path)
82
-
113
+
83
114
  def read_csv_chunks(self, file_path: str):
84
115
  chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
85
116
  for chunk in chunk_iterator:
86
117
  tensors = self.dataframeto_tensors(chunk)
87
118
  yield tensors
88
-
119
+
89
120
  def read_parquet_chunks(self, file_path: str):
90
121
  parquet_file = pq.ParquetFile(file_path)
91
122
  for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
92
- chunk = batch.to_pandas()
123
+ chunk = batch.to_pandas()
93
124
  tensors = self.dataframeto_tensors(chunk)
94
125
  yield tensors
95
126
  del chunk
96
-
127
+
97
128
  def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
98
129
  if self.processor is not None:
99
130
  if not self.processor.is_fitted:
100
- raise ValueError("[DataLoader Error] DataProcessor must be fitted before using in streaming mode")
131
+ raise ValueError(
132
+ "[DataLoader Error] DataProcessor must be fitted before using in streaming mode"
133
+ )
101
134
  transformed_data = self.processor.transform(df, return_dict=True)
102
135
  else:
103
136
  transformed_data = df
104
137
  if isinstance(transformed_data, list):
105
- raise TypeError("[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming.")
138
+ raise TypeError(
139
+ "[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming."
140
+ )
106
141
  safe_data = cast(dict | pd.DataFrame, transformed_data)
107
- batch = build_tensors_from_data(data=safe_data, raw_data=df, features=self.all_features, target_columns=self.target_columns, id_columns=self.id_columns)
142
+ batch = build_tensors_from_data(
143
+ data=safe_data,
144
+ raw_data=df,
145
+ features=self.all_features,
146
+ target_columns=self.target_columns,
147
+ id_columns=self.id_columns,
148
+ )
108
149
  if batch is not None:
109
150
  batch["_already_batched"] = True
110
151
  return batch
111
152
 
112
153
 
113
154
  class RecDataLoader(FeatureSet):
114
- def __init__(self,
115
- dense_features: list[DenseFeature] | None = None,
116
- sparse_features: list[SparseFeature] | None = None,
117
- sequence_features: list[SequenceFeature] | None = None,
118
- target: list[str] | None | str = None,
119
- id_columns: str | list[str] | None = None,
120
- processor: DataProcessor | None = None):
155
+ def __init__(
156
+ self,
157
+ dense_features: list[DenseFeature] | None = None,
158
+ sparse_features: list[SparseFeature] | None = None,
159
+ sequence_features: list[SequenceFeature] | None = None,
160
+ target: list[str] | None | str = None,
161
+ id_columns: str | list[str] | None = None,
162
+ processor: DataProcessor | None = None,
163
+ ):
121
164
  """
122
165
  RecDataLoader is a unified dataloader for supporting in-memory and streaming data.
123
166
  Basemodel will accept RecDataLoader to create dataloaders for training/evaluation/prediction.
124
167
 
125
- Args:
168
+ Args:
126
169
  dense_features: list of DenseFeature definitions
127
170
  sparse_features: list of SparseFeature definitions
128
171
  sequence_features: list of SequenceFeature definitions
@@ -131,16 +174,28 @@ class RecDataLoader(FeatureSet):
131
174
  processor: an instance of DataProcessor, if provided, will be used to transform data before creating tensors.
132
175
  """
133
176
  self.processor = processor
134
- self.set_all_features(dense_features, sparse_features, sequence_features, target, id_columns)
177
+ self.set_all_features(
178
+ dense_features, sparse_features, sequence_features, target, id_columns
179
+ )
135
180
 
136
- def create_dataloader(self,
137
- data: dict | pd.DataFrame | str | DataLoader,
138
- batch_size: int = 32,
139
- shuffle: bool = True,
140
- load_full: bool = True,
141
- chunk_size: int = 10000,
142
- num_workers: int = 0,
143
- sampler = None) -> DataLoader:
181
+ def create_dataloader(
182
+ self,
183
+ data: (
184
+ dict
185
+ | pd.DataFrame
186
+ | str
187
+ | os.PathLike
188
+ | list[str]
189
+ | list[os.PathLike]
190
+ | DataLoader
191
+ ),
192
+ batch_size: int = 32,
193
+ shuffle: bool = True,
194
+ load_full: bool = True,
195
+ chunk_size: int = 10000,
196
+ num_workers: int = 0,
197
+ sampler=None,
198
+ ) -> DataLoader:
144
199
  """
145
200
  Create a DataLoader from various data sources.
146
201
 
@@ -159,39 +214,104 @@ class RecDataLoader(FeatureSet):
159
214
  if isinstance(data, DataLoader):
160
215
  return data
161
216
  elif isinstance(data, (str, os.PathLike)):
162
- return self.create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size, num_workers=num_workers)
217
+ return self.create_from_path(
218
+ path=data,
219
+ batch_size=batch_size,
220
+ shuffle=shuffle,
221
+ load_full=load_full,
222
+ chunk_size=chunk_size,
223
+ num_workers=num_workers,
224
+ )
225
+ elif (
226
+ isinstance(data, list)
227
+ and data
228
+ and all(isinstance(p, (str, os.PathLike)) for p in data)
229
+ ):
230
+ return self.create_from_path(
231
+ path=data,
232
+ batch_size=batch_size,
233
+ shuffle=shuffle,
234
+ load_full=load_full,
235
+ chunk_size=chunk_size,
236
+ num_workers=num_workers,
237
+ )
163
238
  elif isinstance(data, (dict, pd.DataFrame)):
164
- return self.create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, sampler=sampler)
239
+ return self.create_from_memory(
240
+ data=data,
241
+ batch_size=batch_size,
242
+ shuffle=shuffle,
243
+ num_workers=num_workers,
244
+ sampler=sampler,
245
+ )
165
246
  else:
166
- raise ValueError(f"[RecDataLoader Error] Unsupported data type: {type(data)}")
167
-
168
- def create_from_memory(self,
169
- data: dict | pd.DataFrame,
170
- batch_size: int,
171
- shuffle: bool,
172
- num_workers: int = 0,
173
- sampler=None) -> DataLoader:
247
+ raise ValueError(
248
+ f"[RecDataLoader Error] Unsupported data type: {type(data)}"
249
+ )
250
+
251
+ def create_from_memory(
252
+ self,
253
+ data: dict | pd.DataFrame,
254
+ batch_size: int,
255
+ shuffle: bool,
256
+ num_workers: int = 0,
257
+ sampler=None,
258
+ ) -> DataLoader:
174
259
 
175
260
  raw_data = data
176
261
 
177
262
  if self.processor is not None:
178
263
  if not self.processor.is_fitted:
179
- raise ValueError("[RecDataLoader Error] DataProcessor must be fitted before transforming data in memory")
180
- data = self.processor.transform(data, return_dict=True) # type: ignore
181
- tensors = build_tensors_from_data(data=data,raw_data=raw_data, features=self.all_features, target_columns=self.target_columns, id_columns=self.id_columns,)
264
+ raise ValueError(
265
+ "[RecDataLoader Error] DataProcessor must be fitted before transforming data in memory"
266
+ )
267
+ data = self.processor.transform(data, return_dict=True) # type: ignore
268
+ tensors = build_tensors_from_data(
269
+ data=data,
270
+ raw_data=raw_data,
271
+ features=self.all_features,
272
+ target_columns=self.target_columns,
273
+ id_columns=self.id_columns,
274
+ )
182
275
  if tensors is None:
183
- raise ValueError("[RecDataLoader Error] No valid tensors could be built from the provided data.")
276
+ raise ValueError(
277
+ "[RecDataLoader Error] No valid tensors could be built from the provided data."
278
+ )
184
279
  dataset = TensorDictDataset(tensors)
185
- return DataLoader(dataset, batch_size=batch_size, shuffle=False if sampler is not None else shuffle, sampler=sampler, collate_fn=collate_fn, num_workers=num_workers)
186
-
187
- def create_from_path(self,
188
- path: str,
189
- batch_size: int,
190
- shuffle: bool,
191
- load_full: bool,
192
- chunk_size: int = 10000,
193
- num_workers: int = 0) -> DataLoader:
194
- file_paths, file_type = resolve_file_paths(str(Path(path)))
280
+ return DataLoader(
281
+ dataset,
282
+ batch_size=batch_size,
283
+ shuffle=False if sampler is not None else shuffle,
284
+ sampler=sampler,
285
+ collate_fn=collate_fn,
286
+ num_workers=num_workers,
287
+ )
288
+
289
+ def create_from_path(
290
+ self,
291
+ path: str | os.PathLike | list[str] | list[os.PathLike],
292
+ batch_size: int,
293
+ shuffle: bool,
294
+ load_full: bool,
295
+ chunk_size: int = 10000,
296
+ num_workers: int = 0,
297
+ ) -> DataLoader:
298
+ if isinstance(path, (str, os.PathLike)):
299
+ file_paths, file_type = resolve_file_paths(str(Path(path)))
300
+ else:
301
+ file_paths = [str(Path(p)) for p in path]
302
+ if not file_paths:
303
+ raise ValueError("[RecDataLoader Error] Empty file path list provided.")
304
+ suffixes = {Path(p).suffix.lower() for p in file_paths}
305
+ if len(suffixes) != 1:
306
+ raise ValueError(
307
+ "[RecDataLoader Error] Mixed file types in provided list; please use only CSV or only Parquet."
308
+ )
309
+ suffix = suffixes.pop()
310
+ if suffix not in {".csv", ".parquet"}:
311
+ raise ValueError(
312
+ f"[RecDataLoader Error] Unsupported file extension in list: {suffix}"
313
+ )
314
+ file_type = "csv" if suffix == ".csv" else "parquet"
195
315
  # Load full data into memory
196
316
  if load_full:
197
317
  dfs = []
@@ -205,28 +325,60 @@ class RecDataLoader(FeatureSet):
205
325
  df = read_table(file_path, file_type=file_type)
206
326
  dfs.append(df)
207
327
  except MemoryError as exc:
208
- raise MemoryError(f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming.") from exc
328
+ raise MemoryError(
329
+ f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming."
330
+ ) from exc
209
331
  try:
210
332
  combined_df = pd.concat(dfs, ignore_index=True)
211
333
  except MemoryError as exc:
212
- raise MemoryError(f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size.") from exc
213
- return self.create_from_memory(combined_df, batch_size, shuffle, num_workers=num_workers)
334
+ raise MemoryError(
335
+ f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size."
336
+ ) from exc
337
+ return self.create_from_memory(
338
+ combined_df, batch_size, shuffle, num_workers=num_workers
339
+ )
214
340
  else:
215
- return self.load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle, num_workers=num_workers)
341
+ return self.load_files_streaming(
342
+ file_paths,
343
+ file_type,
344
+ batch_size,
345
+ chunk_size,
346
+ shuffle,
347
+ num_workers=num_workers,
348
+ )
216
349
 
217
- def load_files_streaming(self,
218
- file_paths: list[str],
219
- file_type: str,
220
- batch_size: int,
221
- chunk_size: int,
222
- shuffle: bool,
223
- num_workers: int = 0) -> DataLoader:
350
+ def load_files_streaming(
351
+ self,
352
+ file_paths: list[str],
353
+ file_type: str,
354
+ batch_size: int,
355
+ chunk_size: int,
356
+ shuffle: bool,
357
+ num_workers: int = 0,
358
+ ) -> DataLoader:
224
359
  if shuffle:
225
- logging.info("[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset).")
360
+ logging.info(
361
+ "[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset)."
362
+ )
226
363
  if batch_size != 1:
227
- logging.info("[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
228
- dataset = FileDataset(file_paths=file_paths, dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target_columns=self.target_columns, id_columns=self.id_columns, chunk_size=chunk_size, file_type=file_type, processor=self.processor)
229
- return DataLoader(dataset, batch_size=1, collate_fn=collate_fn, num_workers=num_workers)
364
+ logging.info(
365
+ "[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput."
366
+ )
367
+ dataset = FileDataset(
368
+ file_paths=file_paths,
369
+ dense_features=self.dense_features,
370
+ sparse_features=self.sparse_features,
371
+ sequence_features=self.sequence_features,
372
+ target_columns=self.target_columns,
373
+ id_columns=self.id_columns,
374
+ chunk_size=chunk_size,
375
+ file_type=file_type,
376
+ processor=self.processor,
377
+ )
378
+ return DataLoader(
379
+ dataset, batch_size=1, collate_fn=collate_fn, num_workers=num_workers
380
+ )
381
+
230
382
 
231
383
  def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
232
384
  if isinstance(column, pd.Series):
@@ -238,12 +390,20 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
238
390
  if column.ndim == 0:
239
391
  column = column.reshape(1)
240
392
  if column.dtype == object and any(isinstance(v, str) for v in column.ravel()):
241
- raise TypeError(f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values.")
242
- if column.dtype == object and len(column) > 0 and isinstance(column[0], (list, tuple, np.ndarray)):
393
+ raise TypeError(
394
+ f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
395
+ )
396
+ if (
397
+ column.dtype == object
398
+ and len(column) > 0
399
+ and isinstance(column[0], (list, tuple, np.ndarray))
400
+ ):
243
401
  sequences = []
244
402
  for seq in column:
245
403
  if isinstance(seq, str):
246
- raise TypeError(f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values.")
404
+ raise TypeError(
405
+ f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
406
+ )
247
407
  if isinstance(seq, (list, tuple, np.ndarray)):
248
408
  arr = np.asarray(seq, dtype=np.int64)
249
409
  else:
@@ -258,25 +418,32 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
258
418
  if len(seq) > max_len:
259
419
  padded.append(seq[:max_len])
260
420
  else:
261
- padded.append(np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value))
421
+ padded.append(
422
+ np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value)
423
+ )
262
424
  column = np.stack(padded)
263
425
  elif column.ndim == 1:
264
426
  column = column.reshape(-1, 1)
265
427
  return np.asarray(column, dtype=np.int64)
266
428
 
267
- def build_tensors_from_data(
429
+
430
+ def build_tensors_from_data(
268
431
  data: dict | pd.DataFrame,
269
432
  raw_data: dict | pd.DataFrame,
270
433
  features: list,
271
434
  target_columns: list[str],
272
- id_columns: list[str]
435
+ id_columns: list[str],
273
436
  ) -> dict | None:
274
437
  feature_tensors = {}
275
438
  for feature in features:
276
439
  column = get_column_data(data, feature.name)
277
440
  if column is None:
278
- raise ValueError(f"[RecDataLoader Error] Feature column '{feature.name}' not found in data")
279
- if isinstance(feature, SequenceFeature): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
441
+ raise ValueError(
442
+ f"[RecDataLoader Error] Feature column '{feature.name}' not found in data"
443
+ )
444
+ if isinstance(
445
+ feature, SequenceFeature
446
+ ): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
280
447
  arr = normalize_sequence_column(column, feature)
281
448
  tensor = to_tensor(arr, dtype=torch.long)
282
449
  elif isinstance(feature, DenseFeature):
@@ -293,8 +460,14 @@ def build_tensors_from_data(
293
460
  column = get_column_data(data, target_name)
294
461
  if column is None:
295
462
  continue
296
- label_tensor = to_tensor(np.asarray(column, dtype=np.float32), dtype=torch.float32)
297
- if label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
463
+ label_tensor = to_tensor(
464
+ np.asarray(column, dtype=np.float32), dtype=torch.float32
465
+ )
466
+ if (
467
+ label_tensor.dim() == 2
468
+ and label_tensor.shape[0] == 1
469
+ and label_tensor.shape[1] > 1
470
+ ):
298
471
  label_tensor = label_tensor.t()
299
472
  if label_tensor.shape[1:] == (1,):
300
473
  label_tensor = label_tensor.squeeze(1)
@@ -309,11 +482,15 @@ def build_tensors_from_data(
309
482
  if column is None:
310
483
  column = get_column_data(data, id_col)
311
484
  if column is None:
312
- raise KeyError(f"[RecDataLoader Error] ID column '{id_col}' not found in provided data.")
485
+ raise KeyError(
486
+ f"[RecDataLoader Error] ID column '{id_col}' not found in provided data."
487
+ )
313
488
  try:
314
489
  id_arr = np.asarray(column, dtype=np.int64)
315
490
  except Exception as exc:
316
- raise TypeError( f"[RecDataLoader Error] ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}") from exc
491
+ raise TypeError(
492
+ f"[RecDataLoader Error] ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}"
493
+ ) from exc
317
494
  id_tensors[id_col] = to_tensor(id_arr, dtype=torch.long)
318
495
  if not feature_tensors:
319
496
  return None