nextrec 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. nextrec/__init__.py +1 -1
  2. nextrec/__version__.py +1 -1
  3. nextrec/basic/activation.py +10 -5
  4. nextrec/basic/callback.py +1 -0
  5. nextrec/basic/features.py +30 -22
  6. nextrec/basic/layers.py +244 -113
  7. nextrec/basic/loggers.py +62 -43
  8. nextrec/basic/metrics.py +268 -119
  9. nextrec/basic/model.py +1373 -443
  10. nextrec/basic/session.py +10 -3
  11. nextrec/cli.py +498 -0
  12. nextrec/data/__init__.py +19 -25
  13. nextrec/data/batch_utils.py +11 -3
  14. nextrec/data/data_processing.py +42 -24
  15. nextrec/data/data_utils.py +26 -15
  16. nextrec/data/dataloader.py +303 -96
  17. nextrec/data/preprocessor.py +320 -199
  18. nextrec/loss/listwise.py +17 -9
  19. nextrec/loss/loss_utils.py +7 -8
  20. nextrec/loss/pairwise.py +2 -0
  21. nextrec/loss/pointwise.py +30 -12
  22. nextrec/models/generative/hstu.py +106 -40
  23. nextrec/models/match/dssm.py +82 -69
  24. nextrec/models/match/dssm_v2.py +72 -58
  25. nextrec/models/match/mind.py +175 -108
  26. nextrec/models/match/sdm.py +104 -88
  27. nextrec/models/match/youtube_dnn.py +73 -60
  28. nextrec/models/multi_task/esmm.py +53 -39
  29. nextrec/models/multi_task/mmoe.py +70 -47
  30. nextrec/models/multi_task/ple.py +107 -50
  31. nextrec/models/multi_task/poso.py +121 -41
  32. nextrec/models/multi_task/share_bottom.py +54 -38
  33. nextrec/models/ranking/afm.py +172 -45
  34. nextrec/models/ranking/autoint.py +84 -61
  35. nextrec/models/ranking/dcn.py +59 -42
  36. nextrec/models/ranking/dcn_v2.py +64 -23
  37. nextrec/models/ranking/deepfm.py +36 -26
  38. nextrec/models/ranking/dien.py +158 -102
  39. nextrec/models/ranking/din.py +88 -60
  40. nextrec/models/ranking/fibinet.py +55 -35
  41. nextrec/models/ranking/fm.py +32 -26
  42. nextrec/models/ranking/masknet.py +95 -34
  43. nextrec/models/ranking/pnn.py +34 -31
  44. nextrec/models/ranking/widedeep.py +37 -29
  45. nextrec/models/ranking/xdeepfm.py +63 -41
  46. nextrec/utils/__init__.py +61 -32
  47. nextrec/utils/config.py +490 -0
  48. nextrec/utils/device.py +52 -12
  49. nextrec/utils/distributed.py +141 -0
  50. nextrec/utils/embedding.py +1 -0
  51. nextrec/utils/feature.py +1 -0
  52. nextrec/utils/file.py +32 -11
  53. nextrec/utils/initializer.py +61 -16
  54. nextrec/utils/optimizer.py +25 -9
  55. nextrec/utils/synthetic_data.py +531 -0
  56. nextrec/utils/tensor.py +24 -13
  57. {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/METADATA +15 -5
  58. nextrec-0.4.2.dist-info/RECORD +69 -0
  59. nextrec-0.4.2.dist-info/entry_points.txt +2 -0
  60. nextrec-0.3.6.dist-info/RECORD +0 -64
  61. {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
  62. {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
@@ -5,6 +5,7 @@ Date: create on 27/10/2025
5
5
  Checkpoint: edit on 02/12/2025
6
6
  Author: Yang Zhou,zyaztec@gmail.com
7
7
  """
8
+
8
9
  import os
9
10
  import torch
10
11
  import logging
@@ -15,59 +16,89 @@ import pyarrow.parquet as pq
15
16
  from pathlib import Path
16
17
  from typing import cast
17
18
 
18
- from torch.utils.data import DataLoader, Dataset, IterableDataset
19
+ from nextrec.basic.features import (
20
+ DenseFeature,
21
+ SparseFeature,
22
+ SequenceFeature,
23
+ FeatureSet,
24
+ )
19
25
  from nextrec.data.preprocessor import DataProcessor
20
- from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
26
+ from torch.utils.data import DataLoader, Dataset, IterableDataset
21
27
 
22
- from nextrec.basic.loggers import colorize
23
- from nextrec.data.data_processing import get_column_data
24
- from nextrec.data.batch_utils import collate_fn
25
- from nextrec.utils.file import resolve_file_paths, read_table
26
28
  from nextrec.utils.tensor import to_tensor
29
+ from nextrec.utils.file import resolve_file_paths, read_table
30
+ from nextrec.data.batch_utils import collate_fn
31
+ from nextrec.data.data_processing import get_column_data
32
+
27
33
 
28
34
  class TensorDictDataset(Dataset):
29
35
  """Dataset returning sample-level dicts matching the unified batch schema."""
36
+
30
37
  def __init__(self, tensors: dict):
31
38
  self.features = tensors.get("features", {})
32
39
  self.labels = tensors.get("labels")
33
40
  self.ids = tensors.get("ids")
34
41
  if not self.features:
35
- raise ValueError("[TensorDictDataset Error] Dataset requires at least one feature tensor.")
42
+ raise ValueError(
43
+ "[TensorDictDataset Error] Dataset requires at least one feature tensor."
44
+ )
36
45
  lengths = [tensor.shape[0] for tensor in self.features.values()]
37
46
  if not lengths:
38
47
  raise ValueError("[TensorDictDataset Error] Feature tensors are empty.")
39
48
  self.length = lengths[0]
40
49
  for length in lengths[1:]:
41
50
  if length != self.length:
42
- raise ValueError("[TensorDictDataset Error] All feature tensors must have the same length.")
51
+ raise ValueError(
52
+ "[TensorDictDataset Error] All feature tensors must have the same length."
53
+ )
54
+
43
55
  def __len__(self) -> int:
44
56
  return self.length
45
57
 
46
58
  def __getitem__(self, idx: int) -> dict:
47
59
  sample_features = {name: tensor[idx] for name, tensor in self.features.items()}
48
- sample_labels = {name: tensor[idx] for name, tensor in self.labels.items()} if self.labels else None
49
- sample_ids = {name: tensor[idx] for name, tensor in self.ids.items()} if self.ids else None
60
+ sample_labels = (
61
+ {name: tensor[idx] for name, tensor in self.labels.items()}
62
+ if self.labels
63
+ else None
64
+ )
65
+ sample_ids = (
66
+ {name: tensor[idx] for name, tensor in self.ids.items()}
67
+ if self.ids
68
+ else None
69
+ )
50
70
  return {"features": sample_features, "labels": sample_labels, "ids": sample_ids}
51
71
 
72
+
52
73
  class FileDataset(FeatureSet, IterableDataset):
53
- def __init__(self,
54
- file_paths: list[str], # file paths to read, containing CSV or Parquet files
55
- dense_features: list[DenseFeature], # dense feature definitions
56
- sparse_features: list[SparseFeature], # sparse feature definitions
57
- sequence_features: list[SequenceFeature], # sequence feature definitions
58
- target_columns: list[str], # target column names
59
- id_columns: list[str] | None = None, # id columns to carry through (not used for model inputs)
60
- chunk_size: int = 10000,
61
- file_type: str = 'csv',
62
- processor: DataProcessor | None = None): # optional DataProcessor for transformation
74
+ def __init__(
75
+ self,
76
+ file_paths: list[str], # file paths to read, containing CSV or Parquet files
77
+ dense_features: list[DenseFeature], # dense feature definitions
78
+ sparse_features: list[SparseFeature], # sparse feature definitions
79
+ sequence_features: list[SequenceFeature], # sequence feature definitions
80
+ target_columns: list[str], # target column names
81
+ id_columns: (
82
+ list[str] | None
83
+ ) = None, # id columns to carry through (not used for model inputs)
84
+ chunk_size: int = 10000,
85
+ file_type: str = "csv",
86
+ processor: DataProcessor | None = None,
87
+ ): # optional DataProcessor for transformation
63
88
  self.file_paths = file_paths
64
89
  self.chunk_size = chunk_size
65
90
  self.file_type = file_type
66
91
  self.processor = processor
67
- self.set_all_features(dense_features, sparse_features, sequence_features, target_columns, id_columns)
92
+ self.set_all_features(
93
+ dense_features,
94
+ sparse_features,
95
+ sequence_features,
96
+ target_columns,
97
+ id_columns,
98
+ )
68
99
  self.current_file_index = 0
69
100
  self.total_files = len(file_paths)
70
-
101
+
71
102
  def __iter__(self):
72
103
  self.current_file_index = 0
73
104
  for file_path in self.file_paths:
@@ -75,93 +106,212 @@ class FileDataset(FeatureSet, IterableDataset):
75
106
  if self.total_files == 1:
76
107
  file_name = os.path.basename(file_path)
77
108
  logging.info(f"Processing file: {file_name}")
78
- if self.file_type == 'csv':
109
+ if self.file_type == "csv":
79
110
  yield from self.read_csv_chunks(file_path)
80
- elif self.file_type == 'parquet':
111
+ elif self.file_type == "parquet":
81
112
  yield from self.read_parquet_chunks(file_path)
82
-
113
+
83
114
  def read_csv_chunks(self, file_path: str):
84
115
  chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
85
116
  for chunk in chunk_iterator:
86
117
  tensors = self.dataframeto_tensors(chunk)
87
118
  yield tensors
88
-
119
+
89
120
  def read_parquet_chunks(self, file_path: str):
90
121
  parquet_file = pq.ParquetFile(file_path)
91
122
  for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
92
- chunk = batch.to_pandas()
123
+ chunk = batch.to_pandas()
93
124
  tensors = self.dataframeto_tensors(chunk)
94
125
  yield tensors
95
126
  del chunk
96
-
127
+
97
128
  def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
98
129
  if self.processor is not None:
99
130
  if not self.processor.is_fitted:
100
- raise ValueError("[DataLoader Error] DataProcessor must be fitted before using in streaming mode")
131
+ raise ValueError(
132
+ "[DataLoader Error] DataProcessor must be fitted before using in streaming mode"
133
+ )
101
134
  transformed_data = self.processor.transform(df, return_dict=True)
102
135
  else:
103
136
  transformed_data = df
104
137
  if isinstance(transformed_data, list):
105
- raise TypeError("[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming.")
138
+ raise TypeError(
139
+ "[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming."
140
+ )
106
141
  safe_data = cast(dict | pd.DataFrame, transformed_data)
107
- batch = build_tensors_from_data(data=safe_data, raw_data=df, features=self.all_features, target_columns=self.target_columns, id_columns=self.id_columns)
142
+ batch = build_tensors_from_data(
143
+ data=safe_data,
144
+ raw_data=df,
145
+ features=self.all_features,
146
+ target_columns=self.target_columns,
147
+ id_columns=self.id_columns,
148
+ )
108
149
  if batch is not None:
109
150
  batch["_already_batched"] = True
110
151
  return batch
111
152
 
112
153
 
113
154
  class RecDataLoader(FeatureSet):
114
- def __init__(self,
115
- dense_features: list[DenseFeature] | None = None,
116
- sparse_features: list[SparseFeature] | None = None,
117
- sequence_features: list[SequenceFeature] | None = None,
118
- target: list[str] | None | str = None,
119
- id_columns: str | list[str] | None = None,
120
- processor: DataProcessor | None = None):
155
+ def __init__(
156
+ self,
157
+ dense_features: list[DenseFeature] | None = None,
158
+ sparse_features: list[SparseFeature] | None = None,
159
+ sequence_features: list[SequenceFeature] | None = None,
160
+ target: list[str] | None | str = None,
161
+ id_columns: str | list[str] | None = None,
162
+ processor: DataProcessor | None = None,
163
+ ):
164
+ """
165
+ RecDataLoader is a unified dataloader for supporting in-memory and streaming data.
166
+ Basemodel will accept RecDataLoader to create dataloaders for training/evaluation/prediction.
167
+
168
+ Args:
169
+ dense_features: list of DenseFeature definitions
170
+ sparse_features: list of SparseFeature definitions
171
+ sequence_features: list of SequenceFeature definitions
172
+ target: target column name(s), e.g. 'label' or ['ctr', 'ctcvr']
173
+ id_columns: id column name(s) to carry through (not used for model inputs), e.g. 'user_id' or ['user_id', 'item_id']
174
+ processor: an instance of DataProcessor, if provided, will be used to transform data before creating tensors.
175
+ """
121
176
  self.processor = processor
122
- self.set_all_features(dense_features, sparse_features, sequence_features, target, id_columns)
177
+ self.set_all_features(
178
+ dense_features, sparse_features, sequence_features, target, id_columns
179
+ )
180
+
181
+ def create_dataloader(
182
+ self,
183
+ data: (
184
+ dict
185
+ | pd.DataFrame
186
+ | str
187
+ | os.PathLike
188
+ | list[str]
189
+ | list[os.PathLike]
190
+ | DataLoader
191
+ ),
192
+ batch_size: int = 32,
193
+ shuffle: bool = True,
194
+ load_full: bool = True,
195
+ chunk_size: int = 10000,
196
+ num_workers: int = 0,
197
+ sampler=None,
198
+ ) -> DataLoader:
199
+ """
200
+ Create a DataLoader from various data sources.
201
+
202
+ Args:
203
+ data: Data source, can be a dict, pd.DataFrame, file path (str), or existing DataLoader.
204
+ batch_size: Batch size for DataLoader.
205
+ shuffle: Whether to shuffle the data (ignored in streaming mode).
206
+ load_full: If True, load full data into memory; if False, use streaming mode for large files.
207
+ chunk_size: Chunk size for streaming mode (number of rows per chunk).
208
+ num_workers: Number of worker processes for data loading.
209
+ sampler: Optional sampler for DataLoader, only used for distributed training.
210
+ Returns:
211
+ DataLoader instance.
212
+ """
123
213
 
124
- def create_dataloader(self,
125
- data: dict | pd.DataFrame | str | DataLoader,
126
- batch_size: int = 32,
127
- shuffle: bool = True,
128
- load_full: bool = True,
129
- chunk_size: int = 10000,
130
- num_workers: int = 0) -> DataLoader:
131
214
  if isinstance(data, DataLoader):
132
215
  return data
133
216
  elif isinstance(data, (str, os.PathLike)):
134
- return self.create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size, num_workers=num_workers)
217
+ return self.create_from_path(
218
+ path=data,
219
+ batch_size=batch_size,
220
+ shuffle=shuffle,
221
+ load_full=load_full,
222
+ chunk_size=chunk_size,
223
+ num_workers=num_workers,
224
+ )
225
+ elif (
226
+ isinstance(data, list)
227
+ and data
228
+ and all(isinstance(p, (str, os.PathLike)) for p in data)
229
+ ):
230
+ return self.create_from_path(
231
+ path=data,
232
+ batch_size=batch_size,
233
+ shuffle=shuffle,
234
+ load_full=load_full,
235
+ chunk_size=chunk_size,
236
+ num_workers=num_workers,
237
+ )
135
238
  elif isinstance(data, (dict, pd.DataFrame)):
136
- return self.create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
239
+ return self.create_from_memory(
240
+ data=data,
241
+ batch_size=batch_size,
242
+ shuffle=shuffle,
243
+ num_workers=num_workers,
244
+ sampler=sampler,
245
+ )
137
246
  else:
138
- raise ValueError(f"[RecDataLoader Error] Unsupported data type: {type(data)}")
139
-
140
- def create_from_memory(self,
141
- data: dict | pd.DataFrame,
142
- batch_size: int,
143
- shuffle: bool,
144
- num_workers: int = 0) -> DataLoader:
247
+ raise ValueError(
248
+ f"[RecDataLoader Error] Unsupported data type: {type(data)}"
249
+ )
250
+
251
+ def create_from_memory(
252
+ self,
253
+ data: dict | pd.DataFrame,
254
+ batch_size: int,
255
+ shuffle: bool,
256
+ num_workers: int = 0,
257
+ sampler=None,
258
+ ) -> DataLoader:
259
+
145
260
  raw_data = data
146
261
 
147
262
  if self.processor is not None:
148
263
  if not self.processor.is_fitted:
149
- raise ValueError("[RecDataLoader Error] DataProcessor must be fitted before transforming data in memory")
150
- data = self.processor.transform(data, return_dict=True) # type: ignore
151
- tensors = build_tensors_from_data(data=data,raw_data=raw_data, features=self.all_features, target_columns=self.target_columns, id_columns=self.id_columns,)
264
+ raise ValueError(
265
+ "[RecDataLoader Error] DataProcessor must be fitted before transforming data in memory"
266
+ )
267
+ data = self.processor.transform(data, return_dict=True) # type: ignore
268
+ tensors = build_tensors_from_data(
269
+ data=data,
270
+ raw_data=raw_data,
271
+ features=self.all_features,
272
+ target_columns=self.target_columns,
273
+ id_columns=self.id_columns,
274
+ )
152
275
  if tensors is None:
153
- raise ValueError("[RecDataLoader Error] No valid tensors could be built from the provided data.")
276
+ raise ValueError(
277
+ "[RecDataLoader Error] No valid tensors could be built from the provided data."
278
+ )
154
279
  dataset = TensorDictDataset(tensors)
155
- return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn, num_workers=num_workers)
156
-
157
- def create_from_path(self,
158
- path: str,
159
- batch_size: int,
160
- shuffle: bool,
161
- load_full: bool,
162
- chunk_size: int = 10000,
163
- num_workers: int = 0) -> DataLoader:
164
- file_paths, file_type = resolve_file_paths(str(Path(path)))
280
+ return DataLoader(
281
+ dataset,
282
+ batch_size=batch_size,
283
+ shuffle=False if sampler is not None else shuffle,
284
+ sampler=sampler,
285
+ collate_fn=collate_fn,
286
+ num_workers=num_workers,
287
+ )
288
+
289
+ def create_from_path(
290
+ self,
291
+ path: str | os.PathLike | list[str] | list[os.PathLike],
292
+ batch_size: int,
293
+ shuffle: bool,
294
+ load_full: bool,
295
+ chunk_size: int = 10000,
296
+ num_workers: int = 0,
297
+ ) -> DataLoader:
298
+ if isinstance(path, (str, os.PathLike)):
299
+ file_paths, file_type = resolve_file_paths(str(Path(path)))
300
+ else:
301
+ file_paths = [str(Path(p)) for p in path]
302
+ if not file_paths:
303
+ raise ValueError("[RecDataLoader Error] Empty file path list provided.")
304
+ suffixes = {Path(p).suffix.lower() for p in file_paths}
305
+ if len(suffixes) != 1:
306
+ raise ValueError(
307
+ "[RecDataLoader Error] Mixed file types in provided list; please use only CSV or only Parquet."
308
+ )
309
+ suffix = suffixes.pop()
310
+ if suffix not in {".csv", ".parquet"}:
311
+ raise ValueError(
312
+ f"[RecDataLoader Error] Unsupported file extension in list: {suffix}"
313
+ )
314
+ file_type = "csv" if suffix == ".csv" else "parquet"
165
315
  # Load full data into memory
166
316
  if load_full:
167
317
  dfs = []
@@ -175,28 +325,60 @@ class RecDataLoader(FeatureSet):
175
325
  df = read_table(file_path, file_type=file_type)
176
326
  dfs.append(df)
177
327
  except MemoryError as exc:
178
- raise MemoryError(f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming.") from exc
328
+ raise MemoryError(
329
+ f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming."
330
+ ) from exc
179
331
  try:
180
332
  combined_df = pd.concat(dfs, ignore_index=True)
181
333
  except MemoryError as exc:
182
- raise MemoryError(f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size.") from exc
183
- return self.create_from_memory(combined_df, batch_size, shuffle, num_workers=num_workers)
334
+ raise MemoryError(
335
+ f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size."
336
+ ) from exc
337
+ return self.create_from_memory(
338
+ combined_df, batch_size, shuffle, num_workers=num_workers
339
+ )
184
340
  else:
185
- return self.load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle, num_workers=num_workers)
341
+ return self.load_files_streaming(
342
+ file_paths,
343
+ file_type,
344
+ batch_size,
345
+ chunk_size,
346
+ shuffle,
347
+ num_workers=num_workers,
348
+ )
186
349
 
187
- def load_files_streaming(self,
188
- file_paths: list[str],
189
- file_type: str,
190
- batch_size: int,
191
- chunk_size: int,
192
- shuffle: bool,
193
- num_workers: int = 0) -> DataLoader:
350
+ def load_files_streaming(
351
+ self,
352
+ file_paths: list[str],
353
+ file_type: str,
354
+ batch_size: int,
355
+ chunk_size: int,
356
+ shuffle: bool,
357
+ num_workers: int = 0,
358
+ ) -> DataLoader:
194
359
  if shuffle:
195
- logging.info("[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset).")
360
+ logging.info(
361
+ "[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset)."
362
+ )
196
363
  if batch_size != 1:
197
- logging.info("[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
198
- dataset = FileDataset(file_paths=file_paths, dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target_columns=self.target_columns, id_columns=self.id_columns, chunk_size=chunk_size, file_type=file_type, processor=self.processor)
199
- return DataLoader(dataset, batch_size=1, collate_fn=collate_fn, num_workers=num_workers)
364
+ logging.info(
365
+ "[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput."
366
+ )
367
+ dataset = FileDataset(
368
+ file_paths=file_paths,
369
+ dense_features=self.dense_features,
370
+ sparse_features=self.sparse_features,
371
+ sequence_features=self.sequence_features,
372
+ target_columns=self.target_columns,
373
+ id_columns=self.id_columns,
374
+ chunk_size=chunk_size,
375
+ file_type=file_type,
376
+ processor=self.processor,
377
+ )
378
+ return DataLoader(
379
+ dataset, batch_size=1, collate_fn=collate_fn, num_workers=num_workers
380
+ )
381
+
200
382
 
201
383
  def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
202
384
  if isinstance(column, pd.Series):
@@ -208,12 +390,20 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
208
390
  if column.ndim == 0:
209
391
  column = column.reshape(1)
210
392
  if column.dtype == object and any(isinstance(v, str) for v in column.ravel()):
211
- raise TypeError(f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values.")
212
- if column.dtype == object and len(column) > 0 and isinstance(column[0], (list, tuple, np.ndarray)):
393
+ raise TypeError(
394
+ f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
395
+ )
396
+ if (
397
+ column.dtype == object
398
+ and len(column) > 0
399
+ and isinstance(column[0], (list, tuple, np.ndarray))
400
+ ):
213
401
  sequences = []
214
402
  for seq in column:
215
403
  if isinstance(seq, str):
216
- raise TypeError(f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values.")
404
+ raise TypeError(
405
+ f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
406
+ )
217
407
  if isinstance(seq, (list, tuple, np.ndarray)):
218
408
  arr = np.asarray(seq, dtype=np.int64)
219
409
  else:
@@ -228,25 +418,32 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
228
418
  if len(seq) > max_len:
229
419
  padded.append(seq[:max_len])
230
420
  else:
231
- padded.append(np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value))
421
+ padded.append(
422
+ np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value)
423
+ )
232
424
  column = np.stack(padded)
233
425
  elif column.ndim == 1:
234
426
  column = column.reshape(-1, 1)
235
427
  return np.asarray(column, dtype=np.int64)
236
428
 
237
- def build_tensors_from_data(
429
+
430
+ def build_tensors_from_data(
238
431
  data: dict | pd.DataFrame,
239
432
  raw_data: dict | pd.DataFrame,
240
433
  features: list,
241
434
  target_columns: list[str],
242
- id_columns: list[str]
435
+ id_columns: list[str],
243
436
  ) -> dict | None:
244
437
  feature_tensors = {}
245
438
  for feature in features:
246
439
  column = get_column_data(data, feature.name)
247
440
  if column is None:
248
- raise ValueError(f"[RecDataLoader Error] Feature column '{feature.name}' not found in data")
249
- if isinstance(feature, SequenceFeature): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
441
+ raise ValueError(
442
+ f"[RecDataLoader Error] Feature column '{feature.name}' not found in data"
443
+ )
444
+ if isinstance(
445
+ feature, SequenceFeature
446
+ ): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
250
447
  arr = normalize_sequence_column(column, feature)
251
448
  tensor = to_tensor(arr, dtype=torch.long)
252
449
  elif isinstance(feature, DenseFeature):
@@ -263,8 +460,14 @@ def build_tensors_from_data(
263
460
  column = get_column_data(data, target_name)
264
461
  if column is None:
265
462
  continue
266
- label_tensor = to_tensor(np.asarray(column, dtype=np.float32), dtype=torch.float32)
267
- if label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
463
+ label_tensor = to_tensor(
464
+ np.asarray(column, dtype=np.float32), dtype=torch.float32
465
+ )
466
+ if (
467
+ label_tensor.dim() == 2
468
+ and label_tensor.shape[0] == 1
469
+ and label_tensor.shape[1] > 1
470
+ ):
268
471
  label_tensor = label_tensor.t()
269
472
  if label_tensor.shape[1:] == (1,):
270
473
  label_tensor = label_tensor.squeeze(1)
@@ -279,11 +482,15 @@ def build_tensors_from_data(
279
482
  if column is None:
280
483
  column = get_column_data(data, id_col)
281
484
  if column is None:
282
- raise KeyError(f"[RecDataLoader Error] ID column '{id_col}' not found in provided data.")
485
+ raise KeyError(
486
+ f"[RecDataLoader Error] ID column '{id_col}' not found in provided data."
487
+ )
283
488
  try:
284
489
  id_arr = np.asarray(column, dtype=np.int64)
285
490
  except Exception as exc:
286
- raise TypeError( f"[RecDataLoader Error] ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}") from exc
491
+ raise TypeError(
492
+ f"[RecDataLoader Error] ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}"
493
+ ) from exc
287
494
  id_tensors[id_col] = to_tensor(id_arr, dtype=torch.long)
288
495
  if not feature_tensors:
289
496
  return None