nextrec 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,32 +6,64 @@ import pandas as pd
6
6
  import pyarrow.parquet as pq
7
7
  from pathlib import Path
8
8
 
9
+
10
+ def _stack_section(batch: list[dict], section: str):
11
+ """Stack one section of the batch (features/labels/ids)."""
12
+ entries = [item.get(section) for item in batch if item.get(section) is not None]
13
+ if not entries:
14
+ return None
15
+ merged: dict = {}
16
+ for name in entries[0]:
17
+ tensors = [item[section][name] for item in batch if item.get(section) is not None and name in item[section]]
18
+ merged[name] = torch.stack(tensors, dim=0)
19
+ return merged
20
+
21
+
9
22
  def collate_fn(batch):
10
- """Collate a list of tensor tuples from ``FileDataset`` into batched tensors."""
23
+ """
24
+ Collate a list of sample dicts into the unified batch format:
25
+ {
26
+ "features": {name: Tensor(B, ...)},
27
+ "labels": {target: Tensor(B, ...)} or None,
28
+ "ids": {id_name: Tensor(B, ...)} or None,
29
+ }
30
+ """
11
31
  if not batch:
12
- return tuple()
13
-
14
- num_tensors = len(batch[0])
32
+ return {"features": {}, "labels": None, "ids": None}
33
+
34
+ first = batch[0]
35
+ if isinstance(first, dict) and "features" in first:
36
+ # Streaming dataset yields already-batched chunks; avoid adding an extra dim.
37
+ if first.get("_already_batched") and len(batch) == 1:
38
+ return {
39
+ "features": first.get("features", {}),
40
+ "labels": first.get("labels"),
41
+ "ids": first.get("ids"),
42
+ }
43
+ return {
44
+ "features": _stack_section(batch, "features") or {},
45
+ "labels": _stack_section(batch, "labels"),
46
+ "ids": _stack_section(batch, "ids"),
47
+ }
48
+
49
+ # Fallback: stack tuples/lists of tensors
50
+ num_tensors = len(first)
15
51
  result = []
16
-
17
52
  for i in range(num_tensors):
18
53
  tensor_list = [item[i] for item in batch]
19
- first = tensor_list[0]
20
-
21
- if isinstance(first, torch.Tensor):
54
+ first_item = tensor_list[0]
55
+ if isinstance(first_item, torch.Tensor):
22
56
  stacked = torch.cat(tensor_list, dim=0)
23
- elif isinstance(first, np.ndarray):
57
+ elif isinstance(first_item, np.ndarray):
24
58
  stacked = np.concatenate(tensor_list, axis=0)
25
- elif isinstance(first, list):
59
+ elif isinstance(first_item, list):
26
60
  combined = []
27
61
  for entry in tensor_list:
28
62
  combined.extend(entry)
29
63
  stacked = combined
30
64
  else:
31
65
  stacked = tensor_list
32
-
33
66
  result.append(stacked)
34
-
35
67
  return tuple(result)
36
68
 
37
69
 
@@ -2,6 +2,7 @@
2
2
  Dataloader definitions
3
3
 
4
4
  Date: create on 27/10/2025
5
+ Update: 25/11/2025
5
6
  Author: Yang Zhou,zyaztec@gmail.com
6
7
  """
7
8
  import os
@@ -15,7 +16,7 @@ import pyarrow.parquet as pq
15
16
  from pathlib import Path
16
17
  from typing import Iterator, Literal, Union, Optional
17
18
 
18
- from torch.utils.data import DataLoader, TensorDataset, IterableDataset
19
+ from torch.utils.data import DataLoader, Dataset, IterableDataset
19
20
  from nextrec.data.preprocessor import DataProcessor
20
21
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSpecMixin
21
22
 
@@ -27,27 +28,33 @@ from nextrec.data import (
27
28
  read_table,
28
29
  )
29
30
 
31
+ class TensorDictDataset(Dataset):
32
+ """Dataset returning sample-level dicts matching the unified batch schema."""
33
+ def __init__(self, tensors: dict):
34
+ self.features = tensors.get("features", {})
35
+ self.labels = tensors.get("labels")
36
+ self.ids = tensors.get("ids")
37
+ if not self.features:
38
+ raise ValueError("Dataset requires at least one feature tensor.")
39
+ lengths = [tensor.shape[0] for tensor in self.features.values()]
40
+ if not lengths:
41
+ raise ValueError("Feature tensors are empty.")
42
+ self.length = lengths[0]
43
+ for length in lengths[1:]:
44
+ if length != self.length:
45
+ raise ValueError("All feature tensors must have the same length.")
46
+
47
+ def __len__(self) -> int:
48
+ return self.length
49
+
50
+ def __getitem__(self, idx: int) -> dict:
51
+ sample_features = {name: tensor[idx] for name, tensor in self.features.items()}
52
+ sample_labels = {name: tensor[idx] for name, tensor in self.labels.items()} if self.labels else None
53
+ sample_ids = {name: tensor[idx] for name, tensor in self.ids.items()} if self.ids else None
54
+ return {"features": sample_features, "labels": sample_labels, "ids": sample_ids}
30
55
 
31
- class FileDataset(FeatureSpecMixin, IterableDataset):
32
- """
33
- Iterable dataset that streams CSV/Parquet files in chunks and yields tensor tuples.
34
-
35
- :param file_paths: Absolute or relative paths to CSV/Parquet files.
36
- :param dense_features: Dense feature definitions (float tensors).
37
- :param sparse_features: Sparse/categorical feature definitions (int tensors).
38
- :param sequence_features: Sequence feature definitions (padded int tensors).
39
- :param target_columns: Label/target column names.
40
- :param id_columns: Optional ID columns appended after targets.
41
- :param chunk_size: Number of rows to read per chunk.
42
- :param file_type: ``\"csv\"`` or ``\"parquet\"``.
43
- :param processor: Optional fitted :class:`~nextrec.data.preprocessor.DataProcessor` for online transform.
44
56
 
45
- Yields
46
- ------
47
- tuple
48
- Tensors ordered as ``dense + sparse + sequence + targets (+ ids)``. Shape respects chunk size.
49
- """
50
-
57
+ class FileDataset(FeatureSpecMixin, IterableDataset):
51
58
  def __init__(self,
52
59
  file_paths: list[str], # file paths to read, containing CSV or Parquet files
53
60
  dense_features: list[DenseFeature], # dense feature definitions
@@ -58,95 +65,48 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
58
65
  chunk_size: int = 10000,
59
66
  file_type: str = 'csv',
60
67
  processor: DataProcessor | None = None): # optional DataProcessor for transformation
61
- """
62
- Initialize a streaming dataset backed by on-disk files.
63
- """
64
-
65
68
  self.file_paths = file_paths
66
69
  self.chunk_size = chunk_size
67
70
  self.file_type = file_type
68
71
  self.processor = processor
69
-
70
- self._set_feature_config(dense_features, sparse_features, sequence_features)
71
- self._set_target_config(target_columns, id_columns or [])
72
+ self._set_feature_config(dense_features, sparse_features, sequence_features, target_columns, id_columns)
72
73
  self.current_file_index = 0
73
74
  self.total_files = len(file_paths)
74
75
 
75
- def __iter__(self) -> Iterator[tuple]:
76
- """
77
- Iterate over files and stream tensor tuples chunk by chunk.
78
-
79
- Files are processed sequentially; each chunk is transformed (optionally via
80
- ``processor``) and converted to tensors before being yielded to PyTorch ``DataLoader``.
81
- """
76
+ def __iter__(self):
82
77
  self.current_file_index = 0
83
78
  self._file_pbar = None
84
-
85
- # Create progress bar for file processing when multiple files
86
79
  if self.total_files > 1:
87
- self._file_pbar = tqdm.tqdm(
88
- total=self.total_files,
89
- desc="Files",
90
- unit="file",
91
- position=0,
92
- leave=True,
93
- bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
94
- )
95
-
80
+ self._file_pbar = tqdm.tqdm(total=self.total_files, desc="Files", unit="file", position=0, leave=True, bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
96
81
  for file_path in self.file_paths:
97
82
  self.current_file_index += 1
98
-
99
83
  if self._file_pbar is not None:
100
84
  self._file_pbar.update(1)
101
85
  elif self.total_files == 1:
102
86
  file_name = os.path.basename(file_path)
103
- logging.info(colorize(f"Processing file: {file_name}", color="cyan"))
104
-
87
+ logging.info(f"Processing file: {file_name}")
105
88
  if self.file_type == 'csv':
106
89
  yield from self._read_csv_chunks(file_path)
107
90
  elif self.file_type == 'parquet':
108
91
  yield from self._read_parquet_chunks(file_path)
109
-
110
92
  if self._file_pbar is not None:
111
93
  self._file_pbar.close()
112
94
 
113
- def _read_csv_chunks(self, file_path: str) -> Iterator[tuple]:
114
- """
115
- Stream a CSV file chunk by chunk.
116
-
117
- :param file_path: Path to the CSV file.
118
- :yields: Tensor tuples for each chunk.
119
- """
95
+ def _read_csv_chunks(self, file_path: str):
120
96
  chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
121
-
122
97
  for chunk in chunk_iterator:
123
98
  tensors = self._dataframe_to_tensors(chunk)
124
- if tensors:
125
- yield tensors
99
+ yield tensors
126
100
 
127
- def _read_parquet_chunks(self, file_path: str) -> Iterator[tuple]:
128
- """
129
- Stream a Parquet file via ``pyarrow`` batch reading.
130
-
131
- :param file_path: Path to the Parquet file.
132
- :yields: Tensor tuples for each batch.
133
- """
134
-
101
+ def _read_parquet_chunks(self, file_path: str):
135
102
  parquet_file = pq.ParquetFile(file_path)
136
103
  for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
137
104
  chunk = batch.to_pandas()
138
105
  tensors = self._dataframe_to_tensors(chunk)
139
- if tensors:
140
- yield tensors
106
+ yield tensors
141
107
  del chunk
142
108
 
143
- def _dataframe_to_tensors(self, df: pd.DataFrame) -> tuple | None:
144
- """
145
- Convert a DataFrame chunk into a tuple of tensors respecting feature order.
146
-
147
- :param df: DataFrame chunk.
148
- :returns: Tuple of tensors (features + targets + ids) or ``None`` if no tensors created.
149
- """
109
+ def _dataframe_to_tensors(self, df: pd.DataFrame) -> dict | None:
150
110
  if self.processor is not None:
151
111
  if not self.processor.is_fitted:
152
112
  raise ValueError("DataProcessor must be fitted before using in streaming mode")
@@ -154,44 +114,19 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
154
114
  else:
155
115
  transformed_data = df
156
116
 
157
- return build_tensors_from_data(
117
+ batch = build_tensors_from_data(
158
118
  data=transformed_data,
159
119
  raw_data=df,
160
120
  features=self.all_features,
161
121
  target_columns=self.target_columns,
162
122
  id_columns=self.id_columns,
163
- on_missing_feature="raise",
164
123
  )
124
+ if batch is not None:
125
+ batch["_already_batched"] = True
126
+ return batch
165
127
 
166
128
 
167
129
  class RecDataLoader(FeatureSpecMixin):
168
- """
169
- Convenience wrapper for building PyTorch ``DataLoader`` objects for recommendation models.
170
-
171
- :param dense_features: Dense feature definitions (float tensors).
172
- :param sparse_features: Sparse/categorical feature definitions (int tensors).
173
- :param sequence_features: Sequence feature definitions (padded int tensors).
174
- :param target: Target column name(s); string or list.
175
- :param id_columns: Optional ID column name(s) appended after targets.
176
- :param processor: Optional fitted :class:`~nextrec.data.preprocessor.DataProcessor` for preprocessing.
177
-
178
- Examples
179
- --------
180
- >>> loader = RecDataLoader(
181
- ... dense_features=dense_features,
182
- ... sparse_features=sparse_features,
183
- ... sequence_features=sequence_features,
184
- ... target=['label'],
185
- ... processor=processor,
186
- ... )
187
- >>> dataloader = loader.create_dataloader(
188
- ... data=\"/path/to/data.csv\",
189
- ... batch_size=1024,
190
- ... load_full=False,
191
- ... chunk_size=20000,
192
- ... )
193
- """
194
-
195
130
  def __init__(self,
196
131
  dense_features: list[DenseFeature] | None = None,
197
132
  sparse_features: list[SparseFeature] | None = None,
@@ -199,20 +134,8 @@ class RecDataLoader(FeatureSpecMixin):
199
134
  target: list[str] | None | str = None,
200
135
  id_columns: str | list[str] | None = None,
201
136
  processor: Optional['DataProcessor'] = None):
202
- """
203
- Initialize the loader with feature/target definitions.
204
-
205
- :param dense_features: Dense feature definitions (float).
206
- :param sparse_features: Sparse feature definitions (int).
207
- :param sequence_features: Sequence feature definitions (int, padded).
208
- :param target: Single target name or list of names.
209
- :param id_columns: Optional ID columns to append in output.
210
- :param processor: Optional fitted ``DataProcessor`` for preprocessing.
211
- """
212
-
213
137
  self.processor = processor
214
- self._set_feature_config(dense_features, sparse_features, sequence_features)
215
- self._set_target_config(target, id_columns)
138
+ self._set_feature_config(dense_features, sparse_features, sequence_features, target, id_columns)
216
139
 
217
140
  def create_dataloader(self,
218
141
  data: Union[dict, pd.DataFrame, str, DataLoader],
@@ -220,79 +143,38 @@ class RecDataLoader(FeatureSpecMixin):
220
143
  shuffle: bool = True,
221
144
  load_full: bool = True,
222
145
  chunk_size: int = 10000) -> DataLoader:
223
- """
224
- Build a ``DataLoader`` from in-memory data, file path, or an existing loader.
225
-
226
- :param data: Dict/DataFrame (in-memory), path to CSV/Parquet file/dir, or an existing ``DataLoader``.
227
- :param batch_size: Batch size for the returned ``DataLoader``.
228
- :param shuffle: Shuffle flag passed to PyTorch ``DataLoader`` (for in-memory and streaming batches).
229
- :param load_full: If ``True``, load all files into memory; if ``False``, stream with chunks.
230
- :param chunk_size: Number of rows per chunk when ``load_full=False``.
231
- :returns: A configured PyTorch ``DataLoader``.
232
- """
233
146
  if isinstance(data, DataLoader):
234
147
  return data
235
-
236
- if isinstance(data, (str, os.PathLike)):
237
- return self._create_from_path(data, batch_size, shuffle, load_full, chunk_size)
238
-
239
- if isinstance(data, (dict, pd.DataFrame)):
240
- return self._create_from_memory(data, batch_size, shuffle)
241
-
242
- raise ValueError(f"Unsupported data type: {type(data)}")
148
+ elif isinstance(data, (str, os.PathLike)):
149
+ return self._create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size)
150
+ elif isinstance(data, (dict, pd.DataFrame)):
151
+ return self._create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle)
152
+ else:
153
+ raise ValueError(f"Unsupported data type: {type(data)}")
243
154
 
244
155
  def _create_from_memory(self,
245
156
  data: Union[dict, pd.DataFrame],
246
157
  batch_size: int,
247
158
  shuffle: bool) -> DataLoader:
248
- """
249
- Convert in-memory data (dict/DataFrame) into tensors and wrap with ``DataLoader``.
250
-
251
- :param data: Dict or DataFrame containing feature/target columns.
252
- :param batch_size: Batch size.
253
- :param shuffle: Whether to shuffle batches.
254
- :returns: A ``DataLoader`` backed by ``TensorDataset``.
255
- """
256
-
257
159
  raw_data = data
258
160
 
259
161
  if self.processor is not None:
260
- assert self.processor.is_fitted, "DataProcessor must be fitted before using in RecDataLoader"
162
+ if not self.processor.is_fitted:
163
+ raise ValueError("DataProcessor must be fitted before transforming data in memory")
261
164
  data = self.processor.transform(data, return_dict=True)
262
-
263
- tensors = build_tensors_from_data(
264
- data=data,
265
- raw_data=raw_data,
266
- features=self.all_features,
267
- target_columns=self.target_columns,
268
- id_columns=self.id_columns,
269
- on_missing_feature="raise",
270
- )
271
-
272
- assert tensors is not None, "No tensors were created from provided data."
273
-
274
- dataset = TensorDataset(*tensors)
275
- return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
165
+ tensors = build_tensors_from_data(data=data,raw_data=raw_data, features=self.all_features, target_columns=self.target_columns, id_columns=self.id_columns,)
166
+ if tensors is None:
167
+ raise ValueError("No valid tensors could be built from the provided data.")
168
+ dataset = TensorDictDataset(tensors)
169
+ return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
276
170
 
277
171
  def _create_from_path(self,
278
172
  path: str,
279
173
  batch_size: int,
280
174
  shuffle: bool,
281
175
  load_full: bool,
282
- chunk_size: int) -> DataLoader:
283
- """
284
- Build a ``DataLoader`` from a CSV/Parquet file or directory.
285
-
286
- :param path: File path or directory containing homogeneous CSV/Parquet files.
287
- :param batch_size: Batch size.
288
- :param shuffle: Shuffle flag.
289
- :param load_full: If ``True``, load all rows into memory; otherwise stream.
290
- :param chunk_size: Chunk rows when streaming.
291
- :returns: A ``DataLoader`` (in-memory or streaming).
292
- """
293
-
176
+ chunk_size: int = 10000) -> DataLoader:
294
177
  file_paths, file_type = resolve_file_paths(str(Path(path)))
295
-
296
178
  # Load full data into memory
297
179
  if load_full:
298
180
  dfs = []
@@ -306,20 +188,12 @@ class RecDataLoader(FeatureSpecMixin):
306
188
  df = read_table(file_path, file_type)
307
189
  dfs.append(df)
308
190
  except MemoryError as exc:
309
- raise MemoryError(
310
- f"Out of memory while reading {file_path}. "
311
- f"Consider using load_full=False with streaming."
312
- ) from exc
313
-
191
+ raise MemoryError(f"Out of memory while reading {file_path}. Consider using load_full=False with streaming.") from exc
314
192
  try:
315
193
  combined_df = pd.concat(dfs, ignore_index=True)
316
194
  except MemoryError as exc:
317
- raise MemoryError(
318
- f"Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). "
319
- f"Use load_full=False to stream or reduce chunk_size."
320
- ) from exc
321
-
322
- return self._create_from_memory(combined_df, batch_size, shuffle)
195
+ raise MemoryError(f"Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size.") from exc
196
+ return self._create_from_memory(combined_df, batch_size, shuffle,)
323
197
  else:
324
198
  return self._load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle)
325
199
 
@@ -329,26 +203,10 @@ class RecDataLoader(FeatureSpecMixin):
329
203
  batch_size: int,
330
204
  chunk_size: int,
331
205
  shuffle: bool) -> DataLoader:
332
- """
333
- Create a streaming ``DataLoader`` that yields chunked tensors from files.
334
-
335
- :param file_paths: Ordered list of file paths.
336
- :param file_type: ``\"csv\"`` or ``\"parquet\"``.
337
- :param batch_size: Batch size for the outer ``DataLoader``.
338
- :param chunk_size: Number of rows per chunk when reading files.
339
- :returns: Streaming ``DataLoader`` with custom ``collate_fn``.
340
- """
341
-
342
206
  if shuffle:
343
- logging.warning(colorize("Shuffle is ignored in streaming mode (IterableDataset).", "yellow"))
344
-
207
+ logging.warning("Shuffle is ignored in streaming mode (IterableDataset).")
345
208
  if batch_size != 1:
346
- logging.warning(colorize(
347
- "Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.",
348
- "yellow",
349
- ))
350
- effective_batch_size = 1
351
-
209
+ logging.warning("Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
352
210
  dataset = FileDataset(
353
211
  file_paths=file_paths,
354
212
  dense_features=self.dense_features,
@@ -360,41 +218,24 @@ class RecDataLoader(FeatureSpecMixin):
360
218
  file_type=file_type,
361
219
  processor=self.processor
362
220
  )
363
-
364
- return DataLoader(dataset, batch_size=effective_batch_size, collate_fn=collate_fn)
221
+ return DataLoader(dataset, batch_size=1, collate_fn=collate_fn)
365
222
 
366
223
  def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
367
- """
368
- Normalize a raw sequence column into a padded int64 ``ndarray``.
369
-
370
- :param column: Sequence column from DataFrame/dict; can be Series, list, or ndarray.
371
- :param feature: Sequence feature definition providing ``max_len`` and optional ``padding_idx``.
372
- :returns: 2-D numpy array (batch, seq_len) with dtype ``int64``.
373
- """
374
224
  if isinstance(column, pd.Series):
375
225
  column = column.tolist()
376
-
377
226
  if isinstance(column, (list, tuple)):
378
227
  column = np.array(column, dtype=object)
379
-
380
228
  if not isinstance(column, np.ndarray):
381
229
  column = np.array([column], dtype=object)
382
-
383
230
  if column.ndim == 0:
384
231
  column = column.reshape(1)
385
-
386
232
  if column.dtype == object and any(isinstance(v, str) for v in column.ravel()):
387
- raise TypeError(
388
- f"Sequence feature '{feature.name}' expects numeric sequences; found string values."
389
- )
390
-
233
+ raise TypeError(f"Sequence feature '{feature.name}' expects numeric sequences; found string values.")
391
234
  if column.dtype == object and len(column) > 0 and isinstance(column[0], (list, tuple, np.ndarray)):
392
235
  sequences = []
393
236
  for seq in column:
394
237
  if isinstance(seq, str):
395
- raise TypeError(
396
- f"Sequence feature '{feature.name}' expects numeric sequences; found string values."
397
- )
238
+ raise TypeError(f"Sequence feature '{feature.name}' expects numeric sequences; found string values.")
398
239
  if isinstance(seq, (list, tuple, np.ndarray)):
399
240
  arr = np.asarray(seq, dtype=np.int64)
400
241
  else:
@@ -415,7 +256,6 @@ def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
415
256
  column = np.stack(padded)
416
257
  elif column.ndim == 1:
417
258
  column = column.reshape(-1, 1)
418
-
419
259
  return np.asarray(column, dtype=np.int64)
420
260
 
421
261
 
@@ -424,67 +264,38 @@ def build_tensors_from_data( # noqa: C901
424
264
  raw_data: dict | pd.DataFrame,
425
265
  features: list,
426
266
  target_columns: list[str],
427
- id_columns: list[str],
428
- on_missing_feature: str = "raise",
429
- ) -> tuple | None:
430
- """
431
- Shared routine to convert structured data into a tuple of tensors.
432
-
433
- :param data: Preprocessed data (dict or DataFrame) used to fetch model inputs/labels.
434
- :param raw_data: Original data, used for untouched ID columns.
435
- :param features: Ordered list of feature definitions.
436
- :param target_columns: Target/label column names.
437
- :param id_columns: Extra ID column names to append at the end of the tensor tuple.
438
- :param on_missing_feature: ``\"warn\"`` to skip missing feature with warning, ``\"raise\"`` to error.
439
- :returns: Tuple of tensors following the order of ``features`` + targets (+ ids) or ``None`` if empty.
440
- """
441
- tensors: list[torch.Tensor] = []
442
-
267
+ id_columns: list[str]
268
+ ) -> dict | None:
269
+ feature_tensors: dict[str, torch.Tensor] = {}
443
270
  for feature in features:
444
271
  column = get_column_data(data, feature.name)
445
272
  if column is None:
446
- if on_missing_feature == "warn":
447
- logging.warning(colorize(f"Feature column '{feature.name}' not found in data", "yellow"))
448
- continue
449
- raise AssertionError(f"Feature column {feature.name} not found in data.")
450
-
273
+ raise ValueError(f"Feature column '{feature.name}' not found in data")
451
274
  if isinstance(feature, SequenceFeature):
452
275
  tensor = torch.from_numpy(_normalize_sequence_column(column, feature))
453
276
  elif isinstance(feature, DenseFeature):
454
277
  tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
455
278
  else:
456
279
  tensor = torch.from_numpy(np.asarray(column, dtype=np.int64))
457
-
458
- tensors.append(tensor)
459
-
460
- label_tensors = []
280
+ feature_tensors[feature.name] = tensor
281
+ label_tensors = None
461
282
  if target_columns:
283
+ label_tensors = {}
462
284
  for target_name in target_columns:
463
285
  column = get_column_data(data, target_name)
464
- assert column is not None, f"Target column '{target_name}' not found in data."
465
-
286
+ if column is None:
287
+ continue
466
288
  label_tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
467
-
468
- if label_tensor.dim() == 1:
469
- label_tensor = label_tensor.view(-1, 1)
470
- elif label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
289
+ if label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
471
290
  label_tensor = label_tensor.t()
472
-
473
- label_tensors.append(label_tensor)
474
-
475
- if label_tensors:
476
- if len(label_tensors) == 1 and label_tensors[0].shape[1] > 1:
477
- y_tensor = label_tensors[0]
478
- else:
479
- y_tensor = torch.cat(label_tensors, dim=1)
480
-
481
- if y_tensor.shape[1] == 1:
482
- y_tensor = y_tensor.squeeze(1)
483
-
484
- tensors.append(y_tensor)
485
-
291
+ if label_tensor.shape[1:] == (1,):
292
+ label_tensor = label_tensor.squeeze(1)
293
+ label_tensors[target_name] = label_tensor
294
+ if not label_tensors:
295
+ label_tensors = None
296
+ id_tensors = None
486
297
  if id_columns:
487
- id_arrays = []
298
+ id_tensors = {}
488
299
  for id_col in id_columns:
489
300
  column = get_column_data(raw_data, id_col)
490
301
  if column is None:
@@ -494,20 +305,8 @@ def build_tensors_from_data( # noqa: C901
494
305
  try:
495
306
  id_arr = np.asarray(column, dtype=np.int64)
496
307
  except Exception as exc:
497
- raise TypeError(
498
- f"ID column '{id_col}' must contain numeric values. "
499
- f"Received dtype={np.asarray(column).dtype}, error: {exc}"
500
- ) from exc
501
- id_arrays.append(id_arr)
502
-
503
- combined_ids = np.column_stack(id_arrays)
504
- tensors.append(torch.from_numpy(combined_ids))
505
-
506
- if not tensors:
308
+ raise TypeError( f"ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}") from exc
309
+ id_tensors[id_col] = torch.from_numpy(id_arr)
310
+ if not feature_tensors:
507
311
  return None
508
-
509
- return tuple(tensors)
510
-
511
-
512
- # Backward compatible alias
513
- _build_tensors_from_data = build_tensors_from_data
312
+ return {"features": feature_tensors, "labels": label_tensors, "ids": id_tensors}