nextrec 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/features.py +5 -1
- nextrec/basic/layers.py +3 -7
- nextrec/basic/model.py +495 -664
- nextrec/data/data_utils.py +44 -12
- nextrec/data/dataloader.py +84 -285
- nextrec/data/preprocessor.py +91 -213
- nextrec/loss/__init__.py +0 -1
- nextrec/loss/loss_utils.py +51 -120
- nextrec/models/multi_task/esmm.py +1 -1
- nextrec/models/ranking/masknet.py +1 -1
- nextrec/utils/__init__.py +4 -1
- nextrec/utils/common.py +16 -0
- {nextrec-0.2.4.dist-info → nextrec-0.2.5.dist-info}/METADATA +2 -2
- {nextrec-0.2.4.dist-info → nextrec-0.2.5.dist-info}/RECORD +17 -16
- {nextrec-0.2.4.dist-info → nextrec-0.2.5.dist-info}/WHEEL +0 -0
- {nextrec-0.2.4.dist-info → nextrec-0.2.5.dist-info}/licenses/LICENSE +0 -0
nextrec/data/data_utils.py
CHANGED
|
@@ -6,32 +6,64 @@ import pandas as pd
|
|
|
6
6
|
import pyarrow.parquet as pq
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
|
|
9
|
+
|
|
10
|
+
def _stack_section(batch: list[dict], section: str):
|
|
11
|
+
"""Stack one section of the batch (features/labels/ids)."""
|
|
12
|
+
entries = [item.get(section) for item in batch if item.get(section) is not None]
|
|
13
|
+
if not entries:
|
|
14
|
+
return None
|
|
15
|
+
merged: dict = {}
|
|
16
|
+
for name in entries[0]:
|
|
17
|
+
tensors = [item[section][name] for item in batch if item.get(section) is not None and name in item[section]]
|
|
18
|
+
merged[name] = torch.stack(tensors, dim=0)
|
|
19
|
+
return merged
|
|
20
|
+
|
|
21
|
+
|
|
9
22
|
def collate_fn(batch):
|
|
10
|
-
"""
|
|
23
|
+
"""
|
|
24
|
+
Collate a list of sample dicts into the unified batch format:
|
|
25
|
+
{
|
|
26
|
+
"features": {name: Tensor(B, ...)},
|
|
27
|
+
"labels": {target: Tensor(B, ...)} or None,
|
|
28
|
+
"ids": {id_name: Tensor(B, ...)} or None,
|
|
29
|
+
}
|
|
30
|
+
"""
|
|
11
31
|
if not batch:
|
|
12
|
-
return
|
|
13
|
-
|
|
14
|
-
|
|
32
|
+
return {"features": {}, "labels": None, "ids": None}
|
|
33
|
+
|
|
34
|
+
first = batch[0]
|
|
35
|
+
if isinstance(first, dict) and "features" in first:
|
|
36
|
+
# Streaming dataset yields already-batched chunks; avoid adding an extra dim.
|
|
37
|
+
if first.get("_already_batched") and len(batch) == 1:
|
|
38
|
+
return {
|
|
39
|
+
"features": first.get("features", {}),
|
|
40
|
+
"labels": first.get("labels"),
|
|
41
|
+
"ids": first.get("ids"),
|
|
42
|
+
}
|
|
43
|
+
return {
|
|
44
|
+
"features": _stack_section(batch, "features") or {},
|
|
45
|
+
"labels": _stack_section(batch, "labels"),
|
|
46
|
+
"ids": _stack_section(batch, "ids"),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Fallback: stack tuples/lists of tensors
|
|
50
|
+
num_tensors = len(first)
|
|
15
51
|
result = []
|
|
16
|
-
|
|
17
52
|
for i in range(num_tensors):
|
|
18
53
|
tensor_list = [item[i] for item in batch]
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
if isinstance(first, torch.Tensor):
|
|
54
|
+
first_item = tensor_list[0]
|
|
55
|
+
if isinstance(first_item, torch.Tensor):
|
|
22
56
|
stacked = torch.cat(tensor_list, dim=0)
|
|
23
|
-
elif isinstance(
|
|
57
|
+
elif isinstance(first_item, np.ndarray):
|
|
24
58
|
stacked = np.concatenate(tensor_list, axis=0)
|
|
25
|
-
elif isinstance(
|
|
59
|
+
elif isinstance(first_item, list):
|
|
26
60
|
combined = []
|
|
27
61
|
for entry in tensor_list:
|
|
28
62
|
combined.extend(entry)
|
|
29
63
|
stacked = combined
|
|
30
64
|
else:
|
|
31
65
|
stacked = tensor_list
|
|
32
|
-
|
|
33
66
|
result.append(stacked)
|
|
34
|
-
|
|
35
67
|
return tuple(result)
|
|
36
68
|
|
|
37
69
|
|
nextrec/data/dataloader.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Dataloader definitions
|
|
3
3
|
|
|
4
4
|
Date: create on 27/10/2025
|
|
5
|
+
Update: 25/11/2025
|
|
5
6
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
6
7
|
"""
|
|
7
8
|
import os
|
|
@@ -15,7 +16,7 @@ import pyarrow.parquet as pq
|
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
from typing import Iterator, Literal, Union, Optional
|
|
17
18
|
|
|
18
|
-
from torch.utils.data import DataLoader,
|
|
19
|
+
from torch.utils.data import DataLoader, Dataset, IterableDataset
|
|
19
20
|
from nextrec.data.preprocessor import DataProcessor
|
|
20
21
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSpecMixin
|
|
21
22
|
|
|
@@ -27,27 +28,33 @@ from nextrec.data import (
|
|
|
27
28
|
read_table,
|
|
28
29
|
)
|
|
29
30
|
|
|
31
|
+
class TensorDictDataset(Dataset):
|
|
32
|
+
"""Dataset returning sample-level dicts matching the unified batch schema."""
|
|
33
|
+
def __init__(self, tensors: dict):
|
|
34
|
+
self.features = tensors.get("features", {})
|
|
35
|
+
self.labels = tensors.get("labels")
|
|
36
|
+
self.ids = tensors.get("ids")
|
|
37
|
+
if not self.features:
|
|
38
|
+
raise ValueError("Dataset requires at least one feature tensor.")
|
|
39
|
+
lengths = [tensor.shape[0] for tensor in self.features.values()]
|
|
40
|
+
if not lengths:
|
|
41
|
+
raise ValueError("Feature tensors are empty.")
|
|
42
|
+
self.length = lengths[0]
|
|
43
|
+
for length in lengths[1:]:
|
|
44
|
+
if length != self.length:
|
|
45
|
+
raise ValueError("All feature tensors must have the same length.")
|
|
46
|
+
|
|
47
|
+
def __len__(self) -> int:
|
|
48
|
+
return self.length
|
|
49
|
+
|
|
50
|
+
def __getitem__(self, idx: int) -> dict:
|
|
51
|
+
sample_features = {name: tensor[idx] for name, tensor in self.features.items()}
|
|
52
|
+
sample_labels = {name: tensor[idx] for name, tensor in self.labels.items()} if self.labels else None
|
|
53
|
+
sample_ids = {name: tensor[idx] for name, tensor in self.ids.items()} if self.ids else None
|
|
54
|
+
return {"features": sample_features, "labels": sample_labels, "ids": sample_ids}
|
|
30
55
|
|
|
31
|
-
class FileDataset(FeatureSpecMixin, IterableDataset):
|
|
32
|
-
"""
|
|
33
|
-
Iterable dataset that streams CSV/Parquet files in chunks and yields tensor tuples.
|
|
34
|
-
|
|
35
|
-
:param file_paths: Absolute or relative paths to CSV/Parquet files.
|
|
36
|
-
:param dense_features: Dense feature definitions (float tensors).
|
|
37
|
-
:param sparse_features: Sparse/categorical feature definitions (int tensors).
|
|
38
|
-
:param sequence_features: Sequence feature definitions (padded int tensors).
|
|
39
|
-
:param target_columns: Label/target column names.
|
|
40
|
-
:param id_columns: Optional ID columns appended after targets.
|
|
41
|
-
:param chunk_size: Number of rows to read per chunk.
|
|
42
|
-
:param file_type: ``\"csv\"`` or ``\"parquet\"``.
|
|
43
|
-
:param processor: Optional fitted :class:`~nextrec.data.preprocessor.DataProcessor` for online transform.
|
|
44
56
|
|
|
45
|
-
|
|
46
|
-
------
|
|
47
|
-
tuple
|
|
48
|
-
Tensors ordered as ``dense + sparse + sequence + targets (+ ids)``. Shape respects chunk size.
|
|
49
|
-
"""
|
|
50
|
-
|
|
57
|
+
class FileDataset(FeatureSpecMixin, IterableDataset):
|
|
51
58
|
def __init__(self,
|
|
52
59
|
file_paths: list[str], # file paths to read, containing CSV or Parquet files
|
|
53
60
|
dense_features: list[DenseFeature], # dense feature definitions
|
|
@@ -58,95 +65,48 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
|
|
|
58
65
|
chunk_size: int = 10000,
|
|
59
66
|
file_type: str = 'csv',
|
|
60
67
|
processor: DataProcessor | None = None): # optional DataProcessor for transformation
|
|
61
|
-
"""
|
|
62
|
-
Initialize a streaming dataset backed by on-disk files.
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
68
|
self.file_paths = file_paths
|
|
66
69
|
self.chunk_size = chunk_size
|
|
67
70
|
self.file_type = file_type
|
|
68
71
|
self.processor = processor
|
|
69
|
-
|
|
70
|
-
self._set_feature_config(dense_features, sparse_features, sequence_features)
|
|
71
|
-
self._set_target_config(target_columns, id_columns or [])
|
|
72
|
+
self._set_feature_config(dense_features, sparse_features, sequence_features, target_columns, id_columns)
|
|
72
73
|
self.current_file_index = 0
|
|
73
74
|
self.total_files = len(file_paths)
|
|
74
75
|
|
|
75
|
-
def __iter__(self)
|
|
76
|
-
"""
|
|
77
|
-
Iterate over files and stream tensor tuples chunk by chunk.
|
|
78
|
-
|
|
79
|
-
Files are processed sequentially; each chunk is transformed (optionally via
|
|
80
|
-
``processor``) and converted to tensors before being yielded to PyTorch ``DataLoader``.
|
|
81
|
-
"""
|
|
76
|
+
def __iter__(self):
|
|
82
77
|
self.current_file_index = 0
|
|
83
78
|
self._file_pbar = None
|
|
84
|
-
|
|
85
|
-
# Create progress bar for file processing when multiple files
|
|
86
79
|
if self.total_files > 1:
|
|
87
|
-
self._file_pbar = tqdm.tqdm(
|
|
88
|
-
total=self.total_files,
|
|
89
|
-
desc="Files",
|
|
90
|
-
unit="file",
|
|
91
|
-
position=0,
|
|
92
|
-
leave=True,
|
|
93
|
-
bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
|
|
94
|
-
)
|
|
95
|
-
|
|
80
|
+
self._file_pbar = tqdm.tqdm(total=self.total_files, desc="Files", unit="file", position=0, leave=True, bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')
|
|
96
81
|
for file_path in self.file_paths:
|
|
97
82
|
self.current_file_index += 1
|
|
98
|
-
|
|
99
83
|
if self._file_pbar is not None:
|
|
100
84
|
self._file_pbar.update(1)
|
|
101
85
|
elif self.total_files == 1:
|
|
102
86
|
file_name = os.path.basename(file_path)
|
|
103
|
-
logging.info(
|
|
104
|
-
|
|
87
|
+
logging.info(f"Processing file: {file_name}")
|
|
105
88
|
if self.file_type == 'csv':
|
|
106
89
|
yield from self._read_csv_chunks(file_path)
|
|
107
90
|
elif self.file_type == 'parquet':
|
|
108
91
|
yield from self._read_parquet_chunks(file_path)
|
|
109
|
-
|
|
110
92
|
if self._file_pbar is not None:
|
|
111
93
|
self._file_pbar.close()
|
|
112
94
|
|
|
113
|
-
def _read_csv_chunks(self, file_path: str)
|
|
114
|
-
"""
|
|
115
|
-
Stream a CSV file chunk by chunk.
|
|
116
|
-
|
|
117
|
-
:param file_path: Path to the CSV file.
|
|
118
|
-
:yields: Tensor tuples for each chunk.
|
|
119
|
-
"""
|
|
95
|
+
def _read_csv_chunks(self, file_path: str):
|
|
120
96
|
chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
|
|
121
|
-
|
|
122
97
|
for chunk in chunk_iterator:
|
|
123
98
|
tensors = self._dataframe_to_tensors(chunk)
|
|
124
|
-
|
|
125
|
-
yield tensors
|
|
99
|
+
yield tensors
|
|
126
100
|
|
|
127
|
-
def _read_parquet_chunks(self, file_path: str)
|
|
128
|
-
"""
|
|
129
|
-
Stream a Parquet file via ``pyarrow`` batch reading.
|
|
130
|
-
|
|
131
|
-
:param file_path: Path to the Parquet file.
|
|
132
|
-
:yields: Tensor tuples for each batch.
|
|
133
|
-
"""
|
|
134
|
-
|
|
101
|
+
def _read_parquet_chunks(self, file_path: str):
|
|
135
102
|
parquet_file = pq.ParquetFile(file_path)
|
|
136
103
|
for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
|
|
137
104
|
chunk = batch.to_pandas()
|
|
138
105
|
tensors = self._dataframe_to_tensors(chunk)
|
|
139
|
-
|
|
140
|
-
yield tensors
|
|
106
|
+
yield tensors
|
|
141
107
|
del chunk
|
|
142
108
|
|
|
143
|
-
def _dataframe_to_tensors(self, df: pd.DataFrame) ->
|
|
144
|
-
"""
|
|
145
|
-
Convert a DataFrame chunk into a tuple of tensors respecting feature order.
|
|
146
|
-
|
|
147
|
-
:param df: DataFrame chunk.
|
|
148
|
-
:returns: Tuple of tensors (features + targets + ids) or ``None`` if no tensors created.
|
|
149
|
-
"""
|
|
109
|
+
def _dataframe_to_tensors(self, df: pd.DataFrame) -> dict | None:
|
|
150
110
|
if self.processor is not None:
|
|
151
111
|
if not self.processor.is_fitted:
|
|
152
112
|
raise ValueError("DataProcessor must be fitted before using in streaming mode")
|
|
@@ -154,44 +114,19 @@ class FileDataset(FeatureSpecMixin, IterableDataset):
|
|
|
154
114
|
else:
|
|
155
115
|
transformed_data = df
|
|
156
116
|
|
|
157
|
-
|
|
117
|
+
batch = build_tensors_from_data(
|
|
158
118
|
data=transformed_data,
|
|
159
119
|
raw_data=df,
|
|
160
120
|
features=self.all_features,
|
|
161
121
|
target_columns=self.target_columns,
|
|
162
122
|
id_columns=self.id_columns,
|
|
163
|
-
on_missing_feature="raise",
|
|
164
123
|
)
|
|
124
|
+
if batch is not None:
|
|
125
|
+
batch["_already_batched"] = True
|
|
126
|
+
return batch
|
|
165
127
|
|
|
166
128
|
|
|
167
129
|
class RecDataLoader(FeatureSpecMixin):
|
|
168
|
-
"""
|
|
169
|
-
Convenience wrapper for building PyTorch ``DataLoader`` objects for recommendation models.
|
|
170
|
-
|
|
171
|
-
:param dense_features: Dense feature definitions (float tensors).
|
|
172
|
-
:param sparse_features: Sparse/categorical feature definitions (int tensors).
|
|
173
|
-
:param sequence_features: Sequence feature definitions (padded int tensors).
|
|
174
|
-
:param target: Target column name(s); string or list.
|
|
175
|
-
:param id_columns: Optional ID column name(s) appended after targets.
|
|
176
|
-
:param processor: Optional fitted :class:`~nextrec.data.preprocessor.DataProcessor` for preprocessing.
|
|
177
|
-
|
|
178
|
-
Examples
|
|
179
|
-
--------
|
|
180
|
-
>>> loader = RecDataLoader(
|
|
181
|
-
... dense_features=dense_features,
|
|
182
|
-
... sparse_features=sparse_features,
|
|
183
|
-
... sequence_features=sequence_features,
|
|
184
|
-
... target=['label'],
|
|
185
|
-
... processor=processor,
|
|
186
|
-
... )
|
|
187
|
-
>>> dataloader = loader.create_dataloader(
|
|
188
|
-
... data=\"/path/to/data.csv\",
|
|
189
|
-
... batch_size=1024,
|
|
190
|
-
... load_full=False,
|
|
191
|
-
... chunk_size=20000,
|
|
192
|
-
... )
|
|
193
|
-
"""
|
|
194
|
-
|
|
195
130
|
def __init__(self,
|
|
196
131
|
dense_features: list[DenseFeature] | None = None,
|
|
197
132
|
sparse_features: list[SparseFeature] | None = None,
|
|
@@ -199,20 +134,8 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
199
134
|
target: list[str] | None | str = None,
|
|
200
135
|
id_columns: str | list[str] | None = None,
|
|
201
136
|
processor: Optional['DataProcessor'] = None):
|
|
202
|
-
"""
|
|
203
|
-
Initialize the loader with feature/target definitions.
|
|
204
|
-
|
|
205
|
-
:param dense_features: Dense feature definitions (float).
|
|
206
|
-
:param sparse_features: Sparse feature definitions (int).
|
|
207
|
-
:param sequence_features: Sequence feature definitions (int, padded).
|
|
208
|
-
:param target: Single target name or list of names.
|
|
209
|
-
:param id_columns: Optional ID columns to append in output.
|
|
210
|
-
:param processor: Optional fitted ``DataProcessor`` for preprocessing.
|
|
211
|
-
"""
|
|
212
|
-
|
|
213
137
|
self.processor = processor
|
|
214
|
-
self._set_feature_config(dense_features, sparse_features, sequence_features)
|
|
215
|
-
self._set_target_config(target, id_columns)
|
|
138
|
+
self._set_feature_config(dense_features, sparse_features, sequence_features, target, id_columns)
|
|
216
139
|
|
|
217
140
|
def create_dataloader(self,
|
|
218
141
|
data: Union[dict, pd.DataFrame, str, DataLoader],
|
|
@@ -220,79 +143,38 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
220
143
|
shuffle: bool = True,
|
|
221
144
|
load_full: bool = True,
|
|
222
145
|
chunk_size: int = 10000) -> DataLoader:
|
|
223
|
-
"""
|
|
224
|
-
Build a ``DataLoader`` from in-memory data, file path, or an existing loader.
|
|
225
|
-
|
|
226
|
-
:param data: Dict/DataFrame (in-memory), path to CSV/Parquet file/dir, or an existing ``DataLoader``.
|
|
227
|
-
:param batch_size: Batch size for the returned ``DataLoader``.
|
|
228
|
-
:param shuffle: Shuffle flag passed to PyTorch ``DataLoader`` (for in-memory and streaming batches).
|
|
229
|
-
:param load_full: If ``True``, load all files into memory; if ``False``, stream with chunks.
|
|
230
|
-
:param chunk_size: Number of rows per chunk when ``load_full=False``.
|
|
231
|
-
:returns: A configured PyTorch ``DataLoader``.
|
|
232
|
-
"""
|
|
233
146
|
if isinstance(data, DataLoader):
|
|
234
147
|
return data
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
raise ValueError(f"Unsupported data type: {type(data)}")
|
|
148
|
+
elif isinstance(data, (str, os.PathLike)):
|
|
149
|
+
return self._create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size)
|
|
150
|
+
elif isinstance(data, (dict, pd.DataFrame)):
|
|
151
|
+
return self._create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle)
|
|
152
|
+
else:
|
|
153
|
+
raise ValueError(f"Unsupported data type: {type(data)}")
|
|
243
154
|
|
|
244
155
|
def _create_from_memory(self,
|
|
245
156
|
data: Union[dict, pd.DataFrame],
|
|
246
157
|
batch_size: int,
|
|
247
158
|
shuffle: bool) -> DataLoader:
|
|
248
|
-
"""
|
|
249
|
-
Convert in-memory data (dict/DataFrame) into tensors and wrap with ``DataLoader``.
|
|
250
|
-
|
|
251
|
-
:param data: Dict or DataFrame containing feature/target columns.
|
|
252
|
-
:param batch_size: Batch size.
|
|
253
|
-
:param shuffle: Whether to shuffle batches.
|
|
254
|
-
:returns: A ``DataLoader`` backed by ``TensorDataset``.
|
|
255
|
-
"""
|
|
256
|
-
|
|
257
159
|
raw_data = data
|
|
258
160
|
|
|
259
161
|
if self.processor is not None:
|
|
260
|
-
|
|
162
|
+
if not self.processor.is_fitted:
|
|
163
|
+
raise ValueError("DataProcessor must be fitted before transforming data in memory")
|
|
261
164
|
data = self.processor.transform(data, return_dict=True)
|
|
262
|
-
|
|
263
|
-
tensors
|
|
264
|
-
data
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
target_columns=self.target_columns,
|
|
268
|
-
id_columns=self.id_columns,
|
|
269
|
-
on_missing_feature="raise",
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
assert tensors is not None, "No tensors were created from provided data."
|
|
273
|
-
|
|
274
|
-
dataset = TensorDataset(*tensors)
|
|
275
|
-
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
|
|
165
|
+
tensors = build_tensors_from_data(data=data,raw_data=raw_data, features=self.all_features, target_columns=self.target_columns, id_columns=self.id_columns,)
|
|
166
|
+
if tensors is None:
|
|
167
|
+
raise ValueError("No valid tensors could be built from the provided data.")
|
|
168
|
+
dataset = TensorDictDataset(tensors)
|
|
169
|
+
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
|
|
276
170
|
|
|
277
171
|
def _create_from_path(self,
|
|
278
172
|
path: str,
|
|
279
173
|
batch_size: int,
|
|
280
174
|
shuffle: bool,
|
|
281
175
|
load_full: bool,
|
|
282
|
-
chunk_size: int) -> DataLoader:
|
|
283
|
-
"""
|
|
284
|
-
Build a ``DataLoader`` from a CSV/Parquet file or directory.
|
|
285
|
-
|
|
286
|
-
:param path: File path or directory containing homogeneous CSV/Parquet files.
|
|
287
|
-
:param batch_size: Batch size.
|
|
288
|
-
:param shuffle: Shuffle flag.
|
|
289
|
-
:param load_full: If ``True``, load all rows into memory; otherwise stream.
|
|
290
|
-
:param chunk_size: Chunk rows when streaming.
|
|
291
|
-
:returns: A ``DataLoader`` (in-memory or streaming).
|
|
292
|
-
"""
|
|
293
|
-
|
|
176
|
+
chunk_size: int = 10000) -> DataLoader:
|
|
294
177
|
file_paths, file_type = resolve_file_paths(str(Path(path)))
|
|
295
|
-
|
|
296
178
|
# Load full data into memory
|
|
297
179
|
if load_full:
|
|
298
180
|
dfs = []
|
|
@@ -306,20 +188,12 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
306
188
|
df = read_table(file_path, file_type)
|
|
307
189
|
dfs.append(df)
|
|
308
190
|
except MemoryError as exc:
|
|
309
|
-
raise MemoryError(
|
|
310
|
-
f"Out of memory while reading {file_path}. "
|
|
311
|
-
f"Consider using load_full=False with streaming."
|
|
312
|
-
) from exc
|
|
313
|
-
|
|
191
|
+
raise MemoryError(f"Out of memory while reading {file_path}. Consider using load_full=False with streaming.") from exc
|
|
314
192
|
try:
|
|
315
193
|
combined_df = pd.concat(dfs, ignore_index=True)
|
|
316
194
|
except MemoryError as exc:
|
|
317
|
-
raise MemoryError(
|
|
318
|
-
|
|
319
|
-
f"Use load_full=False to stream or reduce chunk_size."
|
|
320
|
-
) from exc
|
|
321
|
-
|
|
322
|
-
return self._create_from_memory(combined_df, batch_size, shuffle)
|
|
195
|
+
raise MemoryError(f"Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size.") from exc
|
|
196
|
+
return self._create_from_memory(combined_df, batch_size, shuffle,)
|
|
323
197
|
else:
|
|
324
198
|
return self._load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle)
|
|
325
199
|
|
|
@@ -329,26 +203,10 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
329
203
|
batch_size: int,
|
|
330
204
|
chunk_size: int,
|
|
331
205
|
shuffle: bool) -> DataLoader:
|
|
332
|
-
"""
|
|
333
|
-
Create a streaming ``DataLoader`` that yields chunked tensors from files.
|
|
334
|
-
|
|
335
|
-
:param file_paths: Ordered list of file paths.
|
|
336
|
-
:param file_type: ``\"csv\"`` or ``\"parquet\"``.
|
|
337
|
-
:param batch_size: Batch size for the outer ``DataLoader``.
|
|
338
|
-
:param chunk_size: Number of rows per chunk when reading files.
|
|
339
|
-
:returns: Streaming ``DataLoader`` with custom ``collate_fn``.
|
|
340
|
-
"""
|
|
341
|
-
|
|
342
206
|
if shuffle:
|
|
343
|
-
logging.warning(
|
|
344
|
-
|
|
207
|
+
logging.warning("Shuffle is ignored in streaming mode (IterableDataset).")
|
|
345
208
|
if batch_size != 1:
|
|
346
|
-
logging.warning(
|
|
347
|
-
"Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.",
|
|
348
|
-
"yellow",
|
|
349
|
-
))
|
|
350
|
-
effective_batch_size = 1
|
|
351
|
-
|
|
209
|
+
logging.warning("Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
|
|
352
210
|
dataset = FileDataset(
|
|
353
211
|
file_paths=file_paths,
|
|
354
212
|
dense_features=self.dense_features,
|
|
@@ -360,41 +218,24 @@ class RecDataLoader(FeatureSpecMixin):
|
|
|
360
218
|
file_type=file_type,
|
|
361
219
|
processor=self.processor
|
|
362
220
|
)
|
|
363
|
-
|
|
364
|
-
return DataLoader(dataset, batch_size=effective_batch_size, collate_fn=collate_fn)
|
|
221
|
+
return DataLoader(dataset, batch_size=1, collate_fn=collate_fn)
|
|
365
222
|
|
|
366
223
|
def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
367
|
-
"""
|
|
368
|
-
Normalize a raw sequence column into a padded int64 ``ndarray``.
|
|
369
|
-
|
|
370
|
-
:param column: Sequence column from DataFrame/dict; can be Series, list, or ndarray.
|
|
371
|
-
:param feature: Sequence feature definition providing ``max_len`` and optional ``padding_idx``.
|
|
372
|
-
:returns: 2-D numpy array (batch, seq_len) with dtype ``int64``.
|
|
373
|
-
"""
|
|
374
224
|
if isinstance(column, pd.Series):
|
|
375
225
|
column = column.tolist()
|
|
376
|
-
|
|
377
226
|
if isinstance(column, (list, tuple)):
|
|
378
227
|
column = np.array(column, dtype=object)
|
|
379
|
-
|
|
380
228
|
if not isinstance(column, np.ndarray):
|
|
381
229
|
column = np.array([column], dtype=object)
|
|
382
|
-
|
|
383
230
|
if column.ndim == 0:
|
|
384
231
|
column = column.reshape(1)
|
|
385
|
-
|
|
386
232
|
if column.dtype == object and any(isinstance(v, str) for v in column.ravel()):
|
|
387
|
-
raise TypeError(
|
|
388
|
-
f"Sequence feature '{feature.name}' expects numeric sequences; found string values."
|
|
389
|
-
)
|
|
390
|
-
|
|
233
|
+
raise TypeError(f"Sequence feature '{feature.name}' expects numeric sequences; found string values.")
|
|
391
234
|
if column.dtype == object and len(column) > 0 and isinstance(column[0], (list, tuple, np.ndarray)):
|
|
392
235
|
sequences = []
|
|
393
236
|
for seq in column:
|
|
394
237
|
if isinstance(seq, str):
|
|
395
|
-
raise TypeError(
|
|
396
|
-
f"Sequence feature '{feature.name}' expects numeric sequences; found string values."
|
|
397
|
-
)
|
|
238
|
+
raise TypeError(f"Sequence feature '{feature.name}' expects numeric sequences; found string values.")
|
|
398
239
|
if isinstance(seq, (list, tuple, np.ndarray)):
|
|
399
240
|
arr = np.asarray(seq, dtype=np.int64)
|
|
400
241
|
else:
|
|
@@ -415,7 +256,6 @@ def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
|
415
256
|
column = np.stack(padded)
|
|
416
257
|
elif column.ndim == 1:
|
|
417
258
|
column = column.reshape(-1, 1)
|
|
418
|
-
|
|
419
259
|
return np.asarray(column, dtype=np.int64)
|
|
420
260
|
|
|
421
261
|
|
|
@@ -424,67 +264,38 @@ def build_tensors_from_data( # noqa: C901
|
|
|
424
264
|
raw_data: dict | pd.DataFrame,
|
|
425
265
|
features: list,
|
|
426
266
|
target_columns: list[str],
|
|
427
|
-
id_columns: list[str]
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
"""
|
|
431
|
-
Shared routine to convert structured data into a tuple of tensors.
|
|
432
|
-
|
|
433
|
-
:param data: Preprocessed data (dict or DataFrame) used to fetch model inputs/labels.
|
|
434
|
-
:param raw_data: Original data, used for untouched ID columns.
|
|
435
|
-
:param features: Ordered list of feature definitions.
|
|
436
|
-
:param target_columns: Target/label column names.
|
|
437
|
-
:param id_columns: Extra ID column names to append at the end of the tensor tuple.
|
|
438
|
-
:param on_missing_feature: ``\"warn\"`` to skip missing feature with warning, ``\"raise\"`` to error.
|
|
439
|
-
:returns: Tuple of tensors following the order of ``features`` + targets (+ ids) or ``None`` if empty.
|
|
440
|
-
"""
|
|
441
|
-
tensors: list[torch.Tensor] = []
|
|
442
|
-
|
|
267
|
+
id_columns: list[str]
|
|
268
|
+
) -> dict | None:
|
|
269
|
+
feature_tensors: dict[str, torch.Tensor] = {}
|
|
443
270
|
for feature in features:
|
|
444
271
|
column = get_column_data(data, feature.name)
|
|
445
272
|
if column is None:
|
|
446
|
-
|
|
447
|
-
logging.warning(colorize(f"Feature column '{feature.name}' not found in data", "yellow"))
|
|
448
|
-
continue
|
|
449
|
-
raise AssertionError(f"Feature column {feature.name} not found in data.")
|
|
450
|
-
|
|
273
|
+
raise ValueError(f"Feature column '{feature.name}' not found in data")
|
|
451
274
|
if isinstance(feature, SequenceFeature):
|
|
452
275
|
tensor = torch.from_numpy(_normalize_sequence_column(column, feature))
|
|
453
276
|
elif isinstance(feature, DenseFeature):
|
|
454
277
|
tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
|
|
455
278
|
else:
|
|
456
279
|
tensor = torch.from_numpy(np.asarray(column, dtype=np.int64))
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
label_tensors = []
|
|
280
|
+
feature_tensors[feature.name] = tensor
|
|
281
|
+
label_tensors = None
|
|
461
282
|
if target_columns:
|
|
283
|
+
label_tensors = {}
|
|
462
284
|
for target_name in target_columns:
|
|
463
285
|
column = get_column_data(data, target_name)
|
|
464
|
-
|
|
465
|
-
|
|
286
|
+
if column is None:
|
|
287
|
+
continue
|
|
466
288
|
label_tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
|
|
467
|
-
|
|
468
|
-
if label_tensor.dim() == 1:
|
|
469
|
-
label_tensor = label_tensor.view(-1, 1)
|
|
470
|
-
elif label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
|
|
289
|
+
if label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
|
|
471
290
|
label_tensor = label_tensor.t()
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
else:
|
|
479
|
-
y_tensor = torch.cat(label_tensors, dim=1)
|
|
480
|
-
|
|
481
|
-
if y_tensor.shape[1] == 1:
|
|
482
|
-
y_tensor = y_tensor.squeeze(1)
|
|
483
|
-
|
|
484
|
-
tensors.append(y_tensor)
|
|
485
|
-
|
|
291
|
+
if label_tensor.shape[1:] == (1,):
|
|
292
|
+
label_tensor = label_tensor.squeeze(1)
|
|
293
|
+
label_tensors[target_name] = label_tensor
|
|
294
|
+
if not label_tensors:
|
|
295
|
+
label_tensors = None
|
|
296
|
+
id_tensors = None
|
|
486
297
|
if id_columns:
|
|
487
|
-
|
|
298
|
+
id_tensors = {}
|
|
488
299
|
for id_col in id_columns:
|
|
489
300
|
column = get_column_data(raw_data, id_col)
|
|
490
301
|
if column is None:
|
|
@@ -494,20 +305,8 @@ def build_tensors_from_data( # noqa: C901
|
|
|
494
305
|
try:
|
|
495
306
|
id_arr = np.asarray(column, dtype=np.int64)
|
|
496
307
|
except Exception as exc:
|
|
497
|
-
raise TypeError(
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
) from exc
|
|
501
|
-
id_arrays.append(id_arr)
|
|
502
|
-
|
|
503
|
-
combined_ids = np.column_stack(id_arrays)
|
|
504
|
-
tensors.append(torch.from_numpy(combined_ids))
|
|
505
|
-
|
|
506
|
-
if not tensors:
|
|
308
|
+
raise TypeError( f"ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}") from exc
|
|
309
|
+
id_tensors[id_col] = torch.from_numpy(id_arr)
|
|
310
|
+
if not feature_tensors:
|
|
507
311
|
return None
|
|
508
|
-
|
|
509
|
-
return tuple(tensors)
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
# Backward compatible alias
|
|
513
|
-
_build_tensors_from_data = build_tensors_from_data
|
|
312
|
+
return {"features": feature_tensors, "labels": label_tensors, "ids": id_tensors}
|