nextrec 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +1 -1
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -5
- nextrec/basic/callback.py +1 -0
- nextrec/basic/features.py +30 -22
- nextrec/basic/layers.py +244 -113
- nextrec/basic/loggers.py +62 -43
- nextrec/basic/metrics.py +268 -119
- nextrec/basic/model.py +1373 -443
- nextrec/basic/session.py +10 -3
- nextrec/cli.py +498 -0
- nextrec/data/__init__.py +19 -25
- nextrec/data/batch_utils.py +11 -3
- nextrec/data/data_processing.py +42 -24
- nextrec/data/data_utils.py +26 -15
- nextrec/data/dataloader.py +303 -96
- nextrec/data/preprocessor.py +320 -199
- nextrec/loss/listwise.py +17 -9
- nextrec/loss/loss_utils.py +7 -8
- nextrec/loss/pairwise.py +2 -0
- nextrec/loss/pointwise.py +30 -12
- nextrec/models/generative/hstu.py +106 -40
- nextrec/models/match/dssm.py +82 -69
- nextrec/models/match/dssm_v2.py +72 -58
- nextrec/models/match/mind.py +175 -108
- nextrec/models/match/sdm.py +104 -88
- nextrec/models/match/youtube_dnn.py +73 -60
- nextrec/models/multi_task/esmm.py +53 -39
- nextrec/models/multi_task/mmoe.py +70 -47
- nextrec/models/multi_task/ple.py +107 -50
- nextrec/models/multi_task/poso.py +121 -41
- nextrec/models/multi_task/share_bottom.py +54 -38
- nextrec/models/ranking/afm.py +172 -45
- nextrec/models/ranking/autoint.py +84 -61
- nextrec/models/ranking/dcn.py +59 -42
- nextrec/models/ranking/dcn_v2.py +64 -23
- nextrec/models/ranking/deepfm.py +36 -26
- nextrec/models/ranking/dien.py +158 -102
- nextrec/models/ranking/din.py +88 -60
- nextrec/models/ranking/fibinet.py +55 -35
- nextrec/models/ranking/fm.py +32 -26
- nextrec/models/ranking/masknet.py +95 -34
- nextrec/models/ranking/pnn.py +34 -31
- nextrec/models/ranking/widedeep.py +37 -29
- nextrec/models/ranking/xdeepfm.py +63 -41
- nextrec/utils/__init__.py +61 -32
- nextrec/utils/config.py +490 -0
- nextrec/utils/device.py +52 -12
- nextrec/utils/distributed.py +141 -0
- nextrec/utils/embedding.py +1 -0
- nextrec/utils/feature.py +1 -0
- nextrec/utils/file.py +32 -11
- nextrec/utils/initializer.py +61 -16
- nextrec/utils/optimizer.py +25 -9
- nextrec/utils/synthetic_data.py +531 -0
- nextrec/utils/tensor.py +24 -13
- {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/METADATA +15 -5
- nextrec-0.4.2.dist-info/RECORD +69 -0
- nextrec-0.4.2.dist-info/entry_points.txt +2 -0
- nextrec-0.3.6.dist-info/RECORD +0 -64
- {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
- {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
nextrec/data/dataloader.py
CHANGED
|
@@ -5,6 +5,7 @@ Date: create on 27/10/2025
|
|
|
5
5
|
Checkpoint: edit on 02/12/2025
|
|
6
6
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
7
7
|
"""
|
|
8
|
+
|
|
8
9
|
import os
|
|
9
10
|
import torch
|
|
10
11
|
import logging
|
|
@@ -15,59 +16,89 @@ import pyarrow.parquet as pq
|
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
from typing import cast
|
|
17
18
|
|
|
18
|
-
from
|
|
19
|
+
from nextrec.basic.features import (
|
|
20
|
+
DenseFeature,
|
|
21
|
+
SparseFeature,
|
|
22
|
+
SequenceFeature,
|
|
23
|
+
FeatureSet,
|
|
24
|
+
)
|
|
19
25
|
from nextrec.data.preprocessor import DataProcessor
|
|
20
|
-
from
|
|
26
|
+
from torch.utils.data import DataLoader, Dataset, IterableDataset
|
|
21
27
|
|
|
22
|
-
from nextrec.basic.loggers import colorize
|
|
23
|
-
from nextrec.data.data_processing import get_column_data
|
|
24
|
-
from nextrec.data.batch_utils import collate_fn
|
|
25
|
-
from nextrec.utils.file import resolve_file_paths, read_table
|
|
26
28
|
from nextrec.utils.tensor import to_tensor
|
|
29
|
+
from nextrec.utils.file import resolve_file_paths, read_table
|
|
30
|
+
from nextrec.data.batch_utils import collate_fn
|
|
31
|
+
from nextrec.data.data_processing import get_column_data
|
|
32
|
+
|
|
27
33
|
|
|
28
34
|
class TensorDictDataset(Dataset):
|
|
29
35
|
"""Dataset returning sample-level dicts matching the unified batch schema."""
|
|
36
|
+
|
|
30
37
|
def __init__(self, tensors: dict):
|
|
31
38
|
self.features = tensors.get("features", {})
|
|
32
39
|
self.labels = tensors.get("labels")
|
|
33
40
|
self.ids = tensors.get("ids")
|
|
34
41
|
if not self.features:
|
|
35
|
-
raise ValueError(
|
|
42
|
+
raise ValueError(
|
|
43
|
+
"[TensorDictDataset Error] Dataset requires at least one feature tensor."
|
|
44
|
+
)
|
|
36
45
|
lengths = [tensor.shape[0] for tensor in self.features.values()]
|
|
37
46
|
if not lengths:
|
|
38
47
|
raise ValueError("[TensorDictDataset Error] Feature tensors are empty.")
|
|
39
48
|
self.length = lengths[0]
|
|
40
49
|
for length in lengths[1:]:
|
|
41
50
|
if length != self.length:
|
|
42
|
-
raise ValueError(
|
|
51
|
+
raise ValueError(
|
|
52
|
+
"[TensorDictDataset Error] All feature tensors must have the same length."
|
|
53
|
+
)
|
|
54
|
+
|
|
43
55
|
def __len__(self) -> int:
|
|
44
56
|
return self.length
|
|
45
57
|
|
|
46
58
|
def __getitem__(self, idx: int) -> dict:
|
|
47
59
|
sample_features = {name: tensor[idx] for name, tensor in self.features.items()}
|
|
48
|
-
sample_labels =
|
|
49
|
-
|
|
60
|
+
sample_labels = (
|
|
61
|
+
{name: tensor[idx] for name, tensor in self.labels.items()}
|
|
62
|
+
if self.labels
|
|
63
|
+
else None
|
|
64
|
+
)
|
|
65
|
+
sample_ids = (
|
|
66
|
+
{name: tensor[idx] for name, tensor in self.ids.items()}
|
|
67
|
+
if self.ids
|
|
68
|
+
else None
|
|
69
|
+
)
|
|
50
70
|
return {"features": sample_features, "labels": sample_labels, "ids": sample_ids}
|
|
51
71
|
|
|
72
|
+
|
|
52
73
|
class FileDataset(FeatureSet, IterableDataset):
|
|
53
|
-
def __init__(
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
file_paths: list[str], # file paths to read, containing CSV or Parquet files
|
|
77
|
+
dense_features: list[DenseFeature], # dense feature definitions
|
|
78
|
+
sparse_features: list[SparseFeature], # sparse feature definitions
|
|
79
|
+
sequence_features: list[SequenceFeature], # sequence feature definitions
|
|
80
|
+
target_columns: list[str], # target column names
|
|
81
|
+
id_columns: (
|
|
82
|
+
list[str] | None
|
|
83
|
+
) = None, # id columns to carry through (not used for model inputs)
|
|
84
|
+
chunk_size: int = 10000,
|
|
85
|
+
file_type: str = "csv",
|
|
86
|
+
processor: DataProcessor | None = None,
|
|
87
|
+
): # optional DataProcessor for transformation
|
|
63
88
|
self.file_paths = file_paths
|
|
64
89
|
self.chunk_size = chunk_size
|
|
65
90
|
self.file_type = file_type
|
|
66
91
|
self.processor = processor
|
|
67
|
-
self.set_all_features(
|
|
92
|
+
self.set_all_features(
|
|
93
|
+
dense_features,
|
|
94
|
+
sparse_features,
|
|
95
|
+
sequence_features,
|
|
96
|
+
target_columns,
|
|
97
|
+
id_columns,
|
|
98
|
+
)
|
|
68
99
|
self.current_file_index = 0
|
|
69
100
|
self.total_files = len(file_paths)
|
|
70
|
-
|
|
101
|
+
|
|
71
102
|
def __iter__(self):
|
|
72
103
|
self.current_file_index = 0
|
|
73
104
|
for file_path in self.file_paths:
|
|
@@ -75,93 +106,212 @@ class FileDataset(FeatureSet, IterableDataset):
|
|
|
75
106
|
if self.total_files == 1:
|
|
76
107
|
file_name = os.path.basename(file_path)
|
|
77
108
|
logging.info(f"Processing file: {file_name}")
|
|
78
|
-
if self.file_type ==
|
|
109
|
+
if self.file_type == "csv":
|
|
79
110
|
yield from self.read_csv_chunks(file_path)
|
|
80
|
-
elif self.file_type ==
|
|
111
|
+
elif self.file_type == "parquet":
|
|
81
112
|
yield from self.read_parquet_chunks(file_path)
|
|
82
|
-
|
|
113
|
+
|
|
83
114
|
def read_csv_chunks(self, file_path: str):
|
|
84
115
|
chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
|
|
85
116
|
for chunk in chunk_iterator:
|
|
86
117
|
tensors = self.dataframeto_tensors(chunk)
|
|
87
118
|
yield tensors
|
|
88
|
-
|
|
119
|
+
|
|
89
120
|
def read_parquet_chunks(self, file_path: str):
|
|
90
121
|
parquet_file = pq.ParquetFile(file_path)
|
|
91
122
|
for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
|
|
92
|
-
chunk = batch.to_pandas()
|
|
123
|
+
chunk = batch.to_pandas()
|
|
93
124
|
tensors = self.dataframeto_tensors(chunk)
|
|
94
125
|
yield tensors
|
|
95
126
|
del chunk
|
|
96
|
-
|
|
127
|
+
|
|
97
128
|
def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
|
|
98
129
|
if self.processor is not None:
|
|
99
130
|
if not self.processor.is_fitted:
|
|
100
|
-
raise ValueError(
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"[DataLoader Error] DataProcessor must be fitted before using in streaming mode"
|
|
133
|
+
)
|
|
101
134
|
transformed_data = self.processor.transform(df, return_dict=True)
|
|
102
135
|
else:
|
|
103
136
|
transformed_data = df
|
|
104
137
|
if isinstance(transformed_data, list):
|
|
105
|
-
raise TypeError(
|
|
138
|
+
raise TypeError(
|
|
139
|
+
"[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming."
|
|
140
|
+
)
|
|
106
141
|
safe_data = cast(dict | pd.DataFrame, transformed_data)
|
|
107
|
-
batch = build_tensors_from_data(
|
|
142
|
+
batch = build_tensors_from_data(
|
|
143
|
+
data=safe_data,
|
|
144
|
+
raw_data=df,
|
|
145
|
+
features=self.all_features,
|
|
146
|
+
target_columns=self.target_columns,
|
|
147
|
+
id_columns=self.id_columns,
|
|
148
|
+
)
|
|
108
149
|
if batch is not None:
|
|
109
150
|
batch["_already_batched"] = True
|
|
110
151
|
return batch
|
|
111
152
|
|
|
112
153
|
|
|
113
154
|
class RecDataLoader(FeatureSet):
|
|
114
|
-
def __init__(
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
155
|
+
def __init__(
|
|
156
|
+
self,
|
|
157
|
+
dense_features: list[DenseFeature] | None = None,
|
|
158
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
159
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
160
|
+
target: list[str] | None | str = None,
|
|
161
|
+
id_columns: str | list[str] | None = None,
|
|
162
|
+
processor: DataProcessor | None = None,
|
|
163
|
+
):
|
|
164
|
+
"""
|
|
165
|
+
RecDataLoader is a unified dataloader for supporting in-memory and streaming data.
|
|
166
|
+
Basemodel will accept RecDataLoader to create dataloaders for training/evaluation/prediction.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
dense_features: list of DenseFeature definitions
|
|
170
|
+
sparse_features: list of SparseFeature definitions
|
|
171
|
+
sequence_features: list of SequenceFeature definitions
|
|
172
|
+
target: target column name(s), e.g. 'label' or ['ctr', 'ctcvr']
|
|
173
|
+
id_columns: id column name(s) to carry through (not used for model inputs), e.g. 'user_id' or ['user_id', 'item_id']
|
|
174
|
+
processor: an instance of DataProcessor, if provided, will be used to transform data before creating tensors.
|
|
175
|
+
"""
|
|
121
176
|
self.processor = processor
|
|
122
|
-
self.set_all_features(
|
|
177
|
+
self.set_all_features(
|
|
178
|
+
dense_features, sparse_features, sequence_features, target, id_columns
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
def create_dataloader(
|
|
182
|
+
self,
|
|
183
|
+
data: (
|
|
184
|
+
dict
|
|
185
|
+
| pd.DataFrame
|
|
186
|
+
| str
|
|
187
|
+
| os.PathLike
|
|
188
|
+
| list[str]
|
|
189
|
+
| list[os.PathLike]
|
|
190
|
+
| DataLoader
|
|
191
|
+
),
|
|
192
|
+
batch_size: int = 32,
|
|
193
|
+
shuffle: bool = True,
|
|
194
|
+
load_full: bool = True,
|
|
195
|
+
chunk_size: int = 10000,
|
|
196
|
+
num_workers: int = 0,
|
|
197
|
+
sampler=None,
|
|
198
|
+
) -> DataLoader:
|
|
199
|
+
"""
|
|
200
|
+
Create a DataLoader from various data sources.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
data: Data source, can be a dict, pd.DataFrame, file path (str), or existing DataLoader.
|
|
204
|
+
batch_size: Batch size for DataLoader.
|
|
205
|
+
shuffle: Whether to shuffle the data (ignored in streaming mode).
|
|
206
|
+
load_full: If True, load full data into memory; if False, use streaming mode for large files.
|
|
207
|
+
chunk_size: Chunk size for streaming mode (number of rows per chunk).
|
|
208
|
+
num_workers: Number of worker processes for data loading.
|
|
209
|
+
sampler: Optional sampler for DataLoader, only used for distributed training.
|
|
210
|
+
Returns:
|
|
211
|
+
DataLoader instance.
|
|
212
|
+
"""
|
|
123
213
|
|
|
124
|
-
def create_dataloader(self,
|
|
125
|
-
data: dict | pd.DataFrame | str | DataLoader,
|
|
126
|
-
batch_size: int = 32,
|
|
127
|
-
shuffle: bool = True,
|
|
128
|
-
load_full: bool = True,
|
|
129
|
-
chunk_size: int = 10000,
|
|
130
|
-
num_workers: int = 0) -> DataLoader:
|
|
131
214
|
if isinstance(data, DataLoader):
|
|
132
215
|
return data
|
|
133
216
|
elif isinstance(data, (str, os.PathLike)):
|
|
134
|
-
return self.create_from_path(
|
|
217
|
+
return self.create_from_path(
|
|
218
|
+
path=data,
|
|
219
|
+
batch_size=batch_size,
|
|
220
|
+
shuffle=shuffle,
|
|
221
|
+
load_full=load_full,
|
|
222
|
+
chunk_size=chunk_size,
|
|
223
|
+
num_workers=num_workers,
|
|
224
|
+
)
|
|
225
|
+
elif (
|
|
226
|
+
isinstance(data, list)
|
|
227
|
+
and data
|
|
228
|
+
and all(isinstance(p, (str, os.PathLike)) for p in data)
|
|
229
|
+
):
|
|
230
|
+
return self.create_from_path(
|
|
231
|
+
path=data,
|
|
232
|
+
batch_size=batch_size,
|
|
233
|
+
shuffle=shuffle,
|
|
234
|
+
load_full=load_full,
|
|
235
|
+
chunk_size=chunk_size,
|
|
236
|
+
num_workers=num_workers,
|
|
237
|
+
)
|
|
135
238
|
elif isinstance(data, (dict, pd.DataFrame)):
|
|
136
|
-
return self.create_from_memory(
|
|
239
|
+
return self.create_from_memory(
|
|
240
|
+
data=data,
|
|
241
|
+
batch_size=batch_size,
|
|
242
|
+
shuffle=shuffle,
|
|
243
|
+
num_workers=num_workers,
|
|
244
|
+
sampler=sampler,
|
|
245
|
+
)
|
|
137
246
|
else:
|
|
138
|
-
raise ValueError(
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
247
|
+
raise ValueError(
|
|
248
|
+
f"[RecDataLoader Error] Unsupported data type: {type(data)}"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
def create_from_memory(
|
|
252
|
+
self,
|
|
253
|
+
data: dict | pd.DataFrame,
|
|
254
|
+
batch_size: int,
|
|
255
|
+
shuffle: bool,
|
|
256
|
+
num_workers: int = 0,
|
|
257
|
+
sampler=None,
|
|
258
|
+
) -> DataLoader:
|
|
259
|
+
|
|
145
260
|
raw_data = data
|
|
146
261
|
|
|
147
262
|
if self.processor is not None:
|
|
148
263
|
if not self.processor.is_fitted:
|
|
149
|
-
raise ValueError(
|
|
150
|
-
|
|
151
|
-
|
|
264
|
+
raise ValueError(
|
|
265
|
+
"[RecDataLoader Error] DataProcessor must be fitted before transforming data in memory"
|
|
266
|
+
)
|
|
267
|
+
data = self.processor.transform(data, return_dict=True) # type: ignore
|
|
268
|
+
tensors = build_tensors_from_data(
|
|
269
|
+
data=data,
|
|
270
|
+
raw_data=raw_data,
|
|
271
|
+
features=self.all_features,
|
|
272
|
+
target_columns=self.target_columns,
|
|
273
|
+
id_columns=self.id_columns,
|
|
274
|
+
)
|
|
152
275
|
if tensors is None:
|
|
153
|
-
raise ValueError(
|
|
276
|
+
raise ValueError(
|
|
277
|
+
"[RecDataLoader Error] No valid tensors could be built from the provided data."
|
|
278
|
+
)
|
|
154
279
|
dataset = TensorDictDataset(tensors)
|
|
155
|
-
return DataLoader(
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
280
|
+
return DataLoader(
|
|
281
|
+
dataset,
|
|
282
|
+
batch_size=batch_size,
|
|
283
|
+
shuffle=False if sampler is not None else shuffle,
|
|
284
|
+
sampler=sampler,
|
|
285
|
+
collate_fn=collate_fn,
|
|
286
|
+
num_workers=num_workers,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def create_from_path(
|
|
290
|
+
self,
|
|
291
|
+
path: str | os.PathLike | list[str] | list[os.PathLike],
|
|
292
|
+
batch_size: int,
|
|
293
|
+
shuffle: bool,
|
|
294
|
+
load_full: bool,
|
|
295
|
+
chunk_size: int = 10000,
|
|
296
|
+
num_workers: int = 0,
|
|
297
|
+
) -> DataLoader:
|
|
298
|
+
if isinstance(path, (str, os.PathLike)):
|
|
299
|
+
file_paths, file_type = resolve_file_paths(str(Path(path)))
|
|
300
|
+
else:
|
|
301
|
+
file_paths = [str(Path(p)) for p in path]
|
|
302
|
+
if not file_paths:
|
|
303
|
+
raise ValueError("[RecDataLoader Error] Empty file path list provided.")
|
|
304
|
+
suffixes = {Path(p).suffix.lower() for p in file_paths}
|
|
305
|
+
if len(suffixes) != 1:
|
|
306
|
+
raise ValueError(
|
|
307
|
+
"[RecDataLoader Error] Mixed file types in provided list; please use only CSV or only Parquet."
|
|
308
|
+
)
|
|
309
|
+
suffix = suffixes.pop()
|
|
310
|
+
if suffix not in {".csv", ".parquet"}:
|
|
311
|
+
raise ValueError(
|
|
312
|
+
f"[RecDataLoader Error] Unsupported file extension in list: {suffix}"
|
|
313
|
+
)
|
|
314
|
+
file_type = "csv" if suffix == ".csv" else "parquet"
|
|
165
315
|
# Load full data into memory
|
|
166
316
|
if load_full:
|
|
167
317
|
dfs = []
|
|
@@ -175,28 +325,60 @@ class RecDataLoader(FeatureSet):
|
|
|
175
325
|
df = read_table(file_path, file_type=file_type)
|
|
176
326
|
dfs.append(df)
|
|
177
327
|
except MemoryError as exc:
|
|
178
|
-
raise MemoryError(
|
|
328
|
+
raise MemoryError(
|
|
329
|
+
f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming."
|
|
330
|
+
) from exc
|
|
179
331
|
try:
|
|
180
332
|
combined_df = pd.concat(dfs, ignore_index=True)
|
|
181
333
|
except MemoryError as exc:
|
|
182
|
-
raise MemoryError(
|
|
183
|
-
|
|
334
|
+
raise MemoryError(
|
|
335
|
+
f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size."
|
|
336
|
+
) from exc
|
|
337
|
+
return self.create_from_memory(
|
|
338
|
+
combined_df, batch_size, shuffle, num_workers=num_workers
|
|
339
|
+
)
|
|
184
340
|
else:
|
|
185
|
-
return self.load_files_streaming(
|
|
341
|
+
return self.load_files_streaming(
|
|
342
|
+
file_paths,
|
|
343
|
+
file_type,
|
|
344
|
+
batch_size,
|
|
345
|
+
chunk_size,
|
|
346
|
+
shuffle,
|
|
347
|
+
num_workers=num_workers,
|
|
348
|
+
)
|
|
186
349
|
|
|
187
|
-
def load_files_streaming(
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
350
|
+
def load_files_streaming(
|
|
351
|
+
self,
|
|
352
|
+
file_paths: list[str],
|
|
353
|
+
file_type: str,
|
|
354
|
+
batch_size: int,
|
|
355
|
+
chunk_size: int,
|
|
356
|
+
shuffle: bool,
|
|
357
|
+
num_workers: int = 0,
|
|
358
|
+
) -> DataLoader:
|
|
194
359
|
if shuffle:
|
|
195
|
-
logging.info(
|
|
360
|
+
logging.info(
|
|
361
|
+
"[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset)."
|
|
362
|
+
)
|
|
196
363
|
if batch_size != 1:
|
|
197
|
-
logging.info(
|
|
198
|
-
|
|
199
|
-
|
|
364
|
+
logging.info(
|
|
365
|
+
"[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput."
|
|
366
|
+
)
|
|
367
|
+
dataset = FileDataset(
|
|
368
|
+
file_paths=file_paths,
|
|
369
|
+
dense_features=self.dense_features,
|
|
370
|
+
sparse_features=self.sparse_features,
|
|
371
|
+
sequence_features=self.sequence_features,
|
|
372
|
+
target_columns=self.target_columns,
|
|
373
|
+
id_columns=self.id_columns,
|
|
374
|
+
chunk_size=chunk_size,
|
|
375
|
+
file_type=file_type,
|
|
376
|
+
processor=self.processor,
|
|
377
|
+
)
|
|
378
|
+
return DataLoader(
|
|
379
|
+
dataset, batch_size=1, collate_fn=collate_fn, num_workers=num_workers
|
|
380
|
+
)
|
|
381
|
+
|
|
200
382
|
|
|
201
383
|
def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
202
384
|
if isinstance(column, pd.Series):
|
|
@@ -208,12 +390,20 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
|
208
390
|
if column.ndim == 0:
|
|
209
391
|
column = column.reshape(1)
|
|
210
392
|
if column.dtype == object and any(isinstance(v, str) for v in column.ravel()):
|
|
211
|
-
raise TypeError(
|
|
212
|
-
|
|
393
|
+
raise TypeError(
|
|
394
|
+
f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
|
|
395
|
+
)
|
|
396
|
+
if (
|
|
397
|
+
column.dtype == object
|
|
398
|
+
and len(column) > 0
|
|
399
|
+
and isinstance(column[0], (list, tuple, np.ndarray))
|
|
400
|
+
):
|
|
213
401
|
sequences = []
|
|
214
402
|
for seq in column:
|
|
215
403
|
if isinstance(seq, str):
|
|
216
|
-
raise TypeError(
|
|
404
|
+
raise TypeError(
|
|
405
|
+
f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
|
|
406
|
+
)
|
|
217
407
|
if isinstance(seq, (list, tuple, np.ndarray)):
|
|
218
408
|
arr = np.asarray(seq, dtype=np.int64)
|
|
219
409
|
else:
|
|
@@ -228,25 +418,32 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
|
228
418
|
if len(seq) > max_len:
|
|
229
419
|
padded.append(seq[:max_len])
|
|
230
420
|
else:
|
|
231
|
-
padded.append(
|
|
421
|
+
padded.append(
|
|
422
|
+
np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value)
|
|
423
|
+
)
|
|
232
424
|
column = np.stack(padded)
|
|
233
425
|
elif column.ndim == 1:
|
|
234
426
|
column = column.reshape(-1, 1)
|
|
235
427
|
return np.asarray(column, dtype=np.int64)
|
|
236
428
|
|
|
237
|
-
|
|
429
|
+
|
|
430
|
+
def build_tensors_from_data(
|
|
238
431
|
data: dict | pd.DataFrame,
|
|
239
432
|
raw_data: dict | pd.DataFrame,
|
|
240
433
|
features: list,
|
|
241
434
|
target_columns: list[str],
|
|
242
|
-
id_columns: list[str]
|
|
435
|
+
id_columns: list[str],
|
|
243
436
|
) -> dict | None:
|
|
244
437
|
feature_tensors = {}
|
|
245
438
|
for feature in features:
|
|
246
439
|
column = get_column_data(data, feature.name)
|
|
247
440
|
if column is None:
|
|
248
|
-
raise ValueError(
|
|
249
|
-
|
|
441
|
+
raise ValueError(
|
|
442
|
+
f"[RecDataLoader Error] Feature column '{feature.name}' not found in data"
|
|
443
|
+
)
|
|
444
|
+
if isinstance(
|
|
445
|
+
feature, SequenceFeature
|
|
446
|
+
): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
|
|
250
447
|
arr = normalize_sequence_column(column, feature)
|
|
251
448
|
tensor = to_tensor(arr, dtype=torch.long)
|
|
252
449
|
elif isinstance(feature, DenseFeature):
|
|
@@ -263,8 +460,14 @@ def build_tensors_from_data(
|
|
|
263
460
|
column = get_column_data(data, target_name)
|
|
264
461
|
if column is None:
|
|
265
462
|
continue
|
|
266
|
-
label_tensor = to_tensor(
|
|
267
|
-
|
|
463
|
+
label_tensor = to_tensor(
|
|
464
|
+
np.asarray(column, dtype=np.float32), dtype=torch.float32
|
|
465
|
+
)
|
|
466
|
+
if (
|
|
467
|
+
label_tensor.dim() == 2
|
|
468
|
+
and label_tensor.shape[0] == 1
|
|
469
|
+
and label_tensor.shape[1] > 1
|
|
470
|
+
):
|
|
268
471
|
label_tensor = label_tensor.t()
|
|
269
472
|
if label_tensor.shape[1:] == (1,):
|
|
270
473
|
label_tensor = label_tensor.squeeze(1)
|
|
@@ -279,11 +482,15 @@ def build_tensors_from_data(
|
|
|
279
482
|
if column is None:
|
|
280
483
|
column = get_column_data(data, id_col)
|
|
281
484
|
if column is None:
|
|
282
|
-
raise KeyError(
|
|
485
|
+
raise KeyError(
|
|
486
|
+
f"[RecDataLoader Error] ID column '{id_col}' not found in provided data."
|
|
487
|
+
)
|
|
283
488
|
try:
|
|
284
489
|
id_arr = np.asarray(column, dtype=np.int64)
|
|
285
490
|
except Exception as exc:
|
|
286
|
-
raise TypeError(
|
|
491
|
+
raise TypeError(
|
|
492
|
+
f"[RecDataLoader Error] ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}"
|
|
493
|
+
) from exc
|
|
287
494
|
id_tensors[id_col] = to_tensor(id_arr, dtype=torch.long)
|
|
288
495
|
if not feature_tensors:
|
|
289
496
|
return None
|