nextrec 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +1 -1
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -5
- nextrec/basic/callback.py +1 -0
- nextrec/basic/features.py +30 -22
- nextrec/basic/layers.py +250 -112
- nextrec/basic/loggers.py +63 -44
- nextrec/basic/metrics.py +270 -120
- nextrec/basic/model.py +1084 -402
- nextrec/basic/session.py +10 -3
- nextrec/cli.py +492 -0
- nextrec/data/__init__.py +19 -25
- nextrec/data/batch_utils.py +11 -3
- nextrec/data/data_processing.py +51 -45
- nextrec/data/data_utils.py +26 -15
- nextrec/data/dataloader.py +273 -96
- nextrec/data/preprocessor.py +320 -199
- nextrec/loss/listwise.py +17 -9
- nextrec/loss/loss_utils.py +7 -8
- nextrec/loss/pairwise.py +2 -0
- nextrec/loss/pointwise.py +30 -12
- nextrec/models/generative/hstu.py +103 -38
- nextrec/models/match/dssm.py +82 -68
- nextrec/models/match/dssm_v2.py +72 -57
- nextrec/models/match/mind.py +175 -107
- nextrec/models/match/sdm.py +104 -87
- nextrec/models/match/youtube_dnn.py +73 -59
- nextrec/models/multi_task/esmm.py +69 -46
- nextrec/models/multi_task/mmoe.py +91 -53
- nextrec/models/multi_task/ple.py +117 -58
- nextrec/models/multi_task/poso.py +163 -55
- nextrec/models/multi_task/share_bottom.py +63 -36
- nextrec/models/ranking/afm.py +80 -45
- nextrec/models/ranking/autoint.py +74 -57
- nextrec/models/ranking/dcn.py +110 -48
- nextrec/models/ranking/dcn_v2.py +265 -45
- nextrec/models/ranking/deepfm.py +39 -24
- nextrec/models/ranking/dien.py +335 -146
- nextrec/models/ranking/din.py +158 -92
- nextrec/models/ranking/fibinet.py +134 -52
- nextrec/models/ranking/fm.py +68 -26
- nextrec/models/ranking/masknet.py +95 -33
- nextrec/models/ranking/pnn.py +128 -58
- nextrec/models/ranking/widedeep.py +40 -28
- nextrec/models/ranking/xdeepfm.py +67 -40
- nextrec/utils/__init__.py +59 -34
- nextrec/utils/config.py +496 -0
- nextrec/utils/device.py +30 -20
- nextrec/utils/distributed.py +36 -9
- nextrec/utils/embedding.py +1 -0
- nextrec/utils/feature.py +1 -0
- nextrec/utils/file.py +33 -11
- nextrec/utils/initializer.py +61 -16
- nextrec/utils/model.py +22 -0
- nextrec/utils/optimizer.py +25 -9
- nextrec/utils/synthetic_data.py +283 -165
- nextrec/utils/tensor.py +24 -13
- {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/METADATA +53 -24
- nextrec-0.4.3.dist-info/RECORD +69 -0
- nextrec-0.4.3.dist-info/entry_points.txt +2 -0
- nextrec-0.4.1.dist-info/RECORD +0 -66
- {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/WHEEL +0 -0
- {nextrec-0.4.1.dist-info → nextrec-0.4.3.dist-info}/licenses/LICENSE +0 -0
nextrec/data/dataloader.py
CHANGED
|
@@ -5,6 +5,7 @@ Date: create on 27/10/2025
|
|
|
5
5
|
Checkpoint: edit on 02/12/2025
|
|
6
6
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
7
7
|
"""
|
|
8
|
+
|
|
8
9
|
import os
|
|
9
10
|
import torch
|
|
10
11
|
import logging
|
|
@@ -15,8 +16,12 @@ import pyarrow.parquet as pq
|
|
|
15
16
|
from pathlib import Path
|
|
16
17
|
from typing import cast
|
|
17
18
|
|
|
18
|
-
from nextrec.basic.
|
|
19
|
-
|
|
19
|
+
from nextrec.basic.features import (
|
|
20
|
+
DenseFeature,
|
|
21
|
+
SparseFeature,
|
|
22
|
+
SequenceFeature,
|
|
23
|
+
FeatureSet,
|
|
24
|
+
)
|
|
20
25
|
from nextrec.data.preprocessor import DataProcessor
|
|
21
26
|
from torch.utils.data import DataLoader, Dataset, IterableDataset
|
|
22
27
|
|
|
@@ -25,49 +30,75 @@ from nextrec.utils.file import resolve_file_paths, read_table
|
|
|
25
30
|
from nextrec.data.batch_utils import collate_fn
|
|
26
31
|
from nextrec.data.data_processing import get_column_data
|
|
27
32
|
|
|
33
|
+
|
|
28
34
|
class TensorDictDataset(Dataset):
|
|
29
35
|
"""Dataset returning sample-level dicts matching the unified batch schema."""
|
|
36
|
+
|
|
30
37
|
def __init__(self, tensors: dict):
|
|
31
38
|
self.features = tensors.get("features", {})
|
|
32
39
|
self.labels = tensors.get("labels")
|
|
33
40
|
self.ids = tensors.get("ids")
|
|
34
41
|
if not self.features:
|
|
35
|
-
raise ValueError(
|
|
42
|
+
raise ValueError(
|
|
43
|
+
"[TensorDictDataset Error] Dataset requires at least one feature tensor."
|
|
44
|
+
)
|
|
36
45
|
lengths = [tensor.shape[0] for tensor in self.features.values()]
|
|
37
46
|
if not lengths:
|
|
38
47
|
raise ValueError("[TensorDictDataset Error] Feature tensors are empty.")
|
|
39
48
|
self.length = lengths[0]
|
|
40
49
|
for length in lengths[1:]:
|
|
41
50
|
if length != self.length:
|
|
42
|
-
raise ValueError(
|
|
51
|
+
raise ValueError(
|
|
52
|
+
"[TensorDictDataset Error] All feature tensors must have the same length."
|
|
53
|
+
)
|
|
54
|
+
|
|
43
55
|
def __len__(self) -> int:
|
|
44
56
|
return self.length
|
|
45
57
|
|
|
46
58
|
def __getitem__(self, idx: int) -> dict:
|
|
47
59
|
sample_features = {name: tensor[idx] for name, tensor in self.features.items()}
|
|
48
|
-
sample_labels =
|
|
49
|
-
|
|
60
|
+
sample_labels = (
|
|
61
|
+
{name: tensor[idx] for name, tensor in self.labels.items()}
|
|
62
|
+
if self.labels
|
|
63
|
+
else None
|
|
64
|
+
)
|
|
65
|
+
sample_ids = (
|
|
66
|
+
{name: tensor[idx] for name, tensor in self.ids.items()}
|
|
67
|
+
if self.ids
|
|
68
|
+
else None
|
|
69
|
+
)
|
|
50
70
|
return {"features": sample_features, "labels": sample_labels, "ids": sample_ids}
|
|
51
71
|
|
|
72
|
+
|
|
52
73
|
class FileDataset(FeatureSet, IterableDataset):
|
|
53
|
-
def __init__(
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
file_paths: list[str], # file paths to read, containing CSV or Parquet files
|
|
77
|
+
dense_features: list[DenseFeature], # dense feature definitions
|
|
78
|
+
sparse_features: list[SparseFeature], # sparse feature definitions
|
|
79
|
+
sequence_features: list[SequenceFeature], # sequence feature definitions
|
|
80
|
+
target_columns: list[str], # target column names
|
|
81
|
+
id_columns: (
|
|
82
|
+
list[str] | None
|
|
83
|
+
) = None, # id columns to carry through (not used for model inputs)
|
|
84
|
+
chunk_size: int = 10000,
|
|
85
|
+
file_type: str = "csv",
|
|
86
|
+
processor: DataProcessor | None = None,
|
|
87
|
+
): # optional DataProcessor for transformation
|
|
63
88
|
self.file_paths = file_paths
|
|
64
89
|
self.chunk_size = chunk_size
|
|
65
90
|
self.file_type = file_type
|
|
66
91
|
self.processor = processor
|
|
67
|
-
self.set_all_features(
|
|
92
|
+
self.set_all_features(
|
|
93
|
+
dense_features,
|
|
94
|
+
sparse_features,
|
|
95
|
+
sequence_features,
|
|
96
|
+
target_columns,
|
|
97
|
+
id_columns,
|
|
98
|
+
)
|
|
68
99
|
self.current_file_index = 0
|
|
69
100
|
self.total_files = len(file_paths)
|
|
70
|
-
|
|
101
|
+
|
|
71
102
|
def __iter__(self):
|
|
72
103
|
self.current_file_index = 0
|
|
73
104
|
for file_path in self.file_paths:
|
|
@@ -75,54 +106,66 @@ class FileDataset(FeatureSet, IterableDataset):
|
|
|
75
106
|
if self.total_files == 1:
|
|
76
107
|
file_name = os.path.basename(file_path)
|
|
77
108
|
logging.info(f"Processing file: {file_name}")
|
|
78
|
-
if self.file_type ==
|
|
109
|
+
if self.file_type == "csv":
|
|
79
110
|
yield from self.read_csv_chunks(file_path)
|
|
80
|
-
elif self.file_type ==
|
|
111
|
+
elif self.file_type == "parquet":
|
|
81
112
|
yield from self.read_parquet_chunks(file_path)
|
|
82
|
-
|
|
113
|
+
|
|
83
114
|
def read_csv_chunks(self, file_path: str):
|
|
84
115
|
chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
|
|
85
116
|
for chunk in chunk_iterator:
|
|
86
117
|
tensors = self.dataframeto_tensors(chunk)
|
|
87
118
|
yield tensors
|
|
88
|
-
|
|
119
|
+
|
|
89
120
|
def read_parquet_chunks(self, file_path: str):
|
|
90
121
|
parquet_file = pq.ParquetFile(file_path)
|
|
91
122
|
for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
|
|
92
|
-
chunk = batch.to_pandas()
|
|
123
|
+
chunk = batch.to_pandas()
|
|
93
124
|
tensors = self.dataframeto_tensors(chunk)
|
|
94
125
|
yield tensors
|
|
95
126
|
del chunk
|
|
96
|
-
|
|
127
|
+
|
|
97
128
|
def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
|
|
98
129
|
if self.processor is not None:
|
|
99
130
|
if not self.processor.is_fitted:
|
|
100
|
-
raise ValueError(
|
|
131
|
+
raise ValueError(
|
|
132
|
+
"[DataLoader Error] DataProcessor must be fitted before using in streaming mode"
|
|
133
|
+
)
|
|
101
134
|
transformed_data = self.processor.transform(df, return_dict=True)
|
|
102
135
|
else:
|
|
103
136
|
transformed_data = df
|
|
104
137
|
if isinstance(transformed_data, list):
|
|
105
|
-
raise TypeError(
|
|
138
|
+
raise TypeError(
|
|
139
|
+
"[DataLoader Error] DataProcessor.transform returned file paths; use return_dict=True with in-memory data for streaming."
|
|
140
|
+
)
|
|
106
141
|
safe_data = cast(dict | pd.DataFrame, transformed_data)
|
|
107
|
-
batch = build_tensors_from_data(
|
|
142
|
+
batch = build_tensors_from_data(
|
|
143
|
+
data=safe_data,
|
|
144
|
+
raw_data=df,
|
|
145
|
+
features=self.all_features,
|
|
146
|
+
target_columns=self.target_columns,
|
|
147
|
+
id_columns=self.id_columns,
|
|
148
|
+
)
|
|
108
149
|
if batch is not None:
|
|
109
150
|
batch["_already_batched"] = True
|
|
110
151
|
return batch
|
|
111
152
|
|
|
112
153
|
|
|
113
154
|
class RecDataLoader(FeatureSet):
|
|
114
|
-
def __init__(
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
155
|
+
def __init__(
|
|
156
|
+
self,
|
|
157
|
+
dense_features: list[DenseFeature] | None = None,
|
|
158
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
159
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
160
|
+
target: list[str] | None | str = None,
|
|
161
|
+
id_columns: str | list[str] | None = None,
|
|
162
|
+
processor: DataProcessor | None = None,
|
|
163
|
+
):
|
|
121
164
|
"""
|
|
122
165
|
RecDataLoader is a unified dataloader for supporting in-memory and streaming data.
|
|
123
166
|
Basemodel will accept RecDataLoader to create dataloaders for training/evaluation/prediction.
|
|
124
167
|
|
|
125
|
-
Args:
|
|
168
|
+
Args:
|
|
126
169
|
dense_features: list of DenseFeature definitions
|
|
127
170
|
sparse_features: list of SparseFeature definitions
|
|
128
171
|
sequence_features: list of SequenceFeature definitions
|
|
@@ -131,16 +174,28 @@ class RecDataLoader(FeatureSet):
|
|
|
131
174
|
processor: an instance of DataProcessor, if provided, will be used to transform data before creating tensors.
|
|
132
175
|
"""
|
|
133
176
|
self.processor = processor
|
|
134
|
-
self.set_all_features(
|
|
177
|
+
self.set_all_features(
|
|
178
|
+
dense_features, sparse_features, sequence_features, target, id_columns
|
|
179
|
+
)
|
|
135
180
|
|
|
136
|
-
def create_dataloader(
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
181
|
+
def create_dataloader(
|
|
182
|
+
self,
|
|
183
|
+
data: (
|
|
184
|
+
dict
|
|
185
|
+
| pd.DataFrame
|
|
186
|
+
| str
|
|
187
|
+
| os.PathLike
|
|
188
|
+
| list[str]
|
|
189
|
+
| list[os.PathLike]
|
|
190
|
+
| DataLoader
|
|
191
|
+
),
|
|
192
|
+
batch_size: int = 32,
|
|
193
|
+
shuffle: bool = True,
|
|
194
|
+
load_full: bool = True,
|
|
195
|
+
chunk_size: int = 10000,
|
|
196
|
+
num_workers: int = 0,
|
|
197
|
+
sampler=None,
|
|
198
|
+
) -> DataLoader:
|
|
144
199
|
"""
|
|
145
200
|
Create a DataLoader from various data sources.
|
|
146
201
|
|
|
@@ -159,39 +214,104 @@ class RecDataLoader(FeatureSet):
|
|
|
159
214
|
if isinstance(data, DataLoader):
|
|
160
215
|
return data
|
|
161
216
|
elif isinstance(data, (str, os.PathLike)):
|
|
162
|
-
return self.create_from_path(
|
|
217
|
+
return self.create_from_path(
|
|
218
|
+
path=data,
|
|
219
|
+
batch_size=batch_size,
|
|
220
|
+
shuffle=shuffle,
|
|
221
|
+
load_full=load_full,
|
|
222
|
+
chunk_size=chunk_size,
|
|
223
|
+
num_workers=num_workers,
|
|
224
|
+
)
|
|
225
|
+
elif (
|
|
226
|
+
isinstance(data, list)
|
|
227
|
+
and data
|
|
228
|
+
and all(isinstance(p, (str, os.PathLike)) for p in data)
|
|
229
|
+
):
|
|
230
|
+
return self.create_from_path(
|
|
231
|
+
path=data,
|
|
232
|
+
batch_size=batch_size,
|
|
233
|
+
shuffle=shuffle,
|
|
234
|
+
load_full=load_full,
|
|
235
|
+
chunk_size=chunk_size,
|
|
236
|
+
num_workers=num_workers,
|
|
237
|
+
)
|
|
163
238
|
elif isinstance(data, (dict, pd.DataFrame)):
|
|
164
|
-
return self.create_from_memory(
|
|
239
|
+
return self.create_from_memory(
|
|
240
|
+
data=data,
|
|
241
|
+
batch_size=batch_size,
|
|
242
|
+
shuffle=shuffle,
|
|
243
|
+
num_workers=num_workers,
|
|
244
|
+
sampler=sampler,
|
|
245
|
+
)
|
|
165
246
|
else:
|
|
166
|
-
raise ValueError(
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
247
|
+
raise ValueError(
|
|
248
|
+
f"[RecDataLoader Error] Unsupported data type: {type(data)}"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
def create_from_memory(
|
|
252
|
+
self,
|
|
253
|
+
data: dict | pd.DataFrame,
|
|
254
|
+
batch_size: int,
|
|
255
|
+
shuffle: bool,
|
|
256
|
+
num_workers: int = 0,
|
|
257
|
+
sampler=None,
|
|
258
|
+
) -> DataLoader:
|
|
174
259
|
|
|
175
260
|
raw_data = data
|
|
176
261
|
|
|
177
262
|
if self.processor is not None:
|
|
178
263
|
if not self.processor.is_fitted:
|
|
179
|
-
raise ValueError(
|
|
180
|
-
|
|
181
|
-
|
|
264
|
+
raise ValueError(
|
|
265
|
+
"[RecDataLoader Error] DataProcessor must be fitted before transforming data in memory"
|
|
266
|
+
)
|
|
267
|
+
data = self.processor.transform(data, return_dict=True) # type: ignore
|
|
268
|
+
tensors = build_tensors_from_data(
|
|
269
|
+
data=data,
|
|
270
|
+
raw_data=raw_data,
|
|
271
|
+
features=self.all_features,
|
|
272
|
+
target_columns=self.target_columns,
|
|
273
|
+
id_columns=self.id_columns,
|
|
274
|
+
)
|
|
182
275
|
if tensors is None:
|
|
183
|
-
raise ValueError(
|
|
276
|
+
raise ValueError(
|
|
277
|
+
"[RecDataLoader Error] No valid tensors could be built from the provided data."
|
|
278
|
+
)
|
|
184
279
|
dataset = TensorDictDataset(tensors)
|
|
185
|
-
return DataLoader(
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
280
|
+
return DataLoader(
|
|
281
|
+
dataset,
|
|
282
|
+
batch_size=batch_size,
|
|
283
|
+
shuffle=False if sampler is not None else shuffle,
|
|
284
|
+
sampler=sampler,
|
|
285
|
+
collate_fn=collate_fn,
|
|
286
|
+
num_workers=num_workers,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
def create_from_path(
|
|
290
|
+
self,
|
|
291
|
+
path: str | os.PathLike | list[str] | list[os.PathLike],
|
|
292
|
+
batch_size: int,
|
|
293
|
+
shuffle: bool,
|
|
294
|
+
load_full: bool,
|
|
295
|
+
chunk_size: int = 10000,
|
|
296
|
+
num_workers: int = 0,
|
|
297
|
+
) -> DataLoader:
|
|
298
|
+
if isinstance(path, (str, os.PathLike)):
|
|
299
|
+
file_paths, file_type = resolve_file_paths(str(Path(path)))
|
|
300
|
+
else:
|
|
301
|
+
file_paths = [str(Path(p)) for p in path]
|
|
302
|
+
if not file_paths:
|
|
303
|
+
raise ValueError("[RecDataLoader Error] Empty file path list provided.")
|
|
304
|
+
suffixes = {Path(p).suffix.lower() for p in file_paths}
|
|
305
|
+
if len(suffixes) != 1:
|
|
306
|
+
raise ValueError(
|
|
307
|
+
"[RecDataLoader Error] Mixed file types in provided list; please use only CSV or only Parquet."
|
|
308
|
+
)
|
|
309
|
+
suffix = suffixes.pop()
|
|
310
|
+
if suffix not in {".csv", ".parquet"}:
|
|
311
|
+
raise ValueError(
|
|
312
|
+
f"[RecDataLoader Error] Unsupported file extension in list: {suffix}"
|
|
313
|
+
)
|
|
314
|
+
file_type = "csv" if suffix == ".csv" else "parquet"
|
|
195
315
|
# Load full data into memory
|
|
196
316
|
if load_full:
|
|
197
317
|
dfs = []
|
|
@@ -202,31 +322,63 @@ class RecDataLoader(FeatureSet):
|
|
|
202
322
|
except OSError:
|
|
203
323
|
pass
|
|
204
324
|
try:
|
|
205
|
-
df = read_table(file_path,
|
|
325
|
+
df = read_table(file_path, data_format=file_type)
|
|
206
326
|
dfs.append(df)
|
|
207
327
|
except MemoryError as exc:
|
|
208
|
-
raise MemoryError(
|
|
328
|
+
raise MemoryError(
|
|
329
|
+
f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming."
|
|
330
|
+
) from exc
|
|
209
331
|
try:
|
|
210
332
|
combined_df = pd.concat(dfs, ignore_index=True)
|
|
211
333
|
except MemoryError as exc:
|
|
212
|
-
raise MemoryError(
|
|
213
|
-
|
|
334
|
+
raise MemoryError(
|
|
335
|
+
f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size."
|
|
336
|
+
) from exc
|
|
337
|
+
return self.create_from_memory(
|
|
338
|
+
combined_df, batch_size, shuffle, num_workers=num_workers
|
|
339
|
+
)
|
|
214
340
|
else:
|
|
215
|
-
return self.load_files_streaming(
|
|
341
|
+
return self.load_files_streaming(
|
|
342
|
+
file_paths,
|
|
343
|
+
file_type,
|
|
344
|
+
batch_size,
|
|
345
|
+
chunk_size,
|
|
346
|
+
shuffle,
|
|
347
|
+
num_workers=num_workers,
|
|
348
|
+
)
|
|
216
349
|
|
|
217
|
-
def load_files_streaming(
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
350
|
+
def load_files_streaming(
|
|
351
|
+
self,
|
|
352
|
+
file_paths: list[str],
|
|
353
|
+
file_type: str,
|
|
354
|
+
batch_size: int,
|
|
355
|
+
chunk_size: int,
|
|
356
|
+
shuffle: bool,
|
|
357
|
+
num_workers: int = 0,
|
|
358
|
+
) -> DataLoader:
|
|
224
359
|
if shuffle:
|
|
225
|
-
logging.info(
|
|
360
|
+
logging.info(
|
|
361
|
+
"[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset)."
|
|
362
|
+
)
|
|
226
363
|
if batch_size != 1:
|
|
227
|
-
logging.info(
|
|
228
|
-
|
|
229
|
-
|
|
364
|
+
logging.info(
|
|
365
|
+
"[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput."
|
|
366
|
+
)
|
|
367
|
+
dataset = FileDataset(
|
|
368
|
+
file_paths=file_paths,
|
|
369
|
+
dense_features=self.dense_features,
|
|
370
|
+
sparse_features=self.sparse_features,
|
|
371
|
+
sequence_features=self.sequence_features,
|
|
372
|
+
target_columns=self.target_columns,
|
|
373
|
+
id_columns=self.id_columns,
|
|
374
|
+
chunk_size=chunk_size,
|
|
375
|
+
file_type=file_type,
|
|
376
|
+
processor=self.processor,
|
|
377
|
+
)
|
|
378
|
+
return DataLoader(
|
|
379
|
+
dataset, batch_size=1, collate_fn=collate_fn, num_workers=num_workers
|
|
380
|
+
)
|
|
381
|
+
|
|
230
382
|
|
|
231
383
|
def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
232
384
|
if isinstance(column, pd.Series):
|
|
@@ -238,12 +390,20 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
|
238
390
|
if column.ndim == 0:
|
|
239
391
|
column = column.reshape(1)
|
|
240
392
|
if column.dtype == object and any(isinstance(v, str) for v in column.ravel()):
|
|
241
|
-
raise TypeError(
|
|
242
|
-
|
|
393
|
+
raise TypeError(
|
|
394
|
+
f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
|
|
395
|
+
)
|
|
396
|
+
if (
|
|
397
|
+
column.dtype == object
|
|
398
|
+
and len(column) > 0
|
|
399
|
+
and isinstance(column[0], (list, tuple, np.ndarray))
|
|
400
|
+
):
|
|
243
401
|
sequences = []
|
|
244
402
|
for seq in column:
|
|
245
403
|
if isinstance(seq, str):
|
|
246
|
-
raise TypeError(
|
|
404
|
+
raise TypeError(
|
|
405
|
+
f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
|
|
406
|
+
)
|
|
247
407
|
if isinstance(seq, (list, tuple, np.ndarray)):
|
|
248
408
|
arr = np.asarray(seq, dtype=np.int64)
|
|
249
409
|
else:
|
|
@@ -258,25 +418,32 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
|
258
418
|
if len(seq) > max_len:
|
|
259
419
|
padded.append(seq[:max_len])
|
|
260
420
|
else:
|
|
261
|
-
padded.append(
|
|
421
|
+
padded.append(
|
|
422
|
+
np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value)
|
|
423
|
+
)
|
|
262
424
|
column = np.stack(padded)
|
|
263
425
|
elif column.ndim == 1:
|
|
264
426
|
column = column.reshape(-1, 1)
|
|
265
427
|
return np.asarray(column, dtype=np.int64)
|
|
266
428
|
|
|
267
|
-
|
|
429
|
+
|
|
430
|
+
def build_tensors_from_data(
|
|
268
431
|
data: dict | pd.DataFrame,
|
|
269
432
|
raw_data: dict | pd.DataFrame,
|
|
270
433
|
features: list,
|
|
271
434
|
target_columns: list[str],
|
|
272
|
-
id_columns: list[str]
|
|
435
|
+
id_columns: list[str],
|
|
273
436
|
) -> dict | None:
|
|
274
437
|
feature_tensors = {}
|
|
275
438
|
for feature in features:
|
|
276
439
|
column = get_column_data(data, feature.name)
|
|
277
440
|
if column is None:
|
|
278
|
-
raise ValueError(
|
|
279
|
-
|
|
441
|
+
raise ValueError(
|
|
442
|
+
f"[RecDataLoader Error] Feature column '{feature.name}' not found in data"
|
|
443
|
+
)
|
|
444
|
+
if isinstance(
|
|
445
|
+
feature, SequenceFeature
|
|
446
|
+
): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
|
|
280
447
|
arr = normalize_sequence_column(column, feature)
|
|
281
448
|
tensor = to_tensor(arr, dtype=torch.long)
|
|
282
449
|
elif isinstance(feature, DenseFeature):
|
|
@@ -293,8 +460,14 @@ def build_tensors_from_data(
|
|
|
293
460
|
column = get_column_data(data, target_name)
|
|
294
461
|
if column is None:
|
|
295
462
|
continue
|
|
296
|
-
label_tensor = to_tensor(
|
|
297
|
-
|
|
463
|
+
label_tensor = to_tensor(
|
|
464
|
+
np.asarray(column, dtype=np.float32), dtype=torch.float32
|
|
465
|
+
)
|
|
466
|
+
if (
|
|
467
|
+
label_tensor.dim() == 2
|
|
468
|
+
and label_tensor.shape[0] == 1
|
|
469
|
+
and label_tensor.shape[1] > 1
|
|
470
|
+
):
|
|
298
471
|
label_tensor = label_tensor.t()
|
|
299
472
|
if label_tensor.shape[1:] == (1,):
|
|
300
473
|
label_tensor = label_tensor.squeeze(1)
|
|
@@ -309,11 +482,15 @@ def build_tensors_from_data(
|
|
|
309
482
|
if column is None:
|
|
310
483
|
column = get_column_data(data, id_col)
|
|
311
484
|
if column is None:
|
|
312
|
-
raise KeyError(
|
|
485
|
+
raise KeyError(
|
|
486
|
+
f"[RecDataLoader Error] ID column '{id_col}' not found in provided data."
|
|
487
|
+
)
|
|
313
488
|
try:
|
|
314
489
|
id_arr = np.asarray(column, dtype=np.int64)
|
|
315
490
|
except Exception as exc:
|
|
316
|
-
raise TypeError(
|
|
491
|
+
raise TypeError(
|
|
492
|
+
f"[RecDataLoader Error] ID column '{id_col}' must contain numeric values. Received dtype={np.asarray(column).dtype}, error: {exc}"
|
|
493
|
+
) from exc
|
|
317
494
|
id_tensors[id_col] = to_tensor(id_arr, dtype=torch.long)
|
|
318
495
|
if not feature_tensors:
|
|
319
496
|
return None
|