nextrec 0.1.10__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +1 -2
- nextrec/basic/callback.py +1 -2
- nextrec/basic/features.py +39 -8
- nextrec/basic/layers.py +1 -2
- nextrec/basic/loggers.py +15 -10
- nextrec/basic/metrics.py +1 -2
- nextrec/basic/model.py +87 -84
- nextrec/basic/session.py +150 -0
- nextrec/data/__init__.py +13 -2
- nextrec/data/data_utils.py +74 -22
- nextrec/data/dataloader.py +513 -0
- nextrec/data/preprocessor.py +494 -134
- nextrec/loss/listwise.py +6 -0
- nextrec/loss/loss_utils.py +1 -2
- nextrec/loss/match_losses.py +4 -5
- nextrec/loss/pairwise.py +6 -0
- nextrec/loss/pointwise.py +6 -0
- nextrec/models/match/dssm.py +2 -2
- nextrec/models/match/dssm_v2.py +2 -2
- nextrec/models/match/mind.py +2 -2
- nextrec/models/match/sdm.py +2 -2
- nextrec/models/match/youtube_dnn.py +2 -2
- nextrec/models/multi_task/esmm.py +3 -3
- nextrec/models/multi_task/mmoe.py +3 -3
- nextrec/models/multi_task/ple.py +3 -3
- nextrec/models/multi_task/share_bottom.py +3 -3
- nextrec/models/ranking/afm.py +2 -3
- nextrec/models/ranking/autoint.py +3 -3
- nextrec/models/ranking/dcn.py +3 -3
- nextrec/models/ranking/deepfm.py +2 -3
- nextrec/models/ranking/dien.py +3 -3
- nextrec/models/ranking/din.py +3 -3
- nextrec/models/ranking/fibinet.py +3 -3
- nextrec/models/ranking/fm.py +3 -3
- nextrec/models/ranking/masknet.py +3 -3
- nextrec/models/ranking/pnn.py +3 -3
- nextrec/models/ranking/widedeep.py +3 -3
- nextrec/models/ranking/xdeepfm.py +3 -3
- nextrec/utils/__init__.py +4 -8
- nextrec/utils/embedding.py +2 -4
- nextrec/utils/initializer.py +1 -2
- nextrec/utils/optimizer.py +1 -2
- {nextrec-0.1.10.dist-info → nextrec-0.2.1.dist-info}/METADATA +4 -5
- nextrec-0.2.1.dist-info/RECORD +54 -0
- nextrec/basic/dataloader.py +0 -447
- nextrec/utils/common.py +0 -14
- nextrec-0.1.10.dist-info/RECORD +0 -51
- {nextrec-0.1.10.dist-info → nextrec-0.2.1.dist-info}/WHEEL +0 -0
- {nextrec-0.1.10.dist-info → nextrec-0.2.1.dist-info}/licenses/LICENSE +0 -0
nextrec/data/data_utils.py
CHANGED
|
@@ -1,30 +1,13 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Data processing utilities for NextRec
|
|
3
|
-
|
|
4
|
-
Date: create on 13/11/2025
|
|
5
|
-
Author:
|
|
6
|
-
Yang Zhou, zyaztec@gmail.com
|
|
7
|
-
"""
|
|
1
|
+
"""Data processing utilities for NextRec."""
|
|
8
2
|
|
|
9
3
|
import torch
|
|
10
4
|
import numpy as np
|
|
11
5
|
import pandas as pd
|
|
6
|
+
import pyarrow.parquet as pq
|
|
7
|
+
from pathlib import Path
|
|
12
8
|
|
|
13
9
|
def collate_fn(batch):
|
|
14
|
-
"""
|
|
15
|
-
Custom collate function for batching tuples of tensors.
|
|
16
|
-
Each element in batch is a tuple of tensors from FileDataset.
|
|
17
|
-
|
|
18
|
-
Examples:
|
|
19
|
-
# Single sample in batch
|
|
20
|
-
(tensor([1.0, 2.0]), tensor([10, 20]), tensor([100, 200]), tensor(1.0))
|
|
21
|
-
# Batched output
|
|
22
|
-
(tensor([[1.0, 2.0], [3.0, 4.0]]), # dense_features batch
|
|
23
|
-
tensor([[10, 20], [30, 40]]), # sparse_features batch
|
|
24
|
-
tensor([[100, 200], [300, 400]]), # sequence_features batch
|
|
25
|
-
tensor([1.0, 0.0]) # labels batch)
|
|
26
|
-
|
|
27
|
-
"""
|
|
10
|
+
"""Collate a list of tensor tuples from ``FileDataset`` into batched tensors."""
|
|
28
11
|
if not batch:
|
|
29
12
|
return tuple()
|
|
30
13
|
|
|
@@ -33,7 +16,20 @@ def collate_fn(batch):
|
|
|
33
16
|
|
|
34
17
|
for i in range(num_tensors):
|
|
35
18
|
tensor_list = [item[i] for item in batch]
|
|
36
|
-
|
|
19
|
+
first = tensor_list[0]
|
|
20
|
+
|
|
21
|
+
if isinstance(first, torch.Tensor):
|
|
22
|
+
stacked = torch.cat(tensor_list, dim=0)
|
|
23
|
+
elif isinstance(first, np.ndarray):
|
|
24
|
+
stacked = np.concatenate(tensor_list, axis=0)
|
|
25
|
+
elif isinstance(first, list):
|
|
26
|
+
combined = []
|
|
27
|
+
for entry in tensor_list:
|
|
28
|
+
combined.extend(entry)
|
|
29
|
+
stacked = combined
|
|
30
|
+
else:
|
|
31
|
+
stacked = tensor_list
|
|
32
|
+
|
|
37
33
|
result.append(stacked)
|
|
38
34
|
|
|
39
35
|
return tuple(result)
|
|
@@ -53,6 +49,62 @@ def get_column_data(data: dict | pd.DataFrame, name: str):
|
|
|
53
49
|
raise KeyError(f"Unsupported data type for extracting column {name}")
|
|
54
50
|
|
|
55
51
|
|
|
52
|
+
def resolve_file_paths(path: str) -> tuple[list[str], str]:
|
|
53
|
+
"""Resolve file or directory path into a sorted list of files and file type."""
|
|
54
|
+
path_obj = Path(path)
|
|
55
|
+
|
|
56
|
+
if path_obj.is_file():
|
|
57
|
+
file_type = path_obj.suffix.lower().lstrip(".")
|
|
58
|
+
assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
|
|
59
|
+
return [str(path_obj)], file_type
|
|
60
|
+
|
|
61
|
+
if path_obj.is_dir():
|
|
62
|
+
collected_files = [p for p in path_obj.iterdir() if p.is_file()]
|
|
63
|
+
csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
|
|
64
|
+
parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
|
|
65
|
+
|
|
66
|
+
if csv_files and parquet_files:
|
|
67
|
+
raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
|
|
68
|
+
file_paths = csv_files if csv_files else parquet_files
|
|
69
|
+
if not file_paths:
|
|
70
|
+
raise ValueError(f"No CSV or Parquet files found in directory: {path}")
|
|
71
|
+
file_paths.sort()
|
|
72
|
+
file_type = "csv" if csv_files else "parquet"
|
|
73
|
+
return file_paths, file_type
|
|
74
|
+
|
|
75
|
+
raise ValueError(f"Invalid path: {path}")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def iter_file_chunks(file_path: str, file_type: str, chunk_size: int):
|
|
79
|
+
"""Yield DataFrame chunks for CSV/Parquet without loading the whole file."""
|
|
80
|
+
if file_type == "csv":
|
|
81
|
+
yield from pd.read_csv(file_path, chunksize=chunk_size)
|
|
82
|
+
return
|
|
83
|
+
parquet_file = pq.ParquetFile(file_path)
|
|
84
|
+
for batch in parquet_file.iter_batches(batch_size=chunk_size):
|
|
85
|
+
yield batch.to_pandas()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def read_table(file_path: str, file_type: str) -> pd.DataFrame:
|
|
89
|
+
"""Read a single CSV/Parquet file."""
|
|
90
|
+
if file_type == "csv":
|
|
91
|
+
return pd.read_csv(file_path)
|
|
92
|
+
return pd.read_parquet(file_path)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
|
|
96
|
+
"""Load multiple files of the same type into DataFrames."""
|
|
97
|
+
return [read_table(fp, file_type) for fp in file_paths]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def default_output_dir(path: str) -> Path:
|
|
101
|
+
"""Generate a default output directory path based on the input path."""
|
|
102
|
+
path_obj = Path(path)
|
|
103
|
+
if path_obj.is_file():
|
|
104
|
+
return path_obj.parent / f"{path_obj.stem}_preprocessed"
|
|
105
|
+
return path_obj.with_name(f"{path_obj.name}_preprocessed")
|
|
106
|
+
|
|
107
|
+
|
|
56
108
|
def split_dict_random(data_dict: dict, test_size: float=0.2, random_state:int|None=None):
|
|
57
109
|
"""Randomly split a dictionary of data into training and testing sets."""
|
|
58
110
|
lengths = [len(v) for v in data_dict.values()]
|
|
@@ -0,0 +1,513 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dataloader definitions
|
|
3
|
+
|
|
4
|
+
Date: create on 27/10/2025
|
|
5
|
+
Author: Yang Zhou,zyaztec@gmail.com
|
|
6
|
+
"""
|
|
7
|
+
import os
|
|
8
|
+
import tqdm
|
|
9
|
+
import torch
|
|
10
|
+
import logging
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import pyarrow.parquet as pq
|
|
14
|
+
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Iterator, Literal, Union, Optional
|
|
17
|
+
|
|
18
|
+
from torch.utils.data import DataLoader, TensorDataset, IterableDataset
|
|
19
|
+
from nextrec.data.preprocessor import DataProcessor
|
|
20
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureConfig
|
|
21
|
+
|
|
22
|
+
from nextrec.basic.loggers import colorize
|
|
23
|
+
from nextrec.data import (
|
|
24
|
+
get_column_data,
|
|
25
|
+
collate_fn,
|
|
26
|
+
resolve_file_paths,
|
|
27
|
+
read_table,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class FileDataset(FeatureConfig, IterableDataset):
|
|
32
|
+
"""
|
|
33
|
+
Iterable dataset that streams CSV/Parquet files in chunks and yields tensor tuples.
|
|
34
|
+
|
|
35
|
+
:param file_paths: Absolute or relative paths to CSV/Parquet files.
|
|
36
|
+
:param dense_features: Dense feature definitions (float tensors).
|
|
37
|
+
:param sparse_features: Sparse/categorical feature definitions (int tensors).
|
|
38
|
+
:param sequence_features: Sequence feature definitions (padded int tensors).
|
|
39
|
+
:param target_columns: Label/target column names.
|
|
40
|
+
:param id_columns: Optional ID columns appended after targets.
|
|
41
|
+
:param chunk_size: Number of rows to read per chunk.
|
|
42
|
+
:param file_type: ``\"csv\"`` or ``\"parquet\"``.
|
|
43
|
+
:param processor: Optional fitted :class:`~nextrec.data.preprocessor.DataProcessor` for online transform.
|
|
44
|
+
|
|
45
|
+
Yields
|
|
46
|
+
------
|
|
47
|
+
tuple
|
|
48
|
+
Tensors ordered as ``dense + sparse + sequence + targets (+ ids)``. Shape respects chunk size.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self,
|
|
52
|
+
file_paths: list[str], # file paths to read, containing CSV or Parquet files
|
|
53
|
+
dense_features: list[DenseFeature], # dense feature definitions
|
|
54
|
+
sparse_features: list[SparseFeature], # sparse feature definitions
|
|
55
|
+
sequence_features: list[SequenceFeature], # sequence feature definitions
|
|
56
|
+
target_columns: list[str], # target column names
|
|
57
|
+
id_columns: list[str] | None = None, # id columns to carry through (not used for model inputs)
|
|
58
|
+
chunk_size: int = 10000,
|
|
59
|
+
file_type: str = 'csv',
|
|
60
|
+
processor: DataProcessor | None = None): # optional DataProcessor for transformation
|
|
61
|
+
"""
|
|
62
|
+
Initialize a streaming dataset backed by on-disk files.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
self.file_paths = file_paths
|
|
66
|
+
self.chunk_size = chunk_size
|
|
67
|
+
self.file_type = file_type
|
|
68
|
+
self.processor = processor
|
|
69
|
+
|
|
70
|
+
self._set_feature_config(dense_features, sparse_features, sequence_features)
|
|
71
|
+
self._set_target_config(target_columns, id_columns or [])
|
|
72
|
+
self.current_file_index = 0
|
|
73
|
+
self.total_files = len(file_paths)
|
|
74
|
+
|
|
75
|
+
def __iter__(self) -> Iterator[tuple]:
|
|
76
|
+
"""
|
|
77
|
+
Iterate over files and stream tensor tuples chunk by chunk.
|
|
78
|
+
|
|
79
|
+
Files are processed sequentially; each chunk is transformed (optionally via
|
|
80
|
+
``processor``) and converted to tensors before being yielded to PyTorch ``DataLoader``.
|
|
81
|
+
"""
|
|
82
|
+
self.current_file_index = 0
|
|
83
|
+
self._file_pbar = None
|
|
84
|
+
|
|
85
|
+
# Create progress bar for file processing when multiple files
|
|
86
|
+
if self.total_files > 1:
|
|
87
|
+
self._file_pbar = tqdm.tqdm(
|
|
88
|
+
total=self.total_files,
|
|
89
|
+
desc="Files",
|
|
90
|
+
unit="file",
|
|
91
|
+
position=0,
|
|
92
|
+
leave=True,
|
|
93
|
+
bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]'
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
for file_path in self.file_paths:
|
|
97
|
+
self.current_file_index += 1
|
|
98
|
+
|
|
99
|
+
if self._file_pbar is not None:
|
|
100
|
+
self._file_pbar.update(1)
|
|
101
|
+
elif self.total_files == 1:
|
|
102
|
+
file_name = os.path.basename(file_path)
|
|
103
|
+
logging.info(colorize(f"Processing file: {file_name}", color="cyan"))
|
|
104
|
+
|
|
105
|
+
if self.file_type == 'csv':
|
|
106
|
+
yield from self._read_csv_chunks(file_path)
|
|
107
|
+
elif self.file_type == 'parquet':
|
|
108
|
+
yield from self._read_parquet_chunks(file_path)
|
|
109
|
+
|
|
110
|
+
if self._file_pbar is not None:
|
|
111
|
+
self._file_pbar.close()
|
|
112
|
+
|
|
113
|
+
def _read_csv_chunks(self, file_path: str) -> Iterator[tuple]:
|
|
114
|
+
"""
|
|
115
|
+
Stream a CSV file chunk by chunk.
|
|
116
|
+
|
|
117
|
+
:param file_path: Path to the CSV file.
|
|
118
|
+
:yields: Tensor tuples for each chunk.
|
|
119
|
+
"""
|
|
120
|
+
chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
|
|
121
|
+
|
|
122
|
+
for chunk in chunk_iterator:
|
|
123
|
+
tensors = self._dataframe_to_tensors(chunk)
|
|
124
|
+
if tensors:
|
|
125
|
+
yield tensors
|
|
126
|
+
|
|
127
|
+
def _read_parquet_chunks(self, file_path: str) -> Iterator[tuple]:
|
|
128
|
+
"""
|
|
129
|
+
Stream a Parquet file via ``pyarrow`` batch reading.
|
|
130
|
+
|
|
131
|
+
:param file_path: Path to the Parquet file.
|
|
132
|
+
:yields: Tensor tuples for each batch.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
parquet_file = pq.ParquetFile(file_path)
|
|
136
|
+
for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
|
|
137
|
+
chunk = batch.to_pandas()
|
|
138
|
+
tensors = self._dataframe_to_tensors(chunk)
|
|
139
|
+
if tensors:
|
|
140
|
+
yield tensors
|
|
141
|
+
del chunk
|
|
142
|
+
|
|
143
|
+
def _dataframe_to_tensors(self, df: pd.DataFrame) -> tuple | None:
|
|
144
|
+
"""
|
|
145
|
+
Convert a DataFrame chunk into a tuple of tensors respecting feature order.
|
|
146
|
+
|
|
147
|
+
:param df: DataFrame chunk.
|
|
148
|
+
:returns: Tuple of tensors (features + targets + ids) or ``None`` if no tensors created.
|
|
149
|
+
"""
|
|
150
|
+
if self.processor is not None:
|
|
151
|
+
if not self.processor.is_fitted:
|
|
152
|
+
raise ValueError("DataProcessor must be fitted before using in streaming mode")
|
|
153
|
+
transformed_data = self.processor.transform(df, return_dict=True)
|
|
154
|
+
else:
|
|
155
|
+
transformed_data = df
|
|
156
|
+
|
|
157
|
+
return build_tensors_from_data(
|
|
158
|
+
data=transformed_data,
|
|
159
|
+
raw_data=df,
|
|
160
|
+
features=self.all_features,
|
|
161
|
+
target_columns=self.target_columns,
|
|
162
|
+
id_columns=self.id_columns,
|
|
163
|
+
on_missing_feature="raise",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class RecDataLoader(FeatureConfig):
|
|
168
|
+
"""
|
|
169
|
+
Convenience wrapper for building PyTorch ``DataLoader`` objects for recommendation models.
|
|
170
|
+
|
|
171
|
+
:param dense_features: Dense feature definitions (float tensors).
|
|
172
|
+
:param sparse_features: Sparse/categorical feature definitions (int tensors).
|
|
173
|
+
:param sequence_features: Sequence feature definitions (padded int tensors).
|
|
174
|
+
:param target: Target column name(s); string or list.
|
|
175
|
+
:param id_columns: Optional ID column name(s) appended after targets.
|
|
176
|
+
:param processor: Optional fitted :class:`~nextrec.data.preprocessor.DataProcessor` for preprocessing.
|
|
177
|
+
|
|
178
|
+
Examples
|
|
179
|
+
--------
|
|
180
|
+
>>> loader = RecDataLoader(
|
|
181
|
+
... dense_features=dense_features,
|
|
182
|
+
... sparse_features=sparse_features,
|
|
183
|
+
... sequence_features=sequence_features,
|
|
184
|
+
... target=['label'],
|
|
185
|
+
... processor=processor,
|
|
186
|
+
... )
|
|
187
|
+
>>> dataloader = loader.create_dataloader(
|
|
188
|
+
... data=\"/path/to/data.csv\",
|
|
189
|
+
... batch_size=1024,
|
|
190
|
+
... load_full=False,
|
|
191
|
+
... chunk_size=20000,
|
|
192
|
+
... )
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
def __init__(self,
|
|
196
|
+
dense_features: list[DenseFeature] | None = None,
|
|
197
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
198
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
199
|
+
target: list[str] | None | str = None,
|
|
200
|
+
id_columns: str | list[str] | None = None,
|
|
201
|
+
processor: Optional['DataProcessor'] = None):
|
|
202
|
+
"""
|
|
203
|
+
Initialize the loader with feature/target definitions.
|
|
204
|
+
|
|
205
|
+
:param dense_features: Dense feature definitions (float).
|
|
206
|
+
:param sparse_features: Sparse feature definitions (int).
|
|
207
|
+
:param sequence_features: Sequence feature definitions (int, padded).
|
|
208
|
+
:param target: Single target name or list of names.
|
|
209
|
+
:param id_columns: Optional ID columns to append in output.
|
|
210
|
+
:param processor: Optional fitted ``DataProcessor`` for preprocessing.
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
self.processor = processor
|
|
214
|
+
self._set_feature_config(dense_features, sparse_features, sequence_features)
|
|
215
|
+
self._set_target_config(target, id_columns)
|
|
216
|
+
|
|
217
|
+
def create_dataloader(self,
|
|
218
|
+
data: Union[dict, pd.DataFrame, str, DataLoader],
|
|
219
|
+
batch_size: int = 32,
|
|
220
|
+
shuffle: bool = True,
|
|
221
|
+
load_full: bool = True,
|
|
222
|
+
chunk_size: int = 10000) -> DataLoader:
|
|
223
|
+
"""
|
|
224
|
+
Build a ``DataLoader`` from in-memory data, file path, or an existing loader.
|
|
225
|
+
|
|
226
|
+
:param data: Dict/DataFrame (in-memory), path to CSV/Parquet file/dir, or an existing ``DataLoader``.
|
|
227
|
+
:param batch_size: Batch size for the returned ``DataLoader``.
|
|
228
|
+
:param shuffle: Shuffle flag passed to PyTorch ``DataLoader`` (for in-memory and streaming batches).
|
|
229
|
+
:param load_full: If ``True``, load all files into memory; if ``False``, stream with chunks.
|
|
230
|
+
:param chunk_size: Number of rows per chunk when ``load_full=False``.
|
|
231
|
+
:returns: A configured PyTorch ``DataLoader``.
|
|
232
|
+
"""
|
|
233
|
+
if isinstance(data, DataLoader):
|
|
234
|
+
return data
|
|
235
|
+
|
|
236
|
+
if isinstance(data, (str, os.PathLike)):
|
|
237
|
+
return self._create_from_path(data, batch_size, shuffle, load_full, chunk_size)
|
|
238
|
+
|
|
239
|
+
if isinstance(data, (dict, pd.DataFrame)):
|
|
240
|
+
return self._create_from_memory(data, batch_size, shuffle)
|
|
241
|
+
|
|
242
|
+
raise ValueError(f"Unsupported data type: {type(data)}")
|
|
243
|
+
|
|
244
|
+
def _create_from_memory(self,
|
|
245
|
+
data: Union[dict, pd.DataFrame],
|
|
246
|
+
batch_size: int,
|
|
247
|
+
shuffle: bool) -> DataLoader:
|
|
248
|
+
"""
|
|
249
|
+
Convert in-memory data (dict/DataFrame) into tensors and wrap with ``DataLoader``.
|
|
250
|
+
|
|
251
|
+
:param data: Dict or DataFrame containing feature/target columns.
|
|
252
|
+
:param batch_size: Batch size.
|
|
253
|
+
:param shuffle: Whether to shuffle batches.
|
|
254
|
+
:returns: A ``DataLoader`` backed by ``TensorDataset``.
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
raw_data = data
|
|
258
|
+
|
|
259
|
+
if self.processor is not None:
|
|
260
|
+
assert self.processor.is_fitted, "DataProcessor must be fitted before using in RecDataLoader"
|
|
261
|
+
data = self.processor.transform(data, return_dict=True)
|
|
262
|
+
|
|
263
|
+
tensors = build_tensors_from_data(
|
|
264
|
+
data=data,
|
|
265
|
+
raw_data=raw_data,
|
|
266
|
+
features=self.all_features,
|
|
267
|
+
target_columns=self.target_columns,
|
|
268
|
+
id_columns=self.id_columns,
|
|
269
|
+
on_missing_feature="raise",
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
assert tensors is not None, "No tensors were created from provided data."
|
|
273
|
+
|
|
274
|
+
dataset = TensorDataset(*tensors)
|
|
275
|
+
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
|
|
276
|
+
|
|
277
|
+
def _create_from_path(self,
|
|
278
|
+
path: str,
|
|
279
|
+
batch_size: int,
|
|
280
|
+
shuffle: bool,
|
|
281
|
+
load_full: bool,
|
|
282
|
+
chunk_size: int) -> DataLoader:
|
|
283
|
+
"""
|
|
284
|
+
Build a ``DataLoader`` from a CSV/Parquet file or directory.
|
|
285
|
+
|
|
286
|
+
:param path: File path or directory containing homogeneous CSV/Parquet files.
|
|
287
|
+
:param batch_size: Batch size.
|
|
288
|
+
:param shuffle: Shuffle flag.
|
|
289
|
+
:param load_full: If ``True``, load all rows into memory; otherwise stream.
|
|
290
|
+
:param chunk_size: Chunk rows when streaming.
|
|
291
|
+
:returns: A ``DataLoader`` (in-memory or streaming).
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
file_paths, file_type = resolve_file_paths(str(Path(path)))
|
|
295
|
+
|
|
296
|
+
# Load full data into memory
|
|
297
|
+
if load_full:
|
|
298
|
+
dfs = []
|
|
299
|
+
total_bytes = 0
|
|
300
|
+
for file_path in file_paths:
|
|
301
|
+
try:
|
|
302
|
+
total_bytes += os.path.getsize(file_path)
|
|
303
|
+
except OSError:
|
|
304
|
+
pass
|
|
305
|
+
try:
|
|
306
|
+
df = read_table(file_path, file_type)
|
|
307
|
+
dfs.append(df)
|
|
308
|
+
except MemoryError as exc:
|
|
309
|
+
raise MemoryError(
|
|
310
|
+
f"Out of memory while reading {file_path}. "
|
|
311
|
+
f"Consider using load_full=False with streaming."
|
|
312
|
+
) from exc
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
combined_df = pd.concat(dfs, ignore_index=True)
|
|
316
|
+
except MemoryError as exc:
|
|
317
|
+
raise MemoryError(
|
|
318
|
+
f"Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). "
|
|
319
|
+
f"Use load_full=False to stream or reduce chunk_size."
|
|
320
|
+
) from exc
|
|
321
|
+
|
|
322
|
+
return self._create_from_memory(combined_df, batch_size, shuffle)
|
|
323
|
+
else:
|
|
324
|
+
return self._load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle)
|
|
325
|
+
|
|
326
|
+
def _load_files_streaming(self,
|
|
327
|
+
file_paths: list[str],
|
|
328
|
+
file_type: str,
|
|
329
|
+
batch_size: int,
|
|
330
|
+
chunk_size: int,
|
|
331
|
+
shuffle: bool) -> DataLoader:
|
|
332
|
+
"""
|
|
333
|
+
Create a streaming ``DataLoader`` that yields chunked tensors from files.
|
|
334
|
+
|
|
335
|
+
:param file_paths: Ordered list of file paths.
|
|
336
|
+
:param file_type: ``\"csv\"`` or ``\"parquet\"``.
|
|
337
|
+
:param batch_size: Batch size for the outer ``DataLoader``.
|
|
338
|
+
:param chunk_size: Number of rows per chunk when reading files.
|
|
339
|
+
:returns: Streaming ``DataLoader`` with custom ``collate_fn``.
|
|
340
|
+
"""
|
|
341
|
+
|
|
342
|
+
if shuffle:
|
|
343
|
+
logging.warning(colorize("Shuffle is ignored in streaming mode (IterableDataset).", "yellow"))
|
|
344
|
+
|
|
345
|
+
if batch_size != 1:
|
|
346
|
+
logging.warning(colorize(
|
|
347
|
+
"Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.",
|
|
348
|
+
"yellow",
|
|
349
|
+
))
|
|
350
|
+
effective_batch_size = 1
|
|
351
|
+
|
|
352
|
+
dataset = FileDataset(
|
|
353
|
+
file_paths=file_paths,
|
|
354
|
+
dense_features=self.dense_features,
|
|
355
|
+
sparse_features=self.sparse_features,
|
|
356
|
+
sequence_features=self.sequence_features,
|
|
357
|
+
target_columns=self.target_columns,
|
|
358
|
+
id_columns=self.id_columns,
|
|
359
|
+
chunk_size=chunk_size,
|
|
360
|
+
file_type=file_type,
|
|
361
|
+
processor=self.processor
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
return DataLoader(dataset, batch_size=effective_batch_size, collate_fn=collate_fn)
|
|
365
|
+
|
|
366
|
+
def _normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
367
|
+
"""
|
|
368
|
+
Normalize a raw sequence column into a padded int64 ``ndarray``.
|
|
369
|
+
|
|
370
|
+
:param column: Sequence column from DataFrame/dict; can be Series, list, or ndarray.
|
|
371
|
+
:param feature: Sequence feature definition providing ``max_len`` and optional ``padding_idx``.
|
|
372
|
+
:returns: 2-D numpy array (batch, seq_len) with dtype ``int64``.
|
|
373
|
+
"""
|
|
374
|
+
if isinstance(column, pd.Series):
|
|
375
|
+
column = column.tolist()
|
|
376
|
+
|
|
377
|
+
if isinstance(column, (list, tuple)):
|
|
378
|
+
column = np.array(column, dtype=object)
|
|
379
|
+
|
|
380
|
+
if not isinstance(column, np.ndarray):
|
|
381
|
+
column = np.array([column], dtype=object)
|
|
382
|
+
|
|
383
|
+
if column.ndim == 0:
|
|
384
|
+
column = column.reshape(1)
|
|
385
|
+
|
|
386
|
+
if column.dtype == object and any(isinstance(v, str) for v in column.ravel()):
|
|
387
|
+
raise TypeError(
|
|
388
|
+
f"Sequence feature '{feature.name}' expects numeric sequences; found string values."
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
if column.dtype == object and len(column) > 0 and isinstance(column[0], (list, tuple, np.ndarray)):
|
|
392
|
+
sequences = []
|
|
393
|
+
for seq in column:
|
|
394
|
+
if isinstance(seq, str):
|
|
395
|
+
raise TypeError(
|
|
396
|
+
f"Sequence feature '{feature.name}' expects numeric sequences; found string values."
|
|
397
|
+
)
|
|
398
|
+
if isinstance(seq, (list, tuple, np.ndarray)):
|
|
399
|
+
arr = np.asarray(seq, dtype=np.int64)
|
|
400
|
+
else:
|
|
401
|
+
arr = np.asarray([seq], dtype=np.int64)
|
|
402
|
+
sequences.append(arr)
|
|
403
|
+
|
|
404
|
+
max_len = getattr(feature, "max_len", 0)
|
|
405
|
+
if max_len <= 0:
|
|
406
|
+
max_len = max((len(seq) for seq in sequences), default=1)
|
|
407
|
+
|
|
408
|
+
pad_value = getattr(feature, "padding_idx", 0)
|
|
409
|
+
padded = []
|
|
410
|
+
for seq in sequences:
|
|
411
|
+
if len(seq) > max_len:
|
|
412
|
+
padded.append(seq[:max_len])
|
|
413
|
+
else:
|
|
414
|
+
padded.append(np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value))
|
|
415
|
+
column = np.stack(padded)
|
|
416
|
+
elif column.ndim == 1:
|
|
417
|
+
column = column.reshape(-1, 1)
|
|
418
|
+
|
|
419
|
+
return np.asarray(column, dtype=np.int64)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def build_tensors_from_data( # noqa: C901
|
|
423
|
+
data: dict | pd.DataFrame,
|
|
424
|
+
raw_data: dict | pd.DataFrame,
|
|
425
|
+
features: list,
|
|
426
|
+
target_columns: list[str],
|
|
427
|
+
id_columns: list[str],
|
|
428
|
+
on_missing_feature: str = "raise",
|
|
429
|
+
) -> tuple | None:
|
|
430
|
+
"""
|
|
431
|
+
Shared routine to convert structured data into a tuple of tensors.
|
|
432
|
+
|
|
433
|
+
:param data: Preprocessed data (dict or DataFrame) used to fetch model inputs/labels.
|
|
434
|
+
:param raw_data: Original data, used for untouched ID columns.
|
|
435
|
+
:param features: Ordered list of feature definitions.
|
|
436
|
+
:param target_columns: Target/label column names.
|
|
437
|
+
:param id_columns: Extra ID column names to append at the end of the tensor tuple.
|
|
438
|
+
:param on_missing_feature: ``\"warn\"`` to skip missing feature with warning, ``\"raise\"`` to error.
|
|
439
|
+
:returns: Tuple of tensors following the order of ``features`` + targets (+ ids) or ``None`` if empty.
|
|
440
|
+
"""
|
|
441
|
+
tensors: list[torch.Tensor] = []
|
|
442
|
+
|
|
443
|
+
for feature in features:
|
|
444
|
+
column = get_column_data(data, feature.name)
|
|
445
|
+
if column is None:
|
|
446
|
+
if on_missing_feature == "warn":
|
|
447
|
+
logging.warning(colorize(f"Feature column '{feature.name}' not found in data", "yellow"))
|
|
448
|
+
continue
|
|
449
|
+
raise AssertionError(f"Feature column {feature.name} not found in data.")
|
|
450
|
+
|
|
451
|
+
if isinstance(feature, SequenceFeature):
|
|
452
|
+
tensor = torch.from_numpy(_normalize_sequence_column(column, feature))
|
|
453
|
+
elif isinstance(feature, DenseFeature):
|
|
454
|
+
tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
|
|
455
|
+
else:
|
|
456
|
+
tensor = torch.from_numpy(np.asarray(column, dtype=np.int64))
|
|
457
|
+
|
|
458
|
+
tensors.append(tensor)
|
|
459
|
+
|
|
460
|
+
label_tensors = []
|
|
461
|
+
if target_columns:
|
|
462
|
+
for target_name in target_columns:
|
|
463
|
+
column = get_column_data(data, target_name)
|
|
464
|
+
assert column is not None, f"Target column '{target_name}' not found in data."
|
|
465
|
+
|
|
466
|
+
label_tensor = torch.from_numpy(np.asarray(column, dtype=np.float32))
|
|
467
|
+
|
|
468
|
+
if label_tensor.dim() == 1:
|
|
469
|
+
label_tensor = label_tensor.view(-1, 1)
|
|
470
|
+
elif label_tensor.dim() == 2 and label_tensor.shape[0] == 1 and label_tensor.shape[1] > 1:
|
|
471
|
+
label_tensor = label_tensor.t()
|
|
472
|
+
|
|
473
|
+
label_tensors.append(label_tensor)
|
|
474
|
+
|
|
475
|
+
if label_tensors:
|
|
476
|
+
if len(label_tensors) == 1 and label_tensors[0].shape[1] > 1:
|
|
477
|
+
y_tensor = label_tensors[0]
|
|
478
|
+
else:
|
|
479
|
+
y_tensor = torch.cat(label_tensors, dim=1)
|
|
480
|
+
|
|
481
|
+
if y_tensor.shape[1] == 1:
|
|
482
|
+
y_tensor = y_tensor.squeeze(1)
|
|
483
|
+
|
|
484
|
+
tensors.append(y_tensor)
|
|
485
|
+
|
|
486
|
+
if id_columns:
|
|
487
|
+
id_arrays = []
|
|
488
|
+
for id_col in id_columns:
|
|
489
|
+
column = get_column_data(raw_data, id_col)
|
|
490
|
+
if column is None:
|
|
491
|
+
column = get_column_data(data, id_col)
|
|
492
|
+
if column is None:
|
|
493
|
+
raise KeyError(f"ID column '{id_col}' not found in provided data.")
|
|
494
|
+
try:
|
|
495
|
+
id_arr = np.asarray(column, dtype=np.int64)
|
|
496
|
+
except Exception as exc:
|
|
497
|
+
raise TypeError(
|
|
498
|
+
f"ID column '{id_col}' must contain numeric values. "
|
|
499
|
+
f"Received dtype={np.asarray(column).dtype}, error: {exc}"
|
|
500
|
+
) from exc
|
|
501
|
+
id_arrays.append(id_arr)
|
|
502
|
+
|
|
503
|
+
combined_ids = np.column_stack(id_arrays)
|
|
504
|
+
tensors.append(torch.from_numpy(combined_ids))
|
|
505
|
+
|
|
506
|
+
if not tensors:
|
|
507
|
+
return None
|
|
508
|
+
|
|
509
|
+
return tuple(tensors)
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
# Backward compatible alias
|
|
513
|
+
_build_tensors_from_data = build_tensors_from_data
|