nextrec 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/heads.py +99 -0
- nextrec/basic/loggers.py +5 -5
- nextrec/basic/model.py +217 -88
- nextrec/cli.py +1 -1
- nextrec/data/dataloader.py +93 -95
- nextrec/data/preprocessor.py +108 -46
- nextrec/loss/grad_norm.py +13 -13
- nextrec/models/multi_task/esmm.py +10 -11
- nextrec/models/multi_task/mmoe.py +20 -19
- nextrec/models/multi_task/ple.py +35 -34
- nextrec/models/multi_task/poso.py +23 -21
- nextrec/models/multi_task/share_bottom.py +18 -17
- nextrec/models/ranking/afm.py +4 -3
- nextrec/models/ranking/autoint.py +4 -3
- nextrec/models/ranking/dcn.py +4 -3
- nextrec/models/ranking/dcn_v2.py +4 -3
- nextrec/models/ranking/deepfm.py +4 -3
- nextrec/models/ranking/dien.py +2 -2
- nextrec/models/ranking/din.py +2 -2
- nextrec/models/ranking/eulernet.py +4 -3
- nextrec/models/ranking/ffm.py +4 -3
- nextrec/models/ranking/fibinet.py +2 -2
- nextrec/models/ranking/fm.py +4 -3
- nextrec/models/ranking/lr.py +4 -3
- nextrec/models/ranking/masknet.py +4 -5
- nextrec/models/ranking/pnn.py +5 -4
- nextrec/models/ranking/widedeep.py +8 -8
- nextrec/models/ranking/xdeepfm.py +5 -4
- nextrec/utils/console.py +20 -6
- nextrec/utils/data.py +154 -32
- nextrec/utils/model.py +86 -1
- {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/METADATA +5 -6
- {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/RECORD +37 -36
- {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/WHEEL +0 -0
- {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/entry_points.txt +0 -0
- {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/licenses/LICENSE +0 -0
nextrec/cli.py
CHANGED
nextrec/data/dataloader.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Dataloader definitions
|
|
3
3
|
|
|
4
4
|
Date: create on 27/10/2025
|
|
5
|
-
Checkpoint: edit on
|
|
5
|
+
Checkpoint: edit on 24/12/2025
|
|
6
6
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
7
7
|
"""
|
|
8
8
|
|
|
@@ -13,7 +13,6 @@ from typing import cast
|
|
|
13
13
|
|
|
14
14
|
import numpy as np
|
|
15
15
|
import pandas as pd
|
|
16
|
-
import pyarrow.parquet as pq
|
|
17
16
|
import torch
|
|
18
17
|
from torch.utils.data import DataLoader, Dataset, IterableDataset
|
|
19
18
|
|
|
@@ -26,7 +25,12 @@ from nextrec.basic.features import (
|
|
|
26
25
|
from nextrec.data.batch_utils import collate_fn
|
|
27
26
|
from nextrec.data.data_processing import get_column_data
|
|
28
27
|
from nextrec.data.preprocessor import DataProcessor
|
|
29
|
-
from nextrec.utils.data import
|
|
28
|
+
from nextrec.utils.data import (
|
|
29
|
+
check_streaming_support,
|
|
30
|
+
iter_file_chunks,
|
|
31
|
+
read_table,
|
|
32
|
+
resolve_file_paths,
|
|
33
|
+
)
|
|
30
34
|
from nextrec.utils.torch_utils import to_tensor
|
|
31
35
|
|
|
32
36
|
|
|
@@ -72,22 +76,34 @@ class TensorDictDataset(Dataset):
|
|
|
72
76
|
class FileDataset(FeatureSet, IterableDataset):
|
|
73
77
|
def __init__(
|
|
74
78
|
self,
|
|
75
|
-
file_paths: list[str],
|
|
76
|
-
dense_features: list[DenseFeature],
|
|
77
|
-
sparse_features: list[SparseFeature],
|
|
78
|
-
sequence_features: list[SequenceFeature],
|
|
79
|
-
target_columns: list[str],
|
|
80
|
-
id_columns:
|
|
81
|
-
list[str] | None
|
|
82
|
-
) = None, # id columns to carry through (not used for model inputs)
|
|
79
|
+
file_paths: list[str],
|
|
80
|
+
dense_features: list[DenseFeature],
|
|
81
|
+
sparse_features: list[SparseFeature],
|
|
82
|
+
sequence_features: list[SequenceFeature],
|
|
83
|
+
target_columns: list[str],
|
|
84
|
+
id_columns: list[str] | None = None,
|
|
83
85
|
chunk_size: int = 10000,
|
|
84
86
|
file_type: str = "csv",
|
|
85
87
|
processor: DataProcessor | None = None,
|
|
86
|
-
):
|
|
88
|
+
):
|
|
89
|
+
"""Streaming dataset for reading files in chunks.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
file_paths: List of file paths to read
|
|
93
|
+
dense_features: Dense feature definitions
|
|
94
|
+
sparse_features: Sparse feature definitions
|
|
95
|
+
sequence_features: Sequence feature definitions
|
|
96
|
+
target_columns: Target column names
|
|
97
|
+
id_columns: ID columns to carry through
|
|
98
|
+
chunk_size: Number of rows per chunk
|
|
99
|
+
file_type: Format type (csv, parquet, etc.)
|
|
100
|
+
processor: Optional DataProcessor for transformation
|
|
101
|
+
"""
|
|
87
102
|
self.file_paths = file_paths
|
|
88
103
|
self.chunk_size = chunk_size
|
|
89
104
|
self.file_type = file_type
|
|
90
105
|
self.processor = processor
|
|
106
|
+
|
|
91
107
|
self.set_all_features(
|
|
92
108
|
dense_features,
|
|
93
109
|
sparse_features,
|
|
@@ -102,26 +118,11 @@ class FileDataset(FeatureSet, IterableDataset):
|
|
|
102
118
|
self.current_file_index = 0
|
|
103
119
|
for file_path in self.file_paths:
|
|
104
120
|
self.current_file_index += 1
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
yield from self.read_parquet_chunks(file_path)
|
|
111
|
-
|
|
112
|
-
def read_csv_chunks(self, file_path: str):
|
|
113
|
-
chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
|
|
114
|
-
for chunk in chunk_iterator:
|
|
115
|
-
tensors = self.dataframeto_tensors(chunk)
|
|
116
|
-
yield tensors
|
|
117
|
-
|
|
118
|
-
def read_parquet_chunks(self, file_path: str):
|
|
119
|
-
parquet_file = pq.ParquetFile(file_path)
|
|
120
|
-
for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
|
|
121
|
-
chunk = batch.to_pandas()
|
|
122
|
-
tensors = self.dataframeto_tensors(chunk)
|
|
123
|
-
yield tensors
|
|
124
|
-
del chunk
|
|
121
|
+
for chunk in iter_file_chunks(file_path, self.file_type, self.chunk_size):
|
|
122
|
+
tensors = self.dataframeto_tensors(chunk)
|
|
123
|
+
if tensors is not None:
|
|
124
|
+
yield tensors
|
|
125
|
+
del chunk
|
|
125
126
|
|
|
126
127
|
def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
|
|
127
128
|
if self.processor is not None:
|
|
@@ -209,8 +210,6 @@ class RecDataLoader(FeatureSet):
|
|
|
209
210
|
Returns:
|
|
210
211
|
DataLoader instance.
|
|
211
212
|
"""
|
|
212
|
-
|
|
213
|
-
# Enforce num_workers=0 for streaming mode to prevent data duplication
|
|
214
213
|
if streaming and num_workers > 0:
|
|
215
214
|
logging.warning(
|
|
216
215
|
f"[RecDataLoader Warning] num_workers={num_workers} is not compatible with streaming=True. "
|
|
@@ -221,20 +220,13 @@ class RecDataLoader(FeatureSet):
|
|
|
221
220
|
|
|
222
221
|
if isinstance(data, DataLoader):
|
|
223
222
|
return data
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
path=data,
|
|
227
|
-
batch_size=batch_size,
|
|
228
|
-
shuffle=shuffle,
|
|
229
|
-
streaming=streaming,
|
|
230
|
-
chunk_size=chunk_size,
|
|
231
|
-
num_workers=num_workers,
|
|
232
|
-
)
|
|
233
|
-
elif (
|
|
223
|
+
|
|
224
|
+
is_path_list = (
|
|
234
225
|
isinstance(data, list)
|
|
235
226
|
and data
|
|
236
227
|
and all(isinstance(p, (str, os.PathLike)) for p in data)
|
|
237
|
-
)
|
|
228
|
+
)
|
|
229
|
+
if isinstance(data, (str, os.PathLike)) or is_path_list:
|
|
238
230
|
return self.create_from_path(
|
|
239
231
|
path=data,
|
|
240
232
|
batch_size=batch_size,
|
|
@@ -243,7 +235,8 @@ class RecDataLoader(FeatureSet):
|
|
|
243
235
|
chunk_size=chunk_size,
|
|
244
236
|
num_workers=num_workers,
|
|
245
237
|
)
|
|
246
|
-
|
|
238
|
+
|
|
239
|
+
if isinstance(data, (dict, pd.DataFrame)):
|
|
247
240
|
return self.create_from_memory(
|
|
248
241
|
data=data,
|
|
249
242
|
batch_size=batch_size,
|
|
@@ -251,10 +244,8 @@ class RecDataLoader(FeatureSet):
|
|
|
251
244
|
num_workers=num_workers,
|
|
252
245
|
sampler=sampler,
|
|
253
246
|
)
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
f"[RecDataLoader Error] Unsupported data type: {type(data)}"
|
|
257
|
-
)
|
|
247
|
+
|
|
248
|
+
raise ValueError(f"[RecDataLoader Error] Unsupported data type: {type(data)}")
|
|
258
249
|
|
|
259
250
|
def create_from_memory(
|
|
260
251
|
self,
|
|
@@ -264,7 +255,6 @@ class RecDataLoader(FeatureSet):
|
|
|
264
255
|
num_workers: int = 0,
|
|
265
256
|
sampler=None,
|
|
266
257
|
) -> DataLoader:
|
|
267
|
-
|
|
268
258
|
raw_data = data
|
|
269
259
|
|
|
270
260
|
if self.processor is not None:
|
|
@@ -309,17 +299,24 @@ class RecDataLoader(FeatureSet):
|
|
|
309
299
|
file_paths = [str(Path(p)) for p in path]
|
|
310
300
|
if not file_paths:
|
|
311
301
|
raise ValueError("[RecDataLoader Error] Empty file path list provided.")
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
302
|
+
|
|
303
|
+
from nextrec.utils.data import get_file_format_from_extension
|
|
304
|
+
|
|
305
|
+
file_formats = set()
|
|
306
|
+
for p in file_paths:
|
|
307
|
+
fmt = get_file_format_from_extension(Path(p).suffix)
|
|
308
|
+
if fmt is None:
|
|
309
|
+
raise ValueError(
|
|
310
|
+
f"[RecDataLoader Error] Unsupported file extension: {Path(p).suffix}"
|
|
311
|
+
)
|
|
312
|
+
file_formats.add(fmt)
|
|
313
|
+
|
|
314
|
+
if len(file_formats) != 1:
|
|
319
315
|
raise ValueError(
|
|
320
|
-
f"[RecDataLoader Error]
|
|
316
|
+
f"[RecDataLoader Error] Mixed file types in provided list: {', '.join(file_formats)}. "
|
|
317
|
+
"Please use a single format per DataLoader."
|
|
321
318
|
)
|
|
322
|
-
file_type =
|
|
319
|
+
file_type = file_formats.pop()
|
|
323
320
|
if streaming:
|
|
324
321
|
return self.load_files_streaming(
|
|
325
322
|
file_paths,
|
|
@@ -329,31 +326,30 @@ class RecDataLoader(FeatureSet):
|
|
|
329
326
|
shuffle,
|
|
330
327
|
num_workers=num_workers,
|
|
331
328
|
)
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
pass
|
|
341
|
-
try:
|
|
342
|
-
df = read_table(file_path, data_format=file_type)
|
|
343
|
-
dfs.append(df)
|
|
344
|
-
except MemoryError as exc:
|
|
345
|
-
raise MemoryError(
|
|
346
|
-
f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using streaming=True."
|
|
347
|
-
) from exc
|
|
329
|
+
|
|
330
|
+
dfs = []
|
|
331
|
+
total_bytes = 0
|
|
332
|
+
for file_path in file_paths:
|
|
333
|
+
try:
|
|
334
|
+
total_bytes += os.path.getsize(file_path)
|
|
335
|
+
except OSError:
|
|
336
|
+
pass
|
|
348
337
|
try:
|
|
349
|
-
|
|
338
|
+
df = read_table(file_path, data_format=file_type)
|
|
339
|
+
dfs.append(df)
|
|
350
340
|
except MemoryError as exc:
|
|
351
341
|
raise MemoryError(
|
|
352
|
-
f"[RecDataLoader Error] Out of memory while
|
|
342
|
+
f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using streaming=True."
|
|
353
343
|
) from exc
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
344
|
+
try:
|
|
345
|
+
combined_df = pd.concat(dfs, ignore_index=True)
|
|
346
|
+
except MemoryError as exc:
|
|
347
|
+
raise MemoryError(
|
|
348
|
+
f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use streaming=True or reduce chunk_size."
|
|
349
|
+
) from exc
|
|
350
|
+
return self.create_from_memory(
|
|
351
|
+
combined_df, batch_size, shuffle, num_workers=num_workers
|
|
352
|
+
)
|
|
357
353
|
|
|
358
354
|
def load_files_streaming(
|
|
359
355
|
self,
|
|
@@ -364,6 +360,11 @@ class RecDataLoader(FeatureSet):
|
|
|
364
360
|
shuffle: bool,
|
|
365
361
|
num_workers: int = 0,
|
|
366
362
|
) -> DataLoader:
|
|
363
|
+
if not check_streaming_support(file_type):
|
|
364
|
+
raise ValueError(
|
|
365
|
+
f"[RecDataLoader Error] Format '{file_type}' does not support streaming reads. "
|
|
366
|
+
"Use streaming=False or convert data to csv/parquet."
|
|
367
|
+
)
|
|
367
368
|
if shuffle:
|
|
368
369
|
logging.info(
|
|
369
370
|
"[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset)."
|
|
@@ -420,22 +421,21 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
|
420
421
|
f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
|
|
421
422
|
)
|
|
422
423
|
if isinstance(seq, (list, tuple, np.ndarray)):
|
|
423
|
-
|
|
424
|
+
sequences.append(np.asarray(seq, dtype=np.int64))
|
|
424
425
|
else:
|
|
425
|
-
|
|
426
|
-
sequences.append(arr)
|
|
426
|
+
sequences.append(np.asarray([seq], dtype=np.int64))
|
|
427
427
|
max_len = getattr(feature, "max_len", 0)
|
|
428
428
|
if max_len <= 0:
|
|
429
429
|
max_len = max((len(seq) for seq in sequences), default=1)
|
|
430
430
|
pad_value = getattr(feature, "padding_idx", 0)
|
|
431
|
-
padded = [
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
431
|
+
padded = [
|
|
432
|
+
(
|
|
433
|
+
seq[:max_len]
|
|
434
|
+
if len(seq) > max_len
|
|
435
|
+
else np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value)
|
|
436
|
+
)
|
|
437
|
+
for seq in sequences
|
|
438
|
+
]
|
|
439
439
|
column = np.stack(padded)
|
|
440
440
|
elif column.ndim == 1:
|
|
441
441
|
column = column.reshape(-1, 1)
|
|
@@ -456,9 +456,7 @@ def build_tensors_from_data(
|
|
|
456
456
|
raise ValueError(
|
|
457
457
|
f"[RecDataLoader Error] Feature column '{feature.name}' not found in data"
|
|
458
458
|
)
|
|
459
|
-
if isinstance(
|
|
460
|
-
feature, SequenceFeature
|
|
461
|
-
): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
|
|
459
|
+
if isinstance(feature, SequenceFeature):
|
|
462
460
|
arr = normalize_sequence_column(column, feature)
|
|
463
461
|
tensor = to_tensor(arr, dtype=torch.long)
|
|
464
462
|
elif isinstance(feature, DenseFeature):
|
nextrec/data/preprocessor.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
3
3
|
|
|
4
4
|
Date: create on 13/11/2025
|
|
5
|
-
Checkpoint: edit on
|
|
5
|
+
Checkpoint: edit on 24/12/2025
|
|
6
6
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
7
7
|
"""
|
|
8
8
|
|
|
@@ -34,6 +34,8 @@ from nextrec.basic.session import resolve_save_path
|
|
|
34
34
|
from nextrec.data.data_processing import hash_md5_mod
|
|
35
35
|
from nextrec.utils.console import progress
|
|
36
36
|
from nextrec.utils.data import (
|
|
37
|
+
FILE_FORMAT_CONFIG,
|
|
38
|
+
check_streaming_support,
|
|
37
39
|
default_output_dir,
|
|
38
40
|
iter_file_chunks,
|
|
39
41
|
load_dataframes,
|
|
@@ -239,17 +241,9 @@ class DataProcessor(FeatureSet):
|
|
|
239
241
|
dtype=np.int64,
|
|
240
242
|
count=sparse_series.size,
|
|
241
243
|
)
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
f"[Data Processor Error] LabelEncoder for {name} not fitted"
|
|
246
|
-
)
|
|
247
|
-
cat = pd.Categorical(sparse_series, categories=le.classes_)
|
|
248
|
-
codes = cat.codes # -1 indicates unknown category
|
|
249
|
-
unk_index = 0
|
|
250
|
-
if "<UNK>" in le.classes_:
|
|
251
|
-
unk_index = int(list(le.classes_).index("<UNK>"))
|
|
252
|
-
return np.where(codes < 0, unk_index, codes).astype(np.int64, copy=False)
|
|
244
|
+
raise ValueError(
|
|
245
|
+
f"[Data Processor Error] Token index for {name} not fitted"
|
|
246
|
+
)
|
|
253
247
|
|
|
254
248
|
if encode_method == "hash":
|
|
255
249
|
hash_size = config["hash_size"]
|
|
@@ -298,13 +292,11 @@ class DataProcessor(FeatureSet):
|
|
|
298
292
|
split_fn = str.split
|
|
299
293
|
is_nan = np.isnan
|
|
300
294
|
if encode_method == "label":
|
|
301
|
-
class_to_idx = config.get("_token_to_idx")
|
|
295
|
+
class_to_idx = config.get("_token_to_idx")
|
|
302
296
|
if class_to_idx is None:
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
|
|
307
|
-
config["_class_to_idx"] = class_to_idx
|
|
297
|
+
raise ValueError(
|
|
298
|
+
f"[Data Processor Error] Token index for {name} not fitted"
|
|
299
|
+
)
|
|
308
300
|
unk_index = int(config.get("_unk_index", class_to_idx.get("<UNK>", 0)))
|
|
309
301
|
else:
|
|
310
302
|
class_to_idx = None # type: ignore
|
|
@@ -429,6 +421,12 @@ class DataProcessor(FeatureSet):
|
|
|
429
421
|
)
|
|
430
422
|
)
|
|
431
423
|
file_paths, file_type = resolve_file_paths(path)
|
|
424
|
+
if not check_streaming_support(file_type):
|
|
425
|
+
raise ValueError(
|
|
426
|
+
f"[DataProcessor Error] Format '{file_type}' does not support streaming. "
|
|
427
|
+
"fit_from_path only supports streaming formats (csv, parquet) to avoid high memory usage. "
|
|
428
|
+
"Use fit(dataframe) with in-memory data or convert the data format."
|
|
429
|
+
)
|
|
432
430
|
|
|
433
431
|
numeric_acc: Dict[str, Dict[str, float]] = {}
|
|
434
432
|
for name in self.numeric_features.keys():
|
|
@@ -607,17 +605,16 @@ class DataProcessor(FeatureSet):
|
|
|
607
605
|
data: Union[pd.DataFrame, Dict[str, Any]],
|
|
608
606
|
return_dict: bool,
|
|
609
607
|
persist: bool,
|
|
610
|
-
save_format: Optional[
|
|
608
|
+
save_format: Optional[str],
|
|
611
609
|
output_path: Optional[str],
|
|
612
610
|
warn_missing: bool = True,
|
|
613
611
|
):
|
|
614
612
|
logger = logging.getLogger()
|
|
615
|
-
is_dataframe = isinstance(data, pd.DataFrame)
|
|
616
613
|
data_dict = data if isinstance(data, dict) else None
|
|
617
614
|
|
|
618
|
-
result_dict
|
|
619
|
-
if
|
|
620
|
-
df
|
|
615
|
+
result_dict = {}
|
|
616
|
+
if isinstance(data, pd.DataFrame):
|
|
617
|
+
df = data # type: ignore[assignment]
|
|
621
618
|
for col in df.columns:
|
|
622
619
|
result_dict[col] = df[col].to_numpy(copy=False)
|
|
623
620
|
else:
|
|
@@ -631,7 +628,7 @@ class DataProcessor(FeatureSet):
|
|
|
631
628
|
else:
|
|
632
629
|
result_dict[key] = np.asarray(value)
|
|
633
630
|
|
|
634
|
-
data_columns = data.columns if
|
|
631
|
+
data_columns = data.columns if isinstance(data, pd.DataFrame) else data_dict
|
|
635
632
|
feature_groups = [
|
|
636
633
|
("Numeric", self.numeric_features, self.process_numeric_feature_transform),
|
|
637
634
|
("Sparse", self.sparse_features, self.process_sparse_feature_transform),
|
|
@@ -651,7 +648,7 @@ class DataProcessor(FeatureSet):
|
|
|
651
648
|
continue
|
|
652
649
|
series_data = (
|
|
653
650
|
data[name]
|
|
654
|
-
if
|
|
651
|
+
if isinstance(data, pd.DataFrame)
|
|
655
652
|
else pd.Series(result_dict[name], name=name)
|
|
656
653
|
)
|
|
657
654
|
result_dict[name] = transform_fn(series_data, config)
|
|
@@ -666,8 +663,6 @@ class DataProcessor(FeatureSet):
|
|
|
666
663
|
columns_dict[key] = value
|
|
667
664
|
return pd.DataFrame(columns_dict)
|
|
668
665
|
|
|
669
|
-
if save_format not in [None, "csv", "parquet"]:
|
|
670
|
-
raise ValueError("save_format must be either 'csv', 'parquet', or None")
|
|
671
666
|
effective_format = save_format
|
|
672
667
|
if persist:
|
|
673
668
|
effective_format = save_format or "parquet"
|
|
@@ -675,6 +670,8 @@ class DataProcessor(FeatureSet):
|
|
|
675
670
|
if (not return_dict) or persist:
|
|
676
671
|
result_df = dict_to_dataframe(result_dict)
|
|
677
672
|
if persist:
|
|
673
|
+
if effective_format not in FILE_FORMAT_CONFIG:
|
|
674
|
+
raise ValueError(f"Unsupported save format: {effective_format}")
|
|
678
675
|
if output_path is None:
|
|
679
676
|
raise ValueError(
|
|
680
677
|
"[Data Processor Error] output_path must be provided when persisting transformed data."
|
|
@@ -683,12 +680,25 @@ class DataProcessor(FeatureSet):
|
|
|
683
680
|
if output_dir.suffix:
|
|
684
681
|
output_dir = output_dir.parent
|
|
685
682
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
686
|
-
|
|
683
|
+
|
|
684
|
+
suffix = FILE_FORMAT_CONFIG[effective_format]["extension"][0]
|
|
685
|
+
save_path = output_dir / f"transformed_data{suffix}"
|
|
687
686
|
assert result_df is not None, "DataFrame conversion failed"
|
|
688
|
-
|
|
687
|
+
|
|
688
|
+
# Save based on format
|
|
689
|
+
if effective_format == "csv":
|
|
690
|
+
result_df.to_csv(save_path, index=False)
|
|
691
|
+
elif effective_format == "parquet":
|
|
689
692
|
result_df.to_parquet(save_path, index=False)
|
|
693
|
+
elif effective_format == "feather":
|
|
694
|
+
result_df.to_feather(save_path)
|
|
695
|
+
elif effective_format == "excel":
|
|
696
|
+
result_df.to_excel(save_path, index=False)
|
|
697
|
+
elif effective_format == "hdf5":
|
|
698
|
+
result_df.to_hdf(save_path, key="data", mode="w")
|
|
690
699
|
else:
|
|
691
|
-
|
|
700
|
+
raise ValueError(f"Unsupported save format: {effective_format}")
|
|
701
|
+
|
|
692
702
|
logger.info(
|
|
693
703
|
colorize(
|
|
694
704
|
f"Transformed data saved to: {save_path.resolve()}", color="green"
|
|
@@ -703,7 +713,7 @@ class DataProcessor(FeatureSet):
|
|
|
703
713
|
self,
|
|
704
714
|
input_path: str,
|
|
705
715
|
output_path: Optional[str],
|
|
706
|
-
save_format: Optional[
|
|
716
|
+
save_format: Optional[str],
|
|
707
717
|
chunk_size: int = 200000,
|
|
708
718
|
):
|
|
709
719
|
"""Transform data from files under a path and save them to a new location.
|
|
@@ -713,8 +723,21 @@ class DataProcessor(FeatureSet):
|
|
|
713
723
|
logger = logging.getLogger()
|
|
714
724
|
file_paths, file_type = resolve_file_paths(input_path)
|
|
715
725
|
target_format = save_format or file_type
|
|
716
|
-
if target_format not in
|
|
717
|
-
raise ValueError("
|
|
726
|
+
if target_format not in FILE_FORMAT_CONFIG:
|
|
727
|
+
raise ValueError(f"Unsupported format: {target_format}")
|
|
728
|
+
if chunk_size > 0 and not check_streaming_support(file_type):
|
|
729
|
+
raise ValueError(
|
|
730
|
+
f"Input format '{file_type}' does not support streaming reads. "
|
|
731
|
+
"Set chunk_size<=0 to use full-load transform."
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
# Warn about streaming support
|
|
735
|
+
if not check_streaming_support(target_format):
|
|
736
|
+
logger.warning(
|
|
737
|
+
f"[Data Processor Warning] Format '{target_format}' does not support streaming writes. "
|
|
738
|
+
"Large files may require more memory. Use csv or parquet for better streaming support."
|
|
739
|
+
)
|
|
740
|
+
|
|
718
741
|
base_output_dir = (
|
|
719
742
|
Path(output_path) if output_path else default_output_dir(input_path)
|
|
720
743
|
)
|
|
@@ -725,10 +748,10 @@ class DataProcessor(FeatureSet):
|
|
|
725
748
|
saved_paths = []
|
|
726
749
|
for file_path in progress(file_paths, description="Transforming files"):
|
|
727
750
|
source_path = Path(file_path)
|
|
728
|
-
|
|
751
|
+
suffix = FILE_FORMAT_CONFIG[target_format]["extension"][0]
|
|
752
|
+
target_file = output_root / f"{source_path.stem}{suffix}"
|
|
729
753
|
|
|
730
754
|
# Stream transform for large files
|
|
731
|
-
|
|
732
755
|
if chunk_size <= 0:
|
|
733
756
|
# fallback to full load behavior
|
|
734
757
|
df = read_table(file_path, file_type)
|
|
@@ -743,16 +766,28 @@ class DataProcessor(FeatureSet):
|
|
|
743
766
|
assert isinstance(
|
|
744
767
|
transformed_df, pd.DataFrame
|
|
745
768
|
), "[Data Processor Error] Expected DataFrame when return_dict=False"
|
|
769
|
+
|
|
770
|
+
# Save based on format
|
|
746
771
|
if target_format == "csv":
|
|
747
772
|
transformed_df.to_csv(target_file, index=False)
|
|
748
|
-
|
|
773
|
+
elif target_format == "parquet":
|
|
749
774
|
transformed_df.to_parquet(target_file, index=False)
|
|
775
|
+
elif target_format == "feather":
|
|
776
|
+
transformed_df.to_feather(target_file)
|
|
777
|
+
elif target_format == "excel":
|
|
778
|
+
transformed_df.to_excel(target_file, index=False)
|
|
779
|
+
elif target_format == "hdf5":
|
|
780
|
+
transformed_df.to_hdf(target_file, key="data", mode="w")
|
|
781
|
+
else:
|
|
782
|
+
raise ValueError(f"Unsupported format: {target_format}")
|
|
783
|
+
|
|
750
784
|
saved_paths.append(str(target_file.resolve()))
|
|
751
785
|
continue
|
|
752
786
|
|
|
753
787
|
first_chunk = True
|
|
788
|
+
# Streaming write for supported formats
|
|
754
789
|
if target_format == "parquet":
|
|
755
|
-
|
|
790
|
+
parquet_writer = None
|
|
756
791
|
try:
|
|
757
792
|
for chunk in iter_file_chunks(file_path, file_type, chunk_size):
|
|
758
793
|
transformed_df = self.transform_in_memory(
|
|
@@ -769,16 +804,15 @@ class DataProcessor(FeatureSet):
|
|
|
769
804
|
table = pa.Table.from_pandas(
|
|
770
805
|
transformed_df, preserve_index=False
|
|
771
806
|
)
|
|
772
|
-
if
|
|
773
|
-
|
|
774
|
-
|
|
807
|
+
if parquet_writer is None:
|
|
808
|
+
parquet_writer = pq.ParquetWriter(target_file, table.schema)
|
|
809
|
+
parquet_writer.write_table(table)
|
|
775
810
|
first_chunk = False
|
|
776
811
|
finally:
|
|
777
|
-
if
|
|
778
|
-
|
|
779
|
-
|
|
812
|
+
if parquet_writer is not None:
|
|
813
|
+
parquet_writer.close()
|
|
814
|
+
elif target_format == "csv":
|
|
780
815
|
# CSV: append chunks; header only once
|
|
781
|
-
# (truncate first to avoid mixing with existing files)
|
|
782
816
|
target_file.parent.mkdir(parents=True, exist_ok=True)
|
|
783
817
|
with open(target_file, "w", encoding="utf-8", newline="") as f:
|
|
784
818
|
f.write("")
|
|
@@ -798,6 +832,34 @@ class DataProcessor(FeatureSet):
|
|
|
798
832
|
target_file, index=False, mode="a", header=first_chunk
|
|
799
833
|
)
|
|
800
834
|
first_chunk = False
|
|
835
|
+
else:
|
|
836
|
+
# Non-streaming formats: collect all chunks and save once
|
|
837
|
+
logger.warning(
|
|
838
|
+
f"Format '{target_format}' doesn't support streaming writes. "
|
|
839
|
+
f"Collecting all chunks in memory before saving."
|
|
840
|
+
)
|
|
841
|
+
all_chunks = []
|
|
842
|
+
for chunk in iter_file_chunks(file_path, file_type, chunk_size):
|
|
843
|
+
transformed_df = self.transform_in_memory(
|
|
844
|
+
chunk,
|
|
845
|
+
return_dict=False,
|
|
846
|
+
persist=False,
|
|
847
|
+
save_format=None,
|
|
848
|
+
output_path=None,
|
|
849
|
+
warn_missing=first_chunk,
|
|
850
|
+
)
|
|
851
|
+
assert isinstance(transformed_df, pd.DataFrame)
|
|
852
|
+
all_chunks.append(transformed_df)
|
|
853
|
+
first_chunk = False
|
|
854
|
+
|
|
855
|
+
if all_chunks:
|
|
856
|
+
combined_df = pd.concat(all_chunks, ignore_index=True)
|
|
857
|
+
if target_format == "feather":
|
|
858
|
+
combined_df.to_feather(target_file)
|
|
859
|
+
elif target_format == "excel":
|
|
860
|
+
combined_df.to_excel(target_file, index=False)
|
|
861
|
+
elif target_format == "hdf5":
|
|
862
|
+
combined_df.to_hdf(target_file, key="data", mode="w")
|
|
801
863
|
|
|
802
864
|
saved_paths.append(str(target_file.resolve()))
|
|
803
865
|
logger.info(
|
|
@@ -849,7 +911,7 @@ class DataProcessor(FeatureSet):
|
|
|
849
911
|
self,
|
|
850
912
|
data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
|
|
851
913
|
return_dict: bool = True,
|
|
852
|
-
save_format: Optional[
|
|
914
|
+
save_format: Optional[str] = None,
|
|
853
915
|
output_path: Optional[str] = None,
|
|
854
916
|
chunk_size: int = 200000,
|
|
855
917
|
):
|
|
@@ -877,7 +939,7 @@ class DataProcessor(FeatureSet):
|
|
|
877
939
|
self,
|
|
878
940
|
data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
|
|
879
941
|
return_dict: bool = True,
|
|
880
|
-
save_format: Optional[
|
|
942
|
+
save_format: Optional[str] = None,
|
|
881
943
|
output_path: Optional[str] = None,
|
|
882
944
|
chunk_size: int = 200000,
|
|
883
945
|
):
|