nextrec 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. nextrec/__version__.py +1 -1
  2. nextrec/basic/heads.py +99 -0
  3. nextrec/basic/loggers.py +5 -5
  4. nextrec/basic/model.py +217 -88
  5. nextrec/cli.py +1 -1
  6. nextrec/data/dataloader.py +93 -95
  7. nextrec/data/preprocessor.py +108 -46
  8. nextrec/loss/grad_norm.py +13 -13
  9. nextrec/models/multi_task/esmm.py +10 -11
  10. nextrec/models/multi_task/mmoe.py +20 -19
  11. nextrec/models/multi_task/ple.py +35 -34
  12. nextrec/models/multi_task/poso.py +23 -21
  13. nextrec/models/multi_task/share_bottom.py +18 -17
  14. nextrec/models/ranking/afm.py +4 -3
  15. nextrec/models/ranking/autoint.py +4 -3
  16. nextrec/models/ranking/dcn.py +4 -3
  17. nextrec/models/ranking/dcn_v2.py +4 -3
  18. nextrec/models/ranking/deepfm.py +4 -3
  19. nextrec/models/ranking/dien.py +2 -2
  20. nextrec/models/ranking/din.py +2 -2
  21. nextrec/models/ranking/eulernet.py +4 -3
  22. nextrec/models/ranking/ffm.py +4 -3
  23. nextrec/models/ranking/fibinet.py +2 -2
  24. nextrec/models/ranking/fm.py +4 -3
  25. nextrec/models/ranking/lr.py +4 -3
  26. nextrec/models/ranking/masknet.py +4 -5
  27. nextrec/models/ranking/pnn.py +5 -4
  28. nextrec/models/ranking/widedeep.py +8 -8
  29. nextrec/models/ranking/xdeepfm.py +5 -4
  30. nextrec/utils/console.py +20 -6
  31. nextrec/utils/data.py +154 -32
  32. nextrec/utils/model.py +86 -1
  33. {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/METADATA +5 -6
  34. {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/RECORD +37 -36
  35. {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/WHEEL +0 -0
  36. {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/entry_points.txt +0 -0
  37. {nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/licenses/LICENSE +0 -0
nextrec/cli.py CHANGED
@@ -395,7 +395,7 @@ def train_model(train_config_path: str) -> None:
395
395
  shuffle=train_cfg.get("shuffle", True),
396
396
  num_workers=dataloader_cfg.get("num_workers", 0),
397
397
  user_id_column=id_column,
398
- tensorboard=False,
398
+ use_tensorboard=False,
399
399
  )
400
400
 
401
401
 
@@ -2,7 +2,7 @@
2
2
  Dataloader definitions
3
3
 
4
4
  Date: create on 27/10/2025
5
- Checkpoint: edit on 19/12/2025
5
+ Checkpoint: edit on 24/12/2025
6
6
  Author: Yang Zhou,zyaztec@gmail.com
7
7
  """
8
8
 
@@ -13,7 +13,6 @@ from typing import cast
13
13
 
14
14
  import numpy as np
15
15
  import pandas as pd
16
- import pyarrow.parquet as pq
17
16
  import torch
18
17
  from torch.utils.data import DataLoader, Dataset, IterableDataset
19
18
 
@@ -26,7 +25,12 @@ from nextrec.basic.features import (
26
25
  from nextrec.data.batch_utils import collate_fn
27
26
  from nextrec.data.data_processing import get_column_data
28
27
  from nextrec.data.preprocessor import DataProcessor
29
- from nextrec.utils.data import read_table, resolve_file_paths
28
+ from nextrec.utils.data import (
29
+ check_streaming_support,
30
+ iter_file_chunks,
31
+ read_table,
32
+ resolve_file_paths,
33
+ )
30
34
  from nextrec.utils.torch_utils import to_tensor
31
35
 
32
36
 
@@ -72,22 +76,34 @@ class TensorDictDataset(Dataset):
72
76
  class FileDataset(FeatureSet, IterableDataset):
73
77
  def __init__(
74
78
  self,
75
- file_paths: list[str], # file paths to read, containing CSV or Parquet files
76
- dense_features: list[DenseFeature], # dense feature definitions
77
- sparse_features: list[SparseFeature], # sparse feature definitions
78
- sequence_features: list[SequenceFeature], # sequence feature definitions
79
- target_columns: list[str], # target column names
80
- id_columns: (
81
- list[str] | None
82
- ) = None, # id columns to carry through (not used for model inputs)
79
+ file_paths: list[str],
80
+ dense_features: list[DenseFeature],
81
+ sparse_features: list[SparseFeature],
82
+ sequence_features: list[SequenceFeature],
83
+ target_columns: list[str],
84
+ id_columns: list[str] | None = None,
83
85
  chunk_size: int = 10000,
84
86
  file_type: str = "csv",
85
87
  processor: DataProcessor | None = None,
86
- ): # optional DataProcessor for transformation
88
+ ):
89
+ """Streaming dataset for reading files in chunks.
90
+
91
+ Args:
92
+ file_paths: List of file paths to read
93
+ dense_features: Dense feature definitions
94
+ sparse_features: Sparse feature definitions
95
+ sequence_features: Sequence feature definitions
96
+ target_columns: Target column names
97
+ id_columns: ID columns to carry through
98
+ chunk_size: Number of rows per chunk
99
+ file_type: Format type (csv, parquet, etc.)
100
+ processor: Optional DataProcessor for transformation
101
+ """
87
102
  self.file_paths = file_paths
88
103
  self.chunk_size = chunk_size
89
104
  self.file_type = file_type
90
105
  self.processor = processor
106
+
91
107
  self.set_all_features(
92
108
  dense_features,
93
109
  sparse_features,
@@ -102,26 +118,11 @@ class FileDataset(FeatureSet, IterableDataset):
102
118
  self.current_file_index = 0
103
119
  for file_path in self.file_paths:
104
120
  self.current_file_index += 1
105
- # Don't log file processing here to avoid interrupting progress bars
106
- # File information is already displayed in the CLI data section
107
- if self.file_type == "csv":
108
- yield from self.read_csv_chunks(file_path)
109
- elif self.file_type == "parquet":
110
- yield from self.read_parquet_chunks(file_path)
111
-
112
- def read_csv_chunks(self, file_path: str):
113
- chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
114
- for chunk in chunk_iterator:
115
- tensors = self.dataframeto_tensors(chunk)
116
- yield tensors
117
-
118
- def read_parquet_chunks(self, file_path: str):
119
- parquet_file = pq.ParquetFile(file_path)
120
- for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
121
- chunk = batch.to_pandas()
122
- tensors = self.dataframeto_tensors(chunk)
123
- yield tensors
124
- del chunk
121
+ for chunk in iter_file_chunks(file_path, self.file_type, self.chunk_size):
122
+ tensors = self.dataframeto_tensors(chunk)
123
+ if tensors is not None:
124
+ yield tensors
125
+ del chunk
125
126
 
126
127
  def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
127
128
  if self.processor is not None:
@@ -209,8 +210,6 @@ class RecDataLoader(FeatureSet):
209
210
  Returns:
210
211
  DataLoader instance.
211
212
  """
212
-
213
- # Enforce num_workers=0 for streaming mode to prevent data duplication
214
213
  if streaming and num_workers > 0:
215
214
  logging.warning(
216
215
  f"[RecDataLoader Warning] num_workers={num_workers} is not compatible with streaming=True. "
@@ -221,20 +220,13 @@ class RecDataLoader(FeatureSet):
221
220
 
222
221
  if isinstance(data, DataLoader):
223
222
  return data
224
- elif isinstance(data, (str, os.PathLike)):
225
- return self.create_from_path(
226
- path=data,
227
- batch_size=batch_size,
228
- shuffle=shuffle,
229
- streaming=streaming,
230
- chunk_size=chunk_size,
231
- num_workers=num_workers,
232
- )
233
- elif (
223
+
224
+ is_path_list = (
234
225
  isinstance(data, list)
235
226
  and data
236
227
  and all(isinstance(p, (str, os.PathLike)) for p in data)
237
- ):
228
+ )
229
+ if isinstance(data, (str, os.PathLike)) or is_path_list:
238
230
  return self.create_from_path(
239
231
  path=data,
240
232
  batch_size=batch_size,
@@ -243,7 +235,8 @@ class RecDataLoader(FeatureSet):
243
235
  chunk_size=chunk_size,
244
236
  num_workers=num_workers,
245
237
  )
246
- elif isinstance(data, (dict, pd.DataFrame)):
238
+
239
+ if isinstance(data, (dict, pd.DataFrame)):
247
240
  return self.create_from_memory(
248
241
  data=data,
249
242
  batch_size=batch_size,
@@ -251,10 +244,8 @@ class RecDataLoader(FeatureSet):
251
244
  num_workers=num_workers,
252
245
  sampler=sampler,
253
246
  )
254
- else:
255
- raise ValueError(
256
- f"[RecDataLoader Error] Unsupported data type: {type(data)}"
257
- )
247
+
248
+ raise ValueError(f"[RecDataLoader Error] Unsupported data type: {type(data)}")
258
249
 
259
250
  def create_from_memory(
260
251
  self,
@@ -264,7 +255,6 @@ class RecDataLoader(FeatureSet):
264
255
  num_workers: int = 0,
265
256
  sampler=None,
266
257
  ) -> DataLoader:
267
-
268
258
  raw_data = data
269
259
 
270
260
  if self.processor is not None:
@@ -309,17 +299,24 @@ class RecDataLoader(FeatureSet):
309
299
  file_paths = [str(Path(p)) for p in path]
310
300
  if not file_paths:
311
301
  raise ValueError("[RecDataLoader Error] Empty file path list provided.")
312
- suffixes = {Path(p).suffix.lower() for p in file_paths}
313
- if len(suffixes) != 1:
314
- raise ValueError(
315
- "[RecDataLoader Error] Mixed file types in provided list; please use only CSV or only Parquet."
316
- )
317
- suffix = suffixes.pop()
318
- if suffix not in {".csv", ".parquet"}:
302
+
303
+ from nextrec.utils.data import get_file_format_from_extension
304
+
305
+ file_formats = set()
306
+ for p in file_paths:
307
+ fmt = get_file_format_from_extension(Path(p).suffix)
308
+ if fmt is None:
309
+ raise ValueError(
310
+ f"[RecDataLoader Error] Unsupported file extension: {Path(p).suffix}"
311
+ )
312
+ file_formats.add(fmt)
313
+
314
+ if len(file_formats) != 1:
319
315
  raise ValueError(
320
- f"[RecDataLoader Error] Unsupported file extension in list: {suffix}"
316
+ f"[RecDataLoader Error] Mixed file types in provided list: {', '.join(file_formats)}. "
317
+ "Please use a single format per DataLoader."
321
318
  )
322
- file_type = "csv" if suffix == ".csv" else "parquet"
319
+ file_type = file_formats.pop()
323
320
  if streaming:
324
321
  return self.load_files_streaming(
325
322
  file_paths,
@@ -329,31 +326,30 @@ class RecDataLoader(FeatureSet):
329
326
  shuffle,
330
327
  num_workers=num_workers,
331
328
  )
332
- # Load full data into memory
333
- else:
334
- dfs = []
335
- total_bytes = 0
336
- for file_path in file_paths:
337
- try:
338
- total_bytes += os.path.getsize(file_path)
339
- except OSError:
340
- pass
341
- try:
342
- df = read_table(file_path, data_format=file_type)
343
- dfs.append(df)
344
- except MemoryError as exc:
345
- raise MemoryError(
346
- f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using streaming=True."
347
- ) from exc
329
+
330
+ dfs = []
331
+ total_bytes = 0
332
+ for file_path in file_paths:
333
+ try:
334
+ total_bytes += os.path.getsize(file_path)
335
+ except OSError:
336
+ pass
348
337
  try:
349
- combined_df = pd.concat(dfs, ignore_index=True)
338
+ df = read_table(file_path, data_format=file_type)
339
+ dfs.append(df)
350
340
  except MemoryError as exc:
351
341
  raise MemoryError(
352
- f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use streaming=True or reduce chunk_size."
342
+ f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using streaming=True."
353
343
  ) from exc
354
- return self.create_from_memory(
355
- combined_df, batch_size, shuffle, num_workers=num_workers
356
- )
344
+ try:
345
+ combined_df = pd.concat(dfs, ignore_index=True)
346
+ except MemoryError as exc:
347
+ raise MemoryError(
348
+ f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use streaming=True or reduce chunk_size."
349
+ ) from exc
350
+ return self.create_from_memory(
351
+ combined_df, batch_size, shuffle, num_workers=num_workers
352
+ )
357
353
 
358
354
  def load_files_streaming(
359
355
  self,
@@ -364,6 +360,11 @@ class RecDataLoader(FeatureSet):
364
360
  shuffle: bool,
365
361
  num_workers: int = 0,
366
362
  ) -> DataLoader:
363
+ if not check_streaming_support(file_type):
364
+ raise ValueError(
365
+ f"[RecDataLoader Error] Format '{file_type}' does not support streaming reads. "
366
+ "Use streaming=False or convert data to csv/parquet."
367
+ )
367
368
  if shuffle:
368
369
  logging.info(
369
370
  "[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset)."
@@ -420,22 +421,21 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
420
421
  f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
421
422
  )
422
423
  if isinstance(seq, (list, tuple, np.ndarray)):
423
- arr = np.asarray(seq, dtype=np.int64)
424
+ sequences.append(np.asarray(seq, dtype=np.int64))
424
425
  else:
425
- arr = np.asarray([seq], dtype=np.int64)
426
- sequences.append(arr)
426
+ sequences.append(np.asarray([seq], dtype=np.int64))
427
427
  max_len = getattr(feature, "max_len", 0)
428
428
  if max_len <= 0:
429
429
  max_len = max((len(seq) for seq in sequences), default=1)
430
430
  pad_value = getattr(feature, "padding_idx", 0)
431
- padded = []
432
- for seq in sequences:
433
- if len(seq) > max_len:
434
- padded.append(seq[:max_len])
435
- else:
436
- padded.append(
437
- np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value)
438
- )
431
+ padded = [
432
+ (
433
+ seq[:max_len]
434
+ if len(seq) > max_len
435
+ else np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value)
436
+ )
437
+ for seq in sequences
438
+ ]
439
439
  column = np.stack(padded)
440
440
  elif column.ndim == 1:
441
441
  column = column.reshape(-1, 1)
@@ -456,9 +456,7 @@ def build_tensors_from_data(
456
456
  raise ValueError(
457
457
  f"[RecDataLoader Error] Feature column '{feature.name}' not found in data"
458
458
  )
459
- if isinstance(
460
- feature, SequenceFeature
461
- ): # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
459
+ if isinstance(feature, SequenceFeature):
462
460
  arr = normalize_sequence_column(column, feature)
463
461
  tensor = to_tensor(arr, dtype=torch.long)
464
462
  elif isinstance(feature, DenseFeature):
@@ -2,7 +2,7 @@
2
2
  DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
3
3
 
4
4
  Date: create on 13/11/2025
5
- Checkpoint: edit on 19/12/2025
5
+ Checkpoint: edit on 24/12/2025
6
6
  Author: Yang Zhou, zyaztec@gmail.com
7
7
  """
8
8
 
@@ -34,6 +34,8 @@ from nextrec.basic.session import resolve_save_path
34
34
  from nextrec.data.data_processing import hash_md5_mod
35
35
  from nextrec.utils.console import progress
36
36
  from nextrec.utils.data import (
37
+ FILE_FORMAT_CONFIG,
38
+ check_streaming_support,
37
39
  default_output_dir,
38
40
  iter_file_chunks,
39
41
  load_dataframes,
@@ -239,17 +241,9 @@ class DataProcessor(FeatureSet):
239
241
  dtype=np.int64,
240
242
  count=sparse_series.size,
241
243
  )
242
- le = self.label_encoders.get(name)
243
- if le is None:
244
- raise ValueError(
245
- f"[Data Processor Error] LabelEncoder for {name} not fitted"
246
- )
247
- cat = pd.Categorical(sparse_series, categories=le.classes_)
248
- codes = cat.codes # -1 indicates unknown category
249
- unk_index = 0
250
- if "<UNK>" in le.classes_:
251
- unk_index = int(list(le.classes_).index("<UNK>"))
252
- return np.where(codes < 0, unk_index, codes).astype(np.int64, copy=False)
244
+ raise ValueError(
245
+ f"[Data Processor Error] Token index for {name} not fitted"
246
+ )
253
247
 
254
248
  if encode_method == "hash":
255
249
  hash_size = config["hash_size"]
@@ -298,13 +292,11 @@ class DataProcessor(FeatureSet):
298
292
  split_fn = str.split
299
293
  is_nan = np.isnan
300
294
  if encode_method == "label":
301
- class_to_idx = config.get("_token_to_idx") or config.get("_class_to_idx")
295
+ class_to_idx = config.get("_token_to_idx")
302
296
  if class_to_idx is None:
303
- le = self.label_encoders.get(name)
304
- if le is None:
305
- raise ValueError(f"LabelEncoder for {name} not fitted")
306
- class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
307
- config["_class_to_idx"] = class_to_idx
297
+ raise ValueError(
298
+ f"[Data Processor Error] Token index for {name} not fitted"
299
+ )
308
300
  unk_index = int(config.get("_unk_index", class_to_idx.get("<UNK>", 0)))
309
301
  else:
310
302
  class_to_idx = None # type: ignore
@@ -429,6 +421,12 @@ class DataProcessor(FeatureSet):
429
421
  )
430
422
  )
431
423
  file_paths, file_type = resolve_file_paths(path)
424
+ if not check_streaming_support(file_type):
425
+ raise ValueError(
426
+ f"[DataProcessor Error] Format '{file_type}' does not support streaming. "
427
+ "fit_from_path only supports streaming formats (csv, parquet) to avoid high memory usage. "
428
+ "Use fit(dataframe) with in-memory data or convert the data format."
429
+ )
432
430
 
433
431
  numeric_acc: Dict[str, Dict[str, float]] = {}
434
432
  for name in self.numeric_features.keys():
@@ -607,17 +605,16 @@ class DataProcessor(FeatureSet):
607
605
  data: Union[pd.DataFrame, Dict[str, Any]],
608
606
  return_dict: bool,
609
607
  persist: bool,
610
- save_format: Optional[Literal["csv", "parquet"]],
608
+ save_format: Optional[str],
611
609
  output_path: Optional[str],
612
610
  warn_missing: bool = True,
613
611
  ):
614
612
  logger = logging.getLogger()
615
- is_dataframe = isinstance(data, pd.DataFrame)
616
613
  data_dict = data if isinstance(data, dict) else None
617
614
 
618
- result_dict: Dict[str, np.ndarray] = {}
619
- if is_dataframe:
620
- df: pd.DataFrame = data # type: ignore[assignment]
615
+ result_dict = {}
616
+ if isinstance(data, pd.DataFrame):
617
+ df = data # type: ignore[assignment]
621
618
  for col in df.columns:
622
619
  result_dict[col] = df[col].to_numpy(copy=False)
623
620
  else:
@@ -631,7 +628,7 @@ class DataProcessor(FeatureSet):
631
628
  else:
632
629
  result_dict[key] = np.asarray(value)
633
630
 
634
- data_columns = data.columns if is_dataframe else data_dict
631
+ data_columns = data.columns if isinstance(data, pd.DataFrame) else data_dict
635
632
  feature_groups = [
636
633
  ("Numeric", self.numeric_features, self.process_numeric_feature_transform),
637
634
  ("Sparse", self.sparse_features, self.process_sparse_feature_transform),
@@ -651,7 +648,7 @@ class DataProcessor(FeatureSet):
651
648
  continue
652
649
  series_data = (
653
650
  data[name]
654
- if is_dataframe
651
+ if isinstance(data, pd.DataFrame)
655
652
  else pd.Series(result_dict[name], name=name)
656
653
  )
657
654
  result_dict[name] = transform_fn(series_data, config)
@@ -666,8 +663,6 @@ class DataProcessor(FeatureSet):
666
663
  columns_dict[key] = value
667
664
  return pd.DataFrame(columns_dict)
668
665
 
669
- if save_format not in [None, "csv", "parquet"]:
670
- raise ValueError("save_format must be either 'csv', 'parquet', or None")
671
666
  effective_format = save_format
672
667
  if persist:
673
668
  effective_format = save_format or "parquet"
@@ -675,6 +670,8 @@ class DataProcessor(FeatureSet):
675
670
  if (not return_dict) or persist:
676
671
  result_df = dict_to_dataframe(result_dict)
677
672
  if persist:
673
+ if effective_format not in FILE_FORMAT_CONFIG:
674
+ raise ValueError(f"Unsupported save format: {effective_format}")
678
675
  if output_path is None:
679
676
  raise ValueError(
680
677
  "[Data Processor Error] output_path must be provided when persisting transformed data."
@@ -683,12 +680,25 @@ class DataProcessor(FeatureSet):
683
680
  if output_dir.suffix:
684
681
  output_dir = output_dir.parent
685
682
  output_dir.mkdir(parents=True, exist_ok=True)
686
- save_path = output_dir / f"transformed_data.{effective_format}"
683
+
684
+ suffix = FILE_FORMAT_CONFIG[effective_format]["extension"][0]
685
+ save_path = output_dir / f"transformed_data{suffix}"
687
686
  assert result_df is not None, "DataFrame conversion failed"
688
- if effective_format == "parquet":
687
+
688
+ # Save based on format
689
+ if effective_format == "csv":
690
+ result_df.to_csv(save_path, index=False)
691
+ elif effective_format == "parquet":
689
692
  result_df.to_parquet(save_path, index=False)
693
+ elif effective_format == "feather":
694
+ result_df.to_feather(save_path)
695
+ elif effective_format == "excel":
696
+ result_df.to_excel(save_path, index=False)
697
+ elif effective_format == "hdf5":
698
+ result_df.to_hdf(save_path, key="data", mode="w")
690
699
  else:
691
- result_df.to_csv(save_path, index=False)
700
+ raise ValueError(f"Unsupported save format: {effective_format}")
701
+
692
702
  logger.info(
693
703
  colorize(
694
704
  f"Transformed data saved to: {save_path.resolve()}", color="green"
@@ -703,7 +713,7 @@ class DataProcessor(FeatureSet):
703
713
  self,
704
714
  input_path: str,
705
715
  output_path: Optional[str],
706
- save_format: Optional[Literal["csv", "parquet"]],
716
+ save_format: Optional[str],
707
717
  chunk_size: int = 200000,
708
718
  ):
709
719
  """Transform data from files under a path and save them to a new location.
@@ -713,8 +723,21 @@ class DataProcessor(FeatureSet):
713
723
  logger = logging.getLogger()
714
724
  file_paths, file_type = resolve_file_paths(input_path)
715
725
  target_format = save_format or file_type
716
- if target_format not in ["csv", "parquet"]:
717
- raise ValueError("save_format must be either 'csv' or 'parquet'")
726
+ if target_format not in FILE_FORMAT_CONFIG:
727
+ raise ValueError(f"Unsupported format: {target_format}")
728
+ if chunk_size > 0 and not check_streaming_support(file_type):
729
+ raise ValueError(
730
+ f"Input format '{file_type}' does not support streaming reads. "
731
+ "Set chunk_size<=0 to use full-load transform."
732
+ )
733
+
734
+ # Warn about streaming support
735
+ if not check_streaming_support(target_format):
736
+ logger.warning(
737
+ f"[Data Processor Warning] Format '{target_format}' does not support streaming writes. "
738
+ "Large files may require more memory. Use csv or parquet for better streaming support."
739
+ )
740
+
718
741
  base_output_dir = (
719
742
  Path(output_path) if output_path else default_output_dir(input_path)
720
743
  )
@@ -725,10 +748,10 @@ class DataProcessor(FeatureSet):
725
748
  saved_paths = []
726
749
  for file_path in progress(file_paths, description="Transforming files"):
727
750
  source_path = Path(file_path)
728
- target_file = output_root / f"{source_path.stem}.{target_format}"
751
+ suffix = FILE_FORMAT_CONFIG[target_format]["extension"][0]
752
+ target_file = output_root / f"{source_path.stem}{suffix}"
729
753
 
730
754
  # Stream transform for large files
731
-
732
755
  if chunk_size <= 0:
733
756
  # fallback to full load behavior
734
757
  df = read_table(file_path, file_type)
@@ -743,16 +766,28 @@ class DataProcessor(FeatureSet):
743
766
  assert isinstance(
744
767
  transformed_df, pd.DataFrame
745
768
  ), "[Data Processor Error] Expected DataFrame when return_dict=False"
769
+
770
+ # Save based on format
746
771
  if target_format == "csv":
747
772
  transformed_df.to_csv(target_file, index=False)
748
- else:
773
+ elif target_format == "parquet":
749
774
  transformed_df.to_parquet(target_file, index=False)
775
+ elif target_format == "feather":
776
+ transformed_df.to_feather(target_file)
777
+ elif target_format == "excel":
778
+ transformed_df.to_excel(target_file, index=False)
779
+ elif target_format == "hdf5":
780
+ transformed_df.to_hdf(target_file, key="data", mode="w")
781
+ else:
782
+ raise ValueError(f"Unsupported format: {target_format}")
783
+
750
784
  saved_paths.append(str(target_file.resolve()))
751
785
  continue
752
786
 
753
787
  first_chunk = True
788
+ # Streaming write for supported formats
754
789
  if target_format == "parquet":
755
- writer: pq.ParquetWriter | None = None
790
+ parquet_writer = None
756
791
  try:
757
792
  for chunk in iter_file_chunks(file_path, file_type, chunk_size):
758
793
  transformed_df = self.transform_in_memory(
@@ -769,16 +804,15 @@ class DataProcessor(FeatureSet):
769
804
  table = pa.Table.from_pandas(
770
805
  transformed_df, preserve_index=False
771
806
  )
772
- if writer is None:
773
- writer = pq.ParquetWriter(target_file, table.schema)
774
- writer.write_table(table)
807
+ if parquet_writer is None:
808
+ parquet_writer = pq.ParquetWriter(target_file, table.schema)
809
+ parquet_writer.write_table(table)
775
810
  first_chunk = False
776
811
  finally:
777
- if writer is not None:
778
- writer.close()
779
- else:
812
+ if parquet_writer is not None:
813
+ parquet_writer.close()
814
+ elif target_format == "csv":
780
815
  # CSV: append chunks; header only once
781
- # (truncate first to avoid mixing with existing files)
782
816
  target_file.parent.mkdir(parents=True, exist_ok=True)
783
817
  with open(target_file, "w", encoding="utf-8", newline="") as f:
784
818
  f.write("")
@@ -798,6 +832,34 @@ class DataProcessor(FeatureSet):
798
832
  target_file, index=False, mode="a", header=first_chunk
799
833
  )
800
834
  first_chunk = False
835
+ else:
836
+ # Non-streaming formats: collect all chunks and save once
837
+ logger.warning(
838
+ f"Format '{target_format}' doesn't support streaming writes. "
839
+ f"Collecting all chunks in memory before saving."
840
+ )
841
+ all_chunks = []
842
+ for chunk in iter_file_chunks(file_path, file_type, chunk_size):
843
+ transformed_df = self.transform_in_memory(
844
+ chunk,
845
+ return_dict=False,
846
+ persist=False,
847
+ save_format=None,
848
+ output_path=None,
849
+ warn_missing=first_chunk,
850
+ )
851
+ assert isinstance(transformed_df, pd.DataFrame)
852
+ all_chunks.append(transformed_df)
853
+ first_chunk = False
854
+
855
+ if all_chunks:
856
+ combined_df = pd.concat(all_chunks, ignore_index=True)
857
+ if target_format == "feather":
858
+ combined_df.to_feather(target_file)
859
+ elif target_format == "excel":
860
+ combined_df.to_excel(target_file, index=False)
861
+ elif target_format == "hdf5":
862
+ combined_df.to_hdf(target_file, key="data", mode="w")
801
863
 
802
864
  saved_paths.append(str(target_file.resolve()))
803
865
  logger.info(
@@ -849,7 +911,7 @@ class DataProcessor(FeatureSet):
849
911
  self,
850
912
  data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
851
913
  return_dict: bool = True,
852
- save_format: Optional[Literal["csv", "parquet"]] = None,
914
+ save_format: Optional[str] = None,
853
915
  output_path: Optional[str] = None,
854
916
  chunk_size: int = 200000,
855
917
  ):
@@ -877,7 +939,7 @@ class DataProcessor(FeatureSet):
877
939
  self,
878
940
  data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
879
941
  return_dict: bool = True,
880
- save_format: Optional[Literal["csv", "parquet"]] = None,
942
+ save_format: Optional[str] = None,
881
943
  output_path: Optional[str] = None,
882
944
  chunk_size: int = 200000,
883
945
  ):