PyPI - nextrec - Versions diffs - 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl - Mend

nextrec 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

nextrec/__version__.py +1 -1
nextrec/basic/heads.py +99 -0
nextrec/basic/loggers.py +5 -5
nextrec/basic/model.py +217 -88
nextrec/cli.py +1 -1
nextrec/data/dataloader.py +93 -95
nextrec/data/preprocessor.py +108 -46
nextrec/loss/grad_norm.py +13 -13
nextrec/models/multi_task/esmm.py +10 -11
nextrec/models/multi_task/mmoe.py +20 -19
nextrec/models/multi_task/ple.py +35 -34
nextrec/models/multi_task/poso.py +23 -21
nextrec/models/multi_task/share_bottom.py +18 -17
nextrec/models/ranking/afm.py +4 -3
nextrec/models/ranking/autoint.py +4 -3
nextrec/models/ranking/dcn.py +4 -3
nextrec/models/ranking/dcn_v2.py +4 -3
nextrec/models/ranking/deepfm.py +4 -3
nextrec/models/ranking/dien.py +2 -2
nextrec/models/ranking/din.py +2 -2
nextrec/models/ranking/eulernet.py +4 -3
nextrec/models/ranking/ffm.py +4 -3
nextrec/models/ranking/fibinet.py +2 -2
nextrec/models/ranking/fm.py +4 -3
nextrec/models/ranking/lr.py +4 -3
nextrec/models/ranking/masknet.py +4 -5
nextrec/models/ranking/pnn.py +5 -4
nextrec/models/ranking/widedeep.py +8 -8
nextrec/models/ranking/xdeepfm.py +5 -4
nextrec/utils/console.py +20 -6
nextrec/utils/data.py +154 -32
nextrec/utils/model.py +86 -1
{nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/METADATA +5 -6
{nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/RECORD +37 -36
{nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/WHEEL +0 -0
{nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/entry_points.txt +0 -0
{nextrec-0.4.16.dist-info → nextrec-0.4.18.dist-info}/licenses/LICENSE +0 -0

nextrec/cli.py CHANGED Viewed

@@ -395,7 +395,7 @@ def train_model(train_config_path: str) -> None:
         shuffle=train_cfg.get("shuffle", True),
         num_workers=dataloader_cfg.get("num_workers", 0),
         user_id_column=id_column,
-        tensorboard=False,
+        use_tensorboard=False,
     )

nextrec/data/dataloader.py CHANGED Viewed

@@ -2,7 +2,7 @@
 Dataloader definitions
 Date: create on 27/10/2025
-Checkpoint: edit on 19/12/2025
+Checkpoint: edit on 24/12/2025
 Author: Yang Zhou,zyaztec@gmail.com
 """
@@ -13,7 +13,6 @@ from typing import cast
 import numpy as np
 import pandas as pd
-import pyarrow.parquet as pq
 import torch
 from torch.utils.data import DataLoader, Dataset, IterableDataset
@@ -26,7 +25,12 @@ from nextrec.basic.features import (
 from nextrec.data.batch_utils import collate_fn
 from nextrec.data.data_processing import get_column_data
 from nextrec.data.preprocessor import DataProcessor
-from nextrec.utils.data import read_table, resolve_file_paths
+from nextrec.utils.data import (
+    check_streaming_support,
+    iter_file_chunks,
+    read_table,
+    resolve_file_paths,
+)
 from nextrec.utils.torch_utils import to_tensor
@@ -72,22 +76,34 @@ class TensorDictDataset(Dataset):
 class FileDataset(FeatureSet, IterableDataset):
     def __init__(
         self,
-        file_paths: list[str],  # file paths to read, containing CSV or Parquet files
-        dense_features: list[DenseFeature],  # dense feature definitions
-        sparse_features: list[SparseFeature],  # sparse feature definitions
-        sequence_features: list[SequenceFeature],  # sequence feature definitions
-        target_columns: list[str],  # target column names
-        id_columns: (
-            list[str] | None
-        ) = None,  # id columns to carry through (not used for model inputs)
+        file_paths: list[str],
+        dense_features: list[DenseFeature],
+        sparse_features: list[SparseFeature],
+        sequence_features: list[SequenceFeature],
+        target_columns: list[str],
+        id_columns: list[str] | None = None,
         chunk_size: int = 10000,
         file_type: str = "csv",
         processor: DataProcessor | None = None,
-    ):  # optional DataProcessor for transformation
+    ):
+        """Streaming dataset for reading files in chunks.
+        Args:
+            file_paths: List of file paths to read
+            dense_features: Dense feature definitions
+            sparse_features: Sparse feature definitions
+            sequence_features: Sequence feature definitions
+            target_columns: Target column names
+            id_columns: ID columns to carry through
+            chunk_size: Number of rows per chunk
+            file_type: Format type (csv, parquet, etc.)
+            processor: Optional DataProcessor for transformation
+        """
         self.file_paths = file_paths
         self.chunk_size = chunk_size
         self.file_type = file_type
         self.processor = processor
         self.set_all_features(
             dense_features,
             sparse_features,
@@ -102,26 +118,11 @@ class FileDataset(FeatureSet, IterableDataset):
         self.current_file_index = 0
         for file_path in self.file_paths:
             self.current_file_index += 1
-            # Don't log file processing here to avoid interrupting progress bars
-            # File information is already displayed in the CLI data section
-            if self.file_type == "csv":
-                yield from self.read_csv_chunks(file_path)
-            elif self.file_type == "parquet":
-                yield from self.read_parquet_chunks(file_path)
-    def read_csv_chunks(self, file_path: str):
-        chunk_iterator = pd.read_csv(file_path, chunksize=self.chunk_size)
-        for chunk in chunk_iterator:
-            tensors = self.dataframeto_tensors(chunk)
-            yield tensors
-    def read_parquet_chunks(self, file_path: str):
-        parquet_file = pq.ParquetFile(file_path)
-        for batch in parquet_file.iter_batches(batch_size=self.chunk_size):
-            chunk = batch.to_pandas()
-            tensors = self.dataframeto_tensors(chunk)
-            yield tensors
-            del chunk
+            for chunk in iter_file_chunks(file_path, self.file_type, self.chunk_size):
+                tensors = self.dataframeto_tensors(chunk)
+                if tensors is not None:
+                    yield tensors
+                del chunk
     def dataframeto_tensors(self, df: pd.DataFrame) -> dict | None:
         if self.processor is not None:
@@ -209,8 +210,6 @@ class RecDataLoader(FeatureSet):
         Returns:
             DataLoader instance.
         """
-        # Enforce num_workers=0 for streaming mode to prevent data duplication
         if streaming and num_workers > 0:
             logging.warning(
                 f"[RecDataLoader Warning] num_workers={num_workers} is not compatible with streaming=True. "
@@ -221,20 +220,13 @@ class RecDataLoader(FeatureSet):
         if isinstance(data, DataLoader):
             return data
-        elif isinstance(data, (str, os.PathLike)):
-            return self.create_from_path(
-                path=data,
-                batch_size=batch_size,
-                shuffle=shuffle,
-                streaming=streaming,
-                chunk_size=chunk_size,
-                num_workers=num_workers,
-            )
-        elif (
+        is_path_list = (
             isinstance(data, list)
             and data
             and all(isinstance(p, (str, os.PathLike)) for p in data)
-        ):
+        )
+        if isinstance(data, (str, os.PathLike)) or is_path_list:
             return self.create_from_path(
                 path=data,
                 batch_size=batch_size,
@@ -243,7 +235,8 @@ class RecDataLoader(FeatureSet):
                 chunk_size=chunk_size,
                 num_workers=num_workers,
             )
-        elif isinstance(data, (dict, pd.DataFrame)):
+        if isinstance(data, (dict, pd.DataFrame)):
             return self.create_from_memory(
                 data=data,
                 batch_size=batch_size,
@@ -251,10 +244,8 @@ class RecDataLoader(FeatureSet):
                 num_workers=num_workers,
                 sampler=sampler,
             )
-        else:
-            raise ValueError(
-                f"[RecDataLoader Error] Unsupported data type: {type(data)}"
-            )
+        raise ValueError(f"[RecDataLoader Error] Unsupported data type: {type(data)}")
     def create_from_memory(
         self,
@@ -264,7 +255,6 @@ class RecDataLoader(FeatureSet):
         num_workers: int = 0,
         sampler=None,
     ) -> DataLoader:
         raw_data = data
         if self.processor is not None:
@@ -309,17 +299,24 @@ class RecDataLoader(FeatureSet):
             file_paths = [str(Path(p)) for p in path]
             if not file_paths:
                 raise ValueError("[RecDataLoader Error] Empty file path list provided.")
-            suffixes = {Path(p).suffix.lower() for p in file_paths}
-            if len(suffixes) != 1:
-                raise ValueError(
-                    "[RecDataLoader Error] Mixed file types in provided list; please use only CSV or only Parquet."
-                )
-            suffix = suffixes.pop()
-            if suffix not in {".csv", ".parquet"}:
+            from nextrec.utils.data import get_file_format_from_extension
+            file_formats = set()
+            for p in file_paths:
+                fmt = get_file_format_from_extension(Path(p).suffix)
+                if fmt is None:
+                    raise ValueError(
+                        f"[RecDataLoader Error] Unsupported file extension: {Path(p).suffix}"
+                    )
+                file_formats.add(fmt)
+            if len(file_formats) != 1:
                 raise ValueError(
-                    f"[RecDataLoader Error] Unsupported file extension in list: {suffix}"
+                    f"[RecDataLoader Error] Mixed file types in provided list: {', '.join(file_formats)}. "
+                    "Please use a single format per DataLoader."
                 )
-            file_type = "csv" if suffix == ".csv" else "parquet"
+            file_type = file_formats.pop()
         if streaming:
             return self.load_files_streaming(
                 file_paths,
@@ -329,31 +326,30 @@ class RecDataLoader(FeatureSet):
                 shuffle,
                 num_workers=num_workers,
             )
-        # Load full data into memory
-        else:
-            dfs = []
-            total_bytes = 0
-            for file_path in file_paths:
-                try:
-                    total_bytes += os.path.getsize(file_path)
-                except OSError:
-                    pass
-                try:
-                    df = read_table(file_path, data_format=file_type)
-                    dfs.append(df)
-                except MemoryError as exc:
-                    raise MemoryError(
-                        f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using streaming=True."
-                    ) from exc
+        dfs = []
+        total_bytes = 0
+        for file_path in file_paths:
+            try:
+                total_bytes += os.path.getsize(file_path)
+            except OSError:
+                pass
             try:
-                combined_df = pd.concat(dfs, ignore_index=True)
+                df = read_table(file_path, data_format=file_type)
+                dfs.append(df)
             except MemoryError as exc:
                 raise MemoryError(
-                    f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use streaming=True or reduce chunk_size."
+                    f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using streaming=True."
                 ) from exc
-            return self.create_from_memory(
-                combined_df, batch_size, shuffle, num_workers=num_workers
-            )
+        try:
+            combined_df = pd.concat(dfs, ignore_index=True)
+        except MemoryError as exc:
+            raise MemoryError(
+                f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use streaming=True or reduce chunk_size."
+            ) from exc
+        return self.create_from_memory(
+            combined_df, batch_size, shuffle, num_workers=num_workers
+        )
     def load_files_streaming(
         self,
@@ -364,6 +360,11 @@ class RecDataLoader(FeatureSet):
         shuffle: bool,
         num_workers: int = 0,
     ) -> DataLoader:
+        if not check_streaming_support(file_type):
+            raise ValueError(
+                f"[RecDataLoader Error] Format '{file_type}' does not support streaming reads. "
+                "Use streaming=False or convert data to csv/parquet."
+            )
         if shuffle:
             logging.info(
                 "[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset)."
@@ -420,22 +421,21 @@ def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
                     f"[RecDataLoader Error] Sequence feature '{feature.name}' expects numeric sequences; found string values."
                 )
             if isinstance(seq, (list, tuple, np.ndarray)):
-                arr = np.asarray(seq, dtype=np.int64)
+                sequences.append(np.asarray(seq, dtype=np.int64))
             else:
-                arr = np.asarray([seq], dtype=np.int64)
-            sequences.append(arr)
+                sequences.append(np.asarray([seq], dtype=np.int64))
         max_len = getattr(feature, "max_len", 0)
         if max_len <= 0:
             max_len = max((len(seq) for seq in sequences), default=1)
         pad_value = getattr(feature, "padding_idx", 0)
-        padded = []
-        for seq in sequences:
-            if len(seq) > max_len:
-                padded.append(seq[:max_len])
-            else:
-                padded.append(
-                    np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value)
-                )
+        padded = [
+            (
+                seq[:max_len]
+                if len(seq) > max_len
+                else np.pad(seq, (0, max_len - len(seq)), constant_values=pad_value)
+            )
+            for seq in sequences
+        ]
         column = np.stack(padded)
     elif column.ndim == 1:
         column = column.reshape(-1, 1)
@@ -456,9 +456,7 @@ def build_tensors_from_data(
             raise ValueError(
                 f"[RecDataLoader Error] Feature column '{feature.name}' not found in data"
             )
-        if isinstance(
-            feature, SequenceFeature
-        ):  # sequence feature will do padding/truncation again to avoid the case when input data is not preprocessed
+        if isinstance(feature, SequenceFeature):
             arr = normalize_sequence_column(column, feature)
             tensor = to_tensor(arr, dtype=torch.long)
         elif isinstance(feature, DenseFeature):

nextrec/data/preprocessor.py CHANGED Viewed

@@ -2,7 +2,7 @@
 DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
 Date: create on 13/11/2025
-Checkpoint: edit on 19/12/2025
+Checkpoint: edit on 24/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
@@ -34,6 +34,8 @@ from nextrec.basic.session import resolve_save_path
 from nextrec.data.data_processing import hash_md5_mod
 from nextrec.utils.console import progress
 from nextrec.utils.data import (
+    FILE_FORMAT_CONFIG,
+    check_streaming_support,
     default_output_dir,
     iter_file_chunks,
     load_dataframes,
@@ -239,17 +241,9 @@ class DataProcessor(FeatureSet):
                     dtype=np.int64,
                     count=sparse_series.size,
                 )
-            le = self.label_encoders.get(name)
-            if le is None:
-                raise ValueError(
-                    f"[Data Processor Error] LabelEncoder for {name} not fitted"
-                )
-            cat = pd.Categorical(sparse_series, categories=le.classes_)
-            codes = cat.codes  # -1 indicates unknown category
-            unk_index = 0
-            if "<UNK>" in le.classes_:
-                unk_index = int(list(le.classes_).index("<UNK>"))
-            return np.where(codes < 0, unk_index, codes).astype(np.int64, copy=False)
+            raise ValueError(
+                f"[Data Processor Error] Token index for {name} not fitted"
+            )
         if encode_method == "hash":
             hash_size = config["hash_size"]
@@ -298,13 +292,11 @@ class DataProcessor(FeatureSet):
         split_fn = str.split
         is_nan = np.isnan
         if encode_method == "label":
-            class_to_idx = config.get("_token_to_idx") or config.get("_class_to_idx")
+            class_to_idx = config.get("_token_to_idx")
             if class_to_idx is None:
-                le = self.label_encoders.get(name)
-                if le is None:
-                    raise ValueError(f"LabelEncoder for {name} not fitted")
-                class_to_idx = {cls: idx for idx, cls in enumerate(le.classes_)}
-                config["_class_to_idx"] = class_to_idx
+                raise ValueError(
+                    f"[Data Processor Error] Token index for {name} not fitted"
+                )
             unk_index = int(config.get("_unk_index", class_to_idx.get("<UNK>", 0)))
         else:
             class_to_idx = None  # type: ignore
@@ -429,6 +421,12 @@ class DataProcessor(FeatureSet):
             )
         )
         file_paths, file_type = resolve_file_paths(path)
+        if not check_streaming_support(file_type):
+            raise ValueError(
+                f"[DataProcessor Error] Format '{file_type}' does not support streaming. "
+                "fit_from_path only supports streaming formats (csv, parquet) to avoid high memory usage. "
+                "Use fit(dataframe) with in-memory data or convert the data format."
+            )
         numeric_acc: Dict[str, Dict[str, float]] = {}
         for name in self.numeric_features.keys():
@@ -607,17 +605,16 @@ class DataProcessor(FeatureSet):
         data: Union[pd.DataFrame, Dict[str, Any]],
         return_dict: bool,
         persist: bool,
-        save_format: Optional[Literal["csv", "parquet"]],
+        save_format: Optional[str],
         output_path: Optional[str],
         warn_missing: bool = True,
     ):
         logger = logging.getLogger()
-        is_dataframe = isinstance(data, pd.DataFrame)
         data_dict = data if isinstance(data, dict) else None
-        result_dict: Dict[str, np.ndarray] = {}
-        if is_dataframe:
-            df: pd.DataFrame = data  # type: ignore[assignment]
+        result_dict = {}
+        if isinstance(data, pd.DataFrame):
+            df = data  # type: ignore[assignment]
             for col in df.columns:
                 result_dict[col] = df[col].to_numpy(copy=False)
         else:
@@ -631,7 +628,7 @@ class DataProcessor(FeatureSet):
                 else:
                     result_dict[key] = np.asarray(value)
-        data_columns = data.columns if is_dataframe else data_dict
+        data_columns = data.columns if isinstance(data, pd.DataFrame) else data_dict
         feature_groups = [
             ("Numeric", self.numeric_features, self.process_numeric_feature_transform),
             ("Sparse", self.sparse_features, self.process_sparse_feature_transform),
@@ -651,7 +648,7 @@ class DataProcessor(FeatureSet):
                     continue
                 series_data = (
                     data[name]
-                    if is_dataframe
+                    if isinstance(data, pd.DataFrame)
                     else pd.Series(result_dict[name], name=name)
                 )
                 result_dict[name] = transform_fn(series_data, config)
@@ -666,8 +663,6 @@ class DataProcessor(FeatureSet):
                     columns_dict[key] = value
             return pd.DataFrame(columns_dict)
-        if save_format not in [None, "csv", "parquet"]:
-            raise ValueError("save_format must be either 'csv', 'parquet', or None")
         effective_format = save_format
         if persist:
             effective_format = save_format or "parquet"
@@ -675,6 +670,8 @@ class DataProcessor(FeatureSet):
         if (not return_dict) or persist:
             result_df = dict_to_dataframe(result_dict)
         if persist:
+            if effective_format not in FILE_FORMAT_CONFIG:
+                raise ValueError(f"Unsupported save format: {effective_format}")
             if output_path is None:
                 raise ValueError(
                     "[Data Processor Error] output_path must be provided when persisting transformed data."
@@ -683,12 +680,25 @@ class DataProcessor(FeatureSet):
             if output_dir.suffix:
                 output_dir = output_dir.parent
             output_dir.mkdir(parents=True, exist_ok=True)
-            save_path = output_dir / f"transformed_data.{effective_format}"
+            suffix = FILE_FORMAT_CONFIG[effective_format]["extension"][0]
+            save_path = output_dir / f"transformed_data{suffix}"
             assert result_df is not None, "DataFrame conversion failed"
-            if effective_format == "parquet":
+            # Save based on format
+            if effective_format == "csv":
+                result_df.to_csv(save_path, index=False)
+            elif effective_format == "parquet":
                 result_df.to_parquet(save_path, index=False)
+            elif effective_format == "feather":
+                result_df.to_feather(save_path)
+            elif effective_format == "excel":
+                result_df.to_excel(save_path, index=False)
+            elif effective_format == "hdf5":
+                result_df.to_hdf(save_path, key="data", mode="w")
             else:
-                result_df.to_csv(save_path, index=False)
+                raise ValueError(f"Unsupported save format: {effective_format}")
             logger.info(
                 colorize(
                     f"Transformed data saved to: {save_path.resolve()}", color="green"
@@ -703,7 +713,7 @@ class DataProcessor(FeatureSet):
         self,
         input_path: str,
         output_path: Optional[str],
-        save_format: Optional[Literal["csv", "parquet"]],
+        save_format: Optional[str],
         chunk_size: int = 200000,
     ):
         """Transform data from files under a path and save them to a new location.
@@ -713,8 +723,21 @@ class DataProcessor(FeatureSet):
         logger = logging.getLogger()
         file_paths, file_type = resolve_file_paths(input_path)
         target_format = save_format or file_type
-        if target_format not in ["csv", "parquet"]:
-            raise ValueError("save_format must be either 'csv' or 'parquet'")
+        if target_format not in FILE_FORMAT_CONFIG:
+            raise ValueError(f"Unsupported format: {target_format}")
+        if chunk_size > 0 and not check_streaming_support(file_type):
+            raise ValueError(
+                f"Input format '{file_type}' does not support streaming reads. "
+                "Set chunk_size<=0 to use full-load transform."
+            )
+        # Warn about streaming support
+        if not check_streaming_support(target_format):
+            logger.warning(
+                f"[Data Processor Warning] Format '{target_format}' does not support streaming writes. "
+                "Large files may require more memory. Use csv or parquet for better streaming support."
+            )
         base_output_dir = (
             Path(output_path) if output_path else default_output_dir(input_path)
         )
@@ -725,10 +748,10 @@ class DataProcessor(FeatureSet):
         saved_paths = []
         for file_path in progress(file_paths, description="Transforming files"):
             source_path = Path(file_path)
-            target_file = output_root / f"{source_path.stem}.{target_format}"
+            suffix = FILE_FORMAT_CONFIG[target_format]["extension"][0]
+            target_file = output_root / f"{source_path.stem}{suffix}"
             # Stream transform for large files
             if chunk_size <= 0:
                 # fallback to full load behavior
                 df = read_table(file_path, file_type)
@@ -743,16 +766,28 @@ class DataProcessor(FeatureSet):
                 assert isinstance(
                     transformed_df, pd.DataFrame
                 ), "[Data Processor Error] Expected DataFrame when return_dict=False"
+                # Save based on format
                 if target_format == "csv":
                     transformed_df.to_csv(target_file, index=False)
-                else:
+                elif target_format == "parquet":
                     transformed_df.to_parquet(target_file, index=False)
+                elif target_format == "feather":
+                    transformed_df.to_feather(target_file)
+                elif target_format == "excel":
+                    transformed_df.to_excel(target_file, index=False)
+                elif target_format == "hdf5":
+                    transformed_df.to_hdf(target_file, key="data", mode="w")
+                else:
+                    raise ValueError(f"Unsupported format: {target_format}")
                 saved_paths.append(str(target_file.resolve()))
                 continue
             first_chunk = True
+            # Streaming write for supported formats
             if target_format == "parquet":
-                writer: pq.ParquetWriter | None = None
+                parquet_writer = None
                 try:
                     for chunk in iter_file_chunks(file_path, file_type, chunk_size):
                         transformed_df = self.transform_in_memory(
@@ -769,16 +804,15 @@ class DataProcessor(FeatureSet):
                         table = pa.Table.from_pandas(
                             transformed_df, preserve_index=False
                         )
-                        if writer is None:
-                            writer = pq.ParquetWriter(target_file, table.schema)
-                        writer.write_table(table)
+                        if parquet_writer is None:
+                            parquet_writer = pq.ParquetWriter(target_file, table.schema)
+                        parquet_writer.write_table(table)
                         first_chunk = False
                 finally:
-                    if writer is not None:
-                        writer.close()
-            else:
+                    if parquet_writer is not None:
+                        parquet_writer.close()
+            elif target_format == "csv":
                 # CSV: append chunks; header only once
-                # (truncate first to avoid mixing with existing files)
                 target_file.parent.mkdir(parents=True, exist_ok=True)
                 with open(target_file, "w", encoding="utf-8", newline="") as f:
                     f.write("")
@@ -798,6 +832,34 @@ class DataProcessor(FeatureSet):
                         target_file, index=False, mode="a", header=first_chunk
                     )
                     first_chunk = False
+            else:
+                # Non-streaming formats: collect all chunks and save once
+                logger.warning(
+                    f"Format '{target_format}' doesn't support streaming writes. "
+                    f"Collecting all chunks in memory before saving."
+                )
+                all_chunks = []
+                for chunk in iter_file_chunks(file_path, file_type, chunk_size):
+                    transformed_df = self.transform_in_memory(
+                        chunk,
+                        return_dict=False,
+                        persist=False,
+                        save_format=None,
+                        output_path=None,
+                        warn_missing=first_chunk,
+                    )
+                    assert isinstance(transformed_df, pd.DataFrame)
+                    all_chunks.append(transformed_df)
+                    first_chunk = False
+                if all_chunks:
+                    combined_df = pd.concat(all_chunks, ignore_index=True)
+                    if target_format == "feather":
+                        combined_df.to_feather(target_file)
+                    elif target_format == "excel":
+                        combined_df.to_excel(target_file, index=False)
+                    elif target_format == "hdf5":
+                        combined_df.to_hdf(target_file, key="data", mode="w")
             saved_paths.append(str(target_file.resolve()))
         logger.info(
@@ -849,7 +911,7 @@ class DataProcessor(FeatureSet):
         self,
         data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
         return_dict: bool = True,
-        save_format: Optional[Literal["csv", "parquet"]] = None,
+        save_format: Optional[str] = None,
         output_path: Optional[str] = None,
         chunk_size: int = 200000,
     ):
@@ -877,7 +939,7 @@ class DataProcessor(FeatureSet):
         self,
         data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
         return_dict: bool = True,
-        save_format: Optional[Literal["csv", "parquet"]] = None,
+        save_format: Optional[str] = None,
         output_path: Optional[str] = None,
         chunk_size: int = 200000,
     ):

nextrec 0.4.16__py3-none-any.whl → 0.4.18__py3-none-any.whl

nextrec 0.4.16py3-none-any.whl → 0.4.18py3-none-any.whl