PyPI - nextrec - Versions diffs - 0.4.32__py3-none-any.whl → 0.4.33__py3-none-any.whl - Mend

nextrec 0.4.32py3-none-any.whl → 0.4.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

nextrec/__version__.py +1 -1
nextrec/basic/model.py +12 -8
nextrec/basic/summary.py +2 -1
nextrec/cli.py +40 -31
nextrec/data/preprocessor.py +72 -25
{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/METADATA +4 -4
{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/RECORD +10 -10
{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/WHEEL +0 -0
{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/entry_points.txt +0 -0
{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/licenses/LICENSE +0 -0

nextrec/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.32"
1	+ __version__ = "0.4.33"

nextrec/basic/model.py CHANGED Viewed

@@ -933,6 +933,13 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
         existing_callbacks = self.callbacks.callbacks
+        has_validation = valid_data is not None or valid_split is not None
+        checkpoint_monitor = monitor_metric
+        checkpoint_mode = self.best_metrics_mode
+        if not has_validation:
+            checkpoint_monitor = "loss"
+            checkpoint_mode = "min"
         if self.early_stop_patience > 0 and not any(
             isinstance(cb, EarlyStopper) for cb in existing_callbacks
         ):
@@ -946,6 +953,8 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
                 )
             )
+        has_validation = valid_data is not None or valid_split is not None
         if self.is_main_process and not any(
             isinstance(cb, CheckpointSaver) for cb in existing_callbacks
         ):
@@ -953,9 +962,9 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
                 CheckpointSaver(
                     best_path=self.best_path,
                     checkpoint_path=self.checkpoint_path,
-                    monitor=monitor_metric,
-                    mode=self.best_metrics_mode,
-                    save_best_only=True,
+                    monitor=checkpoint_monitor,
+                    mode=checkpoint_mode,
+                    save_best_only=has_validation,
                     verbose=1,
                 )
             )
@@ -1246,11 +1255,6 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
                         epoch_logs[f"val_{k}"] = v
             else:
                 epoch_logs = {**train_log_payload}
-                if self.is_main_process:
-                    self.save_model(
-                        self.checkpoint_path, add_timestamp=False, verbose=False
-                    )
-                    self.best_checkpoint_path = self.checkpoint_path
             # Call on_epoch_end for all callbacks (handles early stopping, checkpointing, lr scheduling)
             self.callbacks.on_epoch_end(epoch, epoch_logs)

nextrec/basic/summary.py CHANGED Viewed

@@ -73,7 +73,8 @@ class SummarySet:
     def build_data_summary(
         self, data: Any, data_loader: DataLoader | None, sample_key: str
     ):
-        dataset = data_loader.dataset if data_loader else None
+        dataset = data_loader.dataset if data_loader is not None else None
         train_size = get_data_length(dataset)
         if train_size is None:

nextrec/cli.py CHANGED Viewed

@@ -152,16 +152,19 @@ def train_model(train_config_path: str) -> None:
     )
     if data_cfg.get("valid_ratio") is not None:
         logger.info(format_kv("Valid ratio", data_cfg.get("valid_ratio")))
-    if data_cfg.get("val_path") or data_cfg.get("valid_path"):
+    if data_cfg.get("valid_path"):
         logger.info(
             format_kv(
                 "Validation path",
                 resolve_path(
-                    data_cfg.get("val_path") or data_cfg.get("valid_path"), config_dir
+                    data_cfg.get("valid_path"), config_dir
                 ),
             )
         )
+    # Determine validation dataset path early for streaming split / fitting
+    val_data_path = data_cfg.get("valid_path")
     if streaming:
         file_paths, file_type = resolve_file_paths(str(data_path))
         log_kv_lines(
@@ -180,6 +183,34 @@ def train_model(train_config_path: str) -> None:
             raise ValueError(f"Data file is empty: {first_file}") from exc
         df_columns = list(first_chunk.columns)
+        # Decide training/validation file lists before fitting processor, to avoid
+        # leaking validation statistics into preprocessing (scalers/encoders).
+        streaming_train_files = file_paths
+        streaming_valid_ratio = data_cfg.get("valid_ratio")
+        if val_data_path:
+            streaming_valid_files = None
+        elif streaming_valid_ratio is not None:
+            ratio = float(streaming_valid_ratio)
+            if not (0 < ratio < 1):
+                raise ValueError(
+                    f"[NextRec CLI Error] Valid_ratio must be between 0 and 1, current value is {streaming_valid_ratio}"
+                )
+            total_files = len(file_paths)
+            if total_files < 2:
+                raise ValueError(
+                    "[NextRec CLI Error] Must provide valid_path or increase the number of data files. At least 2 files are required for streaming validation split."
+                )
+            val_count = max(1, int(round(total_files * ratio)))
+            if val_count >= total_files:
+                val_count = total_files - 1
+            streaming_valid_files = file_paths[-val_count:]
+            streaming_train_files = file_paths[:-val_count]
+            logger.info(
+                f"Split files for streaming training and validation using valid_ratio={ratio:.3f}: training {len(streaming_train_files)} files, validation {len(streaming_valid_files)} files"
+            )
+        else:
+            streaming_valid_files = None
     else:
         df = read_table(data_path, data_cfg.get("format"))
         logger.info(format_kv("Rows", len(df)))
@@ -215,7 +246,13 @@ def train_model(train_config_path: str) -> None:
     )
     if streaming:
-        processor.fit(str(data_path), chunk_size=dataloader_chunk_size)
+        if file_type is None:
+            raise ValueError("[NextRec CLI Error] Streaming mode requires a valid file_type")
+        processor.fit_from_files(
+            file_paths=streaming_train_files or file_paths,
+            file_type=file_type,
+            chunk_size=dataloader_chunk_size,
+        )
         processed = None
         df = None  # type: ignore[assignment]
     else:
@@ -232,34 +269,6 @@ def train_model(train_config_path: str) -> None:
         sequence_names,
     )
-    # Check if validation dataset path is specified
-    val_data_path = data_cfg.get("val_path") or data_cfg.get("valid_path")
-    if streaming:
-        if not file_paths:
-            file_paths, file_type = resolve_file_paths(str(data_path))
-        streaming_train_files = file_paths
-        streaming_valid_ratio = data_cfg.get("valid_ratio")
-        if val_data_path:
-            streaming_valid_files = None
-        elif streaming_valid_ratio is not None:
-            ratio = float(streaming_valid_ratio)
-            if not (0 < ratio < 1):
-                raise ValueError(
-                    f"[NextRec CLI Error] Valid_ratio must be between 0 and 1, current value is {streaming_valid_ratio}"
-                )
-            total_files = len(file_paths)
-            if total_files < 2:
-                raise ValueError(
-                    "[NextRec CLI Error] Must provide val_path or increase the number of data files. At least 2 files are required for streaming validation split."
-                )
-            val_count = max(1, int(round(total_files * ratio)))
-            if val_count >= total_files:
-                val_count = total_files - 1
-            streaming_valid_files = file_paths[-val_count:]
-            streaming_train_files = file_paths[:-val_count]
-            logger.info(
-                f"Split files for streaming training and validation using valid_ratio={ratio:.3f}: training {len(streaming_train_files)} files, validation {len(streaming_valid_files)} files"
-            )
     train_data: Dict[str, Any]
     valid_data: Dict[str, Any] | None

nextrec/data/preprocessor.py CHANGED Viewed

@@ -566,35 +566,16 @@ class DataProcessor(FeatureSet):
             return [str(v) for v in value]
         return [str(value)]
-    def fit_from_path(self, path: str, chunk_size: int) -> "DataProcessor":
-        """
-        Fit processor statistics by streaming files to reduce memory usage.
-        Args:
-            path (str): File or directory path.
-            chunk_size (int): Number of rows per chunk.
-        Returns:
-            DataProcessor: Fitted DataProcessor instance.
-        """
+    def fit_from_file_paths(
+        self, file_paths: list[str], file_type: str, chunk_size: int
+    ) -> "DataProcessor":
         logger = logging.getLogger()
-        logger.info(
-            colorize(
-                "Fitting DataProcessor (streaming path mode)...",
-                color="cyan",
-                bold=True,
-            )
-        )
-        for config in self.sparse_features.values():
-            config.pop("_min_freq_logged", None)
-        for config in self.sequence_features.values():
-            config.pop("_min_freq_logged", None)
-        file_paths, file_type = resolve_file_paths(path)
+        if not file_paths:
+            raise ValueError("[DataProcessor Error] Empty file list for streaming fit")
         if not check_streaming_support(file_type):
             raise ValueError(
                 f"[DataProcessor Error] Format '{file_type}' does not support streaming. "
-                "fit_from_path only supports streaming formats (csv, parquet) to avoid high memory usage. "
-                "Use fit(dataframe) with in-memory data or convert the data format."
+                "Streaming fit only supports csv, parquet to avoid high memory usage."
             )
         numeric_acc = {}
@@ -636,6 +617,7 @@ class DataProcessor(FeatureSet):
         target_values: Dict[str, set[Any]] = {
             name: set() for name in self.target_features.keys()
         }
         missing_features = set()
         for file_path in file_paths:
             for chunk in iter_file_chunks(file_path, file_type, chunk_size):
@@ -702,10 +684,12 @@ class DataProcessor(FeatureSet):
                 for name in self.target_features.keys() & columns:
                     vals = chunk[name].dropna().tolist()
                     target_values[name].update(vals)
         if missing_features:
             logger.warning(
                 f"The following configured features were not found in provided files: {sorted(missing_features)}"
             )
         # finalize numeric scalers
         for name, config in self.numeric_features.items():
             acc = numeric_acc[name]
@@ -895,6 +879,69 @@ class DataProcessor(FeatureSet):
         )
         return self
+    def fit_from_files(
+        self, file_paths: list[str], file_type: str, chunk_size: int
+    ) -> "DataProcessor":
+        """Fit processor statistics by streaming an explicit list of files.
+        This is useful when you want to fit statistics on training files only (exclude
+        validation files) in streaming mode.
+        """
+        logger = logging.getLogger()
+        logger.info(
+            colorize(
+                "Fitting DataProcessor (streaming files mode)...",
+                color="cyan",
+                bold=True,
+            )
+        )
+        for config in self.sparse_features.values():
+            config.pop("_min_freq_logged", None)
+        for config in self.sequence_features.values():
+            config.pop("_min_freq_logged", None)
+        uses_robust = any(
+            cfg.get("scaler") == "robust" for cfg in self.numeric_features.values()
+        )
+        if uses_robust:
+            logger.warning(
+                "Robust scaler requires full data; loading provided files into memory. "
+                "Consider smaller chunk_size or different scaler if memory is limited."
+            )
+            frames = [read_table(p, file_type) for p in file_paths]
+            df = pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0]
+            return self.fit(df)
+        return self.fit_from_file_paths(file_paths=file_paths, file_type=file_type, chunk_size=chunk_size)
+    def fit_from_path(self, path: str, chunk_size: int) -> "DataProcessor":
+        """
+        Fit processor statistics by streaming files to reduce memory usage.
+        Args:
+            path (str): File or directory path.
+            chunk_size (int): Number of rows per chunk.
+        Returns:
+            DataProcessor: Fitted DataProcessor instance.
+        """
+        logger = logging.getLogger()
+        logger.info(
+            colorize(
+                "Fitting DataProcessor (streaming path mode)...",
+                color="cyan",
+                bold=True,
+            )
+        )
+        for config in self.sparse_features.values():
+            config.pop("_min_freq_logged", None)
+        for config in self.sequence_features.values():
+            config.pop("_min_freq_logged", None)
+        file_paths, file_type = resolve_file_paths(path)
+        return self.fit_from_file_paths(
+            file_paths=file_paths,
+            file_type=file_type,
+            chunk_size=chunk_size,
+        )
     @overload
     def transform_in_memory(
         self,

{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nextrec
-Version: 0.4.32
+Version: 0.4.33
 Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
 Project-URL: Homepage, https://github.com/zerolovesea/NextRec
 Project-URL: Repository, https://github.com/zerolovesea/NextRec
@@ -69,7 +69,7 @@ Description-Content-Type: text/markdown
 ![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
 ![PyTorch](https://img.shields.io/badge/PyTorch-1.10+-ee4c2c.svg)
 ![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)
-![Version](https://img.shields.io/badge/Version-0.4.32-orange.svg)
+![Version](https://img.shields.io/badge/Version-0.4.33-orange.svg)
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/zerolovesea/NextRec)
 中文文档 | [English Version](README_en.md)
@@ -254,11 +254,11 @@ nextrec --mode=predict --predict_config=path/to/predict_config.yaml
 预测结果固定保存到 `{checkpoint_path}/predictions/{name}.{save_data_format}`。
-> 截止当前版本0.4.32，NextRec CLI支持单机训练，分布式训练相关功能尚在开发中。
+> 截止当前版本0.4.33，NextRec CLI支持单机训练，分布式训练相关功能尚在开发中。
 ## 兼容平台
-当前最新版本为0.4.32，所有模型和测试代码均已在以下平台通过验证，如果开发者在使用中遇到兼容问题，请在issue区提出错误报告及系统版本：
+当前最新版本为0.4.33，所有模型和测试代码均已在以下平台通过验证，如果开发者在使用中遇到兼容问题，请在issue区提出错误报告及系统版本：
 | 平台 | 配置 |
 |------|------|

{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 nextrec/__init__.py,sha256=_M3oUqyuvQ5k8Th_3wId6hQ_caclh7M5ad51XN09m98,235
-nextrec/__version__.py,sha256=W0DtmvTLu6FQL6tby9DrJltesCOu7Q36WFhsT2wLrgM,23
-nextrec/cli.py,sha256=hFDL_HlukJxdp4FU486g977Rix9OkGdEPGBj2HxqCGo,25393
+nextrec/__version__.py,sha256=O_0xE0g6EcJfkv7qWx5tmF2cs2K3UCW8uX8xzUqd7Rs,23
+nextrec/cli.py,sha256=k7gOrPfb3zmyUDxZipUNCFn-PaKCwUKbyJHhgpt-lyc,25673
 nextrec/basic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nextrec/basic/activation.py,sha256=uekcJsOy8SiT0_NaDO2VNSStyYFzVikDFVLDk-VrjwQ,2949
 nextrec/basic/asserts.py,sha256=U1EKovV_OT7_Mm99zFvdfF2hccFREp3gdDaeRjfiBwQ,2249
@@ -10,15 +10,15 @@ nextrec/basic/heads.py,sha256=BshykLxD41KxKuZaBxf4Fmy1Mc52b3ioJliN1BVaGlk,3374
 nextrec/basic/layers.py,sha256=tr8XFOcTvUHEZ6T3zJwmtKMA-u_xfzHloIkItGs821U,40084
 nextrec/basic/loggers.py,sha256=LAfnhdSNEzHybrXaKxCWoAML1c2A-FJF6atpfrrm_Kw,13840
 nextrec/basic/metrics.py,sha256=CPzENDcpO6QTDZLBtQlfAGKUYYQc0FT-eaMKJ4MURFo,23396
-nextrec/basic/model.py,sha256=uAC3wFKJcRUAgsvfc9hXhhfp1iILqvTSbA7Ryohn-bc,111590
+nextrec/basic/model.py,sha256=Psm1lfAScyDmkK-TmA7pjvI_Hg1IkZ02XgnqJVmvwyw,111699
 nextrec/basic/session.py,sha256=mrIsjRJhmvcAfoO1pXX-KB3SK5CCgz89wH8XDoAiGEI,4475
-nextrec/basic/summary.py,sha256=b6jLo70gqZj_bQ4eb5yb8SXmr2ilZlKNN293EyVnkyc,17759
+nextrec/basic/summary.py,sha256=MkzFwWJH3K76O0Gxqm3rVfbmWHqokvK2OBDe7WFQymo,17780
 nextrec/data/__init__.py,sha256=YZQjpty1pDCM7q_YNmiA2sa5kbujUw26ObLHWjMPjKY,1194
 nextrec/data/batch_utils.py,sha256=TbnXYqYlmK51dJthaL6dO7LTn4wyp8221I-kdgvpvDE,3542
 nextrec/data/data_processing.py,sha256=lhuwYxWp4Ts2bbuLGDt2LmuPrOy7pNcKczd2uVcQ4ss,6476
 nextrec/data/data_utils.py,sha256=0Ls1cnG9lBz0ovtyedw5vwp7WegGK_iF-F8e_3DEddo,880
 nextrec/data/dataloader.py,sha256=2sXwoiWxupKE-V1QYeZlXjK1yJyxhDtlOhknAnJF8Wk,19727
-nextrec/data/preprocessor.py,sha256=n2ZDR4o_-5nouBgCluWlVrXRkA9AoRaO7EvXPZAQvJg,66734
+nextrec/data/preprocessor.py,sha256=vZR7GnVALHMjQ3d-Bvd0mtkKj0nrkzndMib3vHY570Q,68496
 nextrec/loss/__init__.py,sha256=rualGsY-IBvmM52q9eOBk0MyKcMkpkazcscOeDXi_SM,774
 nextrec/loss/grad_norm.py,sha256=YoE_XSIN1HOUcNq1dpfkIlWtMaB5Pu-SEWDaNgtRw1M,8316
 nextrec/loss/listwise.py,sha256=mluxXQt9XiuWGvXA1nk4I0miqaKB6_GPVQqxLhAiJKs,5999
@@ -88,8 +88,8 @@ nextrec/utils/loss.py,sha256=GBWQGpDaYkMJySpdG078XbeUNXUC34PVqFy0AqNS9N0,4578
 nextrec/utils/model.py,sha256=PI9y8oWz1lhktgapZsiXb8rTr2NrFFlc80tr4yOFHik,5334
 nextrec/utils/torch_utils.py,sha256=UQpWS7F3nITYqvx2KRBaQJc9oTowRkIvowhuQLt6NFM,11953
 nextrec/utils/types.py,sha256=G88DHXFv-mbg-XY-7Xwwh1qvh6WM9jpAsBjw5VuBcio,1559
-nextrec-0.4.32.dist-info/METADATA,sha256=QkHGZMQg5HZLeO0PpByGa-FiNAeclrEeLinbF_K0Jik,23188
-nextrec-0.4.32.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-nextrec-0.4.32.dist-info/entry_points.txt,sha256=NN-dNSdfMRTv86bNXM7d3ZEPW2BQC6bRi7QP7i9cIps,45
-nextrec-0.4.32.dist-info/licenses/LICENSE,sha256=COP1BsqnEUwdx6GCkMjxOo5v3pUe4-Go_CdmQmSfYXM,1064
-nextrec-0.4.32.dist-info/RECORD,,
+nextrec-0.4.33.dist-info/METADATA,sha256=f9PQhSjuU2I32jNDBnVA5YA7K0yiTgnrV0S3QVPSHQU,23188
+nextrec-0.4.33.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+nextrec-0.4.33.dist-info/entry_points.txt,sha256=NN-dNSdfMRTv86bNXM7d3ZEPW2BQC6bRi7QP7i9cIps,45
+nextrec-0.4.33.dist-info/licenses/LICENSE,sha256=COP1BsqnEUwdx6GCkMjxOo5v3pUe4-Go_CdmQmSfYXM,1064
+nextrec-0.4.33.dist-info/RECORD,,

{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/WHEEL RENAMED Viewed

File without changes

{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nextrec-0.4.32.dist-info → nextrec-0.4.33.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

nextrec 0.4.32__py3-none-any.whl → 0.4.33__py3-none-any.whl

nextrec 0.4.32py3-none-any.whl → 0.4.33py3-none-any.whl