PyPI - nextrec - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

nextrec 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

nextrec/__version__.py +1 -1
nextrec/basic/loggers.py +71 -8
nextrec/basic/model.py +41 -9
nextrec/data/dataloader.py +2 -2
nextrec/data/preprocessor.py +33 -69
{nextrec-0.3.3.dist-info → nextrec-0.3.4.dist-info}/METADATA +3 -3
{nextrec-0.3.3.dist-info → nextrec-0.3.4.dist-info}/RECORD +9 -9
{nextrec-0.3.3.dist-info → nextrec-0.3.4.dist-info}/WHEEL +0 -0
{nextrec-0.3.3.dist-info → nextrec-0.3.4.dist-info}/licenses/LICENSE +0 -0

nextrec/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.3.3"
1	+ __version__ = "0.3.4"

nextrec/basic/loggers.py CHANGED Viewed

@@ -2,17 +2,19 @@
 NextRec Basic Loggers
 Date: create on 27/10/2025
-Checkpoint: edit on 29/11/2025
+Checkpoint: edit on 03/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
 import os
 import re
 import sys
+import json
 import copy
 import logging
-from nextrec.basic.session import create_session
+import numbers
+from typing import Mapping, Any
+from nextrec.basic.session import create_session, Session
 ANSI_CODES = {
     'black': '\033[30m',
@@ -77,17 +79,12 @@ def colorize(text: str, color: str | None = None, bold: bool = False) -> str:
     """Apply ANSI color and bold formatting to the given text."""
     if not color and not bold:
         return text
     result = ""
     if bold:
         result += ANSI_BOLD
     if color and color in ANSI_CODES:
         result += ANSI_CODES[color]
     result += text + ANSI_RESET
     return result
 def setup_logger(session_id: str | os.PathLike | None = None):
@@ -126,3 +123,69 @@ def setup_logger(session_id: str | os.PathLike | None = None):
     logger.addHandler(console_handler)
     return logger
+class TrainingLogger:
+    def __init__(
+        self,
+        session: Session,
+        enable_tensorboard: bool,
+        log_name: str = "training_metrics.jsonl",
+    ) -> None:
+        self.session = session
+        self.enable_tensorboard = enable_tensorboard
+        self.log_path = session.metrics_dir / log_name
+        self.log_path.parent.mkdir(parents=True, exist_ok=True)
+        self.tb_writer = None
+        self.tb_dir = None
+        if self.enable_tensorboard:
+            self._init_tensorboard()
+    def _init_tensorboard(self) -> None:
+        try:
+            from torch.utils.tensorboard import SummaryWriter  # type: ignore
+        except ImportError:
+            logging.warning("[TrainingLogger] tensorboard not installed, disable tensorboard logging.")
+            self.enable_tensorboard = False
+            return
+        tb_dir = self.session.logs_dir / "tensorboard"
+        tb_dir.mkdir(parents=True, exist_ok=True)
+        self.tb_dir = tb_dir
+        self.tb_writer = SummaryWriter(log_dir=str(tb_dir))
+    @property
+    def tensorboard_logdir(self):
+        return self.tb_dir
+    def format_metrics(self, metrics: Mapping[str, Any], split: str) -> dict[str, float]:
+        formatted: dict[str, float] = {}
+        for key, value in metrics.items():
+            if isinstance(value, numbers.Number):
+                formatted[f"{split}/{key}"] = float(value)
+            elif hasattr(value, "item"):
+                try:
+                    formatted[f"{split}/{key}"] = float(value.item())
+                except Exception:
+                    continue
+        return formatted
+    def log_metrics(self, metrics: Mapping[str, Any], step: int, split: str = "train") -> None:
+        payload = self.format_metrics(metrics, split)
+        payload["step"] = int(step)
+        with self.log_path.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(payload, ensure_ascii=False) + "\n")
+        if not self.tb_writer:
+            return
+        step = int(payload.get("step", 0))
+        for key, value in payload.items():
+            if key == "step":
+                continue
+            self.tb_writer.add_scalar(key, value, global_step=step)
+    def close(self) -> None:
+        if self.tb_writer:
+            self.tb_writer.flush()
+            self.tb_writer.close()
+            self.tb_writer = None

nextrec/basic/model.py CHANGED Viewed

@@ -10,6 +10,8 @@ import os
 import tqdm
 import pickle
 import logging
+import getpass
+import socket
 import numpy as np
 import pandas as pd
 import torch
@@ -24,7 +26,7 @@ from nextrec.basic.callback import EarlyStopper
 from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
 from nextrec.data.dataloader import TensorDictDataset, RecDataLoader
-from nextrec.basic.loggers import setup_logger, colorize
+from nextrec.basic.loggers import setup_logger, colorize, TrainingLogger
 from nextrec.basic.session import resolve_save_path, create_session
 from nextrec.basic.metrics import configure_metrics, evaluate_metrics, check_user_id
@@ -88,6 +90,7 @@ class BaseModel(FeatureSet, nn.Module):
         self.early_stop_patience = early_stop_patience
         self.max_gradient_norm = 1.0
         self.logger_initialized = False
+        self.training_logger: TrainingLogger | None = None
     def register_regularization_weights(self, embedding_attr: str = "embedding", exclude_modules: list[str] | None = None, include_modules: list[str] | None = None) -> None:
         exclude_modules = exclude_modules or []
@@ -275,11 +278,13 @@ class BaseModel(FeatureSet, nn.Module):
             metrics: list[str] | dict[str, list[str]] | None = None, # ['auc', 'logloss'] or {'target1': ['auc', 'logloss'], 'target2': ['mse']}
             epochs:int=1, shuffle:bool=True, batch_size:int=32,
             user_id_column: str | None = None,
-            validation_split: float | None = None):
+            validation_split: float | None = None,
+            tensorboard: bool = True,):
         self.to(self.device)
         if not self.logger_initialized:
             setup_logger(session_id=self.session_id)
             self.logger_initialized = True
+        self.training_logger = TrainingLogger(session=self.session, enable_tensorboard=tensorboard)
         self.metrics, self.task_specific_metrics, self.best_metrics_mode = configure_metrics(task=self.task, metrics=metrics, target_names=self.target_columns) # ['auc', 'logloss'], {'target1': ['auc', 'logloss'], 'target2': ['mse']}, 'max'
         self.early_stopper = EarlyStopper(patience=self.early_stop_patience, mode=self.best_metrics_mode)
@@ -303,6 +308,20 @@ class BaseModel(FeatureSet, nn.Module):
             is_streaming = True
         self.summary()
+        logging.info("")
+        if self.training_logger and self.training_logger.enable_tensorboard:
+            tb_dir = self.training_logger.tensorboard_logdir
+            if tb_dir:
+                user = getpass.getuser()
+                host = socket.gethostname()
+                tb_cmd = f"tensorboard --logdir {tb_dir} --port 6006"
+                ssh_hint = f"ssh -L 6006:localhost:6006 {user}@{host}"
+                logging.info(colorize(f"TensorBoard logs saved to: {tb_dir}", color="cyan"))
+                logging.info(colorize("To view logs, run:", color="cyan"))
+                logging.info(colorize(f"    {tb_cmd}", color="cyan"))
+                logging.info(colorize("Then SSH port forward:", color="cyan"))
+                logging.info(colorize(f"    {ssh_hint}", color="cyan"))
         logging.info("")
         logging.info(colorize("=" * 80, bold=True))
         if is_streaming:
@@ -312,7 +331,7 @@ class BaseModel(FeatureSet, nn.Module):
         logging.info(colorize("=" * 80, bold=True))
         logging.info("")
         logging.info(colorize(f"Model device: {self.device}", bold=True))
         for epoch in range(epochs):
             self.epoch_index = epoch
             if is_streaming:
@@ -326,7 +345,8 @@ class BaseModel(FeatureSet, nn.Module):
             else:
                 train_loss = train_result
                 train_metrics = None
+            train_log_payload: dict[str, float] = {}
             # handle logging for single-task and multi-task
             if self.nums_task == 1:
                 log_str = f"Epoch {epoch + 1}/{epochs} - Train: loss={train_loss:.4f}"
@@ -334,6 +354,9 @@ class BaseModel(FeatureSet, nn.Module):
                     metrics_str = ", ".join([f"{k}={v:.4f}" for k, v in train_metrics.items()])
                     log_str += f", {metrics_str}"
                 logging.info(colorize(log_str))
+                train_log_payload["loss"] = float(train_loss)
+                if train_metrics:
+                    train_log_payload.update(train_metrics)
             else:
                 total_loss_val = np.sum(train_loss) if isinstance(train_loss, np.ndarray) else train_loss  # type: ignore
                 log_str = f"Epoch {epoch + 1}/{epochs} - Train: loss={total_loss_val:.4f}"
@@ -356,12 +379,17 @@ class BaseModel(FeatureSet, nn.Module):
                                 task_metric_strs.append(f"{target_name}[{metrics_str}]")
                         log_str += ", " + ", ".join(task_metric_strs)
                 logging.info(colorize(log_str))
+                train_log_payload["loss"] = float(total_loss_val)
+                if train_metrics:
+                    train_log_payload.update(train_metrics)
+            if self.training_logger:
+                self.training_logger.log_metrics(train_log_payload, step=epoch + 1, split="train")
             if valid_loader is not None:
                 # pass user_ids only if needed for GAUC metric
                 val_metrics = self.evaluate(valid_loader, user_ids=valid_user_ids if self.needs_user_ids else None) # {'auc': 0.75, 'logloss': 0.45} or {'auc_target1': 0.75, 'logloss_target1': 0.45, 'mse_target2': 3.2}
                 if self.nums_task == 1:
                     metrics_str = ", ".join([f"{k}={v:.4f}" for k, v in val_metrics.items()])
-                    logging.info(colorize(f"Epoch {epoch + 1}/{epochs} - Valid: {metrics_str}", color="cyan"))
+                    logging.info(colorize(f"  Epoch {epoch + 1}/{epochs} - Valid: {metrics_str}", color="cyan"))
                 else:
                     # multi task metrics
                     task_metrics = {}
@@ -378,7 +406,9 @@ class BaseModel(FeatureSet, nn.Module):
                         if target_name in task_metrics:
                             metrics_str = ", ".join([f"{k}={v:.4f}" for k, v in task_metrics[target_name].items()])
                             task_metric_strs.append(f"{target_name}[{metrics_str}]")
-                    logging.info(colorize(f"Epoch {epoch + 1}/{epochs} - Valid: " + ", ".join(task_metric_strs), color="cyan"))
+                    logging.info(colorize(f"  Epoch {epoch + 1}/{epochs} - Valid: " + ", ".join(task_metric_strs), color="cyan"))
+                if val_metrics and self.training_logger:
+                    self.training_logger.log_metrics(val_metrics, step=epoch + 1, split="valid")
                 # Handle empty validation metrics
                 if not val_metrics:
                     self.save_model(self.checkpoint_path, add_timestamp=False, verbose=False)
@@ -401,6 +431,7 @@ class BaseModel(FeatureSet, nn.Module):
                         self.best_metric = primary_metric
                         improved = True
                 self.save_model(self.checkpoint_path, add_timestamp=False, verbose=False)
+                logging.info(" ")
                 if improved:
                     logging.info(colorize(f"Validation {primary_metric_key} improved to {self.best_metric:.4f}"))
                     self.save_model(self.best_path, add_timestamp=False, verbose=False)
@@ -431,6 +462,8 @@ class BaseModel(FeatureSet, nn.Module):
         if valid_loader is not None:
             logging.info(colorize(f"Load best model from: {self.best_checkpoint_path}"))
             self.load_model(self.best_checkpoint_path, map_location=self.device, verbose=False)
+        if self.training_logger:
+            self.training_logger.close()
         return self
     def train_epoch(self, train_loader: DataLoader, is_streaming: bool = False) -> Union[float, np.ndarray, tuple[Union[float, np.ndarray], dict]]:
@@ -527,6 +560,7 @@ class BaseModel(FeatureSet, nn.Module):
                     batch_user_id = get_user_ids(data=batch_dict, id_columns=self.id_columns)
                     if batch_user_id is not None:
                         collected_user_ids.append(batch_user_id)
+        logging.info(" ")
         logging.info(colorize(f"  Evaluation batches processed: {batch_count}", color="cyan"))
         if len(y_true_list) > 0:
             y_true_all = np.concatenate(y_true_list, axis=0)
@@ -956,9 +990,7 @@ class BaseModel(FeatureSet, nn.Module):
         logger.info(f"  Session ID:            {self.session_id}")
         logger.info(f"  Features Config Path:  {self.features_config_path}")
         logger.info(f"  Latest Checkpoint:     {self.checkpoint_path}")
-        logger.info("")
-        logger.info("")
 class BaseMatchModel(BaseModel):

nextrec/data/dataloader.py CHANGED Viewed

@@ -185,9 +185,9 @@ class RecDataLoader(FeatureSet):
                              chunk_size: int,
                              shuffle: bool) -> DataLoader:
         if shuffle:
-            logging.warning("[RecDataLoader Warning] Shuffle is ignored in streaming mode (IterableDataset).")
+            logging.info("[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset).")
         if batch_size != 1:
-            logging.warning("[RecDataLoader Warning] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
+            logging.info("[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
         dataset = FileDataset(file_paths=file_paths, dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target_columns=self.target_columns, id_columns=self.id_columns, chunk_size=chunk_size, file_type=file_type, processor=self.processor)
         return DataLoader(dataset, batch_size=1, collate_fn=collate_fn)

nextrec/data/preprocessor.py CHANGED Viewed

@@ -38,26 +38,6 @@ from nextrec.__version__ import __version__
 class DataProcessor(FeatureSet):
-    """DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
-    Examples:
-        >>> processor = DataProcessor()
-        >>> processor.add_numeric_feature('age', scaler='standard')
-        >>> processor.add_sparse_feature('user_id', encode_method='hash', hash_size=10000)
-        >>> processor.add_sequence_feature('item_history', encode_method='label', max_len=50, pad_value=0)
-        >>> processor.add_target('label', target_type='binary')
-        >>>
-        >>> # Fit and transform data
-        >>> processor.fit(train_df)
-        >>> processed_data = processor.transform(test_df)  # Returns dict of numpy arrays
-        >>>
-        >>> # Save and load processor
-        >>> processor.save('processor.pkl')
-        >>> loaded_processor = DataProcessor.load('processor.pkl')
-        >>>
-        >>> # Get vocabulary sizes for embedding layers
-        >>> vocab_sizes = processor.get_vocab_sizes()
-    """
     def __init__(self):
         self.numeric_features: Dict[str, Dict[str, Any]] = {}
         self.sparse_features: Dict[str, Dict[str, Any]] = {}
@@ -132,10 +112,10 @@ class DataProcessor(FeatureSet):
         }
         self.set_target_id(list(self.target_features.keys()), [])
-    def _hash_string(self, s: str, hash_size: int) -> int:
+    def hash_string(self, s: str, hash_size: int) -> int:
         return int(hashlib.md5(str(s).encode()).hexdigest(), 16) % hash_size
-    def _process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
+    def process_numeric_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
         scaler_type = config['scaler']
         fill_na = config['fill_na']
@@ -164,7 +144,7 @@ class DataProcessor(FeatureSet):
             scaler.fit(values)
             self.scalers[name] = scaler
-    def _process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
+    def process_numeric_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
         logger = logging.getLogger()
         name = str(data.name)
         scaler_type = config['scaler']
@@ -184,7 +164,7 @@ class DataProcessor(FeatureSet):
                 result = scaler.transform(values.reshape(-1, 1)).ravel()
         return result
-    def _process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
+    def process_sparse_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
         encode_method = config['encode_method']
         fill_na = config['fill_na'] # <UNK>
@@ -197,7 +177,7 @@ class DataProcessor(FeatureSet):
         elif encode_method == 'hash':
             config['vocab_size'] = config['hash_size']
-    def _process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
+    def process_sparse_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
         name = str(data.name)
         encode_method = config['encode_method']
         fill_na = config['fill_na']
@@ -215,11 +195,11 @@ class DataProcessor(FeatureSet):
             return encoded.to_numpy()
         if encode_method == 'hash':
             hash_size = config['hash_size']
-            hash_fn = self._hash_string
+            hash_fn = self.hash_string
             return np.fromiter((hash_fn(v, hash_size) for v in sparse_series.to_numpy()), dtype=np.int64, count=sparse_series.size,)
         return np.array([], dtype=np.int64)
-    def _process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
+    def process_sequence_feature_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
         encode_method = config['encode_method']
         separator = config['separator']
@@ -252,7 +232,7 @@ class DataProcessor(FeatureSet):
         elif encode_method == 'hash':
             config['vocab_size'] = config['hash_size']
-    def _process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
+    def process_sequence_feature_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
         """Optimized sequence transform with preallocation and cached vocab map."""
         name = str(data.name)
         encode_method = config['encode_method']
@@ -276,7 +256,7 @@ class DataProcessor(FeatureSet):
                 config['_class_to_idx'] = class_to_idx
         else:
             class_to_idx = None  # type: ignore
-        hash_fn = self._hash_string
+        hash_fn = self.hash_string
         hash_size = config.get('hash_size')
         for i, seq in enumerate(arr):
             # normalize sequence to a list of strings
@@ -301,11 +281,7 @@ class DataProcessor(FeatureSet):
             elif encode_method == 'hash':
                 if hash_size is None:
                     raise ValueError("hash_size must be set for hash encoding")
-                encoded = [
-                    hash_fn(str(token), hash_size)
-                    for token in tokens
-                    if str(token).strip()
-                ]
+                encoded = [hash_fn(str(token), hash_size) for token in tokens if str(token).strip()]
             else:
                 encoded = []
             if not encoded:
@@ -315,7 +291,7 @@ class DataProcessor(FeatureSet):
             output[i, : len(encoded)] = encoded
         return output
-    def _process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
+    def process_target_fit(self, data: pd.Series, config: Dict[str, Any]):
         name = str(data.name)
         target_type = config['target_type']
         label_map = config.get('label_map')
@@ -334,7 +310,7 @@ class DataProcessor(FeatureSet):
                 config['label_map'] = label_map
             self.target_encoders[name] = label_map
-    def _process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
+    def process_target_transform(self, data: pd.Series, config: Dict[str, Any]) -> np.ndarray:
         logger = logging.getLogger()
         name = str(data.name)
         target_type = config.get('target_type')
@@ -355,13 +331,13 @@ class DataProcessor(FeatureSet):
                     result.append(0)
             return np.array(result, dtype=np.int64 if target_type == 'multiclass' else np.float32)
-    def _load_dataframe_from_path(self, path: str) -> pd.DataFrame:
+    def load_dataframe_from_path(self, path: str) -> pd.DataFrame:
         """Load all data from a file or directory path into a single DataFrame."""
         file_paths, file_type = resolve_file_paths(path)
         frames = load_dataframes(file_paths, file_type)
         return pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0]
-    def _extract_sequence_tokens(self, value: Any, separator: str) -> list[str]:
+    def extract_sequence_tokens(self, value: Any, separator: str) -> list[str]:
         """Extract sequence tokens from a single value."""
         if value is None:
             return []
@@ -374,7 +350,7 @@ class DataProcessor(FeatureSet):
             return [str(v) for v in value]
         return [str(value)]
-    def _fit_from_path(self, path: str, chunk_size: int) -> 'DataProcessor':
+    def fit_from_path(self, path: str, chunk_size: int) -> 'DataProcessor':
         """Fit processor statistics by streaming files to reduce memory usage."""
         logger = logging.getLogger()
         logger.info(colorize("Fitting DataProcessor (streaming path mode)...", color="cyan", bold=True))
@@ -433,7 +409,7 @@ class DataProcessor(FeatureSet):
                     series = chunk[name]
                     tokens = []
                     for val in series:
-                        tokens.extend(self._extract_sequence_tokens(val, separator))
+                        tokens.extend(self.extract_sequence_tokens(val, separator))
                     seq_vocab[name].update(tokens)
                 # target features
@@ -548,7 +524,7 @@ class DataProcessor(FeatureSet):
         logger.info(colorize("DataProcessor fitted successfully (streaming path mode)", color="green", bold=True))
         return self
-    def _transform_in_memory(
+    def transform_in_memory(
         self,
         data: Union[pd.DataFrame, Dict[str, Any]],
         return_dict: bool,
@@ -581,7 +557,7 @@ class DataProcessor(FeatureSet):
                 continue
             # Convert to Series for processing
             series_data = pd.Series(data_dict[name], name=name)
-            processed = self._process_numeric_feature_transform(series_data, config)
+            processed = self.process_numeric_feature_transform(series_data, config)
             result_dict[name] = processed
         # process sparse features
@@ -590,7 +566,7 @@ class DataProcessor(FeatureSet):
                 logger.warning(f"Sparse feature {name} not found in data")
                 continue
             series_data = pd.Series(data_dict[name], name=name)
-            processed = self._process_sparse_feature_transform(series_data, config)
+            processed = self.process_sparse_feature_transform(series_data, config)
             result_dict[name] = processed
         # process sequence features
@@ -599,7 +575,7 @@ class DataProcessor(FeatureSet):
                 logger.warning(f"Sequence feature {name} not found in data")
                 continue
             series_data = pd.Series(data_dict[name], name=name)
-            processed = self._process_sequence_feature_transform(series_data, config)
+            processed = self.process_sequence_feature_transform(series_data, config)
             result_dict[name] = processed
         # process target features
@@ -608,10 +584,10 @@ class DataProcessor(FeatureSet):
                 logger.warning(f"Target {name} not found in data")
                 continue
             series_data = pd.Series(data_dict[name], name=name)
-            processed = self._process_target_transform(series_data, config)
+            processed = self.process_target_transform(series_data, config)
             result_dict[name] = processed
-        def _dict_to_dataframe(result: Dict[str, np.ndarray]) -> pd.DataFrame:
+        def dict_to_dataframe(result: Dict[str, np.ndarray]) -> pd.DataFrame:
             # Convert all arrays to Series/lists at once to avoid fragmentation
             columns_dict = {}
             for key, value in result.items():
@@ -629,7 +605,7 @@ class DataProcessor(FeatureSet):
             effective_format = save_format or "parquet"
         result_df = None
         if (not return_dict) or persist:
-            result_df = _dict_to_dataframe(result_dict)
+            result_df = dict_to_dataframe(result_dict)
         if persist:
             if output_path is None:
                 raise ValueError("output_path must be provided when persisting transformed data.")
@@ -649,7 +625,7 @@ class DataProcessor(FeatureSet):
         assert result_df is not None, "DataFrame is None after transform"
         return result_df
-    def _transform_path(
+    def transform_path(
         self,
         input_path: str,
         output_path: Optional[str],
@@ -669,13 +645,7 @@ class DataProcessor(FeatureSet):
         saved_paths = []
         for file_path in tqdm.tqdm(file_paths, desc="Transforming files", unit="file"):
             df = read_table(file_path, file_type)
-            transformed_df = self._transform_in_memory(
-                df,
-                return_dict=False,
-                persist=False,
-                save_format=None,
-                output_path=None,
-            )
+            transformed_df = self.transform_in_memory(df, return_dict=False, persist=False, save_format=None, output_path=None)
             assert isinstance(transformed_df, pd.DataFrame), "Expected DataFrame when return_dict=False"
             source_path = Path(file_path)
             target_file = output_root / f"{source_path.stem}.{target_format}"
@@ -695,9 +665,9 @@ class DataProcessor(FeatureSet):
             uses_robust = any(cfg.get("scaler") == "robust" for cfg in self.numeric_features.values())
             if uses_robust:
                 logger.warning("Robust scaler requires full data; loading all files into memory. Consider smaller chunk_size or different scaler if memory is limited.")
-                data = self._load_dataframe_from_path(path_str)
+                data = self.load_dataframe_from_path(path_str)
             else:
-                return self._fit_from_path(path_str, chunk_size)
+                return self.fit_from_path(path_str, chunk_size)
         if isinstance(data, dict):
             data = pd.DataFrame(data)
         logger.info(colorize("Fitting DataProcessor...", color="cyan", bold=True))
@@ -705,22 +675,22 @@ class DataProcessor(FeatureSet):
             if name not in data.columns:
                 logger.warning(f"Numeric feature {name} not found in data")
                 continue
-            self._process_numeric_feature_fit(data[name], config)
+            self.process_numeric_feature_fit(data[name], config)
         for name, config in self.sparse_features.items():
             if name not in data.columns:
                 logger.warning(f"Sparse feature {name} not found in data")
                 continue
-            self._process_sparse_feature_fit(data[name], config)
+            self.process_sparse_feature_fit(data[name], config)
         for name, config in self.sequence_features.items():
             if name not in data.columns:
                 logger.warning(f"Sequence feature {name} not found in data")
                 continue
-            self._process_sequence_feature_fit(data[name], config)
+            self.process_sequence_feature_fit(data[name], config)
         for name, config in self.target_features.items():
             if name not in data.columns:
                 logger.warning(f"Target {name} not found in data")
                 continue
-            self._process_target_fit(data[name], config)
+            self.process_target_fit(data[name], config)
         self.is_fitted = True
         return self
@@ -736,14 +706,8 @@ class DataProcessor(FeatureSet):
         if isinstance(data, (str, os.PathLike)):
             if return_dict:
                 raise ValueError("Path transform writes files only; set return_dict=False when passing a path.")
-            return self._transform_path(str(data), output_path, save_format)
-        return self._transform_in_memory(
-            data=data,
-            return_dict=return_dict,
-            persist=output_path is not None,
-            save_format=save_format,
-            output_path=output_path,
-        )
+            return self.transform_path(str(data), output_path, save_format)
+        return self.transform_in_memory(data=data, return_dict=return_dict, persist=output_path is not None, save_format=save_format, output_path=output_path)
     def fit_transform(
         self,

{nextrec-0.3.3.dist-info → nextrec-0.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nextrec
-Version: 0.3.3
+Version: 0.3.4
 Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
 Project-URL: Homepage, https://github.com/zerolovesea/NextRec
 Project-URL: Repository, https://github.com/zerolovesea/NextRec
@@ -63,7 +63,7 @@ Description-Content-Type: text/markdown
 ![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
 ![PyTorch](https://img.shields.io/badge/PyTorch-1.10+-ee4c2c.svg)
 ![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)
-![Version](https://img.shields.io/badge/Version-0.3.3-orange.svg)
+![Version](https://img.shields.io/badge/Version-0.3.4-orange.svg)
 English | [中文文档](README_zh.md)
@@ -110,7 +110,7 @@ To dive deeper, Jupyter notebooks are available:
 - [Hands on the NextRec framework](/tutorials/notebooks/en/Hands%20on%20nextrec.ipynb)
 - [Using the data processor for preprocessing](/tutorials/notebooks/en/Hands%20on%20dataprocessor.ipynb)
-> Current version [0.3.3]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
+> Current version [0.3.4]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
 ## 5-Minute Quick Start

{nextrec-0.3.3.dist-info → nextrec-0.3.4.dist-info}/RECORD RENAMED Viewed

@@ -1,18 +1,18 @@
 nextrec/__init__.py,sha256=CvocnY2uBp0cjNkhrT6ogw0q2bN9s1GNp754FLO-7lo,1117
-nextrec/__version__.py,sha256=8KcCYTXH99C2-gCLuPILJvtT9YftRWJsartIx6TQ2ZY,22
+nextrec/__version__.py,sha256=oYLGMpySamd16KLiaBTfRyrAS7_oyp-TOEHmzmeumwg,22
 nextrec/basic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nextrec/basic/activation.py,sha256=1qs9pq4hT3BUxIiYdYs57axMCm4-JyOBFQ6x7xkHTwM,2849
 nextrec/basic/callback.py,sha256=wwh0I2kKYyywCB-sG9eQXShlpXFJIo75qApJmnI5p6c,1036
 nextrec/basic/features.py,sha256=-RRRbEPU-SFI-GtppflW6O0bKShUsV-Hg_lTGpo3AIE,4262
 nextrec/basic/layers.py,sha256=zzEseKYVnMVs1Tg5EGrFimugId15jI6HumgzjFyRqgw,23127
-nextrec/basic/loggers.py,sha256=VNed0LagpoPSUl2itW8hHT-BSqJHTlQY5pVxIVmm6AE,3733
+nextrec/basic/loggers.py,sha256=hh9tRMmaCTaJ_sfRHIlbcqd6BcpK63vpZ_21TFCiKLI,6148
 nextrec/basic/metrics.py,sha256=8-hMZJXU5L4F8GnToxMZey5dlBrtFyRtTuI_zoQCtIo,21579
-nextrec/basic/model.py,sha256=vtxPuGePgf7lFXItremzKIJmKe4pcSGEZ16TBLw7wcc,67059
+nextrec/basic/model.py,sha256=afnvicyxXMgWdvhrIUaoNnZ7S-QYRYr7fTY5bdM1u_s,68829
 nextrec/basic/session.py,sha256=oaATn-nzbJ9A6SGbMut9xLV_NSh9_1KmVDeNauS06Ps,4767
 nextrec/data/__init__.py,sha256=6WgXZafzzXcv5kuxKNi67O8BJZVl_P_HM2IZCDIIhPA,1052
 nextrec/data/data_utils.py,sha256=aOyja3Yu7O2c8eIeL3P8MyUlUR5EerOUT9UeF4ATq8o,10574
-nextrec/data/dataloader.py,sha256=JsEVInyZ1nQXLAbRDPPN3Y47wOvWxHHOy-ikLa6sOrg,14211
-nextrec/data/preprocessor.py,sha256=Mg0unoalwNsa_OIPq8myxj3rNCrHqfTwB1IpBCdXbnI,41734
+nextrec/data/dataloader.py,sha256=2MLe69y0E1cTZyzMNgyLUCxa6lllGd1ntvwpXzxdX10,14199
+nextrec/data/preprocessor.py,sha256=lhigpjvkEqsjTRfbBBOjgGOxoPyOifwq2LoswgyIVqc,40488
 nextrec/loss/__init__.py,sha256=mO5t417BneZ8Ysa51GyjDaffjWyjzFgPXIQrrggasaQ,827
 nextrec/loss/listwise.py,sha256=gxDbO1td5IeS28jKzdE35o1KAYBRdCYoMzyZzfNLhc0,5689
 nextrec/loss/loss_utils.py,sha256=uZ4m9ChLr-UgIc5Yxm1LjwXDDepApQ-Fas8njweZ9qg,2641
@@ -51,7 +51,7 @@ nextrec/utils/common.py,sha256=NYXnBVtUCtm8epT2ZxJHn_m1SIBBI_PEjZ5VpL465ls,2009
 nextrec/utils/embedding.py,sha256=yxYSdFx0cJITh3Gf-K4SdhwRtKGcI0jOsyBgZ0NLa_c,465
 nextrec/utils/initializer.py,sha256=ffYOs5QuIns_d_-5e40iNtg6s1ftgREJN-ueq_NbDQE,1647
 nextrec/utils/optimizer.py,sha256=EUjAGFPeyou_Cv-_2HRvjzut8y_qpAQudc8L2T0k8zw,2706
-nextrec-0.3.3.dist-info/METADATA,sha256=MR4cHVPwRpyI0RBfooMTu2jZIUcPU-Ztp0AhGAMz37w,16319
-nextrec-0.3.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-nextrec-0.3.3.dist-info/licenses/LICENSE,sha256=2fQfVKeafywkni7MYHyClC6RGGC3laLTXCNBx-ubtp0,1064
-nextrec-0.3.3.dist-info/RECORD,,
+nextrec-0.3.4.dist-info/METADATA,sha256=X5fo5gymQdPXLgM1N03E58uFSQyuQOmdbUp8vXvKl0g,16319
+nextrec-0.3.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+nextrec-0.3.4.dist-info/licenses/LICENSE,sha256=2fQfVKeafywkni7MYHyClC6RGGC3laLTXCNBx-ubtp0,1064
+nextrec-0.3.4.dist-info/RECORD,,

{nextrec-0.3.3.dist-info → nextrec-0.3.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{nextrec-0.3.3.dist-info → nextrec-0.3.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

nextrec 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

nextrec 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl