PyPI - nextrec - Versions diffs - 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

nextrec 0.4.5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

nextrec/__version__.py +1 -1
nextrec/basic/callback.py +399 -21
nextrec/basic/features.py +4 -0
nextrec/basic/layers.py +103 -24
nextrec/basic/metrics.py +71 -1
nextrec/basic/model.py +285 -186
nextrec/data/data_processing.py +1 -3
nextrec/loss/loss_utils.py +73 -4
nextrec/models/generative/__init__.py +16 -0
nextrec/models/generative/hstu.py +110 -57
nextrec/models/generative/rqvae.py +826 -0
nextrec/models/match/dssm.py +5 -4
nextrec/models/match/dssm_v2.py +4 -3
nextrec/models/match/mind.py +5 -4
nextrec/models/match/sdm.py +5 -4
nextrec/models/match/youtube_dnn.py +5 -4
nextrec/models/ranking/masknet.py +1 -1
nextrec/utils/config.py +38 -1
nextrec/utils/embedding.py +28 -0
nextrec/utils/initializer.py +4 -4
nextrec/utils/synthetic_data.py +19 -0
nextrec-0.4.7.dist-info/METADATA +376 -0
{nextrec-0.4.5.dist-info → nextrec-0.4.7.dist-info}/RECORD +26 -25
nextrec-0.4.5.dist-info/METADATA +0 -357
{nextrec-0.4.5.dist-info → nextrec-0.4.7.dist-info}/WHEEL +0 -0
{nextrec-0.4.5.dist-info → nextrec-0.4.7.dist-info}/entry_points.txt +0 -0
{nextrec-0.4.5.dist-info → nextrec-0.4.7.dist-info}/licenses/LICENSE +0 -0

nextrec/__version__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.4.5"
1	+ __version__ = "0.4.7"

nextrec/basic/callback.py CHANGED Viewed

@@ -1,35 +1,413 @@
 """
-EarlyStopper definitions
+Callback System for Training Process
 Date: create on 27/10/2025
+Checkpoint: edit on 17/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
 import copy
+import logging
+from typing import Optional
+from pathlib import Path
+import torch
+import pickle
+from nextrec import __version__
-class EarlyStopper(object):
-    def __init__(self, patience: int = 20, mode: str = "max"):
+class Callback:
+    """
+    Base callback.
+    Notes (DDP):
+        - In distributed training, the training loop runs on every rank.
+        - For callbacks with side effects (saving, logging, etc.), set
+          ``run_on_main_process_only=True`` to avoid multi-rank duplication.
+    """
+    run_on_main_process_only: bool = False
+    def on_train_begin(self, logs: Optional[dict] = None):
+        pass
+    def on_train_end(self, logs: Optional[dict] = None):
+        pass
+    def on_epoch_begin(self, epoch: int, logs: Optional[dict] = None):
+        pass
+    def on_epoch_end(self, epoch: int, logs: Optional[dict] = None):
+        pass
+    def on_batch_begin(self, batch: int, logs: Optional[dict] = None):
+        pass
+    def on_batch_end(self, batch: int, logs: Optional[dict] = None):
+        pass
+    def on_validation_begin(self, logs: Optional[dict] = None):
+        pass
+    def on_validation_end(self, logs: Optional[dict] = None):
+        pass
+    def set_model(self, model):
+        self.model = model
+    def set_params(self, params: dict):
+        self.params = params
+    def should_run(self) -> bool:
+        if not getattr(self, "run_on_main_process_only", False):
+            return True
+        model = getattr(self, "model", None)
+        if model is None:
+            return True
+        return bool(getattr(model, "is_main_process", True))
+class CallbackList:
+    """Container for managing multiple callbacks."""
+    def __init__(self, callbacks: Optional[list[Callback]] = None):
+        self.callbacks = callbacks or []
+    def append(self, callback: Callback):
+        self.callbacks.append(callback)
+    def set_model(self, model):
+        for callback in self.callbacks:
+            callback.set_model(model)
+    def set_params(self, params: dict):
+        for callback in self.callbacks:
+            callback.set_params(params)
+    def on_train_begin(self, logs: Optional[dict] = None):
+        for callback in self.callbacks:
+            if not callback.should_run():
+                continue
+            callback.on_train_begin(logs)
+    def on_train_end(self, logs: Optional[dict] = None):
+        for callback in self.callbacks:
+            if not callback.should_run():
+                continue
+            callback.on_train_end(logs)
+    def on_epoch_begin(self, epoch: int, logs: Optional[dict] = None):
+        for callback in self.callbacks:
+            if not callback.should_run():
+                continue
+            callback.on_epoch_begin(epoch, logs)
+    def on_epoch_end(self, epoch: int, logs: Optional[dict] = None):
+        for callback in self.callbacks:
+            if not callback.should_run():
+                continue
+            callback.on_epoch_end(epoch, logs)
+    def on_batch_begin(self, batch: int, logs: Optional[dict] = None):
+        for callback in self.callbacks:
+            if not callback.should_run():
+                continue
+            callback.on_batch_begin(batch, logs)
+    def on_batch_end(self, batch: int, logs: Optional[dict] = None):
+        for callback in self.callbacks:
+            if not callback.should_run():
+                continue
+            callback.on_batch_end(batch, logs)
+    def on_validation_begin(self, logs: Optional[dict] = None):
+        for callback in self.callbacks:
+            if not callback.should_run():
+                continue
+            callback.on_validation_begin(logs)
+    def on_validation_end(self, logs: Optional[dict] = None):
+        for callback in self.callbacks:
+            if not callback.should_run():
+                continue
+            callback.on_validation_end(logs)
+class EarlyStopper(Callback):
+    def __init__(
+        self,
+        monitor: str = "val_auc",
+        patience: int = 20,
+        mode: str = "max",
+        min_delta: float = 0.0,
+        restore_best_weights: bool = True,
+        verbose: int = 1,
+    ):
+        super().__init__()
+        self.monitor = monitor
         self.patience = patience
-        self.trial_counter = 0
-        self.best_metrics = 0
+        self.mode = mode
+        self.min_delta = abs(min_delta)
+        self.restore_best_weights = restore_best_weights
+        self.verbose = verbose
+        self.wait = 0
+        self.stopped_epoch = 0
         self.best_weights = None
+        self.best_epoch = 0
+        if mode == "min":
+            self.best_value = float("inf")
+            self.monitor_op = lambda current, best: current < (best - self.min_delta)
+        elif mode == "max":
+            self.best_value = float("-inf")
+            self.monitor_op = lambda current, best: current > (best + self.min_delta)
+        else:
+            raise ValueError(f"mode must be 'min' or 'max', got {mode}")
+    def on_train_begin(self, logs: Optional[dict] = None):
+        self.wait = 0
+        self.stopped_epoch = 0
+        self.best_weights = None
+        self.best_epoch = 0
+        if self.mode == "min":
+            self.best_value = float("inf")
+        else:
+            self.best_value = float("-inf")
+    def on_epoch_end(self, epoch: int, logs: Optional[dict] = None):
+        logs = logs or {}
+        current = logs.get(self.monitor)
+        if current is None:
+            if self.verbose > 0:
+                logging.warning(
+                    f"Early stopping conditioned on metric `{self.monitor}` "
+                    f"which is not available. Available metrics are: {','.join(list(logs.keys()))}"
+                )
+            return
+        if self.monitor_op(current, self.best_value):
+            self.best_value = current
+            self.best_epoch = epoch
+            self.wait = 0
+            if self.restore_best_weights:
+                self.best_weights = copy.deepcopy(self.model.state_dict())
+        else:
+            self.wait += 1
+            if self.wait >= self.patience:
+                self.stopped_epoch = epoch
+                if hasattr(self.model, "stop_training"):
+                    self.model.stop_training = True
+                if self.verbose > 0:
+                    logging.info(
+                        f"Early stopping triggered at epoch {epoch + 1}. "
+                        f"Best {self.monitor}: {self.best_value:.6f} at epoch {self.best_epoch + 1}"
+                    )
+    def on_train_end(self, logs: Optional[dict] = None):
+        if self.restore_best_weights and self.best_weights is not None:
+            if self.verbose > 0:
+                logging.info(
+                    f"Restoring model weights from epoch {self.best_epoch + 1} "
+                    f"with best {self.monitor}: {self.best_value:.6f}"
+                )
+            self.model.load_state_dict(self.best_weights)
+class CheckpointSaver(Callback):
+    """Callback to save model checkpoints during training.
+    Args:
+        save_path: Path to save checkpoints.
+        monitor: Metric name to monitor for saving best model.
+        mode: One of {'min', 'max'}.
+        save_best_only: If True, only save when the model is considered the "best".
+        save_freq: Frequency of checkpoint saving ('epoch' or integer for every N epochs).
+        verbose: Verbosity mode.
+    """
+    def __init__(
+        self,
+        save_path: str | Path,
+        monitor: str = "val_auc",
+        mode: str = "max",
+        save_best_only: bool = False,
+        save_freq: str | int = "epoch",
+        verbose: int = 1,
+        run_on_main_process_only: bool = True,
+    ):
+        super().__init__()
+        self.run_on_main_process_only = run_on_main_process_only
+        self.save_path = Path(save_path)
+        self.monitor = monitor
         self.mode = mode
+        self.save_best_only = save_best_only
+        self.save_freq = save_freq
+        self.verbose = verbose
-    def stop_training(self, val_metrics, weights):
-        if self.mode == "max":
-            if val_metrics > self.best_metrics:
-                self.best_metrics = val_metrics
-                self.trial_counter = 0
-                self.best_weights = copy.deepcopy(weights)
-        elif self.mode == "min":
-            if val_metrics < self.best_metrics:
-                self.best_metrics = val_metrics
-                self.trial_counter = 0
-                self.best_weights = copy.deepcopy(weights)
-            return False
-        elif self.trial_counter + 1 < self.patience:
-            self.trial_counter += 1
-            return False
+        if mode == "min":
+            self.best_value = float("inf")
+            self.monitor_op = lambda current, best: current < best
+        elif mode == "max":
+            self.best_value = float("-inf")
+            self.monitor_op = lambda current, best: current > best
         else:
-            return True
+            raise ValueError(f"mode must be 'min' or 'max', got {mode}")
+    def on_train_begin(self, logs: Optional[dict] = None):
+        if self.mode == "min":
+            self.best_value = float("inf")
+        else:
+            self.best_value = float("-inf")
+        # Create directory if it doesn't exist
+        self.save_path.parent.mkdir(parents=True, exist_ok=True)
+    def on_epoch_end(self, epoch: int, logs: Optional[dict] = None):
+        logs = logs or {}
+        # Check if we should save this epoch
+        should_save = False
+        if self.save_freq == "epoch":
+            should_save = True
+        elif isinstance(self.save_freq, int) and (epoch + 1) % self.save_freq == 0:
+            should_save = True
+        if not should_save and self.save_best_only:
+            should_save = False
+        # Check if this is the best model
+        current = logs.get(self.monitor)
+        is_best = False
+        if current is not None and self.monitor_op(current, self.best_value):
+            self.best_value = current
+            is_best = True
+            should_save = True
+        if should_save:
+            if not self.save_best_only or is_best:
+                checkpoint_path = (
+                    self.save_path.parent
+                    / f"{self.save_path.stem}_epoch_{epoch + 1}{self.save_path.suffix}"
+                )
+                self.save_checkpoint(checkpoint_path, epoch, logs)
+                if is_best:
+                    # Use save_path directly without adding _best suffix since it may already contain it
+                    self.save_checkpoint(self.save_path, epoch, logs)
+                    if self.verbose > 0:
+                        logging.info(
+                            f"Saved best model to {self.save_path} with {self.monitor}: {current:.6f}"
+                        )
+    def save_checkpoint(self, path: Path, epoch: int, logs: dict):
+        # Get the actual model (unwrap DDP if needed)
+        model_to_save = (
+            self.model.ddp_model.module
+            if getattr(self.model, "ddp_model", None) is not None
+            else self.model
+        )
+        # Save only state_dict to match BaseModel.save_model() format
+        torch.save(model_to_save.state_dict(), path)
+        # Also save features_config.pkl if it doesn't exist
+        config_path = path.parent / "features_config.pkl"
+        if not config_path.exists():
+            features_config = {
+                "all_features": self.model.all_features,
+                "target": self.model.target_columns,
+                "id_columns": self.model.id_columns,
+                "version": __version__,
+            }
+            with open(config_path, "wb") as f:
+                pickle.dump(features_config, f)
+        if self.verbose > 1:
+            logging.info(f"Saved checkpoint to {path}")
+class LearningRateScheduler(Callback):
+    """Callback for learning rate scheduling.
+    Args:
+        scheduler: Learning rate scheduler instance or name.
+        verbose: Verbosity mode.
+    """
+    def __init__(self, scheduler=None, verbose: int = 0):
+        super().__init__()
+        self.scheduler = scheduler
+        self.verbose = verbose
+    def on_train_begin(self, logs: Optional[dict] = None):
+        if self.scheduler is None and hasattr(self.model, "scheduler_fn"):
+            self.scheduler = self.model.scheduler_fn
+    def on_epoch_end(self, epoch: int, logs: Optional[dict] = None):
+        if self.scheduler is not None:
+            # Get current lr before step
+            if hasattr(self.model, "optimizer_fn"):
+                old_lr = self.model.optimizer_fn.param_groups[0]["lr"]
+            # Step the scheduler
+            if hasattr(self.scheduler, "step"):
+                # Some schedulers need metrics
+                if "val_loss" in (logs or {}) and hasattr(self.scheduler, "mode"):
+                    self.scheduler.step(logs["val_loss"])
+                else:
+                    self.scheduler.step()
+            # Log new lr
+            if self.verbose > 0 and hasattr(self.model, "optimizer_fn"):
+                if getattr(self.model, "is_main_process", True):
+                    new_lr = self.model.optimizer_fn.param_groups[0]["lr"]
+                    if new_lr != old_lr:
+                        logging.info(
+                            f"Learning rate changed from {old_lr:.6e} to {new_lr:.6e}"
+                        )
+class MetricsLogger(Callback):
+    """Callback for logging training metrics.
+    Args:
+        log_freq: Frequency of logging ('epoch', 'batch', or integer for every N epochs/batches).
+        verbose: Verbosity mode.
+    """
+    def __init__(self, log_freq: str | int = "epoch", verbose: int = 1):
+        super().__init__()
+        self.run_on_main_process_only = True
+        self.log_freq = log_freq
+        self.verbose = verbose
+        self.batch_count = 0
+    def on_epoch_end(self, epoch: int, logs: Optional[dict] = None):
+        if self.verbose > 0 and (
+            self.log_freq == "epoch"
+            or (isinstance(self.log_freq, int) and (epoch + 1) % self.log_freq == 0)
+        ):
+            logs = logs or {}
+            metrics_str = " - ".join(
+                [
+                    f"{k}: {v:.6f}" if isinstance(v, float) else f"{k}: {v}"
+                    for k, v in logs.items()
+                ]
+            )
+            logging.info(f"Epoch {epoch + 1}: {metrics_str}")
+    def on_batch_end(self, batch: int, logs: Optional[dict] = None):
+        self.batch_count += 1
+        if self.verbose > 1 and self.log_freq == "batch":
+            logs = logs or {}
+            metrics_str = " - ".join(
+                [
+                    f"{k}: {v:.6f}" if isinstance(v, float) else f"{k}: {v}"
+                    for k, v in logs.items()
+                ]
+            )
+            logging.info(f"Batch {batch}: {metrics_str}")

nextrec/basic/features.py CHANGED Viewed

@@ -33,6 +33,8 @@ class SequenceFeature(BaseFeature):
         l1_reg: float = 0.0,
         l2_reg: float = 1e-5,
         trainable: bool = True,
+        pretrained_weight: torch.Tensor | None = None,
+        freeze_pretrained: bool = False,
     ):
         self.name = name
         self.vocab_size = vocab_size
@@ -47,6 +49,8 @@ class SequenceFeature(BaseFeature):
         self.l1_reg = l1_reg
         self.l2_reg = l2_reg
         self.trainable = trainable
+        self.pretrained_weight = pretrained_weight
+        self.freeze_pretrained = freeze_pretrained
 class SparseFeature(BaseFeature):

nextrec/basic/layers.py CHANGED Viewed

@@ -496,12 +496,18 @@ class HadamardInteractionLayer(nn.Module):
 class MultiHeadSelfAttention(nn.Module):
+    """
+    Multi-Head Self-Attention layer with Flash Attention support.
+    Uses PyTorch 2.0+ scaled_dot_product_attention when available for better performance.
+    """
     def __init__(
         self,
         embedding_dim: int,
         num_heads: int = 2,
         dropout: float = 0.0,
         use_residual: bool = True,
+        use_layer_norm: bool = False,
     ):
         super().__init__()
         if embedding_dim % num_heads != 0:
@@ -512,45 +518,100 @@ class MultiHeadSelfAttention(nn.Module):
         self.num_heads = num_heads
         self.head_dim = embedding_dim // num_heads
         self.use_residual = use_residual
+        self.dropout_rate = dropout
         self.W_Q = nn.Linear(embedding_dim, embedding_dim, bias=False)
         self.W_K = nn.Linear(embedding_dim, embedding_dim, bias=False)
         self.W_V = nn.Linear(embedding_dim, embedding_dim, bias=False)
+        self.W_O = nn.Linear(embedding_dim, embedding_dim, bias=False)
         if self.use_residual:
             self.W_Res = nn.Linear(embedding_dim, embedding_dim, bias=False)
+        if use_layer_norm:
+            self.layer_norm = nn.LayerNorm(embedding_dim)
+        else:
+            self.layer_norm = None
         self.dropout = nn.Dropout(dropout)
+        # Check if Flash Attention is available
+        self.use_flash_attention = hasattr(F, "scaled_dot_product_attention")
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        batch_size, num_fields, _ = x.shape
-        Q = self.W_Q(x)  # [batch_size, num_fields, embedding_dim]
+    def forward(
+        self, x: torch.Tensor, attention_mask: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: [batch_size, seq_len, embedding_dim]
+            attention_mask: [batch_size, seq_len] or [batch_size, seq_len, seq_len], boolean mask where True indicates valid positions
+        Returns:
+            output: [batch_size, seq_len, embedding_dim]
+        """
+        batch_size, seq_len, _ = x.shape
+        Q = self.W_Q(x)  # [batch_size, seq_len, embedding_dim]
         K = self.W_K(x)
         V = self.W_V(x)
-        # Split into multiple heads: [batch_size, num_heads, num_fields, head_dim]
-        Q = Q.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(
-            1, 2
-        )
-        K = K.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(
-            1, 2
-        )
-        V = V.view(batch_size, num_fields, self.num_heads, self.head_dim).transpose(
-            1, 2
-        )
-        # Attention scores
-        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim**0.5)
-        attention_weights = F.softmax(scores, dim=-1)
-        attention_weights = self.dropout(attention_weights)
-        attention_output = torch.matmul(
-            attention_weights, V
-        )  # [batch_size, num_heads, num_fields, head_dim]
+        # Split into multiple heads: [batch_size, num_heads, seq_len, head_dim]
+        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if self.use_flash_attention:
+            # Use PyTorch 2.0+ Flash Attention
+            if attention_mask is not None:
+                # Convert mask to [batch_size, 1, seq_len, seq_len] format
+                if attention_mask.dim() == 2:
+                    # [B, L] -> [B, 1, 1, L]
+                    attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+                elif attention_mask.dim() == 3:
+                    # [B, L, L] -> [B, 1, L, L]
+                    attention_mask = attention_mask.unsqueeze(1)
+            attention_output = F.scaled_dot_product_attention(
+                Q,
+                K,
+                V,
+                attn_mask=attention_mask,
+                dropout_p=self.dropout_rate if self.training else 0.0,
+            )
+            # Handle potential NaN values
+            attention_output = torch.nan_to_num(attention_output, nan=0.0)
+        else:
+            # Fallback to standard attention
+            scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim**0.5)
+            if attention_mask is not None:
+                # Process mask for standard attention
+                if attention_mask.dim() == 2:
+                    # [B, L] -> [B, 1, 1, L]
+                    attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+                elif attention_mask.dim() == 3:
+                    # [B, L, L] -> [B, 1, L, L]
+                    attention_mask = attention_mask.unsqueeze(1)
+                scores = scores.masked_fill(~attention_mask, float("-1e9"))
+            attention_weights = F.softmax(scores, dim=-1)
+            attention_weights = self.dropout(attention_weights)
+            attention_output = torch.matmul(
+                attention_weights, V
+            )  # [batch_size, num_heads, seq_len, head_dim]
         # Concatenate heads
         attention_output = attention_output.transpose(1, 2).contiguous()
         attention_output = attention_output.view(
-            batch_size, num_fields, self.embedding_dim
+            batch_size, seq_len, self.embedding_dim
         )
+        # Output projection
+        output = self.W_O(attention_output)
         # Residual connection
         if self.use_residual:
-            output = attention_output + self.W_Res(x)
-        else:
-            output = attention_output
+            output = output + self.W_Res(x)
+        # Layer normalization
+        if self.layer_norm is not None:
+            output = self.layer_norm(output)
         output = F.relu(output)
         return output
@@ -653,3 +714,21 @@ class AttentionPoolingLayer(nn.Module):
         # Weighted sum over keys: (B, L, 1) * (B, L, D) -> (B, D)
         output = torch.sum(attention_weights * keys, dim=1)
         return output
+class RMSNorm(torch.nn.Module):
+    """
+    Root Mean Square Layer Normalization.
+    Reference: https://arxiv.org/abs/1910.07467
+    """
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # RMS(x) = sqrt(mean(x^2) + eps)
+        variance = torch.mean(x**2, dim=-1, keepdim=True)
+        x_normalized = x * torch.rsqrt(variance + self.eps)
+        return self.weight * x_normalized

nextrec 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl

nextrec 0.4.5py3-none-any.whl → 0.4.7py3-none-any.whl