PyPI - nextrec - Versions diffs - 0.4.22__py3-none-any.whl → 0.4.24__py3-none-any.whl - Mend

nextrec 0.4.22py3-none-any.whl → 0.4.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

nextrec/__version__.py +1 -1
nextrec/basic/layers.py +96 -46
nextrec/basic/metrics.py +128 -114
nextrec/basic/model.py +94 -91
nextrec/basic/summary.py +36 -2
nextrec/data/dataloader.py +2 -0
nextrec/data/preprocessor.py +137 -5
nextrec/loss/listwise.py +19 -6
nextrec/loss/pairwise.py +6 -4
nextrec/loss/pointwise.py +8 -6
nextrec/models/multi_task/aitm.py +0 -0
nextrec/models/multi_task/apg.py +0 -0
nextrec/models/multi_task/cross_stitch.py +0 -0
nextrec/models/multi_task/esmm.py +5 -28
nextrec/models/multi_task/mmoe.py +6 -28
nextrec/models/multi_task/pepnet.py +335 -0
nextrec/models/multi_task/ple.py +21 -40
nextrec/models/multi_task/poso.py +17 -39
nextrec/models/multi_task/share_bottom.py +5 -28
nextrec/models/multi_task/snr_trans.py +0 -0
nextrec/models/ranking/afm.py +3 -27
nextrec/models/ranking/autoint.py +5 -38
nextrec/models/ranking/dcn.py +1 -26
nextrec/models/ranking/dcn_v2.py +6 -34
nextrec/models/ranking/deepfm.py +2 -29
nextrec/models/ranking/dien.py +2 -28
nextrec/models/ranking/din.py +2 -27
nextrec/models/ranking/eulernet.py +3 -30
nextrec/models/ranking/ffm.py +0 -26
nextrec/models/ranking/fibinet.py +8 -32
nextrec/models/ranking/fm.py +0 -29
nextrec/models/ranking/lr.py +0 -30
nextrec/models/ranking/masknet.py +4 -30
nextrec/models/ranking/pnn.py +4 -28
nextrec/models/ranking/widedeep.py +0 -32
nextrec/models/ranking/xdeepfm.py +0 -30
nextrec/models/retrieval/dssm.py +4 -28
nextrec/models/retrieval/dssm_v2.py +4 -28
nextrec/models/retrieval/mind.py +2 -22
nextrec/models/retrieval/sdm.py +4 -24
nextrec/models/retrieval/youtube_dnn.py +4 -25
nextrec/models/sequential/hstu.py +0 -18
nextrec/utils/model.py +91 -4
nextrec/utils/types.py +35 -0
{nextrec-0.4.22.dist-info → nextrec-0.4.24.dist-info}/METADATA +8 -6
nextrec-0.4.24.dist-info/RECORD +86 -0
nextrec-0.4.22.dist-info/RECORD +0 -81
{nextrec-0.4.22.dist-info → nextrec-0.4.24.dist-info}/WHEEL +0 -0
{nextrec-0.4.22.dist-info → nextrec-0.4.24.dist-info}/entry_points.txt +0 -0
{nextrec-0.4.22.dist-info → nextrec-0.4.24.dist-info}/licenses/LICENSE +0 -0

nextrec/basic/model.py CHANGED Viewed

@@ -2,13 +2,14 @@
 Base Model & Base Match Model Class
 Date: create on 27/10/2025
-Checkpoint: edit on 28/12/2025
+Checkpoint: edit on 30/12/2025
 Author: Yang Zhou,zyaztec@gmail.com
 """
 import getpass
 import logging
 import os
+import sys
 import pickle
 import socket
 from pathlib import Path
@@ -16,6 +17,16 @@ from typing import Any, Literal
 import numpy as np
 import pandas as pd
+try:
+    import swanlab  # type: ignore
+except ModuleNotFoundError:
+    swanlab = None
+try:
+    import wandb  # type: ignore
+except ModuleNotFoundError:
+    wandb = None
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -74,13 +85,19 @@ from nextrec.utils.torch_utils import (
     to_tensor,
 )
 from nextrec.utils.config import safe_value
-from nextrec.utils.model import compute_ranking_loss
+from nextrec.utils.model import (
+    compute_ranking_loss,
+    get_loss_list,
+    resolve_loss_weights,
+    get_training_modes,
+)
 from nextrec.utils.types import (
     LossName,
     OptimizerName,
     SchedulerName,
     TrainingModeName,
     TaskTypeName,
+    MetricsName,
 )
@@ -90,7 +107,7 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
         raise NotImplementedError
     @property
-    def default_task(self) -> str | list[str]:
+    def default_task(self) -> TaskTypeName | list[TaskTypeName]:
         raise NotImplementedError
     def __init__(
@@ -139,6 +156,9 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
             world_size: Number of processes (defaults to env WORLD_SIZE).
             local_rank: Local rank for selecting CUDA device (defaults to env LOCAL_RANK).
             ddp_find_unused_parameters: Default False, set it True only when exist unused parameters in ddp model, in most cases should be False.
+        Note:
+            Optimizer, scheduler, and loss are configured via compile().
         """
         super(BaseModel, self).__init__()
@@ -171,24 +191,12 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
             dense_features, sparse_features, sequence_features, target, id_columns
         )
-        self.task = self.default_task if task is None else task
+        self.task = task or self.default_task
         self.nums_task = len(self.task) if isinstance(self.task, list) else 1
-        if isinstance(training_mode, list):
-            training_modes = list(training_mode)
-            if len(training_modes) != self.nums_task:
-                raise ValueError(
-                    "[BaseModel-init Error] training_mode list length must match number of tasks."
-                )
-        else:
-            training_modes = [training_mode] * self.nums_task
-        if any(
-            mode not in {"pointwise", "pairwise", "listwise"} for mode in training_modes
-        ):
-            raise ValueError(
-                "[BaseModel-init Error] training_mode must be one of {'pointwise', 'pairwise', 'listwise'}."
-            )
-        self.training_modes = training_modes
-        self.training_mode = training_modes if self.nums_task > 1 else training_modes[0]
+        self.training_modes = get_training_modes(training_mode, self.nums_task)
+        self.training_mode = (
+            self.training_modes if self.nums_task > 1 else self.training_modes[0]
+        )
         self.embedding_l1_reg = embedding_l1_reg
         self.dense_l1_reg = dense_l1_reg
@@ -196,8 +204,9 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
         self.dense_l2_reg = dense_l2_reg
         self.regularization_weights = []
         self.embedding_params = []
-        self.loss_weight = None
         self.ignore_label = None
+        self.compiled = False
         self.max_gradient_norm = 1.0
         self.logger_initialized = False
@@ -431,28 +440,9 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
             "pairwise": "bpr",
             "listwise": "listnet",
         }
-        effective_loss = loss
-        if effective_loss is None:
-            loss_list = [default_losses[mode] for mode in self.training_modes]
-        elif isinstance(effective_loss, list):
-            if not effective_loss:
-                loss_list = [default_losses[mode] for mode in self.training_modes]
-            else:
-                if len(effective_loss) != self.nums_task:
-                    raise ValueError(
-                        f"[BaseModel-compile Error] Number of loss functions ({len(effective_loss)}) must match number of tasks ({self.nums_task})."
-                    )
-                loss_list = list(effective_loss)
-        else:
-            loss_list = [effective_loss] * self.nums_task
-        for idx, mode in enumerate(self.training_modes):
-            if isinstance(loss_list[idx], str) and loss_list[idx] in {
-                "bce",
-                "binary_crossentropy",
-            }:
-                if mode in {"pairwise", "listwise"}:
-                    loss_list[idx] = default_losses[mode]
+        loss_list = get_loss_list(
+            loss, self.training_modes, self.nums_task, default_losses
+        )
         self.loss_params = loss_params or {}
         optimizer_params = optimizer_params or {}
         self.optimizer_name = (
@@ -516,30 +506,9 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
                 nums_task=self.nums_task, device=self.device, **grad_norm_params
             )
             self.loss_weights = None
-        elif loss_weights is None:
-            self.loss_weights = None
-        elif self.nums_task == 1:
-            if isinstance(loss_weights, (list, tuple)):
-                if len(loss_weights) != 1:
-                    raise ValueError(
-                        "[BaseModel-compile Error] loss_weights list must have exactly one element for single-task setup."
-                    )
-                loss_weights = loss_weights[0]
-            self.loss_weights = [float(loss_weights)]  # type: ignore
         else:
-            if isinstance(loss_weights, (int, float)):
-                weights = [float(loss_weights)] * self.nums_task
-            elif isinstance(loss_weights, (list, tuple)):
-                weights = [float(w) for w in loss_weights]
-                if len(weights) != self.nums_task:
-                    raise ValueError(
-                        f"[BaseModel-compile Error] Number of loss_weights ({len(weights)}) must match number of tasks ({self.nums_task})."
-                    )
-            else:
-                raise TypeError(
-                    f"[BaseModel-compile Error] loss_weights must be int, float, list or tuple, got {type(loss_weights)}"
-                )
-            self.loss_weights = weights
+            self.loss_weights = resolve_loss_weights(loss_weights, self.nums_task)
+        self.compiled = True
     def compute_loss(self, y_pred, y_true):
         if y_true is None:
@@ -602,9 +571,6 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
         for i, (start, end) in enumerate(slices):  # type: ignore
             y_pred_i = y_pred[:, start:end]
             y_true_i = y_true[:, start:end]
-            total_count = y_true_i.shape[0]
-            # valid_count = None
             # mask ignored labels
             if self.ignore_label is not None:
                 valid_mask = y_true_i != self.ignore_label
@@ -613,11 +579,8 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
                 if not torch.any(valid_mask):
                     task_losses.append(y_pred_i.sum() * 0.0)
                     continue
-                # valid_count = valid_mask.sum().to(dtype=y_true_i.dtype)
                 y_pred_i = y_pred_i[valid_mask]
                 y_true_i = y_true_i[valid_mask]
-            # else:
-            # valid_count = y_true_i.new_tensor(float(total_count))
             mode = self.training_modes[i]
@@ -683,6 +646,8 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
             sampler=sampler,
             collate_fn=collate_fn,
             num_workers=num_workers,
+            pin_memory=self.device.type == "cuda",
+            persistent_workers=num_workers > 0,
         )
         return (loader, dataset) if return_dataset else loader
@@ -691,7 +656,7 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
         train_data=None,
         valid_data=None,
         metrics: (
-            list[str] | dict[str, list[str]] | None
+            list[MetricsName] | dict[str, list[MetricsName]] | None
         ) = None,  # ['auc', 'logloss'] or {'target1': ['auc', 'logloss'], 'target2': ['mse']}
         epochs: int = 1,
         shuffle: bool = True,
@@ -705,6 +670,8 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
         use_tensorboard: bool = True,
         use_wandb: bool = False,
         use_swanlab: bool = False,
+        wandb_api: str | None = None,
+        swanlab_api: str | None = None,
         wandb_kwargs: dict | None = None,
         swanlab_kwargs: dict | None = None,
         auto_ddp_sampler: bool = True,
@@ -734,6 +701,8 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
             use_tensorboard: Enable tensorboard logging.
             use_wandb: Enable Weights & Biases logging.
             use_swanlab: Enable SwanLab logging.
+            wandb_api: W&B API key for non-tty login.
+            swanlab_api: SwanLab API key for non-tty login.
             wandb_kwargs: Optional kwargs for wandb.init(...).
             swanlab_kwargs: Optional kwargs for swanlab.init(...).
             auto_ddp_sampler: Attach DistributedSampler automatically when distributed, set False to when data is already sharded per rank.
@@ -751,6 +720,16 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
         )
         self.to(self.device)
+        if not self.compiled:
+            self.compile(
+                optimizer="adam",
+                optimizer_params={},
+                scheduler=None,
+                scheduler_params={},
+                loss=None,
+                loss_params={},
+            )
         if (
             self.distributed
             and dist.is_available()
@@ -825,6 +804,24 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
             }
             training_config: dict = safe_value(training_config)  # type: ignore
+        if self.is_main_process:
+            is_tty = sys.stdin.isatty() and sys.stdout.isatty()
+            if not is_tty:
+                if use_wandb and wandb_api:
+                    if wandb is None:
+                        logging.warning(
+                            "[BaseModel-fit] wandb not installed, skip wandb login."
+                        )
+                    else:
+                        wandb.login(key=wandb_api)
+                if use_swanlab and swanlab_api:
+                    if swanlab is None:
+                        logging.warning(
+                            "[BaseModel-fit] swanlab not installed, skip swanlab login."
+                        )
+                    else:
+                        swanlab.login(api_key=swanlab_api)
         self.training_logger = (
             TrainingLogger(
                 session=self.session,
@@ -1124,16 +1121,17 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
                     train_log_payload, step=epoch + 1, split="train"
                 )
             if valid_loader is not None:
-                self.callbacks.on_validation_begin()
-                val_metrics = self.evaluate(
-                    valid_loader,
-                    user_ids=valid_user_ids if self.needs_user_ids else None,
-                    num_workers=num_workers,
-                )
-                should_log_valid = (epoch + 1) % log_interval == 0 or (
+                should_eval_valid = (epoch + 1) % log_interval == 0 or (
                     epoch + 1
                 ) == epochs
-                if should_log_valid:
+                val_metrics = None
+                if should_eval_valid:
+                    self.callbacks.on_validation_begin()
+                    val_metrics = self.evaluate(
+                        valid_loader,
+                        user_ids=valid_user_ids if self.needs_user_ids else None,
+                        num_workers=num_workers,
+                    )
                     display_metrics_table(
                         epoch=epoch + 1,
                         epochs=epochs,
@@ -1147,23 +1145,24 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
                         is_main_process=self.is_main_process,
                         colorize=lambda s: colorize("  " + s, color="cyan"),
                     )
-                self.callbacks.on_validation_end()
-                if should_log_valid and val_metrics and self.training_logger:
-                    self.training_logger.log_metrics(
-                        val_metrics, step=epoch + 1, split="valid"
-                    )
+                    self.callbacks.on_validation_end()
+                    if val_metrics and self.training_logger:
+                        self.training_logger.log_metrics(
+                            val_metrics, step=epoch + 1, split="valid"
+                        )
                 if not val_metrics:
-                    if self.is_main_process:
+                    if should_eval_valid and self.is_main_process:
                         logging.info(
                             colorize(
                                 "Warning: No validation metrics computed. Skipping validation for this epoch.",
                                 color="yellow",
                             )
                         )
-                    continue
-                epoch_logs = {**train_log_payload}
-                for k, v in val_metrics.items():
-                    epoch_logs[f"val_{k}"] = v
+                    epoch_logs = {**train_log_payload}
+                else:
+                    epoch_logs = {**train_log_payload}
+                    for k, v in val_metrics.items():
+                        epoch_logs[f"val_{k}"] = v
             else:
                 epoch_logs = {**train_log_payload}
                 if self.is_main_process:
@@ -1345,6 +1344,7 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
                 target_names=self.target_columns,
                 task_specific_metrics=self.task_specific_metrics,
                 user_ids=combined_user_ids,
+                ignore_label=self.ignore_label,
             )
             return avg_loss, metrics_dict
         return avg_loss
@@ -1392,6 +1392,8 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
                 sampler=valid_sampler,
                 collate_fn=collate_fn,
                 num_workers=num_workers,
+                pin_memory=self.device.type == "cuda",
+                persistent_workers=num_workers > 0,
             )
         valid_user_ids = None
         if needs_user_ids:
@@ -1537,6 +1539,7 @@ class BaseModel(SummarySet, FeatureSet, nn.Module):
             target_names=self.target_columns,
             task_specific_metrics=self.task_specific_metrics,
             user_ids=final_user_ids,
+            ignore_label=self.ignore_label,
         )
         return metrics_dict

nextrec/basic/summary.py CHANGED Viewed

@@ -1,5 +1,9 @@
 """
 Summary utilities for BaseModel.
+Date: create on 03/12/2025
+Checkpoint: edit on 29/12/2025
+Author: Yang Zhou,zyaztec@gmail.com
 """
 from __future__ import annotations
@@ -12,9 +16,39 @@ from torch.utils.data import DataLoader
 from nextrec.basic.loggers import colorize, format_kv
 from nextrec.data.data_processing import extract_label_arrays, get_data_length
+from nextrec.utils.types import TaskTypeName
 class SummarySet:
+    model_name: str
+    dense_features: list[Any]
+    sparse_features: list[Any]
+    sequence_features: list[Any]
+    task: TaskTypeName | list[TaskTypeName]
+    target_columns: list[str]
+    nums_task: int
+    metrics: Any
+    device: Any
+    optimizer_name: str
+    optimizer_params: dict[str, Any]
+    scheduler_name: str | None
+    scheduler_params: dict[str, Any]
+    loss_config: Any
+    loss_weights: Any
+    grad_norm: Any
+    embedding_l1_reg: float
+    embedding_l2_reg: float
+    dense_l1_reg: float
+    dense_l2_reg: float
+    early_stop_patience: int
+    max_gradient_norm: float | None
+    metrics_sample_limit: int | None
+    session_id: str | None
+    features_config_path: str
+    checkpoint_path: str
+    train_data_summary: dict[str, Any] | None
+    valid_data_summary: dict[str, Any] | None
     def build_data_summary(
         self, data: Any, data_loader: DataLoader | None, sample_key: str
     ):
@@ -305,7 +339,7 @@ class SummarySet:
                         lines = details.get("lines", [])
                         logger.info(f"{target_name}:")
                         for label, value in lines:
-                            logger.info(format_kv(label, value))
+                            logger.info(f"  {format_kv(label, value)}")
             if self.valid_data_summary:
                 if self.train_data_summary:
@@ -320,4 +354,4 @@ class SummarySet:
                         lines = details.get("lines", [])
                         logger.info(f"{target_name}:")
                         for label, value in lines:
-                            logger.info(format_kv(label, value))
+                            logger.info(f"  {format_kv(label, value)}")

nextrec/data/dataloader.py CHANGED Viewed

@@ -282,6 +282,8 @@ class RecDataLoader(FeatureSet):
             sampler=sampler,
             collate_fn=collate_fn,
             num_workers=num_workers,
+            pin_memory=torch.cuda.is_available(),
+            persistent_workers=num_workers > 0,
         )
     def create_from_path(

nextrec/data/preprocessor.py CHANGED Viewed

@@ -2,7 +2,7 @@
 DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
 Date: create on 13/11/2025
-Checkpoint: edit on 24/12/2025
+Checkpoint: edit on 29/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
@@ -79,6 +79,14 @@ class DataProcessor(FeatureSet):
         ] = "standard",
         fill_na: Optional[float] = None,
     ):
+        """Add a numeric feature configuration.
+        Args:
+            name (str): Feature name.
+            scaler (Optional[Literal["standard", "minmax", "robust", "maxabs", "log", "none"]], optional): Scaler type. Defaults to "standard".
+            fill_na (Optional[float], optional): Fill value for missing entries. Defaults to None.
+        """
         self.numeric_features[name] = {"scaler": scaler, "fill_na": fill_na}
     def add_sparse_feature(
@@ -88,6 +96,14 @@ class DataProcessor(FeatureSet):
         hash_size: Optional[int] = None,
         fill_na: str = "<UNK>",
     ):
+        """Add a sparse feature configuration.
+        Args:
+            name (str): Feature name.
+            encode_method (Literal["hash", "label"], optional): Encoding method, including "hash encoding" and "label encoding". Defaults to "label".
+            hash_size (Optional[int], optional): Hash size for hash encoding. Required if encode_method is "hash".
+            fill_na (str, optional): Fill value for missing entries. Defaults to "<UNK>".
+        """
         if encode_method == "hash" and hash_size is None:
             raise ValueError(
                 "[Data Processor Error] hash_size must be specified when encode_method='hash'"
@@ -101,7 +117,7 @@ class DataProcessor(FeatureSet):
     def add_sequence_feature(
         self,
         name: str,
-        encode_method: Literal["hash", "label"] = "label",
+        encode_method: Literal["hash", "label"] = "hash",
         hash_size: Optional[int] = None,
         max_len: Optional[int] = 50,
         pad_value: int = 0,
@@ -110,6 +126,17 @@ class DataProcessor(FeatureSet):
         ] = "pre",  # pre: keep last max_len items, post: keep first max_len items
         separator: str = ",",
     ):
+        """Add a sequence feature configuration.
+        Args:
+            name (str): Feature name.
+            encode_method (Literal["hash", "label"], optional): Encoding method, including "hash encoding" and "label encoding". Defaults to "hash".
+            hash_size (Optional[int], optional): Hash size for hash encoding. Required if encode_method is "hash".
+            max_len (Optional[int], optional): Maximum sequence length. Defaults to 50.
+            pad_value (int, optional): Padding value for sequences shorter than max_len. Defaults to 0.
+            truncate (Literal["pre", "post"], optional): Truncation strategy for sequences longer than max_len, including "pre" (keep last max_len items) and "post" (keep first max_len items). Defaults to "pre".
+            separator (str, optional): Separator for string sequences. Defaults to ",".
+        """
         if encode_method == "hash" and hash_size is None:
             raise ValueError(
                 "[Data Processor Error] hash_size must be specified when encode_method='hash'"
@@ -131,6 +158,14 @@ class DataProcessor(FeatureSet):
             Dict[str, int]
         ] = None,  # example: {'click': 1, 'no_click': 0}
     ):
+        """Add a target configuration.
+        Args:
+            name (str): Target name.
+            target_type (Literal["binary", "regression"], optional): Target type. Defaults to "binary".
+            label_map (Optional[Dict[str, int]], optional): Label mapping for binary targets. Defaults to None.
+        """
         self.target_features[name] = {
             "target_type": target_type,
             "label_map": label_map,
@@ -392,7 +427,15 @@ class DataProcessor(FeatureSet):
         )
     def load_dataframe_from_path(self, path: str) -> pd.DataFrame:
-        """Load all data from a file or directory path into a single DataFrame."""
+        """
+        Load all data from a file or directory path into a single DataFrame.
+        Args:
+            path (str): File or directory path.
+        Returns:
+            pd.DataFrame: Loaded DataFrame.
+        """
         file_paths, file_type = resolve_file_paths(path)
         frames = load_dataframes(file_paths, file_type)
         return pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0]
@@ -411,7 +454,16 @@ class DataProcessor(FeatureSet):
         return [str(value)]
     def fit_from_path(self, path: str, chunk_size: int) -> "DataProcessor":
-        """Fit processor statistics by streaming files to reduce memory usage."""
+        """
+        Fit processor statistics by streaming files to reduce memory usage.
+        Args:
+            path (str): File or directory path.
+            chunk_size (int): Number of rows per chunk.
+        Returns:
+            DataProcessor: Fitted DataProcessor instance.
+        """
         logger = logging.getLogger()
         logger.info(
             colorize(
@@ -428,7 +480,7 @@ class DataProcessor(FeatureSet):
                 "Use fit(dataframe) with in-memory data or convert the data format."
             )
-        numeric_acc: Dict[str, Dict[str, float]] = {}
+        numeric_acc = {}
         for name in self.numeric_features.keys():
             numeric_acc[name] = {
                 "sum": 0.0,
@@ -609,6 +661,21 @@ class DataProcessor(FeatureSet):
         output_path: Optional[str],
         warn_missing: bool = True,
     ):
+        """
+        Transform in-memory data and optionally persist the transformed data.
+        Args:
+            data (Union[pd.DataFrame, Dict[str, Any]]): Input data.
+            return_dict (bool): Whether to return a dictionary of numpy arrays.
+            persist (bool): Whether to persist the transformed data to disk.
+            save_format (Optional[str]): Format to save the data if persisting.
+            output_path (Optional[str]): Output path to save the data if persisting.
+            warn_missing (bool): Whether to warn about missing features in the data.
+        Returns:
+            Union[pd.DataFrame, Dict[str, np.ndarray]]: Transformed data.
+        """
         logger = logging.getLogger()
         data_dict = data if isinstance(data, dict) else None
@@ -719,6 +786,12 @@ class DataProcessor(FeatureSet):
         """Transform data from files under a path and save them to a new location.
         Uses chunked reading/writing to keep peak memory bounded for large files.
+        Args:
+            input_path (str): Input file or directory path.
+            output_path (Optional[str]): Output directory path. If None, defaults to input_path/transformed_data.
+            save_format (Optional[str]): Format to save transformed files. If None, uses input file format.
+            chunk_size (int): Number of rows per chunk.
         """
         logger = logging.getLogger()
         file_paths, file_type = resolve_file_paths(input_path)
@@ -876,6 +949,17 @@ class DataProcessor(FeatureSet):
         data: Union[pd.DataFrame, Dict[str, Any], str, os.PathLike],
         chunk_size: int = 200000,
     ):
+        """
+        Fit the DataProcessor to the provided data.
+        Args:
+            data (Union[pd.DataFrame, Dict[str, Any], str, os.PathLike]): Input data for fitting.
+            chunk_size (int): Number of rows per chunk when streaming from path.
+        Returns:
+            DataProcessor: Fitted DataProcessor instance.
+        """
         logger = logging.getLogger()
         if isinstance(data, (str, os.PathLike)):
             path_str = str(data)
@@ -915,6 +999,19 @@ class DataProcessor(FeatureSet):
         output_path: Optional[str] = None,
         chunk_size: int = 200000,
     ):
+        """
+        Transform the provided data using the fitted DataProcessor.
+        Args:
+            data (Union[pd.DataFrame, Dict[str, Any], str, os.PathLike]): Input data to transform.
+            return_dict (bool): Whether to return a dictionary of numpy arrays.
+            save_format (Optional[str]): Format to save the data if output_path is provided.
+            output_path (Optional[str]): Output path to save the transformed data.
+            chunk_size (int): Number of rows per chunk when streaming from path.
+        Returns:
+            Union[pd.DataFrame, Dict[str, np.ndarray], List[str]]: Transformed data or list of saved file paths.
+        """
         if not self.is_fitted:
             raise ValueError(
                 "[Data Processor Error] DataProcessor must be fitted before transform"
@@ -943,6 +1040,19 @@ class DataProcessor(FeatureSet):
         output_path: Optional[str] = None,
         chunk_size: int = 200000,
     ):
+        """
+        Fit the DataProcessor to the provided data and then transform it.
+        Args:
+            data (Union[pd.DataFrame, Dict[str, Any], str, os.PathLike]): Input data for fitting and transforming.
+            return_dict (bool): Whether to return a dictionary of numpy arrays.
+            save_format (Optional[str]): Format to save the data if output_path is provided.
+            output_path (Optional[str]): Output path to save the transformed data.
+            chunk_size (int): Number of rows per chunk when streaming from path.
+        Returns:
+            Union[pd.DataFrame, Dict[str, np.ndarray], List[str]]: Transformed data or list of saved file paths.
+        """
         self.fit(data, chunk_size=chunk_size)
         return self.transform(
             data,
@@ -952,6 +1062,12 @@ class DataProcessor(FeatureSet):
         )
     def save(self, save_path: str | Path):
+        """
+        Save the fitted DataProcessor to a file.
+        Args:
+            save_path (str | Path): Path to save the DataProcessor.
+        """
         logger = logging.getLogger()
         assert isinstance(save_path, (str, Path)), "save_path must be a string or Path"
         save_path = Path(save_path)
@@ -983,6 +1099,16 @@ class DataProcessor(FeatureSet):
     @classmethod
     def load(cls, load_path: str | Path) -> "DataProcessor":
+        """
+        Load a fitted DataProcessor from a file.
+        Args:
+            load_path (str | Path): Path to load the DataProcessor from.
+        Returns:
+            DataProcessor: Loaded DataProcessor instance.
+        """
         logger = logging.getLogger()
         load_path = Path(load_path)
         with open(load_path, "rb") as f:
@@ -1003,6 +1129,12 @@ class DataProcessor(FeatureSet):
         return processor
     def get_vocab_sizes(self) -> Dict[str, int]:
+        """
+        Get vocabulary sizes for all sparse and sequence features.
+        Returns:
+            Dict[str, int]: Mapping of feature names to vocabulary sizes.
+        """
         vocab_sizes = {}
         for name, config in self.sparse_features.items():
             vocab_sizes[name] = config.get("vocab_size", 0)

nextrec 0.4.22__py3-none-any.whl → 0.4.24__py3-none-any.whl

nextrec 0.4.22py3-none-any.whl → 0.4.24py3-none-any.whl