PyPI - nextrec - Versions diffs - 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

nextrec 0.3.6py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

nextrec/__init__.py +1 -1
nextrec/__version__.py +1 -1
nextrec/basic/activation.py +10 -5
nextrec/basic/callback.py +1 -0
nextrec/basic/features.py +30 -22
nextrec/basic/layers.py +244 -113
nextrec/basic/loggers.py +62 -43
nextrec/basic/metrics.py +268 -119
nextrec/basic/model.py +1373 -443
nextrec/basic/session.py +10 -3
nextrec/cli.py +498 -0
nextrec/data/__init__.py +19 -25
nextrec/data/batch_utils.py +11 -3
nextrec/data/data_processing.py +42 -24
nextrec/data/data_utils.py +26 -15
nextrec/data/dataloader.py +303 -96
nextrec/data/preprocessor.py +320 -199
nextrec/loss/listwise.py +17 -9
nextrec/loss/loss_utils.py +7 -8
nextrec/loss/pairwise.py +2 -0
nextrec/loss/pointwise.py +30 -12
nextrec/models/generative/hstu.py +106 -40
nextrec/models/match/dssm.py +82 -69
nextrec/models/match/dssm_v2.py +72 -58
nextrec/models/match/mind.py +175 -108
nextrec/models/match/sdm.py +104 -88
nextrec/models/match/youtube_dnn.py +73 -60
nextrec/models/multi_task/esmm.py +53 -39
nextrec/models/multi_task/mmoe.py +70 -47
nextrec/models/multi_task/ple.py +107 -50
nextrec/models/multi_task/poso.py +121 -41
nextrec/models/multi_task/share_bottom.py +54 -38
nextrec/models/ranking/afm.py +172 -45
nextrec/models/ranking/autoint.py +84 -61
nextrec/models/ranking/dcn.py +59 -42
nextrec/models/ranking/dcn_v2.py +64 -23
nextrec/models/ranking/deepfm.py +36 -26
nextrec/models/ranking/dien.py +158 -102
nextrec/models/ranking/din.py +88 -60
nextrec/models/ranking/fibinet.py +55 -35
nextrec/models/ranking/fm.py +32 -26
nextrec/models/ranking/masknet.py +95 -34
nextrec/models/ranking/pnn.py +34 -31
nextrec/models/ranking/widedeep.py +37 -29
nextrec/models/ranking/xdeepfm.py +63 -41
nextrec/utils/__init__.py +61 -32
nextrec/utils/config.py +490 -0
nextrec/utils/device.py +52 -12
nextrec/utils/distributed.py +141 -0
nextrec/utils/embedding.py +1 -0
nextrec/utils/feature.py +1 -0
nextrec/utils/file.py +32 -11
nextrec/utils/initializer.py +61 -16
nextrec/utils/optimizer.py +25 -9
nextrec/utils/synthetic_data.py +531 -0
nextrec/utils/tensor.py +24 -13
{nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/METADATA +15 -5
nextrec-0.4.2.dist-info/RECORD +69 -0
nextrec-0.4.2.dist-info/entry_points.txt +2 -0
nextrec-0.3.6.dist-info/RECORD +0 -64
{nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
{nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0

nextrec/utils/distributed.py ADDED Viewed

@@ -0,0 +1,141 @@
+"""
+Distributed utilities for NextRec.
+Date: create on 04/12/2025
+Checkpoint: edit on 05/12/2025
+Author: Yang Zhou,zyaztec@gmail.com
+"""
+import logging
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader, IterableDataset
+from torch.utils.data.distributed import DistributedSampler
+from nextrec.basic.loggers import colorize
+def init_process_group(
+    distributed: bool, rank: int, world_size: int, device_id: int | None = None
+) -> None:
+    """
+    initialize distributed process group for multi-GPU training.
+    Args:
+        distributed: whether to enable distributed training
+        rank: global rank of the current process
+        world_size: total number of processes
+    """
+    if (not distributed) or (not dist.is_available()) or dist.is_initialized():
+        return
+    backend = "nccl" if device_id is not None else "gloo"
+    if backend == "nccl":
+        torch.cuda.set_device(device_id)
+    dist.init_process_group(
+        backend=backend, init_method="env://", rank=rank, world_size=world_size
+    )
+def gather_numpy(self, array: np.ndarray | None) -> np.ndarray | None:
+    """
+    Gather numpy arrays (or None) across ranks. Uses all_gather_object to avoid
+    shape mismatches and ensures every rank participates even when local data is empty.
+    """
+    if not (self.distributed and dist.is_available() and dist.is_initialized()):
+        return array
+    world_size = dist.get_world_size()
+    gathered: list[np.ndarray | None] = [None for _ in range(world_size)]
+    dist.all_gather_object(gathered, array)
+    pieces: list[np.ndarray] = []
+    for item in gathered:
+        if item is None:
+            continue
+        item_np = np.asarray(item)
+        if item_np.size > 0:
+            pieces.append(item_np)
+    if not pieces:
+        return None
+    return np.concatenate(pieces, axis=0)
+def add_distributed_sampler(
+    loader: DataLoader,
+    distributed: bool,
+    world_size: int,
+    rank: int,
+    shuffle: bool,
+    drop_last: bool,
+    default_batch_size: int,
+    is_main_process: bool = False,
+) -> tuple[DataLoader, DistributedSampler | None]:
+    """
+    add distributedsampler to a dataloader, this for distributed training
+    when each device has its own dataloader
+    """
+    # early return if not distributed
+    if not (distributed and dist.is_available() and dist.is_initialized()):
+        return loader, None
+    # return if already has DistributedSampler
+    if isinstance(loader.sampler, DistributedSampler):
+        return loader, loader.sampler
+    dataset = getattr(loader, "dataset", None)
+    if dataset is None:
+        return loader, None
+    if isinstance(dataset, IterableDataset):
+        if is_main_process:
+            logging.info(
+                colorize(
+                    "[Distributed Info] Iterable/streaming DataLoader provided; DistributedSampler is skipped. Ensure dataset handles sharding per rank.",
+                    color="yellow",
+                )
+            )
+        return loader, None
+    sampler = DistributedSampler(
+        dataset,
+        num_replicas=world_size,
+        rank=rank,
+        shuffle=shuffle,
+        drop_last=drop_last,
+    )
+    loader_kwargs = {
+        "batch_size": (
+            loader.batch_size if loader.batch_size is not None else default_batch_size
+        ),
+        "shuffle": False,
+        "sampler": sampler,
+        "num_workers": loader.num_workers,
+        "collate_fn": loader.collate_fn,
+        "drop_last": drop_last,
+    }
+    if getattr(loader, "pin_memory", False):
+        loader_kwargs["pin_memory"] = True
+    pin_memory_device = getattr(loader, "pin_memory_device", None)
+    if pin_memory_device:
+        loader_kwargs["pin_memory_device"] = pin_memory_device
+    timeout = getattr(loader, "timeout", None)
+    if timeout:
+        loader_kwargs["timeout"] = timeout
+    worker_init_fn = getattr(loader, "worker_init_fn", None)
+    if worker_init_fn is not None:
+        loader_kwargs["worker_init_fn"] = worker_init_fn
+    generator = getattr(loader, "generator", None)
+    if generator is not None:
+        loader_kwargs["generator"] = generator
+    if loader.num_workers > 0:
+        loader_kwargs["persistent_workers"] = getattr(
+            loader, "persistent_workers", False
+        )
+        prefetch_factor = getattr(loader, "prefetch_factor", None)
+        if prefetch_factor is not None:
+            loader_kwargs["prefetch_factor"] = prefetch_factor
+    distributed_loader = DataLoader(dataset, **loader_kwargs)
+    if is_main_process:
+        logging.info(
+            colorize(
+                "[Distributed Info] Attached DistributedSampler to provided DataLoader",
+                color="cyan",
+            )
+        )
+    return distributed_loader, sampler

nextrec/utils/embedding.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Embedding utilities for NextRec
 Date: create on 13/11/2025
+Checkpoint: edit on 06/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """

nextrec/utils/feature.py CHANGED Viewed

@@ -5,6 +5,7 @@ Date: create on 03/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
 def normalize_to_list(value: str | list[str] | None) -> list[str]:
     if value is None:
         return []

nextrec/utils/file.py CHANGED Viewed

@@ -2,11 +2,14 @@
 File I/O utilities for NextRec
 Date: create on 03/12/2025
+Checkpoint: edit on 06/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
+import yaml
 import pandas as pd
 import pyarrow.parquet as pq
 from pathlib import Path
 from typing import Generator
@@ -14,7 +17,7 @@ from typing import Generator
 def resolve_file_paths(path: str) -> tuple[list[str], str]:
     """
     Resolve file or directory path into a sorted list of files and file type.
     Args: path: Path to a file or directory
     Returns: tuple: (list of file paths, file type)
     """
@@ -22,16 +25,23 @@ def resolve_file_paths(path: str) -> tuple[list[str], str]:
     if path_obj.is_file():
         file_type = path_obj.suffix.lower().lstrip(".")
-        assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
+        assert file_type in [
+            "csv",
+            "parquet",
+        ], f"Unsupported file extension: {file_type}"
         return [str(path_obj)], file_type
     if path_obj.is_dir():
         collected_files = [p for p in path_obj.iterdir() if p.is_file()]
         csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
-        parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
+        parquet_files = [
+            str(p) for p in collected_files if p.suffix.lower() == ".parquet"
+        ]
         if csv_files and parquet_files:
-            raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
+            raise ValueError(
+                "Directory contains both CSV and Parquet files. Please keep a single format."
+            )
         file_paths = csv_files if csv_files else parquet_files
         if not file_paths:
             raise ValueError(f"No CSV or Parquet files found in directory: {path}")
@@ -42,18 +52,24 @@ def resolve_file_paths(path: str) -> tuple[list[str], str]:
     raise ValueError(f"Invalid path: {path}")
-def read_table(file_path: str, file_type: str) -> pd.DataFrame:
-    if file_type == "csv":
-        return pd.read_csv(file_path)
-    return pd.read_parquet(file_path)
+def read_table(path: str | Path, data_format: str | None = None) -> pd.DataFrame:
+    data_path = Path(path)
+    fmt = data_format.lower() if data_format else data_path.suffix.lower().lstrip(".")
+    if data_path.is_dir() and not fmt:
+        fmt = "parquet"
+    if fmt in {"parquet", ""}:
+        return pd.read_parquet(data_path)
+    if fmt in {"csv", "txt"}:
+        return pd.read_csv(data_path)
+    raise ValueError(f"Unsupported data format: {data_path}")
 def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
     return [read_table(fp, file_type) for fp in file_paths]
 def iter_file_chunks(
-    file_path: str,
-    file_type: str,
-    chunk_size: int
+    file_path: str, file_type: str, chunk_size: int
 ) -> Generator[pd.DataFrame, None, None]:
     if file_type == "csv":
         yield from pd.read_csv(file_path, chunksize=chunk_size)
@@ -68,3 +84,8 @@ def default_output_dir(path: str) -> Path:
     if path_obj.is_file():
         return path_obj.parent / f"{path_obj.stem}_preprocessed"
     return path_obj.with_name(f"{path_obj.name}_preprocessed")
+def read_yaml(path: str | Path):
+    with open(path, "r", encoding="utf-8") as file:
+        return yaml.safe_load(file) or {}

nextrec/utils/initializer.py CHANGED Viewed

@@ -5,32 +5,77 @@ Date: create on 13/11/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
+from typing import Any, Dict, Set, cast
 import torch.nn as nn
+from torch.nn.init import _NonlinearityType
+KNOWN_NONLINEARITIES: Set[str] = {
+    "linear",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "sigmoid",
+    "tanh",
+    "relu",
+    "leaky_relu",
+    "selu",
+    "gelu",
+}
+def resolve_nonlinearity(activation: str | _NonlinearityType) -> _NonlinearityType:
+    if isinstance(activation, str):
+        if activation in KNOWN_NONLINEARITIES:
+            return cast(_NonlinearityType, activation)
+        # Fall back to linear for custom activations (gain handled separately).
+        return "linear"
+    return activation
-def get_initializer(init_type='normal', activation='linear', param=None):
-    param = param or {}
+def resolve_gain(activation: str | _NonlinearityType, param: Dict[str, Any]) -> float:
+    if "gain" in param:
+        return param["gain"]
+    nonlinearity = resolve_nonlinearity(activation)
     try:
-        gain = param.get('gain', nn.init.calculate_gain(activation, param.get('param', None)))
+        return nn.init.calculate_gain(nonlinearity, param.get("param"))
     except ValueError:
-        gain = 1.0  # for custom activations like 'dice'
+        return 1.0  # custom activation with no gain estimate available
+def get_initializer(
+    init_type: str = "normal",
+    activation: str | _NonlinearityType = "linear",
+    param: Dict[str, Any] | None = None,
+):
+    param = param or {}
+    nonlinearity = resolve_nonlinearity(activation)
+    gain = resolve_gain(activation, param)
     def initializer_fn(tensor):
-        if init_type == 'xavier_uniform':
+        if init_type == "xavier_uniform":
             nn.init.xavier_uniform_(tensor, gain=gain)
-        elif init_type == 'xavier_normal':
+        elif init_type == "xavier_normal":
             nn.init.xavier_normal_(tensor, gain=gain)
-        elif init_type == 'kaiming_uniform':
-            nn.init.kaiming_uniform_(tensor, a=param.get('a', 0), nonlinearity=activation)
-        elif init_type == 'kaiming_normal':
-            nn.init.kaiming_normal_(tensor, a=param.get('a', 0), nonlinearity=activation)
-        elif init_type == 'orthogonal':
+        elif init_type == "kaiming_uniform":
+            nn.init.kaiming_uniform_(
+                tensor, a=param.get("a", 0), nonlinearity=nonlinearity
+            )
+        elif init_type == "kaiming_normal":
+            nn.init.kaiming_normal_(
+                tensor, a=param.get("a", 0), nonlinearity=nonlinearity
+            )
+        elif init_type == "orthogonal":
             nn.init.orthogonal_(tensor, gain=gain)
-        elif init_type == 'normal':
-            nn.init.normal_(tensor, mean=param.get('mean', 0.0), std=param.get('std', 0.0001))
-        elif init_type == 'uniform':
-            nn.init.uniform_(tensor, a=param.get('a', -0.05), b=param.get('b', 0.05))
+        elif init_type == "normal":
+            nn.init.normal_(
+                tensor, mean=param.get("mean", 0.0), std=param.get("std", 0.0001)
+            )
+        elif init_type == "uniform":
+            nn.init.uniform_(tensor, a=param.get("a", -0.05), b=param.get("b", 0.05))
         else:
             raise ValueError(f"Unknown init_type: {init_type}")
         return tensor

nextrec/utils/optimizer.py CHANGED Viewed

@@ -8,16 +8,17 @@ Author: Yang Zhou, zyaztec@gmail.com
 import torch
 from typing import Iterable
 def get_optimizer(
     optimizer: str | torch.optim.Optimizer = "adam",
     params: Iterable[torch.nn.Parameter] | None = None,
-    **optimizer_params
+    **optimizer_params,
 ):
     if params is None:
         raise ValueError("params cannot be None. Please provide model parameters.")
-    if 'lr' not in optimizer_params:
-        optimizer_params['lr'] = 1e-3
+    if "lr" not in optimizer_params:
+        optimizer_params["lr"] = 1e-3
     if isinstance(optimizer, str):
         opt_name = optimizer.lower()
         if opt_name == "adam":
@@ -39,21 +40,36 @@ def get_optimizer(
         raise TypeError(f"Invalid optimizer type: {type(optimizer)}")
     return optimizer_fn
 def get_scheduler(
-    scheduler: str | torch.optim.lr_scheduler._LRScheduler | torch.optim.lr_scheduler.LRScheduler | type[torch.optim.lr_scheduler._LRScheduler] | type[torch.optim.lr_scheduler.LRScheduler] | None,
+    scheduler: (
+        str
+        | torch.optim.lr_scheduler._LRScheduler
+        | torch.optim.lr_scheduler.LRScheduler
+        | type[torch.optim.lr_scheduler._LRScheduler]
+        | type[torch.optim.lr_scheduler.LRScheduler]
+        | None
+    ),
     optimizer,
-    **scheduler_params
+    **scheduler_params,
 ):
     if isinstance(scheduler, str):
         if scheduler == "step":
-            scheduler_fn = torch.optim.lr_scheduler.StepLR(optimizer, **scheduler_params)
+            scheduler_fn = torch.optim.lr_scheduler.StepLR(
+                optimizer, **scheduler_params
+            )
         elif scheduler == "cosine":
-            scheduler_fn = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, **scheduler_params)
+            scheduler_fn = torch.optim.lr_scheduler.CosineAnnealingLR(
+                optimizer, **scheduler_params
+            )
         else:
             raise NotImplementedError(f"Unsupported scheduler: {scheduler}")
-    elif isinstance(scheduler, (torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.LRScheduler)):
+    elif isinstance(
+        scheduler,
+        (torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.LRScheduler),
+    ):
         scheduler_fn = scheduler
     else:
         raise TypeError(f"Invalid scheduler type: {type(scheduler)}")
     return scheduler_fn

nextrec 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl

nextrec 0.3.6py3-none-any.whl → 0.4.2py3-none-any.whl