PyPI - nextrec - Versions diffs - 0.1.11__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

nextrec 0.1.11py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

nextrec/__version__.py +1 -1
nextrec/basic/activation.py +1 -2
nextrec/basic/callback.py +1 -2
nextrec/basic/features.py +39 -8
nextrec/basic/layers.py +3 -4
nextrec/basic/loggers.py +15 -10
nextrec/basic/metrics.py +1 -2
nextrec/basic/model.py +160 -125
nextrec/basic/session.py +150 -0
nextrec/data/__init__.py +13 -2
nextrec/data/data_utils.py +74 -22
nextrec/data/dataloader.py +513 -0
nextrec/data/preprocessor.py +494 -134
nextrec/loss/__init__.py +31 -24
nextrec/loss/listwise.py +164 -0
nextrec/loss/loss_utils.py +133 -106
nextrec/loss/pairwise.py +105 -0
nextrec/loss/pointwise.py +198 -0
nextrec/models/match/dssm.py +26 -17
nextrec/models/match/dssm_v2.py +20 -2
nextrec/models/match/mind.py +18 -3
nextrec/models/match/sdm.py +17 -2
nextrec/models/match/youtube_dnn.py +23 -10
nextrec/models/multi_task/esmm.py +8 -8
nextrec/models/multi_task/mmoe.py +8 -8
nextrec/models/multi_task/ple.py +8 -8
nextrec/models/multi_task/share_bottom.py +8 -8
nextrec/models/ranking/__init__.py +8 -0
nextrec/models/ranking/afm.py +5 -4
nextrec/models/ranking/autoint.py +6 -4
nextrec/models/ranking/dcn.py +6 -4
nextrec/models/ranking/deepfm.py +5 -4
nextrec/models/ranking/dien.py +6 -4
nextrec/models/ranking/din.py +6 -4
nextrec/models/ranking/fibinet.py +6 -4
nextrec/models/ranking/fm.py +6 -4
nextrec/models/ranking/masknet.py +6 -4
nextrec/models/ranking/pnn.py +6 -4
nextrec/models/ranking/widedeep.py +6 -4
nextrec/models/ranking/xdeepfm.py +6 -4
nextrec/utils/__init__.py +7 -11
nextrec/utils/embedding.py +2 -4
nextrec/utils/initializer.py +4 -5
nextrec/utils/optimizer.py +7 -8
{nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/METADATA +3 -3
nextrec-0.2.2.dist-info/RECORD +53 -0
nextrec/basic/dataloader.py +0 -447
nextrec/loss/match_losses.py +0 -294
nextrec/utils/common.py +0 -14
nextrec-0.1.11.dist-info/RECORD +0 -51
{nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/WHEEL +0 -0
{nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/licenses/LICENSE +0 -0

nextrec/basic/session.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Session and experiment utilities.
+This module centralizes session/experiment management so the rest of the
+framework writes all artifacts to a consistent location:: <pwd>/log/<experiment_id>/
+Within that folder we keep model parameters, checkpoints, training metrics,
+evaluation metrics, and consolidated log output. When users do not provide an
+``experiment_id`` a timestamp-based identifier is generated once per process to
+avoid scattering files across multiple directories. Test runs are redirected to
+temporary folders so local trees are not polluted.
+Date: create on 23/11/2025
+Author: Yang Zhou,zyaztec@gmail.com
+"""
+from __future__ import annotations
+import os
+import tempfile
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+__all__ = [
+    "Session",
+    "resolve_save_path",
+    "create_session",
+]
+@dataclass(frozen=True)
+class Session:
+    """Encapsulate standard folders for a NextRec experiment."""
+    experiment_id: str
+    root: Path
+    @property
+    def logs_dir(self) -> Path:
+        return self._ensure_dir(self.root)
+    @property
+    def checkpoints_dir(self) -> Path:
+        return self._ensure_dir(self.root)
+    @property
+    def predictions_dir(self) -> Path:
+        return self._ensure_dir(self.root / "predictions")
+    @property
+    def processor_dir(self) -> Path:
+        return self._ensure_dir(self.root / "processor")
+    @property
+    def params_dir(self) -> Path:
+        return self._ensure_dir(self.root)
+    @property
+    def metrics_dir(self) -> Path:
+        return self._ensure_dir(self.root)
+    def save_text(self, name: str, content: str) -> Path:
+        """Convenience helper: write a text file under logs_dir."""
+        path = self.logs_dir / name
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(content, encoding="utf-8")
+        return path
+    @staticmethod
+    def _ensure_dir(path: Path) -> Path:
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+def create_session(experiment_id: str | Path | None = None) -> Session:
+    """Create a :class:`Session` instance with prepared directories."""
+    if experiment_id is not None and str(experiment_id).strip():
+        exp_id = str(experiment_id).strip()
+    else:
+        exp_id = "nextrec_session_" + datetime.now().strftime("%Y%m%d")
+    if (
+        os.getenv("PYTEST_CURRENT_TEST")
+        or os.getenv("PYTEST_RUNNING")
+        or os.getenv("NEXTREC_TEST_MODE") == "1"
+    ):
+        session_path = Path(tempfile.gettempdir()) / "nextrec_logs" / exp_id
+    else:
+        # export NEXTREC_LOG_DIR=/data/nextrec/logs
+        base_dir = Path(os.getenv("NEXTREC_LOG_DIR", Path.cwd() / "nextrec_logs"))
+        session_path = base_dir / exp_id
+    session_path.mkdir(parents=True, exist_ok=True)
+    root = session_path.resolve()
+    return Session(experiment_id=exp_id, root=root)
+def resolve_save_path(
+    path: str | Path | None,
+    default_dir: str | Path,
+    default_name: str,
+    suffix: str,
+    add_timestamp: bool = False,
+) -> Path:
+    """
+    Normalize and create a save path.
+    - If ``path`` is ``None`` or has no suffix, place the file under
+      ``default_dir``.
+    - If ``path`` has no suffix, its stem is used as the file name; otherwise
+      ``default_name``.
+    - Relative paths with a suffix are also anchored under ``default_dir``.
+    - Enforces ``suffix`` (with leading dot) and optionally appends a
+      timestamp.
+    - Parent directories are created.
+    """
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if add_timestamp else None
+    normalized_suffix = suffix if suffix.startswith(".") else f".{suffix}"
+    if path is not None and Path(path).suffix:
+        target = Path(path)
+        if not target.is_absolute():
+            target = Path(default_dir) / target
+        if target.suffix != normalized_suffix:
+            target = target.with_suffix(normalized_suffix)
+        if timestamp:
+            target = target.with_name(f"{target.stem}_{timestamp}{normalized_suffix}")
+        target.parent.mkdir(parents=True, exist_ok=True)
+        return target.resolve()
+    base_dir = Path(default_dir)
+    candidate = Path(path) if path is not None else None
+    if candidate is not None:
+        if candidate.exists() and candidate.is_dir():
+            base_dir = candidate
+            file_stem = default_name
+        else:
+            base_dir = candidate.parent if candidate.parent not in (Path("."), Path("")) else base_dir
+            file_stem = candidate.name or default_name
+    else:
+        file_stem = default_name
+    base_dir.mkdir(parents=True, exist_ok=True)
+    if timestamp:
+        file_stem = f"{file_stem}_{timestamp}"
+    return (base_dir / f"{file_stem}{normalized_suffix}").resolve()

nextrec/data/__init__.py CHANGED Viewed

@@ -4,16 +4,21 @@ Data utilities package for NextRec
 This package provides data processing and manipulation utilities.
 Date: create on 13/11/2025
-Author:
-    Yang Zhou, zyaztec@gmail.com
+Author: Yang Zhou, zyaztec@gmail.com
 """
 from nextrec.data.data_utils import (
     collate_fn,
     get_column_data,
+    default_output_dir,
     split_dict_random,
     build_eval_candidates,
+    resolve_file_paths,
+    iter_file_chunks,
+    read_table,
+    load_dataframes,
 )
+from nextrec.basic.features import FeatureConfig
 # For backward compatibility, keep utils accessible
 from nextrec.data import data_utils
@@ -21,7 +26,13 @@ from nextrec.data import data_utils
 __all__ = [
     'collate_fn',
     'get_column_data',
+    'default_output_dir',
     'split_dict_random',
     'build_eval_candidates',
+    'resolve_file_paths',
+    'iter_file_chunks',
+    'read_table',
+    'load_dataframes',
+    'FeatureConfig',
     'data_utils',
 ]

nextrec/data/data_utils.py CHANGED Viewed

@@ -1,30 +1,13 @@
-"""
-Data processing utilities for NextRec
-Date: create on 13/11/2025
-Author:
-    Yang Zhou, zyaztec@gmail.com
-"""
+"""Data processing utilities for NextRec."""
 import torch
 import numpy as np
 import pandas as pd
+import pyarrow.parquet as pq
+from pathlib import Path
 def collate_fn(batch):
-    """
-    Custom collate function for batching tuples of tensors.
-    Each element in batch is a tuple of tensors from FileDataset.
-    Examples:
-        # Single sample in batch
-        (tensor([1.0, 2.0]), tensor([10, 20]), tensor([100, 200]), tensor(1.0))
-        # Batched output
-        (tensor([[1.0, 2.0], [3.0, 4.0]]),  # dense_features batch
-         tensor([[10, 20], [30, 40]]),       # sparse_features batch
-         tensor([[100, 200], [300, 400]]),   # sequence_features batch
-         tensor([1.0, 0.0])                  # labels batch)
-    """
+    """Collate a list of tensor tuples from ``FileDataset`` into batched tensors."""
     if not batch:
         return tuple()
@@ -33,7 +16,20 @@ def collate_fn(batch):
     for i in range(num_tensors):
         tensor_list = [item[i] for item in batch]
-        stacked = torch.cat(tensor_list, dim=0)
+        first = tensor_list[0]
+        if isinstance(first, torch.Tensor):
+            stacked = torch.cat(tensor_list, dim=0)
+        elif isinstance(first, np.ndarray):
+            stacked = np.concatenate(tensor_list, axis=0)
+        elif isinstance(first, list):
+            combined = []
+            for entry in tensor_list:
+                combined.extend(entry)
+            stacked = combined
+        else:
+            stacked = tensor_list
         result.append(stacked)
     return tuple(result)
@@ -53,6 +49,62 @@ def get_column_data(data: dict | pd.DataFrame, name: str):
         raise KeyError(f"Unsupported data type for extracting column {name}")
+def resolve_file_paths(path: str) -> tuple[list[str], str]:
+    """Resolve file or directory path into a sorted list of files and file type."""
+    path_obj = Path(path)
+    if path_obj.is_file():
+        file_type = path_obj.suffix.lower().lstrip(".")
+        assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
+        return [str(path_obj)], file_type
+    if path_obj.is_dir():
+        collected_files = [p for p in path_obj.iterdir() if p.is_file()]
+        csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
+        parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
+        if csv_files and parquet_files:
+            raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
+        file_paths = csv_files if csv_files else parquet_files
+        if not file_paths:
+            raise ValueError(f"No CSV or Parquet files found in directory: {path}")
+        file_paths.sort()
+        file_type = "csv" if csv_files else "parquet"
+        return file_paths, file_type
+    raise ValueError(f"Invalid path: {path}")
+def iter_file_chunks(file_path: str, file_type: str, chunk_size: int):
+    """Yield DataFrame chunks for CSV/Parquet without loading the whole file."""
+    if file_type == "csv":
+        yield from pd.read_csv(file_path, chunksize=chunk_size)
+        return
+    parquet_file = pq.ParquetFile(file_path)
+    for batch in parquet_file.iter_batches(batch_size=chunk_size):
+        yield batch.to_pandas()
+def read_table(file_path: str, file_type: str) -> pd.DataFrame:
+    """Read a single CSV/Parquet file."""
+    if file_type == "csv":
+        return pd.read_csv(file_path)
+    return pd.read_parquet(file_path)
+def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
+    """Load multiple files of the same type into DataFrames."""
+    return [read_table(fp, file_type) for fp in file_paths]
+def default_output_dir(path: str) -> Path:
+    """Generate a default output directory path based on the input path."""
+    path_obj = Path(path)
+    if path_obj.is_file():
+        return path_obj.parent / f"{path_obj.stem}_preprocessed"
+    return path_obj.with_name(f"{path_obj.name}_preprocessed")
 def split_dict_random(data_dict: dict, test_size: float=0.2, random_state:int|None=None):
     """Randomly split a dictionary of data into training and testing sets."""
     lengths = [len(v) for v in data_dict.values()]

nextrec 0.1.11__py3-none-any.whl → 0.2.2__py3-none-any.whl

nextrec 0.1.11py3-none-any.whl → 0.2.2py3-none-any.whl