PyPI - nextrec - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

nextrec 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

nextrec/__version__.py +1 -1
nextrec/basic/model.py +288 -181
nextrec/basic/summary.py +21 -4
nextrec/cli.py +36 -17
nextrec/data/__init__.py +0 -52
nextrec/data/batch_utils.py +1 -1
nextrec/data/data_processing.py +1 -35
nextrec/data/data_utils.py +0 -4
nextrec/data/dataloader.py +125 -103
nextrec/data/preprocessor.py +141 -92
nextrec/loss/__init__.py +0 -36
nextrec/models/generative/__init__.py +0 -9
nextrec/models/tree_base/__init__.py +0 -15
nextrec/models/tree_base/base.py +14 -23
nextrec/utils/__init__.py +0 -119
nextrec/utils/data.py +39 -119
nextrec/utils/model.py +5 -14
nextrec/utils/torch_utils.py +6 -1
{nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/METADATA +4 -5
{nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/RECORD +23 -23
{nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/WHEEL +0 -0
{nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/entry_points.txt +0 -0
{nextrec-0.5.0.dist-info → nextrec-0.5.2.dist-info}/licenses/LICENSE +0 -0

nextrec/basic/summary.py CHANGED Viewed

@@ -13,10 +13,13 @@ import logging
 from typing import Any, Literal
 import numpy as np
+import pandas as pd
+import polars as pl
 from torch.utils.data import DataLoader
 from nextrec.basic.loggers import colorize, format_kv
-from nextrec.data.data_processing import extract_label_arrays, get_data_length
+from nextrec.data.data_processing import get_column_data, get_data_length
+from nextrec.utils.torch_utils import to_numpy
 from nextrec.utils.types import TaskTypeName
@@ -82,9 +85,23 @@ class SummarySet:
         if train_size is None:
             train_size = get_data_length(data)
-        labels = extract_label_arrays(dataset, self.target_columns)
-        if labels is None:
-            labels = extract_label_arrays(data, self.target_columns)
+        labels = None
+        if self.target_columns:
+            for source in (dataset, data):
+                if source is None:
+                    continue
+                label_source = source.labels if hasattr(source, "labels") else source  # type: ignore
+                if not isinstance(label_source, (dict, pd.DataFrame, pl.DataFrame)):
+                    continue
+                label_map = {}
+                for name in self.target_columns:
+                    column = get_column_data(label_source, name)  # type: ignore
+                    if column is None:
+                        continue
+                    label_map[name] = to_numpy(column)
+                labels = label_map or None
+                if labels:
+                    break
         summary = {}
         if train_size is not None:

nextrec/cli.py CHANGED Viewed

@@ -14,7 +14,7 @@ Examples:
     nextrec --mode=predict --predict_config=nextrec_cli_preset/predict_config.yaml
 Date: create on 06/12/2025
-Checkpoint: edit on 18/12/2025
+Checkpoint: edit on 29/01/2026
 Author: Yang Zhou, zyaztec@gmail.com
 """
@@ -112,10 +112,10 @@ def train_model(train_config_path: str) -> None:
     # train data
     data_path = resolve_path(data_cfg["path"], config_dir)
     target = to_list(data_cfg["target"])
-    file_paths: List[str] = []
-    file_type: str | None = None
-    streaming_train_files: List[str] | None = None
-    streaming_valid_files: List[str] | None = None
+    file_paths = []
+    file_type = None
+    streaming_train_files = None
+    streaming_valid_files = None
     feature_cfg_path = resolve_path(
         cfg.get("feature_config", "feature_config.yaml"), config_dir
@@ -251,7 +251,6 @@ def train_model(train_config_path: str) -> None:
         processor.fit_from_files(
             file_paths=streaming_train_files or file_paths,
             file_type=file_type,
-            chunk_size=dataloader_chunk_size,
         )
         processed = None
         df = None  # type: ignore[assignment]
@@ -653,6 +652,7 @@ def predict_model(predict_config_path: str) -> None:
     streaming = bool(predict_cfg.get("streaming", True))
     chunk_size = int(predict_cfg.get("chunk_size", 20000))
     batch_size = int(predict_cfg.get("batch_size", 512))
+    num_processes = int(predict_cfg.get("num_processes", 1))
     effective_batch_size = chunk_size if streaming else batch_size
     log_cli_section("Data")
@@ -668,17 +668,35 @@ def predict_model(predict_config_path: str) -> None:
             ("Batch size", effective_batch_size),
             ("Chunk size", chunk_size),
             ("Streaming", streaming),
+            ("Num processes", num_processes),
         ]
     )
+    if num_processes > 1 and predict_cfg.get("num_workers", 0) != 0:
+        logger.info("")
+        logger.info(
+            "[NextRec CLI Info] Multi-process streaming enforces num_workers=0 for each shard."
+        )
     logger.info("")
-    pred_loader = rec_dataloader.create_dataloader(
-        data=str(data_path),
-        batch_size=1 if streaming else batch_size,
-        shuffle=False,
-        streaming=streaming,
-        chunk_size=chunk_size,
-        prefetch_factor=predict_cfg.get("prefetch_factor"),
-    )
+    if num_processes > 1:
+        if not streaming:
+            raise ValueError(
+                "[NextRec CLI Error] num_processes > 1 requires streaming=true."
+            )
+        if use_onnx:
+            raise ValueError(
+                "[NextRec CLI Error] num_processes > 1 is not supported with ONNX inference."
+            )
+        pred_data = str(data_path)
+    else:
+        pred_data = rec_dataloader.create_dataloader(
+            data=str(data_path),
+            batch_size=1 if streaming else batch_size,
+            shuffle=False,
+            streaming=streaming,
+            chunk_size=chunk_size,
+            num_workers=predict_cfg.get("num_workers", 0),
+            prefetch_factor=predict_cfg.get("prefetch_factor"),
+        )
     save_format = predict_cfg.get(
         "save_data_format", predict_cfg.get("save_format", "csv")
@@ -697,7 +715,7 @@ def predict_model(predict_config_path: str) -> None:
     if use_onnx:
         result = model.predict_onnx(
             onnx_path=onnx_path,
-            data=pred_loader,
+            data=pred_data,
             batch_size=effective_batch_size,
             include_ids=bool(id_columns),
             return_dataframe=False,
@@ -707,13 +725,14 @@ def predict_model(predict_config_path: str) -> None:
         )
     else:
         result = model.predict(
-            data=pred_loader,
+            data=pred_data,
             batch_size=effective_batch_size,
-            include_ids=bool(id_columns),
             return_dataframe=False,
             save_path=str(save_path),
             save_format=save_format,
             num_workers=predict_cfg.get("num_workers", 0),
+            num_processes=num_processes,
+            processor=processor,
         )
     duration = time.time() - start
     # When return_dataframe=False, result is the actual file path

nextrec/data/__init__.py CHANGED Viewed

@@ -1,52 +0,0 @@
-from nextrec.basic.features import FeatureSet
-from nextrec.data import data_utils
-from nextrec.data.batch_utils import batch_to_dict, collate_fn, stack_section
-from nextrec.data.data_processing import (
-    build_eval_candidates,
-    get_column_data,
-    get_user_ids,
-    split_dict_random,
-)
-from nextrec.data.dataloader import (
-    FileDataset,
-    RecDataLoader,
-    TensorDictDataset,
-    build_tensors_from_data,
-)
-from nextrec.data.preprocessor import DataProcessor
-from nextrec.utils.data import (
-    default_output_dir,
-    iter_file_chunks,
-    load_dataframes,
-    read_table,
-    resolve_file_paths,
-)
-__all__ = [
-    # Batch utilities
-    "collate_fn",
-    "batch_to_dict",
-    "stack_section",
-    # Data processing
-    "get_column_data",
-    "split_dict_random",
-    "build_eval_candidates",
-    "get_user_ids",
-    # File utilities
-    "resolve_file_paths",
-    "iter_file_chunks",
-    "read_table",
-    "load_dataframes",
-    "default_output_dir",
-    # DataLoader
-    "TensorDictDataset",
-    "FileDataset",
-    "RecDataLoader",
-    "build_tensors_from_data",
-    # Preprocessor
-    "DataProcessor",
-    # Features
-    "FeatureSet",
-    # Legacy module
-    "data_utils",
-]

nextrec/data/batch_utils.py CHANGED Viewed

@@ -64,7 +64,7 @@ def collate_fn(batch):
     first = batch[0]
     if isinstance(first, dict) and "features" in first:
         # Streaming dataset yields already-batched chunks; avoid adding an extra dim.
-        if first.get("_already_batched") and len(batch) == 1:
+        if first.get("stream_mode") and len(batch) == 1:
             return {
                 "features": first.get("features", {}),
                 "labels": first.get("labels"),

nextrec/data/data_processing.py CHANGED Viewed

@@ -6,7 +6,6 @@ Checkpoint: edit on 25/12/2025
 Author: Yang Zhou, zyaztec@gmail.com
 """
-import hashlib
 from typing import Any
 import numpy as np
@@ -15,9 +14,6 @@ import torch
 import polars as pl
-from nextrec.utils.torch_utils import to_numpy
 def get_column_data(data: dict | pd.DataFrame | pl.DataFrame, name: str):
     if isinstance(data, dict):
@@ -32,8 +28,7 @@ def get_column_data(data: dict | pd.DataFrame | pl.DataFrame, name: str):
 def get_data_length(data: Any) -> int | None:
-    if data is None:
-        return None
     if isinstance(data, pd.DataFrame):
         return len(data)
     if isinstance(data, pl.DataFrame):
@@ -43,33 +38,9 @@ def get_data_length(data: Any) -> int | None:
             return None
         sample_key = next(iter(data))
         return len(data[sample_key])
-    try:
-        return len(data)
-    except TypeError:
-        return None
-def extract_label_arrays(
-    data: Any, target_columns: list[str]
-) -> dict[str, np.ndarray] | None:
-    if not target_columns or data is None:
-        return None
-    if isinstance(data, (dict, pd.DataFrame)):
-        label_source = data
-    elif hasattr(data, "labels"):
-        label_source = data.labels
     else:
         return None
-    labels: dict[str, np.ndarray] = {}
-    for name in target_columns:
-        column = get_column_data(label_source, name)
-        if column is None:
-            continue
-        labels[name] = to_numpy(column)
-    return labels or None
 def split_dict_random(data_dict, test_size=0.2, random_state=None):
@@ -202,8 +173,3 @@ def get_user_ids(
             return arr.reshape(arr.shape[0])
     return None
-def hash_md5_mod(value: str, hash_size: int) -> int:
-    digest = hashlib.md5(value.encode("utf-8")).digest()
-    return int.from_bytes(digest, byteorder="big", signed=False) % hash_size

nextrec/data/data_utils.py CHANGED Viewed

@@ -15,9 +15,7 @@ from nextrec.data.data_processing import (
     split_dict_random,
 )
 from nextrec.utils.data import (
-    default_output_dir,
     iter_file_chunks,
-    load_dataframes,
     read_table,
     resolve_file_paths,
 )
@@ -36,6 +34,4 @@ __all__ = [
     "resolve_file_paths",
     "iter_file_chunks",
     "read_table",
-    "load_dataframes",
-    "default_output_dir",
 ]

nextrec 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

nextrec 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl