nextrec 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nextrec-0.2.2 → nextrec-0.2.4}/.github/workflows/publish.yml +2 -2
- {nextrec-0.2.2 → nextrec-0.2.4}/.gitignore +1 -1
- {nextrec-0.2.2 → nextrec-0.2.4}/.readthedocs.yaml +3 -3
- {nextrec-0.2.2 → nextrec-0.2.4}/PKG-INFO +2 -2
- {nextrec-0.2.2 → nextrec-0.2.4}/README.md +1 -1
- {nextrec-0.2.2 → nextrec-0.2.4}/README_zh.md +1 -1
- nextrec-0.2.4/docs/rtd/conf.py +39 -0
- nextrec-0.2.4/docs/rtd/index.md +157 -0
- nextrec-0.2.4/docs/rtd/requirements.txt +3 -0
- nextrec-0.2.4/docs/zh//345/277/253/351/200/237/344/270/212/346/211/213.md +97 -0
- nextrec-0.2.4/nextrec/__version__.py +1 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/features.py +2 -1
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/model.py +2 -2
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/data/__init__.py +2 -4
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/data/dataloader.py +3 -3
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/data/preprocessor.py +2 -2
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/autoint.py +51 -7
- nextrec-0.2.4/nextrec/models/ranking/masknet.py +319 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/pyproject.toml +1 -1
- {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/movielen_ranking_deepfm.py +6 -2
- nextrec-0.2.2/docs/conf.py +0 -42
- nextrec-0.2.2/docs/index.rst +0 -172
- nextrec-0.2.2/docs/requirements.txt +0 -2
- nextrec-0.2.2/nextrec/__version__.py +0 -1
- nextrec-0.2.2/nextrec/models/ranking/masknet.py +0 -127
- {nextrec-0.2.2 → nextrec-0.2.4}/.github/workflows/tests.yml +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/CODE_OF_CONDUCT.md +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/CONTRIBUTING.md +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/LICENSE +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/MANIFEST.in +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/dataset/match_task.csv +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/dataset/movielens_100k.csv +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/dataset/multitask_task.csv +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/dataset/ranking_task.csv +0 -0
- {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/Makefile +0 -0
- {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/make.bat +0 -0
- {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/modules.rst +0 -0
- {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.basic.rst +0 -0
- {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.data.rst +0 -0
- {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.loss.rst +0 -0
- {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.rst +0 -0
- {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.utils.rst +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/__init__.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/__init__.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/activation.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/callback.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/layers.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/loggers.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/metrics.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/session.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/data/data_utils.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/__init__.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/listwise.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/loss_utils.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/pairwise.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/pointwise.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/generative/hstu.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/generative/tiger.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/__init__.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/dssm.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/dssm_v2.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/mind.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/sdm.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/youtube_dnn.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/multi_task/esmm.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/multi_task/mmoe.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/multi_task/ple.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/multi_task/share_bottom.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/__init__.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/afm.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/dcn.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/deepfm.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/dien.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/din.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/fibinet.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/fm.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/pnn.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/widedeep.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/xdeepfm.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/utils/__init__.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/utils/embedding.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/utils/initializer.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/utils/optimizer.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/pytest.ini +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/requirements.txt +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/__init__.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/conftest.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/run_tests.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/test_data_preprocessor.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/test_dataloader.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/test_layers.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/test_losses.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/test_match_models.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/test_multitask_models.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/test_ranking_models.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test/test_utils.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/test_requirements.txt +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/example_match_dssm.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/example_multitask.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/example_ranking_din.py +0 -0
- {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/movielen_match_dssm.py +0 -0
|
@@ -8,7 +8,7 @@ on:
|
|
|
8
8
|
workflow_dispatch:
|
|
9
9
|
|
|
10
10
|
jobs:
|
|
11
|
-
# dev
|
|
11
|
+
# dev -> TestPyPI
|
|
12
12
|
publish-to-testpypi:
|
|
13
13
|
if: github.ref == 'refs/heads/dev'
|
|
14
14
|
runs-on: ubuntu-latest
|
|
@@ -36,7 +36,7 @@ jobs:
|
|
|
36
36
|
run: |
|
|
37
37
|
twine upload --verbose --repository testpypi dist/*
|
|
38
38
|
|
|
39
|
-
# main
|
|
39
|
+
# main -> PyPI
|
|
40
40
|
publish-to-pypi:
|
|
41
41
|
if: github.ref == 'refs/heads/main'
|
|
42
42
|
runs-on: ubuntu-latest
|
|
@@ -12,12 +12,12 @@ build:
|
|
|
12
12
|
|
|
13
13
|
# Build documentation in the "docs/" directory with Sphinx
|
|
14
14
|
sphinx:
|
|
15
|
-
configuration: docs/conf.py
|
|
15
|
+
configuration: docs/rtd/conf.py
|
|
16
16
|
|
|
17
17
|
# Optionally, but recommended,
|
|
18
18
|
# declare the Python requirements required to build your documentation
|
|
19
19
|
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
|
20
20
|
python:
|
|
21
21
|
install:
|
|
22
|
-
- requirements: docs/requirements.txt
|
|
23
|
-
|
|
22
|
+
- requirements: docs/rtd/requirements.txt
|
|
23
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nextrec
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
|
|
5
5
|
Project-URL: Homepage, https://github.com/zerolovesea/NextRec
|
|
6
6
|
Project-URL: Repository, https://github.com/zerolovesea/NextRec
|
|
@@ -61,7 +61,7 @@ Description-Content-Type: text/markdown
|
|
|
61
61
|

|
|
62
62
|

|
|
63
63
|

|
|
64
|
-

|
|
65
65
|
|
|
66
66
|
English | [中文版](README_zh.md)
|
|
67
67
|
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|

|
|
6
6
|

|
|
7
7
|

|
|
8
|
-

|
|
9
9
|
|
|
10
10
|
English | [中文版](README_zh.md)
|
|
11
11
|
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|

|
|
6
6
|

|
|
7
7
|

|
|
8
|
-

|
|
9
9
|
|
|
10
10
|
[English Version](README.md) | 中文版
|
|
11
11
|
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Sphinx configuration for building docs on Read the Docs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
|
9
|
+
sys.path.insert(0, str(PROJECT_ROOT / "nextrec"))
|
|
10
|
+
|
|
11
|
+
project = "NextRec"
|
|
12
|
+
copyright = "2025, Yang Zhou"
|
|
13
|
+
author = "Yang Zhou"
|
|
14
|
+
release = "0.2.4"
|
|
15
|
+
|
|
16
|
+
extensions = [
|
|
17
|
+
"myst_parser",
|
|
18
|
+
"sphinx.ext.autodoc",
|
|
19
|
+
"sphinx.ext.napoleon",
|
|
20
|
+
"sphinx_rtd_theme",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
source_suffix = {
|
|
24
|
+
".rst": "restructuredtext",
|
|
25
|
+
".md": "markdown",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
templates_path = ["_templates"]
|
|
29
|
+
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
|
|
30
|
+
|
|
31
|
+
html_theme = "sphinx_rtd_theme"
|
|
32
|
+
html_static_path = ["_static"]
|
|
33
|
+
|
|
34
|
+
autodoc_default_options = {
|
|
35
|
+
"members": True,
|
|
36
|
+
"undoc-members": True,
|
|
37
|
+
"special-members": "__init__, __iter__",
|
|
38
|
+
"private-members": True,
|
|
39
|
+
}
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# NextRec Documentation
|
|
2
|
+
|
|
3
|
+
NextRec is a unified recommendation framework built on PyTorch. It offers modular feature definitions, a reproducible data processing pipeline, and a standard training engine that already powers ranking, retrieval, multi-task, and emerging generative recommendation models.
|
|
4
|
+
|
|
5
|
+
## What you get
|
|
6
|
+
- Unified interface for ranking, retrieval, multi-task, and early generative recommenders (TIGER, HSTU in progress).
|
|
7
|
+
- Ready-to-use feature abstractions: `DenseFeature`, `SparseFeature`, `SequenceFeature`.
|
|
8
|
+
- End-to-end training loop with `compile`, `fit`, `evaluate`, `predict`, checkpoints, metrics, and early stopping.
|
|
9
|
+
- DataProcessor for repeatable numeric/sparse/sequence/target handling with save/load support.
|
|
10
|
+
- GPU/MPS ready; tutorials and runnable scripts under `tutorials/`.
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
Using uv (recommended):
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
git clone https://github.com/zerolovesea/NextRec.git
|
|
17
|
+
cd NextRec
|
|
18
|
+
pip install uv
|
|
19
|
+
uv sync
|
|
20
|
+
source .venv/bin/activate
|
|
21
|
+
uv pip install -e .
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Using pip:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
git clone https://github.com/zerolovesea/NextRec.git
|
|
28
|
+
cd NextRec
|
|
29
|
+
pip install -r requirements.txt
|
|
30
|
+
pip install -r test_requirements.txt
|
|
31
|
+
pip install -e .
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## 5-minute quick start (DeepFM)
|
|
35
|
+
Train and predict on MovieLens-style data:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
import pandas as pd
|
|
39
|
+
from nextrec.models.ranking.deepfm import DeepFM
|
|
40
|
+
from nextrec.basic.features import DenseFeature, SparseFeature
|
|
41
|
+
|
|
42
|
+
df = pd.read_csv("dataset/movielens_100k.csv")
|
|
43
|
+
|
|
44
|
+
dense_features = [DenseFeature("age")]
|
|
45
|
+
sparse_features = [
|
|
46
|
+
SparseFeature("user_id", vocab_size=df["user_id"].max() + 1, embedding_dim=4),
|
|
47
|
+
SparseFeature("item_id", vocab_size=df["item_id"].max() + 1, embedding_dim=4),
|
|
48
|
+
SparseFeature("gender", vocab_size=df["gender"].max() + 1, embedding_dim=4),
|
|
49
|
+
SparseFeature("occupation", vocab_size=df["occupation"].max() + 1, embedding_dim=4),
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
model = DeepFM(
|
|
53
|
+
dense_features=dense_features,
|
|
54
|
+
sparse_features=sparse_features,
|
|
55
|
+
target="label",
|
|
56
|
+
device="cpu",
|
|
57
|
+
session_id="deepfm_demo",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
model.compile(
|
|
61
|
+
optimizer="adam",
|
|
62
|
+
optimizer_params={"lr": 1e-3, "weight_decay": 1e-5},
|
|
63
|
+
loss="bce",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
model.fit(
|
|
67
|
+
train_data=df,
|
|
68
|
+
metrics=["auc", "recall", "precision"],
|
|
69
|
+
epochs=5,
|
|
70
|
+
batch_size=512,
|
|
71
|
+
shuffle=True,
|
|
72
|
+
verbose=1,
|
|
73
|
+
validation_split=0.1,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
preds = model.predict(df)
|
|
77
|
+
print(preds[:5])
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Core API guide
|
|
81
|
+
Feature definitions (`nextrec.basic.features`):
|
|
82
|
+
|
|
83
|
+
- `DenseFeature(name, embedding_dim=1)` for continuous values.
|
|
84
|
+
- `SparseFeature(name, vocab_size, embedding_dim=auto, padding_idx=None, l1_reg=0.0, l2_reg=1e-5, trainable=True)` for categorical ids.
|
|
85
|
+
- `SequenceFeature(name, vocab_size, max_len=20, combiner="mean", padding_idx=None, l1_reg=0.0, l2_reg=1e-5, trainable=True)` for histories with pooling.
|
|
86
|
+
|
|
87
|
+
Data processing (`nextrec.data.preprocessor.DataProcessor`):
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from nextrec.data.preprocessor import DataProcessor
|
|
91
|
+
|
|
92
|
+
processor = DataProcessor()
|
|
93
|
+
processor.add_numeric_feature("age", scaler="standard")
|
|
94
|
+
processor.add_sparse_feature("user_id", encode_method="label")
|
|
95
|
+
processor.add_sequence_feature("item_history", encode_method="hash", hash_size=5000, max_len=50, pad_value=0)
|
|
96
|
+
processor.add_target("label", target_type="binary")
|
|
97
|
+
|
|
98
|
+
processor.fit(train_df) # learns scalers/encoders
|
|
99
|
+
train_arr = processor.transform(train_df) # dict -> numpy arrays
|
|
100
|
+
vocab_sizes = processor.get_vocab_sizes() # useful for embedding dims
|
|
101
|
+
processor.save("processor.pkl") # persist for serving
|
|
102
|
+
processor = DataProcessor.load("processor.pkl")
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Training workflow (`nextrec.basic.model.BaseModel` interface)
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
model.compile(
|
|
109
|
+
optimizer="adam", # str, class, or instance
|
|
110
|
+
optimizer_params={"lr": 1e-3},
|
|
111
|
+
scheduler="steplr", # optional torch scheduler name/class/instance
|
|
112
|
+
scheduler_params={"step_size": 3, "gamma": 0.5},
|
|
113
|
+
loss="bce", # per-task loss or list
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
model.fit(
|
|
117
|
+
train_data=train_df_or_loader, # dict, DataFrame, or DataLoader
|
|
118
|
+
valid_data=valid_df_or_loader, # optional validation split
|
|
119
|
+
metrics=["auc", "logloss"], # or {"label": ["auc", "logloss"]}
|
|
120
|
+
epochs=10,
|
|
121
|
+
batch_size=256,
|
|
122
|
+
shuffle=True,
|
|
123
|
+
verbose=1,
|
|
124
|
+
validation_split=0.1, # auto split when valid_data is None
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
scores = model.evaluate(valid_df_or_loader) # returns metric dict
|
|
128
|
+
preds = model.predict(test_df_or_loader) # numpy array or dict
|
|
129
|
+
model.save_weights("checkpoint.model")
|
|
130
|
+
model.load_weights("checkpoint.model", map_location="cpu")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Model zoo (`nextrec.models`)
|
|
134
|
+
- Ranking: FM, AFM, DeepFM, Wide&Deep, xDeepFM, FiBiNET, PNN, AutoInt, DCN, DIN, DIEN, MaskNet.
|
|
135
|
+
- Retrieval: DSSM, DSSM v2 (pairwise), YouTube DNN, MIND, SDM.
|
|
136
|
+
- Multi-task: MMOE, PLE, ESMM, ShareBottom.
|
|
137
|
+
- Generative (in progress): TIGER, HSTU.
|
|
138
|
+
|
|
139
|
+
## Tutorials and scripts
|
|
140
|
+
- Ready-to-run examples live in `tutorials/` (e.g., `movielen_ranking_deepfm.py`, `example_multitask.py`).
|
|
141
|
+
- Datasets used in samples live in `dataset/`. Check `README.md` and `README_zh.md` for dataset prep and more examples.
|
|
142
|
+
|
|
143
|
+
## Contents
|
|
144
|
+
|
|
145
|
+
```{toctree}
|
|
146
|
+
:maxdepth: 2
|
|
147
|
+
:caption: Contents
|
|
148
|
+
|
|
149
|
+
modules
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## API reference stub
|
|
153
|
+
|
|
154
|
+
```{automodule} nextrec
|
|
155
|
+
:members:
|
|
156
|
+
:noindex:
|
|
157
|
+
```
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
> 本文演示如何用 NextRec 从零到一训练并构建一个可上线的推荐模型。示例基于仓库自带的 `dataset/movielens_100k.csv`和`dataset/match_task.csv`实现。
|
|
2
|
+
|
|
3
|
+
## 1. 环境与数据准备
|
|
4
|
+
|
|
5
|
+
- 依赖:Python 3.10+、PyTorch 1.10+。
|
|
6
|
+
- 安装:`pip install nextrec`(或仓库根目录 `pip install -e .` 以开发模式安装)。
|
|
7
|
+
- 数据格式:CSV 或 Parquet 均可,通常包含用户特征、物品特征、行为序列及监督标签(如 `label`、`click`)。
|
|
8
|
+
|
|
9
|
+
## 2. 关于特征
|
|
10
|
+
|
|
11
|
+
在上手之前,先介绍一些推荐系统的概念。在推荐系统中,通常会处理多种类型的输入信号,在经过一系列的变换之后转化为向量输入网络:
|
|
12
|
+
|
|
13
|
+
- 稠密特征(数值型):连续或可序数化的数值,如年龄、价格、时长、打分;常见做法是标准化/归一化或对数变换。
|
|
14
|
+
- 稀疏特征(类别/ID):高基数离散字段,如用户 ID、物品 ID、性别、职业、设备类型;通常需要索引化后,在一个embedding lookup matrix中进行嵌入。
|
|
15
|
+
- 序列特征(行为序列):可变长的历史行为,如用户的浏览/点击/购买列表。这类特征表征了用户的行为和兴趣变化,通常我们需要截断、padding,嵌入后通过不同聚合方式(如 mean/sum/attention)将其变为定长向量。
|
|
16
|
+
- 上下文特征:时间、地理、曝光位置等环境信息,可是稠密也可能是稀疏,常与主特征交互。
|
|
17
|
+
- 多模态特征:文本、图片、视频等经过预训练模型得到的向量,可直接作为稠密输入,或与 ID 交互建模。
|
|
18
|
+
|
|
19
|
+
通常一个标准的训练数据格式如下所示:
|
|
20
|
+
|
|
21
|
+
```text
|
|
22
|
+
user_id,item_id,gender,age,occupation,history_seq,label
|
|
23
|
+
1024,501,1,28,3,"[12,45,18,77]",1
|
|
24
|
+
2048,777,0,35,5,"[8,99]",0
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## 3. 训练一个排序模型(DeepFM)
|
|
28
|
+
|
|
29
|
+
接下来,我们通过一个简单的模型,指导大家如何使用NextRec在movielens数据集上训练一个DeepFM模型。首先,需要将不同的特征进行定义。
|
|
30
|
+
|
|
31
|
+
对于稀疏特征,我们需要定义词表大小`vocab_size`和嵌入层大小`embedding_dim`,嵌入层id`embedding_name`,对于稠密特征,需要定义是否需要线性变换及变换后的维度。
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import pandas as pd
|
|
35
|
+
from sklearn.model_selection import train_test_split
|
|
36
|
+
|
|
37
|
+
from nextrec.basic.features import DenseFeature, SparseFeature
|
|
38
|
+
from nextrec.models.ranking.deepfm import DeepFM
|
|
39
|
+
|
|
40
|
+
df = pd.read_csv("dataset/movielens_100k.csv")
|
|
41
|
+
|
|
42
|
+
dense_features = [DenseFeature('age')]
|
|
43
|
+
sparse_features = [
|
|
44
|
+
SparseFeature('user_id', vocab_size=df['user_id'].max() + 1, embedding_dim=16),
|
|
45
|
+
SparseFeature('item_id', vocab_size=df['item_id'].max() + 1, embedding_dim=16),
|
|
46
|
+
SparseFeature('gender', vocab_size=df['gender'].max() + 1, embedding_dim=4),
|
|
47
|
+
SparseFeature('occupation', vocab_size=df['occupation'].max() + 1, embedding_dim=8),
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=2024)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
在定义特征后,我只需要实例化需要的模型,随后为模型配置所需要的训练参数。在训练时,模型内部会组装dataloader并进行训练,并可以输出需要的训练指标。
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
model = DeepFM(
|
|
57
|
+
dense_features=dense_features,
|
|
58
|
+
sparse_features=sparse_features,
|
|
59
|
+
mlp_params={"dims": [256, 128], "activation": "relu", "dropout": 0.2},
|
|
60
|
+
target='label',
|
|
61
|
+
device='cpu',
|
|
62
|
+
session_id="movielens_deepfm" # 通过设置session id,来管理不同实验的日志
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
model.compile(
|
|
66
|
+
optimizer="adam",
|
|
67
|
+
optimizer_params={"lr": 1e-3, "weight_decay": 1e-5},
|
|
68
|
+
loss='binary_crossentropy',
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
model.fit(
|
|
72
|
+
train_data=train_df,
|
|
73
|
+
valid_data=test_df,
|
|
74
|
+
metrics=['auc', 'recall','precision'],
|
|
75
|
+
epochs=1,
|
|
76
|
+
batch_size=512,
|
|
77
|
+
shuffle=True
|
|
78
|
+
)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
- `metrics` 支持 `auc`/`logloss`/`accuracy`/`gauc` 等,使用 GAUC 时传入 `user_id_column='user_id'`。
|
|
82
|
+
- 训练会自动早停,并在 `session_id` 对应目录下保存最佳权重。
|
|
83
|
+
|
|
84
|
+
## 5. 推理与评估
|
|
85
|
+
|
|
86
|
+
训练完成后,用户可以进行批量预测。NextRec支持不同的推理数据格式,包括csv,parquet,路径,字典,dataframe,以及符合要求dataloader,。
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
# 批量预测
|
|
90
|
+
preds = model.predict(valid_df, batch_size=512)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
- 保存预测:`model.predict(..., save_path="outputs/preds", save_format="csv")`。
|
|
94
|
+
- 评估接口:`model.evaluate(valid_df, metrics=['auc', 'gauc'], user_id_column='user_id')`。
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
现在你已完成从数据预处理到训练、评估、保存与加载的全流程,可以替换为自己的数据和模型配置,快速构建可上线的推荐系统。
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.4"
|
|
@@ -83,7 +83,7 @@ class DenseFeature(BaseFeature):
|
|
|
83
83
|
self.embedding_dim = embedding_dim
|
|
84
84
|
|
|
85
85
|
|
|
86
|
-
class
|
|
86
|
+
class FeatureSpecMixin:
|
|
87
87
|
"""
|
|
88
88
|
Mixin that normalizes dense/sparse/sequence feature lists and target/id columns.
|
|
89
89
|
"""
|
|
@@ -116,3 +116,4 @@ class FeatureConfig:
|
|
|
116
116
|
if isinstance(value, str):
|
|
117
117
|
return [value]
|
|
118
118
|
return list(value)
|
|
119
|
+
|
|
@@ -19,7 +19,7 @@ from typing import Union, Literal
|
|
|
19
19
|
from torch.utils.data import DataLoader, TensorDataset
|
|
20
20
|
|
|
21
21
|
from nextrec.basic.callback import EarlyStopper
|
|
22
|
-
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature,
|
|
22
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSpecMixin
|
|
23
23
|
from nextrec.basic.metrics import configure_metrics, evaluate_metrics
|
|
24
24
|
|
|
25
25
|
from nextrec.loss import get_loss_fn, get_loss_kwargs
|
|
@@ -30,7 +30,7 @@ from nextrec.utils import get_optimizer, get_scheduler
|
|
|
30
30
|
from nextrec.basic.session import resolve_save_path, create_session
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
class BaseModel(
|
|
33
|
+
class BaseModel(FeatureSpecMixin, nn.Module):
|
|
34
34
|
@property
|
|
35
35
|
def model_name(self) -> str:
|
|
36
36
|
raise NotImplementedError
|
|
@@ -18,9 +18,7 @@ from nextrec.data.data_utils import (
|
|
|
18
18
|
read_table,
|
|
19
19
|
load_dataframes,
|
|
20
20
|
)
|
|
21
|
-
from nextrec.basic.features import
|
|
22
|
-
|
|
23
|
-
# For backward compatibility, keep utils accessible
|
|
21
|
+
from nextrec.basic.features import FeatureSpecMixin
|
|
24
22
|
from nextrec.data import data_utils
|
|
25
23
|
|
|
26
24
|
__all__ = [
|
|
@@ -33,6 +31,6 @@ __all__ = [
|
|
|
33
31
|
'iter_file_chunks',
|
|
34
32
|
'read_table',
|
|
35
33
|
'load_dataframes',
|
|
36
|
-
'
|
|
34
|
+
'FeatureSpecMixin',
|
|
37
35
|
'data_utils',
|
|
38
36
|
]
|
|
@@ -17,7 +17,7 @@ from typing import Iterator, Literal, Union, Optional
|
|
|
17
17
|
|
|
18
18
|
from torch.utils.data import DataLoader, TensorDataset, IterableDataset
|
|
19
19
|
from nextrec.data.preprocessor import DataProcessor
|
|
20
|
-
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature,
|
|
20
|
+
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSpecMixin
|
|
21
21
|
|
|
22
22
|
from nextrec.basic.loggers import colorize
|
|
23
23
|
from nextrec.data import (
|
|
@@ -28,7 +28,7 @@ from nextrec.data import (
|
|
|
28
28
|
)
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
class FileDataset(
|
|
31
|
+
class FileDataset(FeatureSpecMixin, IterableDataset):
|
|
32
32
|
"""
|
|
33
33
|
Iterable dataset that streams CSV/Parquet files in chunks and yields tensor tuples.
|
|
34
34
|
|
|
@@ -164,7 +164,7 @@ class FileDataset(FeatureConfig, IterableDataset):
|
|
|
164
164
|
)
|
|
165
165
|
|
|
166
166
|
|
|
167
|
-
class RecDataLoader(
|
|
167
|
+
class RecDataLoader(FeatureSpecMixin):
|
|
168
168
|
"""
|
|
169
169
|
Convenience wrapper for building PyTorch ``DataLoader`` objects for recommendation models.
|
|
170
170
|
|
|
@@ -31,9 +31,9 @@ from nextrec.data.data_utils import (
|
|
|
31
31
|
default_output_dir,
|
|
32
32
|
)
|
|
33
33
|
from nextrec.basic.session import create_session, resolve_save_path
|
|
34
|
-
from nextrec.basic.features import
|
|
34
|
+
from nextrec.basic.features import FeatureSpecMixin
|
|
35
35
|
|
|
36
|
-
class DataProcessor(
|
|
36
|
+
class DataProcessor(FeatureSpecMixin):
|
|
37
37
|
"""DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
|
|
38
38
|
|
|
39
39
|
Examples:
|
|
@@ -1,12 +1,57 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 09/11/2025
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Checkpoint: edit on 24/11/2025
|
|
4
|
+
Author: Yang Zhou,zyaztec@gmail.com
|
|
5
5
|
Reference:
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
[1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
|
|
7
|
+
self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
|
|
8
|
+
on information and knowledge management. 2019: 1161-1170.
|
|
9
|
+
(https://arxiv.org/abs/1810.11921)
|
|
10
|
+
|
|
11
|
+
AutoInt is a CTR prediction model that leverages multi-head self-attention
|
|
12
|
+
to automatically learn high-order feature interactions in an explicit and
|
|
13
|
+
interpretable way. Instead of relying on manual feature engineering or
|
|
14
|
+
implicit MLP-based transformations, AutoInt models feature dependencies
|
|
15
|
+
by attending over all embedded fields and capturing their contextual
|
|
16
|
+
relationships.
|
|
17
|
+
|
|
18
|
+
In each Interacting Layer:
|
|
19
|
+
(1) Each field embedding is projected into multiple attention heads
|
|
20
|
+
(2) Scaled dot-product attention computes feature-to-feature interactions
|
|
21
|
+
(3) Outputs are aggregated and passed through residual connections
|
|
22
|
+
(4) Layer Normalization ensures stable optimization
|
|
23
|
+
|
|
24
|
+
By stacking multiple Interacting Layers, AutoInt progressively discovers
|
|
25
|
+
higher-order feature interactions, while maintaining transparency since
|
|
26
|
+
attention weights explicitly show which features interact.
|
|
27
|
+
|
|
28
|
+
Key Advantages:
|
|
29
|
+
- Explicit modeling of high-order feature interactions
|
|
30
|
+
- Multi-head attention enhances representation diversity
|
|
31
|
+
- Residual structure facilitates deep interaction learning
|
|
32
|
+
- Attention weights provide interpretability of feature relations
|
|
33
|
+
- Eliminates heavy manual feature engineering
|
|
34
|
+
|
|
35
|
+
AutoInt 是一个 CTR 预估模型,通过多头自注意力机制显式学习高阶特征交互,
|
|
36
|
+
并具有良好的可解释性。不同于依赖人工特征工程或 MLP 隐式建模的方法,
|
|
37
|
+
AutoInt 通过对所有特征 embedding 进行注意力计算,捕捉特征之间的上下文依赖关系。
|
|
38
|
+
|
|
39
|
+
在每个 Interacting Layer(交互层)中:
|
|
40
|
+
(1) 每个特征 embedding 通过投影分成多个注意力头
|
|
41
|
+
(2) 使用缩放点积注意力计算特征间交互权重
|
|
42
|
+
(3) 将多头输出进行聚合,并使用残差连接
|
|
43
|
+
(4) Layer Normalization 确保训练稳定性
|
|
44
|
+
|
|
45
|
+
通过堆叠多个交互层,AutoInt 能逐步学习更高阶的特征交互;
|
|
46
|
+
同时由于注意力权重可视化,模型具有明确的可解释能力,
|
|
47
|
+
能展示哪些特征之间的关系最重要。
|
|
48
|
+
|
|
49
|
+
主要优点:
|
|
50
|
+
- 显式建模高阶特征交互
|
|
51
|
+
- 多头机制增强表示能力
|
|
52
|
+
- 残差结构支持深层交互学习
|
|
53
|
+
- 注意力权重天然具备可解释性
|
|
54
|
+
- 减少繁重的人工特征工程工作
|
|
10
55
|
"""
|
|
11
56
|
|
|
12
57
|
import torch
|
|
@@ -80,7 +125,6 @@ class AutoInt(BaseModel):
|
|
|
80
125
|
|
|
81
126
|
# Project embeddings to attention embedding dimension
|
|
82
127
|
num_fields = len(self.interaction_features)
|
|
83
|
-
total_embedding_dim = sum([f.embedding_dim for f in self.interaction_features])
|
|
84
128
|
|
|
85
129
|
# If embeddings have different dimensions, project them to att_embedding_dim
|
|
86
130
|
self.need_projection = not all(f.embedding_dim == att_embedding_dim for f in self.interaction_features)
|