nextrec 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. {nextrec-0.2.2 → nextrec-0.2.4}/.github/workflows/publish.yml +2 -2
  2. {nextrec-0.2.2 → nextrec-0.2.4}/.gitignore +1 -1
  3. {nextrec-0.2.2 → nextrec-0.2.4}/.readthedocs.yaml +3 -3
  4. {nextrec-0.2.2 → nextrec-0.2.4}/PKG-INFO +2 -2
  5. {nextrec-0.2.2 → nextrec-0.2.4}/README.md +1 -1
  6. {nextrec-0.2.2 → nextrec-0.2.4}/README_zh.md +1 -1
  7. nextrec-0.2.4/docs/rtd/conf.py +39 -0
  8. nextrec-0.2.4/docs/rtd/index.md +157 -0
  9. nextrec-0.2.4/docs/rtd/requirements.txt +3 -0
  10. nextrec-0.2.4/docs/zh//345/277/253/351/200/237/344/270/212/346/211/213.md +97 -0
  11. nextrec-0.2.4/nextrec/__version__.py +1 -0
  12. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/features.py +2 -1
  13. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/model.py +2 -2
  14. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/data/__init__.py +2 -4
  15. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/data/dataloader.py +3 -3
  16. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/data/preprocessor.py +2 -2
  17. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/autoint.py +51 -7
  18. nextrec-0.2.4/nextrec/models/ranking/masknet.py +319 -0
  19. {nextrec-0.2.2 → nextrec-0.2.4}/pyproject.toml +1 -1
  20. {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/movielen_ranking_deepfm.py +6 -2
  21. nextrec-0.2.2/docs/conf.py +0 -42
  22. nextrec-0.2.2/docs/index.rst +0 -172
  23. nextrec-0.2.2/docs/requirements.txt +0 -2
  24. nextrec-0.2.2/nextrec/__version__.py +0 -1
  25. nextrec-0.2.2/nextrec/models/ranking/masknet.py +0 -127
  26. {nextrec-0.2.2 → nextrec-0.2.4}/.github/workflows/tests.yml +0 -0
  27. {nextrec-0.2.2 → nextrec-0.2.4}/CODE_OF_CONDUCT.md +0 -0
  28. {nextrec-0.2.2 → nextrec-0.2.4}/CONTRIBUTING.md +0 -0
  29. {nextrec-0.2.2 → nextrec-0.2.4}/LICENSE +0 -0
  30. {nextrec-0.2.2 → nextrec-0.2.4}/MANIFEST.in +0 -0
  31. {nextrec-0.2.2 → nextrec-0.2.4}/dataset/match_task.csv +0 -0
  32. {nextrec-0.2.2 → nextrec-0.2.4}/dataset/movielens_100k.csv +0 -0
  33. {nextrec-0.2.2 → nextrec-0.2.4}/dataset/multitask_task.csv +0 -0
  34. {nextrec-0.2.2 → nextrec-0.2.4}/dataset/ranking_task.csv +0 -0
  35. {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/Makefile +0 -0
  36. {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/make.bat +0 -0
  37. {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/modules.rst +0 -0
  38. {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.basic.rst +0 -0
  39. {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.data.rst +0 -0
  40. {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.loss.rst +0 -0
  41. {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.rst +0 -0
  42. {nextrec-0.2.2/docs → nextrec-0.2.4/docs/rtd}/nextrec.utils.rst +0 -0
  43. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/__init__.py +0 -0
  44. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/__init__.py +0 -0
  45. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/activation.py +0 -0
  46. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/callback.py +0 -0
  47. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/layers.py +0 -0
  48. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/loggers.py +0 -0
  49. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/metrics.py +0 -0
  50. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/basic/session.py +0 -0
  51. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/data/data_utils.py +0 -0
  52. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/__init__.py +0 -0
  53. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/listwise.py +0 -0
  54. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/loss_utils.py +0 -0
  55. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/pairwise.py +0 -0
  56. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/loss/pointwise.py +0 -0
  57. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/generative/hstu.py +0 -0
  58. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/generative/tiger.py +0 -0
  59. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/__init__.py +0 -0
  60. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/dssm.py +0 -0
  61. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/dssm_v2.py +0 -0
  62. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/mind.py +0 -0
  63. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/sdm.py +0 -0
  64. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/match/youtube_dnn.py +0 -0
  65. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/multi_task/esmm.py +0 -0
  66. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/multi_task/mmoe.py +0 -0
  67. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/multi_task/ple.py +0 -0
  68. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/multi_task/share_bottom.py +0 -0
  69. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/__init__.py +0 -0
  70. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/afm.py +0 -0
  71. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/dcn.py +0 -0
  72. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/deepfm.py +0 -0
  73. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/dien.py +0 -0
  74. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/din.py +0 -0
  75. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/fibinet.py +0 -0
  76. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/fm.py +0 -0
  77. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/pnn.py +0 -0
  78. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/widedeep.py +0 -0
  79. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/models/ranking/xdeepfm.py +0 -0
  80. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/utils/__init__.py +0 -0
  81. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/utils/embedding.py +0 -0
  82. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/utils/initializer.py +0 -0
  83. {nextrec-0.2.2 → nextrec-0.2.4}/nextrec/utils/optimizer.py +0 -0
  84. {nextrec-0.2.2 → nextrec-0.2.4}/pytest.ini +0 -0
  85. {nextrec-0.2.2 → nextrec-0.2.4}/requirements.txt +0 -0
  86. {nextrec-0.2.2 → nextrec-0.2.4}/test/__init__.py +0 -0
  87. {nextrec-0.2.2 → nextrec-0.2.4}/test/conftest.py +0 -0
  88. {nextrec-0.2.2 → nextrec-0.2.4}/test/run_tests.py +0 -0
  89. {nextrec-0.2.2 → nextrec-0.2.4}/test/test_data_preprocessor.py +0 -0
  90. {nextrec-0.2.2 → nextrec-0.2.4}/test/test_dataloader.py +0 -0
  91. {nextrec-0.2.2 → nextrec-0.2.4}/test/test_layers.py +0 -0
  92. {nextrec-0.2.2 → nextrec-0.2.4}/test/test_losses.py +0 -0
  93. {nextrec-0.2.2 → nextrec-0.2.4}/test/test_match_models.py +0 -0
  94. {nextrec-0.2.2 → nextrec-0.2.4}/test/test_multitask_models.py +0 -0
  95. {nextrec-0.2.2 → nextrec-0.2.4}/test/test_ranking_models.py +0 -0
  96. {nextrec-0.2.2 → nextrec-0.2.4}/test/test_utils.py +0 -0
  97. {nextrec-0.2.2 → nextrec-0.2.4}/test_requirements.txt +0 -0
  98. {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/example_match_dssm.py +0 -0
  99. {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/example_multitask.py +0 -0
  100. {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/example_ranking_din.py +0 -0
  101. {nextrec-0.2.2 → nextrec-0.2.4}/tutorials/movielen_match_dssm.py +0 -0
@@ -8,7 +8,7 @@ on:
8
8
  workflow_dispatch:
9
9
 
10
10
  jobs:
11
- # dev 分支 -> TestPyPI
11
+ # dev -> TestPyPI
12
12
  publish-to-testpypi:
13
13
  if: github.ref == 'refs/heads/dev'
14
14
  runs-on: ubuntu-latest
@@ -36,7 +36,7 @@ jobs:
36
36
  run: |
37
37
  twine upload --verbose --repository testpypi dist/*
38
38
 
39
- # main 分支 -> 正式 PyPI
39
+ # main -> PyPI
40
40
  publish-to-pypi:
41
41
  if: github.ref == 'refs/heads/main'
42
42
  runs-on: ubuntu-latest
@@ -127,4 +127,4 @@ session/
127
127
  pypirc.template
128
128
 
129
129
  # Sphinx build
130
- docs/_build/
130
+ docs/rtd/_build/
@@ -12,12 +12,12 @@ build:
12
12
 
13
13
  # Build documentation in the "docs/" directory with Sphinx
14
14
  sphinx:
15
- configuration: docs/conf.py
15
+ configuration: docs/rtd/conf.py
16
16
 
17
17
  # Optionally, but recommended,
18
18
  # declare the Python requirements required to build your documentation
19
19
  # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
20
20
  python:
21
21
  install:
22
- - requirements: docs/requirements.txt
23
-
22
+ - requirements: docs/rtd/requirements.txt
23
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nextrec
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
5
5
  Project-URL: Homepage, https://github.com/zerolovesea/NextRec
6
6
  Project-URL: Repository, https://github.com/zerolovesea/NextRec
@@ -61,7 +61,7 @@ Description-Content-Type: text/markdown
61
61
  ![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
62
62
  ![PyTorch](https://img.shields.io/badge/PyTorch-1.10+-ee4c2c.svg)
63
63
  ![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)
64
- ![Version](https://img.shields.io/badge/Version-0.2.2-orange.svg)
64
+ ![Version](https://img.shields.io/badge/Version-0.2.4-orange.svg)
65
65
 
66
66
  English | [中文版](README_zh.md)
67
67
 
@@ -5,7 +5,7 @@
5
5
  ![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
6
6
  ![PyTorch](https://img.shields.io/badge/PyTorch-1.10+-ee4c2c.svg)
7
7
  ![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)
8
- ![Version](https://img.shields.io/badge/Version-0.2.2-orange.svg)
8
+ ![Version](https://img.shields.io/badge/Version-0.2.4-orange.svg)
9
9
 
10
10
  English | [中文版](README_zh.md)
11
11
 
@@ -5,7 +5,7 @@
5
5
  ![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
6
6
  ![PyTorch](https://img.shields.io/badge/PyTorch-1.10+-ee4c2c.svg)
7
7
  ![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)
8
- ![Version](https://img.shields.io/badge/Version-0.2.2-orange.svg)
8
+ ![Version](https://img.shields.io/badge/Version-0.2.4-orange.svg)
9
9
 
10
10
  [English Version](README.md) | 中文版
11
11
 
@@ -0,0 +1,39 @@
1
+ """Sphinx configuration for building docs on Read the Docs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
9
+ sys.path.insert(0, str(PROJECT_ROOT / "nextrec"))
10
+
11
+ project = "NextRec"
12
+ copyright = "2025, Yang Zhou"
13
+ author = "Yang Zhou"
14
+ release = "0.2.4"
15
+
16
+ extensions = [
17
+ "myst_parser",
18
+ "sphinx.ext.autodoc",
19
+ "sphinx.ext.napoleon",
20
+ "sphinx_rtd_theme",
21
+ ]
22
+
23
+ source_suffix = {
24
+ ".rst": "restructuredtext",
25
+ ".md": "markdown",
26
+ }
27
+
28
+ templates_path = ["_templates"]
29
+ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
30
+
31
+ html_theme = "sphinx_rtd_theme"
32
+ html_static_path = ["_static"]
33
+
34
+ autodoc_default_options = {
35
+ "members": True,
36
+ "undoc-members": True,
37
+ "special-members": "__init__, __iter__",
38
+ "private-members": True,
39
+ }
@@ -0,0 +1,157 @@
1
+ # NextRec Documentation
2
+
3
+ NextRec is a unified recommendation framework built on PyTorch. It offers modular feature definitions, a reproducible data processing pipeline, and a standard training engine that already powers ranking, retrieval, multi-task, and emerging generative recommendation models.
4
+
5
+ ## What you get
6
+ - Unified interface for ranking, retrieval, multi-task, and early generative recommenders (TIGER, HSTU in progress).
7
+ - Ready-to-use feature abstractions: `DenseFeature`, `SparseFeature`, `SequenceFeature`.
8
+ - End-to-end training loop with `compile`, `fit`, `evaluate`, `predict`, checkpoints, metrics, and early stopping.
9
+ - DataProcessor for repeatable numeric/sparse/sequence/target handling with save/load support.
10
+ - GPU/MPS ready; tutorials and runnable scripts under `tutorials/`.
11
+
12
+ ## Installation
13
+ Using uv (recommended):
14
+
15
+ ```bash
16
+ git clone https://github.com/zerolovesea/NextRec.git
17
+ cd NextRec
18
+ pip install uv
19
+ uv sync
20
+ source .venv/bin/activate
21
+ uv pip install -e .
22
+ ```
23
+
24
+ Using pip:
25
+
26
+ ```bash
27
+ git clone https://github.com/zerolovesea/NextRec.git
28
+ cd NextRec
29
+ pip install -r requirements.txt
30
+ pip install -r test_requirements.txt
31
+ pip install -e .
32
+ ```
33
+
34
+ ## 5-minute quick start (DeepFM)
35
+ Train and predict on MovieLens-style data:
36
+
37
+ ```python
38
+ import pandas as pd
39
+ from nextrec.models.ranking.deepfm import DeepFM
40
+ from nextrec.basic.features import DenseFeature, SparseFeature
41
+
42
+ df = pd.read_csv("dataset/movielens_100k.csv")
43
+
44
+ dense_features = [DenseFeature("age")]
45
+ sparse_features = [
46
+ SparseFeature("user_id", vocab_size=df["user_id"].max() + 1, embedding_dim=4),
47
+ SparseFeature("item_id", vocab_size=df["item_id"].max() + 1, embedding_dim=4),
48
+ SparseFeature("gender", vocab_size=df["gender"].max() + 1, embedding_dim=4),
49
+ SparseFeature("occupation", vocab_size=df["occupation"].max() + 1, embedding_dim=4),
50
+ ]
51
+
52
+ model = DeepFM(
53
+ dense_features=dense_features,
54
+ sparse_features=sparse_features,
55
+ target="label",
56
+ device="cpu",
57
+ session_id="deepfm_demo",
58
+ )
59
+
60
+ model.compile(
61
+ optimizer="adam",
62
+ optimizer_params={"lr": 1e-3, "weight_decay": 1e-5},
63
+ loss="bce",
64
+ )
65
+
66
+ model.fit(
67
+ train_data=df,
68
+ metrics=["auc", "recall", "precision"],
69
+ epochs=5,
70
+ batch_size=512,
71
+ shuffle=True,
72
+ verbose=1,
73
+ validation_split=0.1,
74
+ )
75
+
76
+ preds = model.predict(df)
77
+ print(preds[:5])
78
+ ```
79
+
80
+ ## Core API guide
81
+ Feature definitions (`nextrec.basic.features`):
82
+
83
+ - `DenseFeature(name, embedding_dim=1)` for continuous values.
84
+ - `SparseFeature(name, vocab_size, embedding_dim=auto, padding_idx=None, l1_reg=0.0, l2_reg=1e-5, trainable=True)` for categorical ids.
85
+ - `SequenceFeature(name, vocab_size, max_len=20, combiner="mean", padding_idx=None, l1_reg=0.0, l2_reg=1e-5, trainable=True)` for histories with pooling.
86
+
87
+ Data processing (`nextrec.data.preprocessor.DataProcessor`):
88
+
89
+ ```python
90
+ from nextrec.data.preprocessor import DataProcessor
91
+
92
+ processor = DataProcessor()
93
+ processor.add_numeric_feature("age", scaler="standard")
94
+ processor.add_sparse_feature("user_id", encode_method="label")
95
+ processor.add_sequence_feature("item_history", encode_method="hash", hash_size=5000, max_len=50, pad_value=0)
96
+ processor.add_target("label", target_type="binary")
97
+
98
+ processor.fit(train_df) # learns scalers/encoders
99
+ train_arr = processor.transform(train_df) # dict -> numpy arrays
100
+ vocab_sizes = processor.get_vocab_sizes() # useful for embedding dims
101
+ processor.save("processor.pkl") # persist for serving
102
+ processor = DataProcessor.load("processor.pkl")
103
+ ```
104
+
105
+ ## Training workflow (`nextrec.basic.model.BaseModel` interface)
106
+
107
+ ```python
108
+ model.compile(
109
+ optimizer="adam", # str, class, or instance
110
+ optimizer_params={"lr": 1e-3},
111
+ scheduler="steplr", # optional torch scheduler name/class/instance
112
+ scheduler_params={"step_size": 3, "gamma": 0.5},
113
+ loss="bce", # per-task loss or list
114
+ )
115
+
116
+ model.fit(
117
+ train_data=train_df_or_loader, # dict, DataFrame, or DataLoader
118
+ valid_data=valid_df_or_loader, # optional validation split
119
+ metrics=["auc", "logloss"], # or {"label": ["auc", "logloss"]}
120
+ epochs=10,
121
+ batch_size=256,
122
+ shuffle=True,
123
+ verbose=1,
124
+ validation_split=0.1, # auto split when valid_data is None
125
+ )
126
+
127
+ scores = model.evaluate(valid_df_or_loader) # returns metric dict
128
+ preds = model.predict(test_df_or_loader) # numpy array or dict
129
+ model.save_weights("checkpoint.model")
130
+ model.load_weights("checkpoint.model", map_location="cpu")
131
+ ```
132
+
133
+ ## Model zoo (`nextrec.models`)
134
+ - Ranking: FM, AFM, DeepFM, Wide&Deep, xDeepFM, FiBiNET, PNN, AutoInt, DCN, DIN, DIEN, MaskNet.
135
+ - Retrieval: DSSM, DSSM v2 (pairwise), YouTube DNN, MIND, SDM.
136
+ - Multi-task: MMOE, PLE, ESMM, ShareBottom.
137
+ - Generative (in progress): TIGER, HSTU.
138
+
139
+ ## Tutorials and scripts
140
+ - Ready-to-run examples live in `tutorials/` (e.g., `movielen_ranking_deepfm.py`, `example_multitask.py`).
141
+ - Datasets used in samples live in `dataset/`. Check `README.md` and `README_zh.md` for dataset prep and more examples.
142
+
143
+ ## Contents
144
+
145
+ ```{toctree}
146
+ :maxdepth: 2
147
+ :caption: Contents
148
+
149
+ modules
150
+ ```
151
+
152
+ ## API reference stub
153
+
154
+ ```{automodule} nextrec
155
+ :members:
156
+ :noindex:
157
+ ```
@@ -0,0 +1,3 @@
1
+ sphinx-autodoc-typehints
2
+ sphinx_rtd_theme
3
+ myst-parser
@@ -0,0 +1,97 @@
1
+ > 本文演示如何用 NextRec 从零到一训练并构建一个可上线的推荐模型。示例基于仓库自带的 `dataset/movielens_100k.csv`和`dataset/match_task.csv`实现。
2
+
3
+ ## 1. 环境与数据准备
4
+
5
+ - 依赖:Python 3.10+、PyTorch 1.10+。
6
+ - 安装:`pip install nextrec`(或仓库根目录 `pip install -e .` 以开发模式安装)。
7
+ - 数据格式:CSV 或 Parquet 均可,通常包含用户特征、物品特征、行为序列及监督标签(如 `label`、`click`)。
8
+
9
+ ## 2. 关于特征
10
+
11
+ 在上手之前,先介绍一些推荐系统的概念。在推荐系统中,通常会处理多种类型的输入信号,在经过一系列的变换之后转化为向量输入网络:
12
+
13
+ - 稠密特征(数值型):连续或可序数化的数值,如年龄、价格、时长、打分;常见做法是标准化/归一化或对数变换。
14
+ - 稀疏特征(类别/ID):高基数离散字段,如用户 ID、物品 ID、性别、职业、设备类型;通常需要索引化后,在一个embedding lookup matrix中进行嵌入。
15
+ - 序列特征(行为序列):可变长的历史行为,如用户的浏览/点击/购买列表。这类特征表征了用户的行为和兴趣变化,通常我们需要截断、padding,嵌入后通过不同聚合方式(如 mean/sum/attention)将其变为定长向量。
16
+ - 上下文特征:时间、地理、曝光位置等环境信息,可是稠密也可能是稀疏,常与主特征交互。
17
+ - 多模态特征:文本、图片、视频等经过预训练模型得到的向量,可直接作为稠密输入,或与 ID 交互建模。
18
+
19
+ 通常一个标准的训练数据格式如下所示:
20
+
21
+ ```text
22
+ user_id,item_id,gender,age,occupation,history_seq,label
23
+ 1024,501,1,28,3,"[12,45,18,77]",1
24
+ 2048,777,0,35,5,"[8,99]",0
25
+ ```
26
+
27
+ ## 3. 训练一个排序模型(DeepFM)
28
+
29
+ 接下来,我们通过一个简单的模型,指导大家如何使用NextRec在movielens数据集上训练一个DeepFM模型。首先,需要将不同的特征进行定义。
30
+
31
+ 对于稀疏特征,我们需要定义词表大小`vocab_size`和嵌入层大小`embedding_dim`,嵌入层id`embedding_name`,对于稠密特征,需要定义是否需要线性变换及变换后的维度。
32
+
33
+ ```python
34
+ import pandas as pd
35
+ from sklearn.model_selection import train_test_split
36
+
37
+ from nextrec.basic.features import DenseFeature, SparseFeature
38
+ from nextrec.models.ranking.deepfm import DeepFM
39
+
40
+ df = pd.read_csv("dataset/movielens_100k.csv")
41
+
42
+ dense_features = [DenseFeature('age')]
43
+ sparse_features = [
44
+ SparseFeature('user_id', vocab_size=df['user_id'].max() + 1, embedding_dim=16),
45
+ SparseFeature('item_id', vocab_size=df['item_id'].max() + 1, embedding_dim=16),
46
+ SparseFeature('gender', vocab_size=df['gender'].max() + 1, embedding_dim=4),
47
+ SparseFeature('occupation', vocab_size=df['occupation'].max() + 1, embedding_dim=8),
48
+ ]
49
+
50
+ train_df, valid_df = train_test_split(df, test_size=0.2, random_state=2024)
51
+ ```
52
+
53
+ 在定义特征后,我只需要实例化需要的模型,随后为模型配置所需要的训练参数。在训练时,模型内部会组装dataloader并进行训练,并可以输出需要的训练指标。
54
+
55
+ ```python
56
+ model = DeepFM(
57
+ dense_features=dense_features,
58
+ sparse_features=sparse_features,
59
+ mlp_params={"dims": [256, 128], "activation": "relu", "dropout": 0.2},
60
+ target='label',
61
+ device='cpu',
62
+ session_id="movielens_deepfm" # 通过设置session id,来管理不同实验的日志
63
+ )
64
+
65
+ model.compile(
66
+ optimizer="adam",
67
+ optimizer_params={"lr": 1e-3, "weight_decay": 1e-5},
68
+ loss='binary_crossentropy',
69
+ )
70
+
71
+ model.fit(
72
+ train_data=train_df,
73
+ valid_data=test_df,
74
+ metrics=['auc', 'recall','precision'],
75
+ epochs=1,
76
+ batch_size=512,
77
+ shuffle=True
78
+ )
79
+ ```
80
+
81
+ - `metrics` 支持 `auc`/`logloss`/`accuracy`/`gauc` 等,使用 GAUC 时传入 `user_id_column='user_id'`。
82
+ - 训练会自动早停,并在 `session_id` 对应目录下保存最佳权重。
83
+
84
+ ## 5. 推理与评估
85
+
86
+ 训练完成后,用户可以进行批量预测。NextRec支持不同的推理数据格式,包括csv,parquet,路径,字典,dataframe,以及符合要求dataloader,。
87
+
88
+ ```python
89
+ # 批量预测
90
+ preds = model.predict(valid_df, batch_size=512)
91
+ ```
92
+
93
+ - 保存预测:`model.predict(..., save_path="outputs/preds", save_format="csv")`。
94
+ - 评估接口:`model.evaluate(valid_df, metrics=['auc', 'gauc'], user_id_column='user_id')`。
95
+
96
+
97
+ 现在你已完成从数据预处理到训练、评估、保存与加载的全流程,可以替换为自己的数据和模型配置,快速构建可上线的推荐系统。
@@ -0,0 +1 @@
1
+ __version__ = "0.2.4"
@@ -83,7 +83,7 @@ class DenseFeature(BaseFeature):
83
83
  self.embedding_dim = embedding_dim
84
84
 
85
85
 
86
- class FeatureConfig:
86
+ class FeatureSpecMixin:
87
87
  """
88
88
  Mixin that normalizes dense/sparse/sequence feature lists and target/id columns.
89
89
  """
@@ -116,3 +116,4 @@ class FeatureConfig:
116
116
  if isinstance(value, str):
117
117
  return [value]
118
118
  return list(value)
119
+
@@ -19,7 +19,7 @@ from typing import Union, Literal
19
19
  from torch.utils.data import DataLoader, TensorDataset
20
20
 
21
21
  from nextrec.basic.callback import EarlyStopper
22
- from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureConfig
22
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSpecMixin
23
23
  from nextrec.basic.metrics import configure_metrics, evaluate_metrics
24
24
 
25
25
  from nextrec.loss import get_loss_fn, get_loss_kwargs
@@ -30,7 +30,7 @@ from nextrec.utils import get_optimizer, get_scheduler
30
30
  from nextrec.basic.session import resolve_save_path, create_session
31
31
 
32
32
 
33
- class BaseModel(FeatureConfig, nn.Module):
33
+ class BaseModel(FeatureSpecMixin, nn.Module):
34
34
  @property
35
35
  def model_name(self) -> str:
36
36
  raise NotImplementedError
@@ -18,9 +18,7 @@ from nextrec.data.data_utils import (
18
18
  read_table,
19
19
  load_dataframes,
20
20
  )
21
- from nextrec.basic.features import FeatureConfig
22
-
23
- # For backward compatibility, keep utils accessible
21
+ from nextrec.basic.features import FeatureSpecMixin
24
22
  from nextrec.data import data_utils
25
23
 
26
24
  __all__ = [
@@ -33,6 +31,6 @@ __all__ = [
33
31
  'iter_file_chunks',
34
32
  'read_table',
35
33
  'load_dataframes',
36
- 'FeatureConfig',
34
+ 'FeatureSpecMixin',
37
35
  'data_utils',
38
36
  ]
@@ -17,7 +17,7 @@ from typing import Iterator, Literal, Union, Optional
17
17
 
18
18
  from torch.utils.data import DataLoader, TensorDataset, IterableDataset
19
19
  from nextrec.data.preprocessor import DataProcessor
20
- from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureConfig
20
+ from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSpecMixin
21
21
 
22
22
  from nextrec.basic.loggers import colorize
23
23
  from nextrec.data import (
@@ -28,7 +28,7 @@ from nextrec.data import (
28
28
  )
29
29
 
30
30
 
31
- class FileDataset(FeatureConfig, IterableDataset):
31
+ class FileDataset(FeatureSpecMixin, IterableDataset):
32
32
  """
33
33
  Iterable dataset that streams CSV/Parquet files in chunks and yields tensor tuples.
34
34
 
@@ -164,7 +164,7 @@ class FileDataset(FeatureConfig, IterableDataset):
164
164
  )
165
165
 
166
166
 
167
- class RecDataLoader(FeatureConfig):
167
+ class RecDataLoader(FeatureSpecMixin):
168
168
  """
169
169
  Convenience wrapper for building PyTorch ``DataLoader`` objects for recommendation models.
170
170
 
@@ -31,9 +31,9 @@ from nextrec.data.data_utils import (
31
31
  default_output_dir,
32
32
  )
33
33
  from nextrec.basic.session import create_session, resolve_save_path
34
- from nextrec.basic.features import FeatureConfig
34
+ from nextrec.basic.features import FeatureSpecMixin
35
35
 
36
- class DataProcessor(FeatureConfig):
36
+ class DataProcessor(FeatureSpecMixin):
37
37
  """DataProcessor for data preprocessing including numeric, sparse, sequence features and target processing.
38
38
 
39
39
  Examples:
@@ -1,12 +1,57 @@
1
1
  """
2
2
  Date: create on 09/11/2025
3
- Author:
4
- Yang Zhou,zyaztec@gmail.com
3
+ Checkpoint: edit on 24/11/2025
4
+ Author: Yang Zhou,zyaztec@gmail.com
5
5
  Reference:
6
- [1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
7
- self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
8
- on information and knowledge management. 2019: 1161-1170.
9
- (https://arxiv.org/abs/1810.11921)
6
+ [1] Song W, Shi C, Xiao Z, et al. Autoint: Automatic feature interaction learning via
7
+ self-attentive neural networks[C]//Proceedings of the 28th ACM international conference
8
+ on information and knowledge management. 2019: 1161-1170.
9
+ (https://arxiv.org/abs/1810.11921)
10
+
11
+ AutoInt is a CTR prediction model that leverages multi-head self-attention
12
+ to automatically learn high-order feature interactions in an explicit and
13
+ interpretable way. Instead of relying on manual feature engineering or
14
+ implicit MLP-based transformations, AutoInt models feature dependencies
15
+ by attending over all embedded fields and capturing their contextual
16
+ relationships.
17
+
18
+ In each Interacting Layer:
19
+ (1) Each field embedding is projected into multiple attention heads
20
+ (2) Scaled dot-product attention computes feature-to-feature interactions
21
+ (3) Outputs are aggregated and passed through residual connections
22
+ (4) Layer Normalization ensures stable optimization
23
+
24
+ By stacking multiple Interacting Layers, AutoInt progressively discovers
25
+ higher-order feature interactions, while maintaining transparency since
26
+ attention weights explicitly show which features interact.
27
+
28
+ Key Advantages:
29
+ - Explicit modeling of high-order feature interactions
30
+ - Multi-head attention enhances representation diversity
31
+ - Residual structure facilitates deep interaction learning
32
+ - Attention weights provide interpretability of feature relations
33
+ - Eliminates heavy manual feature engineering
34
+
35
+ AutoInt 是一个 CTR 预估模型,通过多头自注意力机制显式学习高阶特征交互,
36
+ 并具有良好的可解释性。不同于依赖人工特征工程或 MLP 隐式建模的方法,
37
+ AutoInt 通过对所有特征 embedding 进行注意力计算,捕捉特征之间的上下文依赖关系。
38
+
39
+ 在每个 Interacting Layer(交互层)中:
40
+ (1) 每个特征 embedding 通过投影分成多个注意力头
41
+ (2) 使用缩放点积注意力计算特征间交互权重
42
+ (3) 将多头输出进行聚合,并使用残差连接
43
+ (4) Layer Normalization 确保训练稳定性
44
+
45
+ 通过堆叠多个交互层,AutoInt 能逐步学习更高阶的特征交互;
46
+ 同时由于注意力权重可视化,模型具有明确的可解释能力,
47
+ 能展示哪些特征之间的关系最重要。
48
+
49
+ 主要优点:
50
+ - 显式建模高阶特征交互
51
+ - 多头机制增强表示能力
52
+ - 残差结构支持深层交互学习
53
+ - 注意力权重天然具备可解释性
54
+ - 减少繁重的人工特征工程工作
10
55
  """
11
56
 
12
57
  import torch
@@ -80,7 +125,6 @@ class AutoInt(BaseModel):
80
125
 
81
126
  # Project embeddings to attention embedding dimension
82
127
  num_fields = len(self.interaction_features)
83
- total_embedding_dim = sum([f.embedding_dim for f in self.interaction_features])
84
128
 
85
129
  # If embeddings have different dimensions, project them to att_embedding_dim
86
130
  self.need_projection = not all(f.embedding_dim == att_embedding_dim for f in self.interaction_features)