nextrec 0.3.4__tar.gz → 0.3.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nextrec-0.3.4 → nextrec-0.3.5}/PKG-INFO +3 -3
- {nextrec-0.3.4 → nextrec-0.3.5}/README.md +2 -2
- {nextrec-0.3.4 → nextrec-0.3.5}/README_zh.md +2 -2
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/conf.py +1 -1
- nextrec-0.3.5/nextrec/__version__.py +1 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/features.py +1 -1
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/model.py +4 -2
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/session.py +3 -10
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/data/__init__.py +47 -9
- nextrec-0.3.5/nextrec/data/batch_utils.py +80 -0
- nextrec-0.3.5/nextrec/data/data_processing.py +152 -0
- nextrec-0.3.5/nextrec/data/data_utils.py +35 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/data/dataloader.py +4 -2
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/data/preprocessor.py +6 -16
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/poso.py +1 -1
- nextrec-0.3.5/nextrec/utils/__init__.py +68 -0
- nextrec-0.3.5/nextrec/utils/device.py +37 -0
- nextrec-0.3.5/nextrec/utils/feature.py +13 -0
- nextrec-0.3.5/nextrec/utils/file.py +70 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/utils/initializer.py +0 -8
- nextrec-0.3.5/nextrec/utils/model.py +22 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/utils/optimizer.py +0 -19
- nextrec-0.3.5/nextrec/utils/tensor.py +61 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/pyproject.toml +1 -1
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/example_match_dssm.py +1 -1
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/example_multitask.py +1 -1
- nextrec-0.3.4/nextrec/__version__.py +0 -1
- nextrec-0.3.4/nextrec/data/data_utils.py +0 -268
- nextrec-0.3.4/nextrec/utils/__init__.py +0 -18
- nextrec-0.3.4/nextrec/utils/common.py +0 -60
- {nextrec-0.3.4 → nextrec-0.3.5}/.github/workflows/publish.yml +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/.github/workflows/tests.yml +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/.gitignore +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/.readthedocs.yaml +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/CODE_OF_CONDUCT.md +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/CONTRIBUTING.md +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/LICENSE +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/MANIFEST.in +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/asserts/Feature Configuration.png +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/asserts/Model Parameters.png +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/asserts/Training Configuration.png +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/asserts/Training logs.png +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/asserts/logo.png +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/asserts/mmoe_tutorial.png +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/asserts/nextrec_diagram_en.png +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/asserts/nextrec_diagram_zh.png +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/asserts/test data.png +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/dataset/ctcvr_task.csv +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/dataset/match_task.csv +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/dataset/movielens_100k.csv +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/dataset/multitask_task.csv +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/dataset/ranking_task.csv +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/en/Getting started guide.md +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/Makefile +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/index.md +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/make.bat +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/modules.rst +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.basic.rst +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.data.rst +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.loss.rst +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.rst +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.utils.rst +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/requirements.txt +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/docs/zh//345/277/253/351/200/237/344/270/212/346/211/213.md" +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/__init__.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/__init__.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/activation.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/callback.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/layers.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/loggers.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/metrics.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/__init__.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/listwise.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/loss_utils.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/pairwise.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/pointwise.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/generative/__init__.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/generative/hstu.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/generative/tiger.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/__init__.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/dssm.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/dssm_v2.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/mind.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/sdm.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/youtube_dnn.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/esmm.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/mmoe.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/ple.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/share_bottom.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/__init__.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/afm.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/autoint.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/dcn.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/dcn_v2.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/deepfm.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/dien.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/din.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/fibinet.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/fm.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/masknet.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/pnn.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/widedeep.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/xdeepfm.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/utils/embedding.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/pytest.ini +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/requirements.txt +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/__init__.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/conftest.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/run_tests.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/test_layers.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/test_losses.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/test_match_models.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/test_multitask_models.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/test_preprocessor.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/test_ranking_models.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test/test_utils.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/test_requirements.txt +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/example_ranking_din.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/movielen_match_dssm.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/movielen_ranking_deepfm.py +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/notebooks/en/Hands on dataprocessor.ipynb +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/notebooks/en/Hands on nextrec.ipynb +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/notebooks/zh/Hands on dataprocessor.ipynb +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/notebooks/zh/Hands on nextrec.ipynb +0 -0
- {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/run_all_tutorials.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nextrec
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.5
|
|
4
4
|
Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
|
|
5
5
|
Project-URL: Homepage, https://github.com/zerolovesea/NextRec
|
|
6
6
|
Project-URL: Repository, https://github.com/zerolovesea/NextRec
|
|
@@ -63,7 +63,7 @@ Description-Content-Type: text/markdown
|
|
|
63
63
|

|
|
64
64
|

|
|
65
65
|

|
|
66
|
-

|
|
67
67
|
|
|
68
68
|
English | [中文文档](README_zh.md)
|
|
69
69
|
|
|
@@ -110,7 +110,7 @@ To dive deeper, Jupyter notebooks are available:
|
|
|
110
110
|
- [Hands on the NextRec framework](/tutorials/notebooks/en/Hands%20on%20nextrec.ipynb)
|
|
111
111
|
- [Using the data processor for preprocessing](/tutorials/notebooks/en/Hands%20on%20dataprocessor.ipynb)
|
|
112
112
|
|
|
113
|
-
> Current version [0.3.
|
|
113
|
+
> Current version [0.3.5]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
|
|
114
114
|
|
|
115
115
|
## 5-Minute Quick Start
|
|
116
116
|
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|

|
|
8
8
|

|
|
9
9
|

|
|
10
|
-

|
|
11
11
|
|
|
12
12
|
English | [中文文档](README_zh.md)
|
|
13
13
|
|
|
@@ -54,7 +54,7 @@ To dive deeper, Jupyter notebooks are available:
|
|
|
54
54
|
- [Hands on the NextRec framework](/tutorials/notebooks/en/Hands%20on%20nextrec.ipynb)
|
|
55
55
|
- [Using the data processor for preprocessing](/tutorials/notebooks/en/Hands%20on%20dataprocessor.ipynb)
|
|
56
56
|
|
|
57
|
-
> Current version [0.3.
|
|
57
|
+
> Current version [0.3.5]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
|
|
58
58
|
|
|
59
59
|
## 5-Minute Quick Start
|
|
60
60
|
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|

|
|
8
8
|

|
|
9
9
|

|
|
10
|
-

|
|
11
11
|
|
|
12
12
|
[English Version](README.md) | 中文文档
|
|
13
13
|
|
|
@@ -54,7 +54,7 @@ NextRec采用模块化、低耦合的工程设计,使得推荐系统从数据
|
|
|
54
54
|
- [如何上手NextRec框架](/tutorials/notebooks/zh/Hands%20on%20nextrec.ipynb)
|
|
55
55
|
- [如何使用数据处理器进行数据预处理](/tutorials/notebooks/zh/Hands%20on%20dataprocessor.ipynb)
|
|
56
56
|
|
|
57
|
-
> 当前版本[0.3.
|
|
57
|
+
> 当前版本[0.3.5],召回模型模块尚不完善,可能存在一些兼容性问题或意外报错,如果遇到问题,欢迎开发者在Issue区提出问题。
|
|
58
58
|
|
|
59
59
|
## 5分钟快速上手
|
|
60
60
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.5"
|
|
@@ -7,7 +7,7 @@ Author: Yang Zhou, zyaztec@gmail.com
|
|
|
7
7
|
"""
|
|
8
8
|
import torch
|
|
9
9
|
from nextrec.utils.embedding import get_auto_embedding_dim
|
|
10
|
-
from nextrec.utils.
|
|
10
|
+
from nextrec.utils.feature import normalize_to_list
|
|
11
11
|
|
|
12
12
|
class BaseFeature(object):
|
|
13
13
|
def __repr__(self):
|
|
@@ -31,10 +31,12 @@ from nextrec.basic.session import resolve_save_path, create_session
|
|
|
31
31
|
from nextrec.basic.metrics import configure_metrics, evaluate_metrics, check_user_id
|
|
32
32
|
|
|
33
33
|
from nextrec.data.dataloader import build_tensors_from_data
|
|
34
|
-
from nextrec.data.
|
|
34
|
+
from nextrec.data.data_processing import get_column_data, get_user_ids
|
|
35
|
+
from nextrec.data.batch_utils import collate_fn, batch_to_dict
|
|
35
36
|
|
|
36
37
|
from nextrec.loss import get_loss_fn, get_loss_kwargs
|
|
37
|
-
from nextrec.utils import get_optimizer, get_scheduler
|
|
38
|
+
from nextrec.utils import get_optimizer, get_scheduler
|
|
39
|
+
from nextrec.utils.tensor import to_tensor
|
|
38
40
|
|
|
39
41
|
from nextrec import __version__
|
|
40
42
|
|
|
@@ -1,14 +1,5 @@
|
|
|
1
1
|
"""Session and experiment utilities.
|
|
2
2
|
|
|
3
|
-
This module centralizes session/experiment management so the rest of the
|
|
4
|
-
framework writes all artifacts to a consistent location:: <pwd>/log/<experiment_id>/
|
|
5
|
-
|
|
6
|
-
Within that folder we keep model parameters, checkpoints, training metrics,
|
|
7
|
-
evaluation metrics, and consolidated log output. When users do not provide an
|
|
8
|
-
``experiment_id`` a timestamp-based identifier is generated once per process to
|
|
9
|
-
avoid scattering files across multiple directories. Test runs are redirected to
|
|
10
|
-
temporary folders so local trees are not polluted.
|
|
11
|
-
|
|
12
3
|
Date: create on 23/11/2025
|
|
13
4
|
Author: Yang Zhou,zyaztec@gmail.com
|
|
14
5
|
"""
|
|
@@ -16,7 +7,7 @@ Author: Yang Zhou,zyaztec@gmail.com
|
|
|
16
7
|
import os
|
|
17
8
|
import tempfile
|
|
18
9
|
from dataclasses import dataclass
|
|
19
|
-
from datetime import datetime
|
|
10
|
+
from datetime import datetime, timezone
|
|
20
11
|
from pathlib import Path
|
|
21
12
|
|
|
22
13
|
__all__ = [
|
|
@@ -74,6 +65,7 @@ def create_session(experiment_id: str | Path | None = None) -> Session:
|
|
|
74
65
|
if experiment_id is not None and str(experiment_id).strip():
|
|
75
66
|
exp_id = str(experiment_id).strip()
|
|
76
67
|
else:
|
|
68
|
+
# Use local time for session naming
|
|
77
69
|
exp_id = "nextrec_session_" + datetime.now().strftime("%Y%m%d")
|
|
78
70
|
|
|
79
71
|
if (
|
|
@@ -111,6 +103,7 @@ def resolve_save_path(
|
|
|
111
103
|
timestamp.
|
|
112
104
|
- Parent directories are created.
|
|
113
105
|
"""
|
|
106
|
+
# Use local time for file timestamps
|
|
114
107
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if add_timestamp else None
|
|
115
108
|
|
|
116
109
|
normalized_suffix = suffix if suffix.startswith(".") else f".{suffix}"
|
|
@@ -1,48 +1,86 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Data utilities package for NextRec
|
|
3
3
|
|
|
4
|
-
This package provides data processing and manipulation utilities
|
|
4
|
+
This package provides data processing and manipulation utilities organized by category:
|
|
5
|
+
- batch_utils: Batch collation and processing
|
|
6
|
+
- data_processing: Data manipulation and user ID extraction
|
|
7
|
+
- data_utils: Legacy module (re-exports from specialized modules)
|
|
8
|
+
- dataloader: Dataset and DataLoader implementations
|
|
9
|
+
- preprocessor: Data preprocessing pipeline
|
|
5
10
|
|
|
6
11
|
Date: create on 13/11/2025
|
|
12
|
+
Last update: 03/12/2025 (refactored)
|
|
7
13
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
8
14
|
"""
|
|
9
15
|
|
|
10
|
-
|
|
11
|
-
|
|
16
|
+
# Batch utilities
|
|
17
|
+
from nextrec.data.batch_utils import collate_fn, batch_to_dict, stack_section
|
|
18
|
+
|
|
19
|
+
# Data processing utilities
|
|
20
|
+
from nextrec.data.data_processing import (
|
|
12
21
|
get_column_data,
|
|
13
|
-
default_output_dir,
|
|
14
22
|
split_dict_random,
|
|
15
23
|
build_eval_candidates,
|
|
24
|
+
get_user_ids,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# File utilities (from utils package)
|
|
28
|
+
from nextrec.utils.file import (
|
|
16
29
|
resolve_file_paths,
|
|
17
30
|
iter_file_chunks,
|
|
18
31
|
read_table,
|
|
19
32
|
load_dataframes,
|
|
33
|
+
default_output_dir,
|
|
20
34
|
)
|
|
21
|
-
|
|
22
|
-
|
|
35
|
+
|
|
36
|
+
# DataLoader components
|
|
23
37
|
from nextrec.data.dataloader import (
|
|
24
38
|
TensorDictDataset,
|
|
25
39
|
FileDataset,
|
|
26
40
|
RecDataLoader,
|
|
27
41
|
build_tensors_from_data,
|
|
28
42
|
)
|
|
43
|
+
|
|
44
|
+
# Preprocessor
|
|
29
45
|
from nextrec.data.preprocessor import DataProcessor
|
|
30
46
|
|
|
47
|
+
# Feature definitions
|
|
48
|
+
from nextrec.basic.features import FeatureSet
|
|
49
|
+
|
|
50
|
+
# Legacy module (for backward compatibility)
|
|
51
|
+
from nextrec.data import data_utils
|
|
52
|
+
|
|
31
53
|
__all__ = [
|
|
54
|
+
# Batch utilities
|
|
32
55
|
'collate_fn',
|
|
56
|
+
'batch_to_dict',
|
|
57
|
+
'stack_section',
|
|
58
|
+
|
|
59
|
+
# Data processing
|
|
33
60
|
'get_column_data',
|
|
34
|
-
'default_output_dir',
|
|
35
61
|
'split_dict_random',
|
|
36
62
|
'build_eval_candidates',
|
|
63
|
+
'get_user_ids',
|
|
64
|
+
|
|
65
|
+
# File utilities
|
|
37
66
|
'resolve_file_paths',
|
|
38
67
|
'iter_file_chunks',
|
|
39
68
|
'read_table',
|
|
40
69
|
'load_dataframes',
|
|
41
|
-
'
|
|
42
|
-
|
|
70
|
+
'default_output_dir',
|
|
71
|
+
|
|
72
|
+
# DataLoader
|
|
43
73
|
'TensorDictDataset',
|
|
44
74
|
'FileDataset',
|
|
45
75
|
'RecDataLoader',
|
|
46
76
|
'build_tensors_from_data',
|
|
77
|
+
|
|
78
|
+
# Preprocessor
|
|
47
79
|
'DataProcessor',
|
|
80
|
+
|
|
81
|
+
# Features
|
|
82
|
+
'FeatureSet',
|
|
83
|
+
|
|
84
|
+
# Legacy module
|
|
85
|
+
'data_utils',
|
|
48
86
|
]
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch collation utilities for NextRec
|
|
3
|
+
|
|
4
|
+
Date: create on 03/12/2025
|
|
5
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import numpy as np
|
|
10
|
+
from typing import Any, Mapping
|
|
11
|
+
|
|
12
|
+
def stack_section(batch: list[dict], section: str):
|
|
13
|
+
entries = [item.get(section) for item in batch if item.get(section) is not None]
|
|
14
|
+
if not entries:
|
|
15
|
+
return None
|
|
16
|
+
merged: dict = {}
|
|
17
|
+
for name in entries[0]: # type: ignore
|
|
18
|
+
tensors = [item[section][name] for item in batch if item.get(section) is not None and name in item[section]]
|
|
19
|
+
merged[name] = torch.stack(tensors, dim=0)
|
|
20
|
+
return merged
|
|
21
|
+
|
|
22
|
+
def collate_fn(batch):
|
|
23
|
+
"""
|
|
24
|
+
Collate a list of sample dicts into the unified batch format:
|
|
25
|
+
{
|
|
26
|
+
"features": {name: Tensor(B, ...)},
|
|
27
|
+
"labels": {target: Tensor(B, ...)} or None,
|
|
28
|
+
"ids": {id_name: Tensor(B, ...)} or None,
|
|
29
|
+
}
|
|
30
|
+
Args: batch: List of samples from DataLoader
|
|
31
|
+
|
|
32
|
+
Returns: dict: Batched data in unified format
|
|
33
|
+
"""
|
|
34
|
+
if not batch:
|
|
35
|
+
return {"features": {}, "labels": None, "ids": None}
|
|
36
|
+
|
|
37
|
+
first = batch[0]
|
|
38
|
+
if isinstance(first, dict) and "features" in first:
|
|
39
|
+
# Streaming dataset yields already-batched chunks; avoid adding an extra dim.
|
|
40
|
+
if first.get("_already_batched") and len(batch) == 1:
|
|
41
|
+
return {
|
|
42
|
+
"features": first.get("features", {}),
|
|
43
|
+
"labels": first.get("labels"),
|
|
44
|
+
"ids": first.get("ids"),
|
|
45
|
+
}
|
|
46
|
+
return {
|
|
47
|
+
"features": stack_section(batch, "features") or {},
|
|
48
|
+
"labels": stack_section(batch, "labels"),
|
|
49
|
+
"ids": stack_section(batch, "ids"),
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Fallback: stack tuples/lists of tensors
|
|
53
|
+
num_tensors = len(first)
|
|
54
|
+
result = []
|
|
55
|
+
for i in range(num_tensors):
|
|
56
|
+
tensor_list = [item[i] for item in batch]
|
|
57
|
+
first_item = tensor_list[0]
|
|
58
|
+
if isinstance(first_item, torch.Tensor):
|
|
59
|
+
stacked = torch.cat(tensor_list, dim=0)
|
|
60
|
+
elif isinstance(first_item, np.ndarray):
|
|
61
|
+
stacked = np.concatenate(tensor_list, axis=0)
|
|
62
|
+
elif isinstance(first_item, list):
|
|
63
|
+
combined = []
|
|
64
|
+
for entry in tensor_list:
|
|
65
|
+
combined.extend(entry)
|
|
66
|
+
stacked = combined
|
|
67
|
+
else:
|
|
68
|
+
stacked = tensor_list
|
|
69
|
+
result.append(stacked)
|
|
70
|
+
return tuple(result)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def batch_to_dict(batch_data: Any, include_ids: bool = True) -> dict:
|
|
74
|
+
if not (isinstance(batch_data, Mapping) and "features" in batch_data):
|
|
75
|
+
raise TypeError("[BaseModel-batch_to_dict Error] Batch data must be a dict with 'features' produced by the current DataLoader.")
|
|
76
|
+
return {
|
|
77
|
+
"features": batch_data.get("features", {}),
|
|
78
|
+
"labels": batch_data.get("labels"),
|
|
79
|
+
"ids": batch_data.get("ids") if include_ids else None,
|
|
80
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data processing utilities for NextRec
|
|
3
|
+
|
|
4
|
+
Date: create on 03/12/2025
|
|
5
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from typing import Any, Mapping
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_column_data(data: dict | pd.DataFrame, name: str):
|
|
15
|
+
if isinstance(data, dict):
|
|
16
|
+
return data[name] if name in data else None
|
|
17
|
+
elif isinstance(data, pd.DataFrame):
|
|
18
|
+
if name not in data.columns:
|
|
19
|
+
return None
|
|
20
|
+
return data[name].values
|
|
21
|
+
else:
|
|
22
|
+
if hasattr(data, name):
|
|
23
|
+
return getattr(data, name)
|
|
24
|
+
raise KeyError(f"Unsupported data type for extracting column {name}")
|
|
25
|
+
|
|
26
|
+
def split_dict_random(
|
|
27
|
+
data_dict: dict,
|
|
28
|
+
test_size: float = 0.2,
|
|
29
|
+
random_state: int | None = None
|
|
30
|
+
):
|
|
31
|
+
lengths = [len(v) for v in data_dict.values()]
|
|
32
|
+
if len(set(lengths)) != 1:
|
|
33
|
+
raise ValueError(f"Length mismatch: {lengths}")
|
|
34
|
+
|
|
35
|
+
n = lengths[0]
|
|
36
|
+
rng = np.random.default_rng(random_state)
|
|
37
|
+
perm = rng.permutation(n)
|
|
38
|
+
cut = int(round(n * (1 - test_size)))
|
|
39
|
+
train_idx, test_idx = perm[:cut], perm[cut:]
|
|
40
|
+
|
|
41
|
+
def take(v, idx):
|
|
42
|
+
if isinstance(v, np.ndarray):
|
|
43
|
+
return v[idx]
|
|
44
|
+
elif isinstance(v, pd.Series):
|
|
45
|
+
return v.iloc[idx].to_numpy()
|
|
46
|
+
else:
|
|
47
|
+
v_arr = np.asarray(v, dtype=object)
|
|
48
|
+
return v_arr[idx]
|
|
49
|
+
|
|
50
|
+
train_dict = {k: take(v, train_idx) for k, v in data_dict.items()}
|
|
51
|
+
test_dict = {k: take(v, test_idx) for k, v in data_dict.items()}
|
|
52
|
+
return train_dict, test_dict
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def build_eval_candidates(
|
|
56
|
+
df_all: pd.DataFrame,
|
|
57
|
+
user_col: str,
|
|
58
|
+
item_col: str,
|
|
59
|
+
label_col: str,
|
|
60
|
+
user_features: pd.DataFrame,
|
|
61
|
+
item_features: pd.DataFrame,
|
|
62
|
+
num_pos_per_user: int = 5,
|
|
63
|
+
num_neg_per_pos: int = 50,
|
|
64
|
+
random_seed: int = 2025,
|
|
65
|
+
) -> pd.DataFrame:
|
|
66
|
+
"""
|
|
67
|
+
Build evaluation candidates with positive and negative samples for each user.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
df_all: Full interaction DataFrame
|
|
71
|
+
user_col: Name of the user ID column
|
|
72
|
+
item_col: Name of the item ID column
|
|
73
|
+
label_col: Name of the label column
|
|
74
|
+
user_features: DataFrame containing user features
|
|
75
|
+
item_features: DataFrame containing item features
|
|
76
|
+
num_pos_per_user: Number of positive samples per user (default: 5)
|
|
77
|
+
num_neg_per_pos: Number of negative samples per positive (default: 50)
|
|
78
|
+
random_seed: Random seed for reproducibility (default: 2025)
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
pd.DataFrame: Evaluation candidates with features
|
|
82
|
+
"""
|
|
83
|
+
rng = np.random.default_rng(random_seed)
|
|
84
|
+
|
|
85
|
+
users = df_all[user_col].unique()
|
|
86
|
+
all_items = item_features[item_col].unique()
|
|
87
|
+
rows = []
|
|
88
|
+
user_hist_items = {u: df_all[df_all[user_col] == u][item_col].unique() for u in users}
|
|
89
|
+
|
|
90
|
+
for u in users:
|
|
91
|
+
df_user = df_all[df_all[user_col] == u]
|
|
92
|
+
pos_items = df_user[df_user[label_col] == 1][item_col].unique()
|
|
93
|
+
if len(pos_items) == 0:
|
|
94
|
+
continue
|
|
95
|
+
pos_items = pos_items[:num_pos_per_user]
|
|
96
|
+
seen_items = set(user_hist_items[u])
|
|
97
|
+
neg_pool = np.setdiff1d(all_items, np.fromiter(seen_items, dtype=all_items.dtype))
|
|
98
|
+
if len(neg_pool) == 0:
|
|
99
|
+
continue
|
|
100
|
+
for pos in pos_items:
|
|
101
|
+
if len(neg_pool) <= num_neg_per_pos:
|
|
102
|
+
neg_items = neg_pool
|
|
103
|
+
else:
|
|
104
|
+
neg_items = rng.choice(neg_pool, size=num_neg_per_pos, replace=False)
|
|
105
|
+
rows.append((u, pos, 1))
|
|
106
|
+
for ni in neg_items:
|
|
107
|
+
rows.append((u, ni, 0))
|
|
108
|
+
|
|
109
|
+
eval_df = pd.DataFrame(rows, columns=[user_col, item_col, label_col])
|
|
110
|
+
eval_df = eval_df.merge(user_features, on=user_col, how='left')
|
|
111
|
+
eval_df = eval_df.merge(item_features, on=item_col, how='left')
|
|
112
|
+
return eval_df
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_user_ids(
|
|
116
|
+
data: Any,
|
|
117
|
+
id_columns: list[str] | str | None = None
|
|
118
|
+
) -> np.ndarray | None:
|
|
119
|
+
"""
|
|
120
|
+
Extract user IDs from various data structures.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
data: Data source (DataFrame, dict, or batch dict)
|
|
124
|
+
id_columns: List or single ID column name(s) (default: None)
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
np.ndarray | None: User IDs as numpy array, or None if not found
|
|
128
|
+
"""
|
|
129
|
+
id_columns = (
|
|
130
|
+
id_columns if isinstance(id_columns, list)
|
|
131
|
+
else [id_columns] if isinstance(id_columns, str)
|
|
132
|
+
else []
|
|
133
|
+
)
|
|
134
|
+
if not id_columns:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
main_id = id_columns[0]
|
|
138
|
+
if isinstance(data, pd.DataFrame) and main_id in data.columns:
|
|
139
|
+
arr = np.asarray(data[main_id].values)
|
|
140
|
+
return arr.reshape(arr.shape[0])
|
|
141
|
+
|
|
142
|
+
if isinstance(data, dict):
|
|
143
|
+
ids_container = data.get("ids")
|
|
144
|
+
if isinstance(ids_container, dict) and main_id in ids_container:
|
|
145
|
+
val = ids_container[main_id]
|
|
146
|
+
val = val.detach().cpu().numpy() if isinstance(val, torch.Tensor) else np.asarray(val)
|
|
147
|
+
return val.reshape(val.shape[0])
|
|
148
|
+
if main_id in data:
|
|
149
|
+
arr = np.asarray(data[main_id])
|
|
150
|
+
return arr.reshape(arr.shape[0])
|
|
151
|
+
|
|
152
|
+
return None
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data processing utilities for NextRec (Refactored)
|
|
3
|
+
|
|
4
|
+
This module now re-exports functions from specialized submodules:
|
|
5
|
+
- batch_utils: collate_fn, batch_to_dict
|
|
6
|
+
- data_processing: get_column_data, split_dict_random, build_eval_candidates, get_user_ids
|
|
7
|
+
- nextrec.utils.file_utils: resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
|
|
8
|
+
|
|
9
|
+
Date: create on 27/10/2025
|
|
10
|
+
Last update: 03/12/2025 (refactored)
|
|
11
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
# Import from new organized modules
|
|
15
|
+
from nextrec.data.batch_utils import collate_fn, batch_to_dict, stack_section
|
|
16
|
+
from nextrec.data.data_processing import get_column_data, split_dict_random, build_eval_candidates, get_user_ids
|
|
17
|
+
from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
# Batch utilities
|
|
21
|
+
'collate_fn',
|
|
22
|
+
'batch_to_dict',
|
|
23
|
+
'stack_section',
|
|
24
|
+
# Data processing
|
|
25
|
+
'get_column_data',
|
|
26
|
+
'split_dict_random',
|
|
27
|
+
'build_eval_candidates',
|
|
28
|
+
'get_user_ids',
|
|
29
|
+
# File utilities
|
|
30
|
+
'resolve_file_paths',
|
|
31
|
+
'iter_file_chunks',
|
|
32
|
+
'read_table',
|
|
33
|
+
'load_dataframes',
|
|
34
|
+
'default_output_dir',
|
|
35
|
+
]
|
|
@@ -20,8 +20,10 @@ from nextrec.data.preprocessor import DataProcessor
|
|
|
20
20
|
from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
|
|
21
21
|
|
|
22
22
|
from nextrec.basic.loggers import colorize
|
|
23
|
-
from nextrec.data import get_column_data
|
|
24
|
-
from nextrec.
|
|
23
|
+
from nextrec.data.data_processing import get_column_data
|
|
24
|
+
from nextrec.data.batch_utils import collate_fn
|
|
25
|
+
from nextrec.utils.file import resolve_file_paths, read_table
|
|
26
|
+
from nextrec.utils.tensor import to_tensor
|
|
25
27
|
|
|
26
28
|
class TensorDictDataset(Dataset):
|
|
27
29
|
"""Dataset returning sample-level dicts matching the unified batch schema."""
|
|
@@ -16,24 +16,14 @@ import pandas as pd
|
|
|
16
16
|
import tqdm
|
|
17
17
|
from pathlib import Path
|
|
18
18
|
from typing import Dict, Union, Optional, Literal, Any
|
|
19
|
-
from sklearn.preprocessing import
|
|
20
|
-
|
|
21
|
-
MinMaxScaler,
|
|
22
|
-
RobustScaler,
|
|
23
|
-
MaxAbsScaler,
|
|
24
|
-
LabelEncoder
|
|
25
|
-
)
|
|
19
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, LabelEncoder
|
|
20
|
+
|
|
26
21
|
|
|
27
|
-
from nextrec.basic.loggers import setup_logger, colorize
|
|
28
|
-
from nextrec.data.data_utils import (
|
|
29
|
-
resolve_file_paths,
|
|
30
|
-
iter_file_chunks,
|
|
31
|
-
read_table,
|
|
32
|
-
load_dataframes,
|
|
33
|
-
default_output_dir,
|
|
34
|
-
)
|
|
35
|
-
from nextrec.basic.session import resolve_save_path
|
|
36
22
|
from nextrec.basic.features import FeatureSet
|
|
23
|
+
from nextrec.basic.loggers import colorize
|
|
24
|
+
from nextrec.basic.session import resolve_save_path
|
|
25
|
+
from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
|
|
26
|
+
|
|
37
27
|
from nextrec.__version__ import __version__
|
|
38
28
|
|
|
39
29
|
|
|
@@ -46,7 +46,7 @@ from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
|
|
|
46
46
|
from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
|
|
47
47
|
from nextrec.basic.activation import activation_layer
|
|
48
48
|
from nextrec.basic.model import BaseModel
|
|
49
|
-
from nextrec.utils.
|
|
49
|
+
from nextrec.utils.model import merge_features
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
class POSOGate(nn.Module):
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utilities package for NextRec
|
|
3
|
+
|
|
4
|
+
This package provides various utility functions organized by category:
|
|
5
|
+
- optimizer: Optimizer and scheduler utilities
|
|
6
|
+
- initializer: Weight initialization utilities
|
|
7
|
+
- embedding: Embedding dimension calculation
|
|
8
|
+
- device_utils: Device management and selection
|
|
9
|
+
- tensor_utils: Tensor operations and conversions
|
|
10
|
+
- file_utils: File I/O operations
|
|
11
|
+
- model_utils: Model-related utilities
|
|
12
|
+
- feature_utils: Feature processing utilities
|
|
13
|
+
|
|
14
|
+
Date: create on 13/11/2025
|
|
15
|
+
Last update: 03/12/2025 (refactored)
|
|
16
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from .optimizer import get_optimizer, get_scheduler
|
|
20
|
+
from .initializer import get_initializer
|
|
21
|
+
from .embedding import get_auto_embedding_dim
|
|
22
|
+
from .device import resolve_device, get_device_info
|
|
23
|
+
from .tensor import to_tensor, stack_tensors, concat_tensors, pad_sequence_tensors
|
|
24
|
+
from .file import resolve_file_paths, read_table, load_dataframes, iter_file_chunks, default_output_dir
|
|
25
|
+
from .model import merge_features, get_mlp_output_dim
|
|
26
|
+
from .feature import normalize_to_list
|
|
27
|
+
from . import optimizer, initializer, embedding
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
# Optimizer & Scheduler
|
|
31
|
+
'get_optimizer',
|
|
32
|
+
'get_scheduler',
|
|
33
|
+
|
|
34
|
+
# Initializer
|
|
35
|
+
'get_initializer',
|
|
36
|
+
|
|
37
|
+
# Embedding
|
|
38
|
+
'get_auto_embedding_dim',
|
|
39
|
+
|
|
40
|
+
# Device utilities
|
|
41
|
+
'resolve_device',
|
|
42
|
+
'get_device_info',
|
|
43
|
+
|
|
44
|
+
# Tensor utilities
|
|
45
|
+
'to_tensor',
|
|
46
|
+
'stack_tensors',
|
|
47
|
+
'concat_tensors',
|
|
48
|
+
'pad_sequence_tensors',
|
|
49
|
+
|
|
50
|
+
# File utilities
|
|
51
|
+
'resolve_file_paths',
|
|
52
|
+
'read_table',
|
|
53
|
+
'load_dataframes',
|
|
54
|
+
'iter_file_chunks',
|
|
55
|
+
'default_output_dir',
|
|
56
|
+
|
|
57
|
+
# Model utilities
|
|
58
|
+
'merge_features',
|
|
59
|
+
'get_mlp_output_dim',
|
|
60
|
+
|
|
61
|
+
# Feature utilities
|
|
62
|
+
'normalize_to_list',
|
|
63
|
+
|
|
64
|
+
# Module exports
|
|
65
|
+
'optimizer',
|
|
66
|
+
'initializer',
|
|
67
|
+
'embedding',
|
|
68
|
+
]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Device management utilities for NextRec
|
|
3
|
+
|
|
4
|
+
Date: create on 03/12/2025
|
|
5
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import platform
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def resolve_device() -> str:
|
|
13
|
+
if torch.cuda.is_available():
|
|
14
|
+
return "cuda"
|
|
15
|
+
if torch.backends.mps.is_available():
|
|
16
|
+
mac_ver = platform.mac_ver()[0]
|
|
17
|
+
try:
|
|
18
|
+
major, minor = (int(x) for x in mac_ver.split(".")[:2])
|
|
19
|
+
except Exception:
|
|
20
|
+
major, minor = 0, 0
|
|
21
|
+
if major >= 14:
|
|
22
|
+
return "mps"
|
|
23
|
+
return "cpu"
|
|
24
|
+
|
|
25
|
+
def get_device_info() -> dict:
|
|
26
|
+
info = {
|
|
27
|
+
'cuda_available': torch.cuda.is_available(),
|
|
28
|
+
'cuda_device_count': torch.cuda.device_count() if torch.cuda.is_available() else 0,
|
|
29
|
+
'mps_available': torch.backends.mps.is_available(),
|
|
30
|
+
'current_device': resolve_device(),
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if torch.cuda.is_available():
|
|
34
|
+
info['cuda_device_name'] = torch.cuda.get_device_name(0)
|
|
35
|
+
info['cuda_capability'] = torch.cuda.get_device_capability(0)
|
|
36
|
+
|
|
37
|
+
return info
|