nextrec 0.3.4__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {nextrec-0.3.4 → nextrec-0.3.5}/PKG-INFO +3 -3
  2. {nextrec-0.3.4 → nextrec-0.3.5}/README.md +2 -2
  3. {nextrec-0.3.4 → nextrec-0.3.5}/README_zh.md +2 -2
  4. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/conf.py +1 -1
  5. nextrec-0.3.5/nextrec/__version__.py +1 -0
  6. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/features.py +1 -1
  7. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/model.py +4 -2
  8. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/session.py +3 -10
  9. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/data/__init__.py +47 -9
  10. nextrec-0.3.5/nextrec/data/batch_utils.py +80 -0
  11. nextrec-0.3.5/nextrec/data/data_processing.py +152 -0
  12. nextrec-0.3.5/nextrec/data/data_utils.py +35 -0
  13. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/data/dataloader.py +4 -2
  14. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/data/preprocessor.py +6 -16
  15. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/poso.py +1 -1
  16. nextrec-0.3.5/nextrec/utils/__init__.py +68 -0
  17. nextrec-0.3.5/nextrec/utils/device.py +37 -0
  18. nextrec-0.3.5/nextrec/utils/feature.py +13 -0
  19. nextrec-0.3.5/nextrec/utils/file.py +70 -0
  20. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/utils/initializer.py +0 -8
  21. nextrec-0.3.5/nextrec/utils/model.py +22 -0
  22. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/utils/optimizer.py +0 -19
  23. nextrec-0.3.5/nextrec/utils/tensor.py +61 -0
  24. {nextrec-0.3.4 → nextrec-0.3.5}/pyproject.toml +1 -1
  25. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/example_match_dssm.py +1 -1
  26. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/example_multitask.py +1 -1
  27. nextrec-0.3.4/nextrec/__version__.py +0 -1
  28. nextrec-0.3.4/nextrec/data/data_utils.py +0 -268
  29. nextrec-0.3.4/nextrec/utils/__init__.py +0 -18
  30. nextrec-0.3.4/nextrec/utils/common.py +0 -60
  31. {nextrec-0.3.4 → nextrec-0.3.5}/.github/workflows/publish.yml +0 -0
  32. {nextrec-0.3.4 → nextrec-0.3.5}/.github/workflows/tests.yml +0 -0
  33. {nextrec-0.3.4 → nextrec-0.3.5}/.gitignore +0 -0
  34. {nextrec-0.3.4 → nextrec-0.3.5}/.readthedocs.yaml +0 -0
  35. {nextrec-0.3.4 → nextrec-0.3.5}/CODE_OF_CONDUCT.md +0 -0
  36. {nextrec-0.3.4 → nextrec-0.3.5}/CONTRIBUTING.md +0 -0
  37. {nextrec-0.3.4 → nextrec-0.3.5}/LICENSE +0 -0
  38. {nextrec-0.3.4 → nextrec-0.3.5}/MANIFEST.in +0 -0
  39. {nextrec-0.3.4 → nextrec-0.3.5}/asserts/Feature Configuration.png +0 -0
  40. {nextrec-0.3.4 → nextrec-0.3.5}/asserts/Model Parameters.png +0 -0
  41. {nextrec-0.3.4 → nextrec-0.3.5}/asserts/Training Configuration.png +0 -0
  42. {nextrec-0.3.4 → nextrec-0.3.5}/asserts/Training logs.png +0 -0
  43. {nextrec-0.3.4 → nextrec-0.3.5}/asserts/logo.png +0 -0
  44. {nextrec-0.3.4 → nextrec-0.3.5}/asserts/mmoe_tutorial.png +0 -0
  45. {nextrec-0.3.4 → nextrec-0.3.5}/asserts/nextrec_diagram_en.png +0 -0
  46. {nextrec-0.3.4 → nextrec-0.3.5}/asserts/nextrec_diagram_zh.png +0 -0
  47. {nextrec-0.3.4 → nextrec-0.3.5}/asserts/test data.png +0 -0
  48. {nextrec-0.3.4 → nextrec-0.3.5}/dataset/ctcvr_task.csv +0 -0
  49. {nextrec-0.3.4 → nextrec-0.3.5}/dataset/match_task.csv +0 -0
  50. {nextrec-0.3.4 → nextrec-0.3.5}/dataset/movielens_100k.csv +0 -0
  51. {nextrec-0.3.4 → nextrec-0.3.5}/dataset/multitask_task.csv +0 -0
  52. {nextrec-0.3.4 → nextrec-0.3.5}/dataset/ranking_task.csv +0 -0
  53. {nextrec-0.3.4 → nextrec-0.3.5}/docs/en/Getting started guide.md +0 -0
  54. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/Makefile +0 -0
  55. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/index.md +0 -0
  56. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/make.bat +0 -0
  57. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/modules.rst +0 -0
  58. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.basic.rst +0 -0
  59. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.data.rst +0 -0
  60. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.loss.rst +0 -0
  61. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.rst +0 -0
  62. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/nextrec.utils.rst +0 -0
  63. {nextrec-0.3.4 → nextrec-0.3.5}/docs/rtd/requirements.txt +0 -0
  64. {nextrec-0.3.4 → nextrec-0.3.5}/docs/zh//345/277/253/351/200/237/344/270/212/346/211/213.md" +0 -0
  65. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/__init__.py +0 -0
  66. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/__init__.py +0 -0
  67. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/activation.py +0 -0
  68. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/callback.py +0 -0
  69. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/layers.py +0 -0
  70. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/loggers.py +0 -0
  71. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/basic/metrics.py +0 -0
  72. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/__init__.py +0 -0
  73. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/listwise.py +0 -0
  74. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/loss_utils.py +0 -0
  75. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/pairwise.py +0 -0
  76. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/loss/pointwise.py +0 -0
  77. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/generative/__init__.py +0 -0
  78. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/generative/hstu.py +0 -0
  79. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/generative/tiger.py +0 -0
  80. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/__init__.py +0 -0
  81. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/dssm.py +0 -0
  82. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/dssm_v2.py +0 -0
  83. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/mind.py +0 -0
  84. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/sdm.py +0 -0
  85. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/match/youtube_dnn.py +0 -0
  86. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/esmm.py +0 -0
  87. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/mmoe.py +0 -0
  88. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/ple.py +0 -0
  89. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/multi_task/share_bottom.py +0 -0
  90. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/__init__.py +0 -0
  91. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/afm.py +0 -0
  92. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/autoint.py +0 -0
  93. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/dcn.py +0 -0
  94. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/dcn_v2.py +0 -0
  95. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/deepfm.py +0 -0
  96. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/dien.py +0 -0
  97. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/din.py +0 -0
  98. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/fibinet.py +0 -0
  99. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/fm.py +0 -0
  100. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/masknet.py +0 -0
  101. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/pnn.py +0 -0
  102. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/widedeep.py +0 -0
  103. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/models/ranking/xdeepfm.py +0 -0
  104. {nextrec-0.3.4 → nextrec-0.3.5}/nextrec/utils/embedding.py +0 -0
  105. {nextrec-0.3.4 → nextrec-0.3.5}/pytest.ini +0 -0
  106. {nextrec-0.3.4 → nextrec-0.3.5}/requirements.txt +0 -0
  107. {nextrec-0.3.4 → nextrec-0.3.5}/test/__init__.py +0 -0
  108. {nextrec-0.3.4 → nextrec-0.3.5}/test/conftest.py +0 -0
  109. {nextrec-0.3.4 → nextrec-0.3.5}/test/run_tests.py +0 -0
  110. {nextrec-0.3.4 → nextrec-0.3.5}/test/test_layers.py +0 -0
  111. {nextrec-0.3.4 → nextrec-0.3.5}/test/test_losses.py +0 -0
  112. {nextrec-0.3.4 → nextrec-0.3.5}/test/test_match_models.py +0 -0
  113. {nextrec-0.3.4 → nextrec-0.3.5}/test/test_multitask_models.py +0 -0
  114. {nextrec-0.3.4 → nextrec-0.3.5}/test/test_preprocessor.py +0 -0
  115. {nextrec-0.3.4 → nextrec-0.3.5}/test/test_ranking_models.py +0 -0
  116. {nextrec-0.3.4 → nextrec-0.3.5}/test/test_utils.py +0 -0
  117. {nextrec-0.3.4 → nextrec-0.3.5}/test_requirements.txt +0 -0
  118. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/example_ranking_din.py +0 -0
  119. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/movielen_match_dssm.py +0 -0
  120. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/movielen_ranking_deepfm.py +0 -0
  121. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/notebooks/en/Hands on dataprocessor.ipynb +0 -0
  122. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/notebooks/en/Hands on nextrec.ipynb +0 -0
  123. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/notebooks/zh/Hands on dataprocessor.ipynb +0 -0
  124. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/notebooks/zh/Hands on nextrec.ipynb +0 -0
  125. {nextrec-0.3.4 → nextrec-0.3.5}/tutorials/run_all_tutorials.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nextrec
3
- Version: 0.3.4
3
+ Version: 0.3.5
4
4
  Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
5
5
  Project-URL: Homepage, https://github.com/zerolovesea/NextRec
6
6
  Project-URL: Repository, https://github.com/zerolovesea/NextRec
@@ -63,7 +63,7 @@ Description-Content-Type: text/markdown
63
63
  ![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
64
64
  ![PyTorch](https://img.shields.io/badge/PyTorch-1.10+-ee4c2c.svg)
65
65
  ![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)
66
- ![Version](https://img.shields.io/badge/Version-0.3.4-orange.svg)
66
+ ![Version](https://img.shields.io/badge/Version-0.3.5-orange.svg)
67
67
 
68
68
  English | [中文文档](README_zh.md)
69
69
 
@@ -110,7 +110,7 @@ To dive deeper, Jupyter notebooks are available:
110
110
  - [Hands on the NextRec framework](/tutorials/notebooks/en/Hands%20on%20nextrec.ipynb)
111
111
  - [Using the data processor for preprocessing](/tutorials/notebooks/en/Hands%20on%20dataprocessor.ipynb)
112
112
 
113
- > Current version [0.3.4]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
113
+ > Current version [0.3.5]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
114
114
 
115
115
  ## 5-Minute Quick Start
116
116
 
@@ -7,7 +7,7 @@
7
7
  ![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
8
8
  ![PyTorch](https://img.shields.io/badge/PyTorch-1.10+-ee4c2c.svg)
9
9
  ![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)
10
- ![Version](https://img.shields.io/badge/Version-0.3.4-orange.svg)
10
+ ![Version](https://img.shields.io/badge/Version-0.3.5-orange.svg)
11
11
 
12
12
  English | [中文文档](README_zh.md)
13
13
 
@@ -54,7 +54,7 @@ To dive deeper, Jupyter notebooks are available:
54
54
  - [Hands on the NextRec framework](/tutorials/notebooks/en/Hands%20on%20nextrec.ipynb)
55
55
  - [Using the data processor for preprocessing](/tutorials/notebooks/en/Hands%20on%20dataprocessor.ipynb)
56
56
 
57
- > Current version [0.3.4]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
57
+ > Current version [0.3.5]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
58
58
 
59
59
  ## 5-Minute Quick Start
60
60
 
@@ -7,7 +7,7 @@
7
7
  ![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
8
8
  ![PyTorch](https://img.shields.io/badge/PyTorch-1.10+-ee4c2c.svg)
9
9
  ![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)
10
- ![Version](https://img.shields.io/badge/Version-0.3.4-orange.svg)
10
+ ![Version](https://img.shields.io/badge/Version-0.3.5-orange.svg)
11
11
 
12
12
  [English Version](README.md) | 中文文档
13
13
 
@@ -54,7 +54,7 @@ NextRec采用模块化、低耦合的工程设计,使得推荐系统从数据
54
54
  - [如何上手NextRec框架](/tutorials/notebooks/zh/Hands%20on%20nextrec.ipynb)
55
55
  - [如何使用数据处理器进行数据预处理](/tutorials/notebooks/zh/Hands%20on%20dataprocessor.ipynb)
56
56
 
57
- > 当前版本[0.3.4],召回模型模块尚不完善,可能存在一些兼容性问题或意外报错,如果遇到问题,欢迎开发者在Issue区提出问题。
57
+ > 当前版本[0.3.5],召回模型模块尚不完善,可能存在一些兼容性问题或意外报错,如果遇到问题,欢迎开发者在Issue区提出问题。
58
58
 
59
59
  ## 5分钟快速上手
60
60
 
@@ -11,7 +11,7 @@ sys.path.insert(0, str(PROJECT_ROOT / "nextrec"))
11
11
  project = "NextRec"
12
12
  copyright = "2025, Yang Zhou"
13
13
  author = "Yang Zhou"
14
- release = "0.3.4"
14
+ release = "0.3.5"
15
15
 
16
16
  extensions = [
17
17
  "myst_parser",
@@ -0,0 +1 @@
1
+ __version__ = "0.3.5"
@@ -7,7 +7,7 @@ Author: Yang Zhou, zyaztec@gmail.com
7
7
  """
8
8
  import torch
9
9
  from nextrec.utils.embedding import get_auto_embedding_dim
10
- from nextrec.utils.common import normalize_to_list
10
+ from nextrec.utils.feature import normalize_to_list
11
11
 
12
12
  class BaseFeature(object):
13
13
  def __repr__(self):
@@ -31,10 +31,12 @@ from nextrec.basic.session import resolve_save_path, create_session
31
31
  from nextrec.basic.metrics import configure_metrics, evaluate_metrics, check_user_id
32
32
 
33
33
  from nextrec.data.dataloader import build_tensors_from_data
34
- from nextrec.data.data_utils import get_column_data, collate_fn, batch_to_dict, get_user_ids
34
+ from nextrec.data.data_processing import get_column_data, get_user_ids
35
+ from nextrec.data.batch_utils import collate_fn, batch_to_dict
35
36
 
36
37
  from nextrec.loss import get_loss_fn, get_loss_kwargs
37
- from nextrec.utils import get_optimizer, get_scheduler, to_tensor
38
+ from nextrec.utils import get_optimizer, get_scheduler
39
+ from nextrec.utils.tensor import to_tensor
38
40
 
39
41
  from nextrec import __version__
40
42
 
@@ -1,14 +1,5 @@
1
1
  """Session and experiment utilities.
2
2
 
3
- This module centralizes session/experiment management so the rest of the
4
- framework writes all artifacts to a consistent location:: <pwd>/log/<experiment_id>/
5
-
6
- Within that folder we keep model parameters, checkpoints, training metrics,
7
- evaluation metrics, and consolidated log output. When users do not provide an
8
- ``experiment_id`` a timestamp-based identifier is generated once per process to
9
- avoid scattering files across multiple directories. Test runs are redirected to
10
- temporary folders so local trees are not polluted.
11
-
12
3
  Date: create on 23/11/2025
13
4
  Author: Yang Zhou,zyaztec@gmail.com
14
5
  """
@@ -16,7 +7,7 @@ Author: Yang Zhou,zyaztec@gmail.com
16
7
  import os
17
8
  import tempfile
18
9
  from dataclasses import dataclass
19
- from datetime import datetime
10
+ from datetime import datetime, timezone
20
11
  from pathlib import Path
21
12
 
22
13
  __all__ = [
@@ -74,6 +65,7 @@ def create_session(experiment_id: str | Path | None = None) -> Session:
74
65
  if experiment_id is not None and str(experiment_id).strip():
75
66
  exp_id = str(experiment_id).strip()
76
67
  else:
68
+ # Use local time for session naming
77
69
  exp_id = "nextrec_session_" + datetime.now().strftime("%Y%m%d")
78
70
 
79
71
  if (
@@ -111,6 +103,7 @@ def resolve_save_path(
111
103
  timestamp.
112
104
  - Parent directories are created.
113
105
  """
106
+ # Use local time for file timestamps
114
107
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if add_timestamp else None
115
108
 
116
109
  normalized_suffix = suffix if suffix.startswith(".") else f".{suffix}"
@@ -1,48 +1,86 @@
1
1
  """
2
2
  Data utilities package for NextRec
3
3
 
4
- This package provides data processing and manipulation utilities.
4
+ This package provides data processing and manipulation utilities organized by category:
5
+ - batch_utils: Batch collation and processing
6
+ - data_processing: Data manipulation and user ID extraction
7
+ - data_utils: Legacy module (re-exports from specialized modules)
8
+ - dataloader: Dataset and DataLoader implementations
9
+ - preprocessor: Data preprocessing pipeline
5
10
 
6
11
  Date: create on 13/11/2025
12
+ Last update: 03/12/2025 (refactored)
7
13
  Author: Yang Zhou, zyaztec@gmail.com
8
14
  """
9
15
 
10
- from nextrec.data.data_utils import (
11
- collate_fn,
16
+ # Batch utilities
17
+ from nextrec.data.batch_utils import collate_fn, batch_to_dict, stack_section
18
+
19
+ # Data processing utilities
20
+ from nextrec.data.data_processing import (
12
21
  get_column_data,
13
- default_output_dir,
14
22
  split_dict_random,
15
23
  build_eval_candidates,
24
+ get_user_ids,
25
+ )
26
+
27
+ # File utilities (from utils package)
28
+ from nextrec.utils.file import (
16
29
  resolve_file_paths,
17
30
  iter_file_chunks,
18
31
  read_table,
19
32
  load_dataframes,
33
+ default_output_dir,
20
34
  )
21
- from nextrec.basic.features import FeatureSet
22
- from nextrec.data import data_utils
35
+
36
+ # DataLoader components
23
37
  from nextrec.data.dataloader import (
24
38
  TensorDictDataset,
25
39
  FileDataset,
26
40
  RecDataLoader,
27
41
  build_tensors_from_data,
28
42
  )
43
+
44
+ # Preprocessor
29
45
  from nextrec.data.preprocessor import DataProcessor
30
46
 
47
+ # Feature definitions
48
+ from nextrec.basic.features import FeatureSet
49
+
50
+ # Legacy module (for backward compatibility)
51
+ from nextrec.data import data_utils
52
+
31
53
  __all__ = [
54
+ # Batch utilities
32
55
  'collate_fn',
56
+ 'batch_to_dict',
57
+ 'stack_section',
58
+
59
+ # Data processing
33
60
  'get_column_data',
34
- 'default_output_dir',
35
61
  'split_dict_random',
36
62
  'build_eval_candidates',
63
+ 'get_user_ids',
64
+
65
+ # File utilities
37
66
  'resolve_file_paths',
38
67
  'iter_file_chunks',
39
68
  'read_table',
40
69
  'load_dataframes',
41
- 'FeatureSet',
42
- 'data_utils',
70
+ 'default_output_dir',
71
+
72
+ # DataLoader
43
73
  'TensorDictDataset',
44
74
  'FileDataset',
45
75
  'RecDataLoader',
46
76
  'build_tensors_from_data',
77
+
78
+ # Preprocessor
47
79
  'DataProcessor',
80
+
81
+ # Features
82
+ 'FeatureSet',
83
+
84
+ # Legacy module
85
+ 'data_utils',
48
86
  ]
@@ -0,0 +1,80 @@
1
+ """
2
+ Batch collation utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ import torch
9
+ import numpy as np
10
+ from typing import Any, Mapping
11
+
12
+ def stack_section(batch: list[dict], section: str):
13
+ entries = [item.get(section) for item in batch if item.get(section) is not None]
14
+ if not entries:
15
+ return None
16
+ merged: dict = {}
17
+ for name in entries[0]: # type: ignore
18
+ tensors = [item[section][name] for item in batch if item.get(section) is not None and name in item[section]]
19
+ merged[name] = torch.stack(tensors, dim=0)
20
+ return merged
21
+
22
+ def collate_fn(batch):
23
+ """
24
+ Collate a list of sample dicts into the unified batch format:
25
+ {
26
+ "features": {name: Tensor(B, ...)},
27
+ "labels": {target: Tensor(B, ...)} or None,
28
+ "ids": {id_name: Tensor(B, ...)} or None,
29
+ }
30
+ Args: batch: List of samples from DataLoader
31
+
32
+ Returns: dict: Batched data in unified format
33
+ """
34
+ if not batch:
35
+ return {"features": {}, "labels": None, "ids": None}
36
+
37
+ first = batch[0]
38
+ if isinstance(first, dict) and "features" in first:
39
+ # Streaming dataset yields already-batched chunks; avoid adding an extra dim.
40
+ if first.get("_already_batched") and len(batch) == 1:
41
+ return {
42
+ "features": first.get("features", {}),
43
+ "labels": first.get("labels"),
44
+ "ids": first.get("ids"),
45
+ }
46
+ return {
47
+ "features": stack_section(batch, "features") or {},
48
+ "labels": stack_section(batch, "labels"),
49
+ "ids": stack_section(batch, "ids"),
50
+ }
51
+
52
+ # Fallback: stack tuples/lists of tensors
53
+ num_tensors = len(first)
54
+ result = []
55
+ for i in range(num_tensors):
56
+ tensor_list = [item[i] for item in batch]
57
+ first_item = tensor_list[0]
58
+ if isinstance(first_item, torch.Tensor):
59
+ stacked = torch.cat(tensor_list, dim=0)
60
+ elif isinstance(first_item, np.ndarray):
61
+ stacked = np.concatenate(tensor_list, axis=0)
62
+ elif isinstance(first_item, list):
63
+ combined = []
64
+ for entry in tensor_list:
65
+ combined.extend(entry)
66
+ stacked = combined
67
+ else:
68
+ stacked = tensor_list
69
+ result.append(stacked)
70
+ return tuple(result)
71
+
72
+
73
+ def batch_to_dict(batch_data: Any, include_ids: bool = True) -> dict:
74
+ if not (isinstance(batch_data, Mapping) and "features" in batch_data):
75
+ raise TypeError("[BaseModel-batch_to_dict Error] Batch data must be a dict with 'features' produced by the current DataLoader.")
76
+ return {
77
+ "features": batch_data.get("features", {}),
78
+ "labels": batch_data.get("labels"),
79
+ "ids": batch_data.get("ids") if include_ids else None,
80
+ }
@@ -0,0 +1,152 @@
1
+ """
2
+ Data processing utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ import torch
9
+ import numpy as np
10
+ import pandas as pd
11
+ from typing import Any, Mapping
12
+
13
+
14
+ def get_column_data(data: dict | pd.DataFrame, name: str):
15
+ if isinstance(data, dict):
16
+ return data[name] if name in data else None
17
+ elif isinstance(data, pd.DataFrame):
18
+ if name not in data.columns:
19
+ return None
20
+ return data[name].values
21
+ else:
22
+ if hasattr(data, name):
23
+ return getattr(data, name)
24
+ raise KeyError(f"Unsupported data type for extracting column {name}")
25
+
26
+ def split_dict_random(
27
+ data_dict: dict,
28
+ test_size: float = 0.2,
29
+ random_state: int | None = None
30
+ ):
31
+ lengths = [len(v) for v in data_dict.values()]
32
+ if len(set(lengths)) != 1:
33
+ raise ValueError(f"Length mismatch: {lengths}")
34
+
35
+ n = lengths[0]
36
+ rng = np.random.default_rng(random_state)
37
+ perm = rng.permutation(n)
38
+ cut = int(round(n * (1 - test_size)))
39
+ train_idx, test_idx = perm[:cut], perm[cut:]
40
+
41
+ def take(v, idx):
42
+ if isinstance(v, np.ndarray):
43
+ return v[idx]
44
+ elif isinstance(v, pd.Series):
45
+ return v.iloc[idx].to_numpy()
46
+ else:
47
+ v_arr = np.asarray(v, dtype=object)
48
+ return v_arr[idx]
49
+
50
+ train_dict = {k: take(v, train_idx) for k, v in data_dict.items()}
51
+ test_dict = {k: take(v, test_idx) for k, v in data_dict.items()}
52
+ return train_dict, test_dict
53
+
54
+
55
+ def build_eval_candidates(
56
+ df_all: pd.DataFrame,
57
+ user_col: str,
58
+ item_col: str,
59
+ label_col: str,
60
+ user_features: pd.DataFrame,
61
+ item_features: pd.DataFrame,
62
+ num_pos_per_user: int = 5,
63
+ num_neg_per_pos: int = 50,
64
+ random_seed: int = 2025,
65
+ ) -> pd.DataFrame:
66
+ """
67
+ Build evaluation candidates with positive and negative samples for each user.
68
+
69
+ Args:
70
+ df_all: Full interaction DataFrame
71
+ user_col: Name of the user ID column
72
+ item_col: Name of the item ID column
73
+ label_col: Name of the label column
74
+ user_features: DataFrame containing user features
75
+ item_features: DataFrame containing item features
76
+ num_pos_per_user: Number of positive samples per user (default: 5)
77
+ num_neg_per_pos: Number of negative samples per positive (default: 50)
78
+ random_seed: Random seed for reproducibility (default: 2025)
79
+
80
+ Returns:
81
+ pd.DataFrame: Evaluation candidates with features
82
+ """
83
+ rng = np.random.default_rng(random_seed)
84
+
85
+ users = df_all[user_col].unique()
86
+ all_items = item_features[item_col].unique()
87
+ rows = []
88
+ user_hist_items = {u: df_all[df_all[user_col] == u][item_col].unique() for u in users}
89
+
90
+ for u in users:
91
+ df_user = df_all[df_all[user_col] == u]
92
+ pos_items = df_user[df_user[label_col] == 1][item_col].unique()
93
+ if len(pos_items) == 0:
94
+ continue
95
+ pos_items = pos_items[:num_pos_per_user]
96
+ seen_items = set(user_hist_items[u])
97
+ neg_pool = np.setdiff1d(all_items, np.fromiter(seen_items, dtype=all_items.dtype))
98
+ if len(neg_pool) == 0:
99
+ continue
100
+ for pos in pos_items:
101
+ if len(neg_pool) <= num_neg_per_pos:
102
+ neg_items = neg_pool
103
+ else:
104
+ neg_items = rng.choice(neg_pool, size=num_neg_per_pos, replace=False)
105
+ rows.append((u, pos, 1))
106
+ for ni in neg_items:
107
+ rows.append((u, ni, 0))
108
+
109
+ eval_df = pd.DataFrame(rows, columns=[user_col, item_col, label_col])
110
+ eval_df = eval_df.merge(user_features, on=user_col, how='left')
111
+ eval_df = eval_df.merge(item_features, on=item_col, how='left')
112
+ return eval_df
113
+
114
+
115
+ def get_user_ids(
116
+ data: Any,
117
+ id_columns: list[str] | str | None = None
118
+ ) -> np.ndarray | None:
119
+ """
120
+ Extract user IDs from various data structures.
121
+
122
+ Args:
123
+ data: Data source (DataFrame, dict, or batch dict)
124
+ id_columns: List or single ID column name(s) (default: None)
125
+
126
+ Returns:
127
+ np.ndarray | None: User IDs as numpy array, or None if not found
128
+ """
129
+ id_columns = (
130
+ id_columns if isinstance(id_columns, list)
131
+ else [id_columns] if isinstance(id_columns, str)
132
+ else []
133
+ )
134
+ if not id_columns:
135
+ return None
136
+
137
+ main_id = id_columns[0]
138
+ if isinstance(data, pd.DataFrame) and main_id in data.columns:
139
+ arr = np.asarray(data[main_id].values)
140
+ return arr.reshape(arr.shape[0])
141
+
142
+ if isinstance(data, dict):
143
+ ids_container = data.get("ids")
144
+ if isinstance(ids_container, dict) and main_id in ids_container:
145
+ val = ids_container[main_id]
146
+ val = val.detach().cpu().numpy() if isinstance(val, torch.Tensor) else np.asarray(val)
147
+ return val.reshape(val.shape[0])
148
+ if main_id in data:
149
+ arr = np.asarray(data[main_id])
150
+ return arr.reshape(arr.shape[0])
151
+
152
+ return None
@@ -0,0 +1,35 @@
1
+ """
2
+ Data processing utilities for NextRec (Refactored)
3
+
4
+ This module now re-exports functions from specialized submodules:
5
+ - batch_utils: collate_fn, batch_to_dict
6
+ - data_processing: get_column_data, split_dict_random, build_eval_candidates, get_user_ids
7
+ - nextrec.utils.file_utils: resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
8
+
9
+ Date: create on 27/10/2025
10
+ Last update: 03/12/2025 (refactored)
11
+ Author: Yang Zhou, zyaztec@gmail.com
12
+ """
13
+
14
+ # Import from new organized modules
15
+ from nextrec.data.batch_utils import collate_fn, batch_to_dict, stack_section
16
+ from nextrec.data.data_processing import get_column_data, split_dict_random, build_eval_candidates, get_user_ids
17
+ from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
18
+
19
+ __all__ = [
20
+ # Batch utilities
21
+ 'collate_fn',
22
+ 'batch_to_dict',
23
+ 'stack_section',
24
+ # Data processing
25
+ 'get_column_data',
26
+ 'split_dict_random',
27
+ 'build_eval_candidates',
28
+ 'get_user_ids',
29
+ # File utilities
30
+ 'resolve_file_paths',
31
+ 'iter_file_chunks',
32
+ 'read_table',
33
+ 'load_dataframes',
34
+ 'default_output_dir',
35
+ ]
@@ -20,8 +20,10 @@ from nextrec.data.preprocessor import DataProcessor
20
20
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
21
21
 
22
22
  from nextrec.basic.loggers import colorize
23
- from nextrec.data import get_column_data, collate_fn, resolve_file_paths, read_table
24
- from nextrec.utils import to_tensor
23
+ from nextrec.data.data_processing import get_column_data
24
+ from nextrec.data.batch_utils import collate_fn
25
+ from nextrec.utils.file import resolve_file_paths, read_table
26
+ from nextrec.utils.tensor import to_tensor
25
27
 
26
28
  class TensorDictDataset(Dataset):
27
29
  """Dataset returning sample-level dicts matching the unified batch schema."""
@@ -16,24 +16,14 @@ import pandas as pd
16
16
  import tqdm
17
17
  from pathlib import Path
18
18
  from typing import Dict, Union, Optional, Literal, Any
19
- from sklearn.preprocessing import (
20
- StandardScaler,
21
- MinMaxScaler,
22
- RobustScaler,
23
- MaxAbsScaler,
24
- LabelEncoder
25
- )
19
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, LabelEncoder
20
+
26
21
 
27
- from nextrec.basic.loggers import setup_logger, colorize
28
- from nextrec.data.data_utils import (
29
- resolve_file_paths,
30
- iter_file_chunks,
31
- read_table,
32
- load_dataframes,
33
- default_output_dir,
34
- )
35
- from nextrec.basic.session import resolve_save_path
36
22
  from nextrec.basic.features import FeatureSet
23
+ from nextrec.basic.loggers import colorize
24
+ from nextrec.basic.session import resolve_save_path
25
+ from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
26
+
37
27
  from nextrec.__version__ import __version__
38
28
 
39
29
 
@@ -46,7 +46,7 @@ from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
46
46
  from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
47
47
  from nextrec.basic.activation import activation_layer
48
48
  from nextrec.basic.model import BaseModel
49
- from nextrec.utils.common import merge_features
49
+ from nextrec.utils.model import merge_features
50
50
 
51
51
 
52
52
  class POSOGate(nn.Module):
@@ -0,0 +1,68 @@
1
+ """
2
+ Utilities package for NextRec
3
+
4
+ This package provides various utility functions organized by category:
5
+ - optimizer: Optimizer and scheduler utilities
6
+ - initializer: Weight initialization utilities
7
+ - embedding: Embedding dimension calculation
8
+ - device_utils: Device management and selection
9
+ - tensor_utils: Tensor operations and conversions
10
+ - file_utils: File I/O operations
11
+ - model_utils: Model-related utilities
12
+ - feature_utils: Feature processing utilities
13
+
14
+ Date: create on 13/11/2025
15
+ Last update: 03/12/2025 (refactored)
16
+ Author: Yang Zhou, zyaztec@gmail.com
17
+ """
18
+
19
+ from .optimizer import get_optimizer, get_scheduler
20
+ from .initializer import get_initializer
21
+ from .embedding import get_auto_embedding_dim
22
+ from .device import resolve_device, get_device_info
23
+ from .tensor import to_tensor, stack_tensors, concat_tensors, pad_sequence_tensors
24
+ from .file import resolve_file_paths, read_table, load_dataframes, iter_file_chunks, default_output_dir
25
+ from .model import merge_features, get_mlp_output_dim
26
+ from .feature import normalize_to_list
27
+ from . import optimizer, initializer, embedding
28
+
29
+ __all__ = [
30
+ # Optimizer & Scheduler
31
+ 'get_optimizer',
32
+ 'get_scheduler',
33
+
34
+ # Initializer
35
+ 'get_initializer',
36
+
37
+ # Embedding
38
+ 'get_auto_embedding_dim',
39
+
40
+ # Device utilities
41
+ 'resolve_device',
42
+ 'get_device_info',
43
+
44
+ # Tensor utilities
45
+ 'to_tensor',
46
+ 'stack_tensors',
47
+ 'concat_tensors',
48
+ 'pad_sequence_tensors',
49
+
50
+ # File utilities
51
+ 'resolve_file_paths',
52
+ 'read_table',
53
+ 'load_dataframes',
54
+ 'iter_file_chunks',
55
+ 'default_output_dir',
56
+
57
+ # Model utilities
58
+ 'merge_features',
59
+ 'get_mlp_output_dim',
60
+
61
+ # Feature utilities
62
+ 'normalize_to_list',
63
+
64
+ # Module exports
65
+ 'optimizer',
66
+ 'initializer',
67
+ 'embedding',
68
+ ]
@@ -0,0 +1,37 @@
1
+ """
2
+ Device management utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ import torch
9
+ import platform
10
+
11
+
12
+ def resolve_device() -> str:
13
+ if torch.cuda.is_available():
14
+ return "cuda"
15
+ if torch.backends.mps.is_available():
16
+ mac_ver = platform.mac_ver()[0]
17
+ try:
18
+ major, minor = (int(x) for x in mac_ver.split(".")[:2])
19
+ except Exception:
20
+ major, minor = 0, 0
21
+ if major >= 14:
22
+ return "mps"
23
+ return "cpu"
24
+
25
+ def get_device_info() -> dict:
26
+ info = {
27
+ 'cuda_available': torch.cuda.is_available(),
28
+ 'cuda_device_count': torch.cuda.device_count() if torch.cuda.is_available() else 0,
29
+ 'mps_available': torch.backends.mps.is_available(),
30
+ 'current_device': resolve_device(),
31
+ }
32
+
33
+ if torch.cuda.is_available():
34
+ info['cuda_device_name'] = torch.cuda.get_device_name(0)
35
+ info['cuda_capability'] = torch.cuda.get_device_capability(0)
36
+
37
+ return info