nextrec 0.3.5__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nextrec-0.3.5 → nextrec-0.3.6}/.gitignore +1 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/PKG-INFO +3 -3
- {nextrec-0.3.5 → nextrec-0.3.6}/README.md +2 -2
- {nextrec-0.3.5 → nextrec-0.3.6}/README_zh.md +2 -2
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/conf.py +1 -1
- nextrec-0.3.6/nextrec/__init__.py +11 -0
- nextrec-0.3.6/nextrec/__version__.py +1 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/basic/loggers.py +1 -1
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/basic/model.py +16 -13
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/basic/session.py +4 -2
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/data/__init__.py +0 -25
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/data/dataloader.py +15 -10
- nextrec-0.3.6/nextrec/models/generative/tiger.py +0 -0
- nextrec-0.3.6/nextrec/models/match/__init__.py +0 -0
- nextrec-0.3.6/nextrec/models/multi_task/__init__.py +0 -0
- nextrec-0.3.6/nextrec/models/ranking/__init__.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/utils/device.py +2 -1
- {nextrec-0.3.5 → nextrec-0.3.6}/pyproject.toml +1 -1
- nextrec-0.3.5/nextrec/__init__.py +0 -41
- nextrec-0.3.5/nextrec/__version__.py +0 -1
- nextrec-0.3.5/nextrec/models/generative/__init__.py +0 -5
- nextrec-0.3.5/nextrec/models/match/__init__.py +0 -13
- nextrec-0.3.5/nextrec/models/ranking/__init__.py +0 -27
- {nextrec-0.3.5 → nextrec-0.3.6}/.github/workflows/publish.yml +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/.github/workflows/tests.yml +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/.readthedocs.yaml +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/CODE_OF_CONDUCT.md +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/CONTRIBUTING.md +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/LICENSE +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/MANIFEST.in +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/asserts/Feature Configuration.png +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/asserts/Model Parameters.png +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/asserts/Training Configuration.png +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/asserts/Training logs.png +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/asserts/logo.png +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/asserts/mmoe_tutorial.png +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/asserts/nextrec_diagram_en.png +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/asserts/nextrec_diagram_zh.png +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/asserts/test data.png +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/dataset/ctcvr_task.csv +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/dataset/match_task.csv +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/dataset/movielens_100k.csv +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/dataset/multitask_task.csv +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/dataset/ranking_task.csv +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/en/Getting started guide.md +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/Makefile +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/index.md +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/make.bat +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/modules.rst +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/nextrec.basic.rst +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/nextrec.data.rst +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/nextrec.loss.rst +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/nextrec.rst +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/nextrec.utils.rst +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/rtd/requirements.txt +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/docs/zh//345/277/253/351/200/237/344/270/212/346/211/213.md" +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/basic/__init__.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/basic/activation.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/basic/callback.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/basic/features.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/basic/layers.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/basic/metrics.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/data/batch_utils.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/data/data_processing.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/data/data_utils.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/data/preprocessor.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/loss/__init__.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/loss/listwise.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/loss/loss_utils.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/loss/pairwise.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/loss/pointwise.py +0 -0
- /nextrec-0.3.5/nextrec/models/generative/tiger.py → /nextrec-0.3.6/nextrec/models/generative/__init__.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/generative/hstu.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/match/dssm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/match/dssm_v2.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/match/mind.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/match/sdm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/match/youtube_dnn.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/multi_task/esmm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/multi_task/mmoe.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/multi_task/ple.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/multi_task/poso.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/multi_task/share_bottom.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/afm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/autoint.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/dcn.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/dcn_v2.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/deepfm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/dien.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/din.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/fibinet.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/fm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/masknet.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/pnn.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/widedeep.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/models/ranking/xdeepfm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/utils/__init__.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/utils/embedding.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/utils/feature.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/utils/file.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/utils/initializer.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/utils/model.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/utils/optimizer.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/nextrec/utils/tensor.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/pytest.ini +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/requirements.txt +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/__init__.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/conftest.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/run_tests.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/test_layers.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/test_losses.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/test_match_models.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/test_multitask_models.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/test_preprocessor.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/test_ranking_models.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test/test_utils.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/test_requirements.txt +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/example_match_dssm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/example_multitask.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/example_ranking_din.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/movielen_match_dssm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/movielen_ranking_deepfm.py +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/notebooks/en/Hands on dataprocessor.ipynb +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/notebooks/en/Hands on nextrec.ipynb +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/notebooks/zh/Hands on dataprocessor.ipynb +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/notebooks/zh/Hands on nextrec.ipynb +0 -0
- {nextrec-0.3.5 → nextrec-0.3.6}/tutorials/run_all_tutorials.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nextrec
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.6
|
|
4
4
|
Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
|
|
5
5
|
Project-URL: Homepage, https://github.com/zerolovesea/NextRec
|
|
6
6
|
Project-URL: Repository, https://github.com/zerolovesea/NextRec
|
|
@@ -63,7 +63,7 @@ Description-Content-Type: text/markdown
|
|
|
63
63
|

|
|
64
64
|

|
|
65
65
|

|
|
66
|
-

|
|
67
67
|
|
|
68
68
|
English | [中文文档](README_zh.md)
|
|
69
69
|
|
|
@@ -110,7 +110,7 @@ To dive deeper, Jupyter notebooks are available:
|
|
|
110
110
|
- [Hands on the NextRec framework](/tutorials/notebooks/en/Hands%20on%20nextrec.ipynb)
|
|
111
111
|
- [Using the data processor for preprocessing](/tutorials/notebooks/en/Hands%20on%20dataprocessor.ipynb)
|
|
112
112
|
|
|
113
|
-
> Current version [0.3.
|
|
113
|
+
> Current version [0.3.6]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
|
|
114
114
|
|
|
115
115
|
## 5-Minute Quick Start
|
|
116
116
|
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|

|
|
8
8
|

|
|
9
9
|

|
|
10
|
-

|
|
11
11
|
|
|
12
12
|
English | [中文文档](README_zh.md)
|
|
13
13
|
|
|
@@ -54,7 +54,7 @@ To dive deeper, Jupyter notebooks are available:
|
|
|
54
54
|
- [Hands on the NextRec framework](/tutorials/notebooks/en/Hands%20on%20nextrec.ipynb)
|
|
55
55
|
- [Using the data processor for preprocessing](/tutorials/notebooks/en/Hands%20on%20dataprocessor.ipynb)
|
|
56
56
|
|
|
57
|
-
> Current version [0.3.
|
|
57
|
+
> Current version [0.3.6]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
|
|
58
58
|
|
|
59
59
|
## 5-Minute Quick Start
|
|
60
60
|
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|

|
|
8
8
|

|
|
9
9
|

|
|
10
|
-

|
|
11
11
|
|
|
12
12
|
[English Version](README.md) | 中文文档
|
|
13
13
|
|
|
@@ -54,7 +54,7 @@ NextRec采用模块化、低耦合的工程设计,使得推荐系统从数据
|
|
|
54
54
|
- [如何上手NextRec框架](/tutorials/notebooks/zh/Hands%20on%20nextrec.ipynb)
|
|
55
55
|
- [如何使用数据处理器进行数据预处理](/tutorials/notebooks/zh/Hands%20on%20dataprocessor.ipynb)
|
|
56
56
|
|
|
57
|
-
> 当前版本[0.3.
|
|
57
|
+
> 当前版本[0.3.6],召回模型模块尚不完善,可能存在一些兼容性问题或意外报错,如果遇到问题,欢迎开发者在Issue区提出问题。
|
|
58
58
|
|
|
59
59
|
## 5分钟快速上手
|
|
60
60
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.6"
|
|
@@ -99,7 +99,7 @@ def setup_logger(session_id: str | os.PathLike | None = None):
|
|
|
99
99
|
session = create_session(str(session_id) if session_id is not None else None)
|
|
100
100
|
log_dir = session.logs_dir
|
|
101
101
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
102
|
-
log_file = log_dir / f"{session.
|
|
102
|
+
log_file = log_dir / f"{session.log_basename}.log"
|
|
103
103
|
|
|
104
104
|
console_format = '%(message)s'
|
|
105
105
|
file_format = '%(asctime)s - %(levelname)s - %(message)s'
|
|
@@ -155,7 +155,7 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
155
155
|
raise ValueError("[BaseModel-input Error] Labels are required but none were found in the input batch.")
|
|
156
156
|
return X_input, y
|
|
157
157
|
|
|
158
|
-
def handle_validation_split(self, train_data: dict | pd.DataFrame, validation_split: float, batch_size: int, shuffle: bool,) -> tuple[DataLoader, dict | pd.DataFrame]:
|
|
158
|
+
def handle_validation_split(self, train_data: dict | pd.DataFrame, validation_split: float, batch_size: int, shuffle: bool, num_workers: int = 0,) -> tuple[DataLoader, dict | pd.DataFrame]:
|
|
159
159
|
"""This function will split training data into training and validation sets when: 1. valid_data is None; 2. validation_split is provided."""
|
|
160
160
|
if not (0 < validation_split < 1):
|
|
161
161
|
raise ValueError(f"[BaseModel-validation Error] validation_split must be between 0 and 1, got {validation_split}")
|
|
@@ -184,7 +184,7 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
184
184
|
arr = np.asarray(value)
|
|
185
185
|
train_split[key] = arr[train_indices]
|
|
186
186
|
valid_split[key] = arr[valid_indices]
|
|
187
|
-
train_loader = self.prepare_data_loader(train_split, batch_size=batch_size, shuffle=shuffle)
|
|
187
|
+
train_loader = self.prepare_data_loader(train_split, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
|
|
188
188
|
logging.info(f"Split data: {len(train_indices)} training samples, {len(valid_indices)} validation samples")
|
|
189
189
|
return train_loader, valid_split
|
|
190
190
|
|
|
@@ -265,14 +265,14 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
265
265
|
task_losses.append(task_loss)
|
|
266
266
|
return torch.stack(task_losses).sum()
|
|
267
267
|
|
|
268
|
-
def prepare_data_loader(self, data: dict | pd.DataFrame | DataLoader, batch_size: int = 32, shuffle: bool = True,):
|
|
268
|
+
def prepare_data_loader(self, data: dict | pd.DataFrame | DataLoader, batch_size: int = 32, shuffle: bool = True, num_workers: int = 0,) -> DataLoader:
|
|
269
269
|
if isinstance(data, DataLoader):
|
|
270
270
|
return data
|
|
271
271
|
tensors = build_tensors_from_data(data=data, raw_data=data, features=self.all_features, target_columns=self.target_columns, id_columns=self.id_columns,)
|
|
272
272
|
if tensors is None:
|
|
273
273
|
raise ValueError("[BaseModel-prepare_data_loader Error] No data available to create DataLoader.")
|
|
274
274
|
dataset = TensorDictDataset(tensors)
|
|
275
|
-
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
|
|
275
|
+
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn, num_workers=num_workers)
|
|
276
276
|
|
|
277
277
|
def fit(self,
|
|
278
278
|
train_data: dict | pd.DataFrame | DataLoader,
|
|
@@ -281,6 +281,7 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
281
281
|
epochs:int=1, shuffle:bool=True, batch_size:int=32,
|
|
282
282
|
user_id_column: str | None = None,
|
|
283
283
|
validation_split: float | None = None,
|
|
284
|
+
num_workers: int = 0,
|
|
284
285
|
tensorboard: bool = True,):
|
|
285
286
|
self.to(self.device)
|
|
286
287
|
if not self.logger_initialized:
|
|
@@ -297,11 +298,11 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
297
298
|
self.best_metric = float('-inf') if self.best_metrics_mode == 'max' else float('inf')
|
|
298
299
|
|
|
299
300
|
if validation_split is not None and valid_data is None:
|
|
300
|
-
train_loader, valid_data = self.handle_validation_split(train_data=train_data, validation_split=validation_split, batch_size=batch_size, shuffle=shuffle,) # type: ignore
|
|
301
|
+
train_loader, valid_data = self.handle_validation_split(train_data=train_data, validation_split=validation_split, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers) # type: ignore
|
|
301
302
|
else:
|
|
302
|
-
train_loader = (train_data if isinstance(train_data, DataLoader) else self.prepare_data_loader(train_data, batch_size=batch_size, shuffle=shuffle))
|
|
303
|
+
train_loader = (train_data if isinstance(train_data, DataLoader) else self.prepare_data_loader(train_data, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers))
|
|
303
304
|
|
|
304
|
-
valid_loader, valid_user_ids = self.prepare_validation_data(valid_data=valid_data, batch_size=batch_size, needs_user_ids=self.needs_user_ids, user_id_column=user_id_column)
|
|
305
|
+
valid_loader, valid_user_ids = self.prepare_validation_data(valid_data=valid_data, batch_size=batch_size, needs_user_ids=self.needs_user_ids, user_id_column=user_id_column, num_workers=num_workers)
|
|
305
306
|
try:
|
|
306
307
|
self.steps_per_epoch = len(train_loader)
|
|
307
308
|
is_streaming = False
|
|
@@ -388,7 +389,7 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
388
389
|
self.training_logger.log_metrics(train_log_payload, step=epoch + 1, split="train")
|
|
389
390
|
if valid_loader is not None:
|
|
390
391
|
# pass user_ids only if needed for GAUC metric
|
|
391
|
-
val_metrics = self.evaluate(valid_loader, user_ids=valid_user_ids if self.needs_user_ids else None) # {'auc': 0.75, 'logloss': 0.45} or {'auc_target1': 0.75, 'logloss_target1': 0.45, 'mse_target2': 3.2}
|
|
392
|
+
val_metrics = self.evaluate(valid_loader, user_ids=valid_user_ids if self.needs_user_ids else None, num_workers=num_workers) # {'auc': 0.75, 'logloss': 0.45} or {'auc_target1': 0.75, 'logloss_target1': 0.45, 'mse_target2': 3.2}
|
|
392
393
|
if self.nums_task == 1:
|
|
393
394
|
metrics_str = ", ".join([f"{k}={v:.4f}" for k, v in val_metrics.items()])
|
|
394
395
|
logging.info(colorize(f" Epoch {epoch + 1}/{epochs} - Valid: {metrics_str}", color="cyan"))
|
|
@@ -513,12 +514,12 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
513
514
|
return avg_loss, metrics_dict
|
|
514
515
|
return avg_loss
|
|
515
516
|
|
|
516
|
-
def prepare_validation_data(self, valid_data: dict | pd.DataFrame | DataLoader | None, batch_size: int, needs_user_ids: bool, user_id_column: str | None = 'user_id') -> tuple[DataLoader | None, np.ndarray | None]:
|
|
517
|
+
def prepare_validation_data(self, valid_data: dict | pd.DataFrame | DataLoader | None, batch_size: int, needs_user_ids: bool, user_id_column: str | None = 'user_id', num_workers: int = 0,) -> tuple[DataLoader | None, np.ndarray | None]:
|
|
517
518
|
if valid_data is None:
|
|
518
519
|
return None, None
|
|
519
520
|
if isinstance(valid_data, DataLoader):
|
|
520
521
|
return valid_data, None
|
|
521
|
-
valid_loader = self.prepare_data_loader(valid_data, batch_size=batch_size, shuffle=False)
|
|
522
|
+
valid_loader = self.prepare_data_loader(valid_data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
|
|
522
523
|
valid_user_ids = None
|
|
523
524
|
if needs_user_ids:
|
|
524
525
|
if user_id_column is None:
|
|
@@ -531,7 +532,8 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
531
532
|
metrics: list[str] | dict[str, list[str]] | None = None,
|
|
532
533
|
batch_size: int = 32,
|
|
533
534
|
user_ids: np.ndarray | None = None,
|
|
534
|
-
user_id_column: str = 'user_id'
|
|
535
|
+
user_id_column: str = 'user_id',
|
|
536
|
+
num_workers: int = 0,) -> dict:
|
|
535
537
|
self.eval()
|
|
536
538
|
eval_metrics = metrics if metrics is not None else self.metrics
|
|
537
539
|
if eval_metrics is None:
|
|
@@ -543,7 +545,7 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
543
545
|
else:
|
|
544
546
|
if user_ids is None and needs_user_ids:
|
|
545
547
|
user_ids = get_user_ids(data=data, id_columns=user_id_column)
|
|
546
|
-
data_loader = self.prepare_data_loader(data, batch_size=batch_size, shuffle=False)
|
|
548
|
+
data_loader = self.prepare_data_loader(data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
|
|
547
549
|
y_true_list = []
|
|
548
550
|
y_pred_list = []
|
|
549
551
|
collected_user_ids = []
|
|
@@ -603,6 +605,7 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
603
605
|
include_ids: bool | None = None,
|
|
604
606
|
return_dataframe: bool = True,
|
|
605
607
|
streaming_chunk_size: int = 10000,
|
|
608
|
+
num_workers: int = 0,
|
|
606
609
|
) -> pd.DataFrame | np.ndarray:
|
|
607
610
|
self.eval()
|
|
608
611
|
if include_ids is None:
|
|
@@ -615,7 +618,7 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
615
618
|
rec_loader = RecDataLoader(dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target=self.target_columns, id_columns=self.id_columns,)
|
|
616
619
|
data_loader = rec_loader.create_dataloader(data=data, batch_size=batch_size, shuffle=False, load_full=False, chunk_size=streaming_chunk_size,)
|
|
617
620
|
elif not isinstance(data, DataLoader):
|
|
618
|
-
data_loader = self.prepare_data_loader(data, batch_size=batch_size, shuffle=False,)
|
|
621
|
+
data_loader = self.prepare_data_loader(data, batch_size=batch_size, shuffle=False, num_workers=num_workers)
|
|
619
622
|
else:
|
|
620
623
|
data_loader = data
|
|
621
624
|
|
|
@@ -22,6 +22,7 @@ class Session:
|
|
|
22
22
|
|
|
23
23
|
experiment_id: str
|
|
24
24
|
root: Path
|
|
25
|
+
log_basename: str # The base name for log files, without path separators
|
|
25
26
|
|
|
26
27
|
@property
|
|
27
28
|
def logs_dir(self) -> Path:
|
|
@@ -60,7 +61,6 @@ class Session:
|
|
|
60
61
|
return path
|
|
61
62
|
|
|
62
63
|
def create_session(experiment_id: str | Path | None = None) -> Session:
|
|
63
|
-
"""Create a :class:`Session` instance with prepared directories."""
|
|
64
64
|
|
|
65
65
|
if experiment_id is not None and str(experiment_id).strip():
|
|
66
66
|
exp_id = str(experiment_id).strip()
|
|
@@ -68,6 +68,8 @@ def create_session(experiment_id: str | Path | None = None) -> Session:
|
|
|
68
68
|
# Use local time for session naming
|
|
69
69
|
exp_id = "nextrec_session_" + datetime.now().strftime("%Y%m%d")
|
|
70
70
|
|
|
71
|
+
log_basename = Path(exp_id).name if exp_id else exp_id
|
|
72
|
+
|
|
71
73
|
if (
|
|
72
74
|
os.getenv("PYTEST_CURRENT_TEST")
|
|
73
75
|
or os.getenv("PYTEST_RUNNING")
|
|
@@ -82,7 +84,7 @@ def create_session(experiment_id: str | Path | None = None) -> Session:
|
|
|
82
84
|
session_path.mkdir(parents=True, exist_ok=True)
|
|
83
85
|
root = session_path.resolve()
|
|
84
86
|
|
|
85
|
-
return Session(experiment_id=exp_id, root=root)
|
|
87
|
+
return Session(experiment_id=exp_id, root=root, log_basename=log_basename)
|
|
86
88
|
|
|
87
89
|
def resolve_save_path(
|
|
88
90
|
path: str | os.PathLike | Path | None,
|
|
@@ -1,22 +1,4 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Data utilities package for NextRec
|
|
3
|
-
|
|
4
|
-
This package provides data processing and manipulation utilities organized by category:
|
|
5
|
-
- batch_utils: Batch collation and processing
|
|
6
|
-
- data_processing: Data manipulation and user ID extraction
|
|
7
|
-
- data_utils: Legacy module (re-exports from specialized modules)
|
|
8
|
-
- dataloader: Dataset and DataLoader implementations
|
|
9
|
-
- preprocessor: Data preprocessing pipeline
|
|
10
|
-
|
|
11
|
-
Date: create on 13/11/2025
|
|
12
|
-
Last update: 03/12/2025 (refactored)
|
|
13
|
-
Author: Yang Zhou, zyaztec@gmail.com
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
# Batch utilities
|
|
17
1
|
from nextrec.data.batch_utils import collate_fn, batch_to_dict, stack_section
|
|
18
|
-
|
|
19
|
-
# Data processing utilities
|
|
20
2
|
from nextrec.data.data_processing import (
|
|
21
3
|
get_column_data,
|
|
22
4
|
split_dict_random,
|
|
@@ -24,7 +6,6 @@ from nextrec.data.data_processing import (
|
|
|
24
6
|
get_user_ids,
|
|
25
7
|
)
|
|
26
8
|
|
|
27
|
-
# File utilities (from utils package)
|
|
28
9
|
from nextrec.utils.file import (
|
|
29
10
|
resolve_file_paths,
|
|
30
11
|
iter_file_chunks,
|
|
@@ -33,7 +14,6 @@ from nextrec.utils.file import (
|
|
|
33
14
|
default_output_dir,
|
|
34
15
|
)
|
|
35
16
|
|
|
36
|
-
# DataLoader components
|
|
37
17
|
from nextrec.data.dataloader import (
|
|
38
18
|
TensorDictDataset,
|
|
39
19
|
FileDataset,
|
|
@@ -41,13 +21,8 @@ from nextrec.data.dataloader import (
|
|
|
41
21
|
build_tensors_from_data,
|
|
42
22
|
)
|
|
43
23
|
|
|
44
|
-
# Preprocessor
|
|
45
24
|
from nextrec.data.preprocessor import DataProcessor
|
|
46
|
-
|
|
47
|
-
# Feature definitions
|
|
48
25
|
from nextrec.basic.features import FeatureSet
|
|
49
|
-
|
|
50
|
-
# Legacy module (for backward compatibility)
|
|
51
26
|
from nextrec.data import data_utils
|
|
52
27
|
|
|
53
28
|
__all__ = [
|
|
@@ -126,20 +126,22 @@ class RecDataLoader(FeatureSet):
|
|
|
126
126
|
batch_size: int = 32,
|
|
127
127
|
shuffle: bool = True,
|
|
128
128
|
load_full: bool = True,
|
|
129
|
-
chunk_size: int = 10000
|
|
129
|
+
chunk_size: int = 10000,
|
|
130
|
+
num_workers: int = 0) -> DataLoader:
|
|
130
131
|
if isinstance(data, DataLoader):
|
|
131
132
|
return data
|
|
132
133
|
elif isinstance(data, (str, os.PathLike)):
|
|
133
|
-
return self.create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size)
|
|
134
|
+
return self.create_from_path(path=data, batch_size=batch_size, shuffle=shuffle, load_full=load_full, chunk_size=chunk_size, num_workers=num_workers)
|
|
134
135
|
elif isinstance(data, (dict, pd.DataFrame)):
|
|
135
|
-
return self.create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle)
|
|
136
|
+
return self.create_from_memory(data=data, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
|
|
136
137
|
else:
|
|
137
138
|
raise ValueError(f"[RecDataLoader Error] Unsupported data type: {type(data)}")
|
|
138
139
|
|
|
139
140
|
def create_from_memory(self,
|
|
140
141
|
data: dict | pd.DataFrame,
|
|
141
142
|
batch_size: int,
|
|
142
|
-
shuffle: bool
|
|
143
|
+
shuffle: bool,
|
|
144
|
+
num_workers: int = 0) -> DataLoader:
|
|
143
145
|
raw_data = data
|
|
144
146
|
|
|
145
147
|
if self.processor is not None:
|
|
@@ -150,14 +152,15 @@ class RecDataLoader(FeatureSet):
|
|
|
150
152
|
if tensors is None:
|
|
151
153
|
raise ValueError("[RecDataLoader Error] No valid tensors could be built from the provided data.")
|
|
152
154
|
dataset = TensorDictDataset(tensors)
|
|
153
|
-
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
|
|
155
|
+
return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn, num_workers=num_workers)
|
|
154
156
|
|
|
155
157
|
def create_from_path(self,
|
|
156
158
|
path: str,
|
|
157
159
|
batch_size: int,
|
|
158
160
|
shuffle: bool,
|
|
159
161
|
load_full: bool,
|
|
160
|
-
chunk_size: int = 10000
|
|
162
|
+
chunk_size: int = 10000,
|
|
163
|
+
num_workers: int = 0) -> DataLoader:
|
|
161
164
|
file_paths, file_type = resolve_file_paths(str(Path(path)))
|
|
162
165
|
# Load full data into memory
|
|
163
166
|
if load_full:
|
|
@@ -169,6 +172,7 @@ class RecDataLoader(FeatureSet):
|
|
|
169
172
|
except OSError:
|
|
170
173
|
pass
|
|
171
174
|
try:
|
|
175
|
+
df = read_table(file_path, file_type=file_type)
|
|
172
176
|
dfs.append(df)
|
|
173
177
|
except MemoryError as exc:
|
|
174
178
|
raise MemoryError(f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using load_full=False with streaming.") from exc
|
|
@@ -176,22 +180,23 @@ class RecDataLoader(FeatureSet):
|
|
|
176
180
|
combined_df = pd.concat(dfs, ignore_index=True)
|
|
177
181
|
except MemoryError as exc:
|
|
178
182
|
raise MemoryError(f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use load_full=False to stream or reduce chunk_size.") from exc
|
|
179
|
-
return self.create_from_memory(combined_df, batch_size, shuffle,)
|
|
183
|
+
return self.create_from_memory(combined_df, batch_size, shuffle, num_workers=num_workers)
|
|
180
184
|
else:
|
|
181
|
-
return self.load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle)
|
|
185
|
+
return self.load_files_streaming(file_paths, file_type, batch_size, chunk_size, shuffle, num_workers=num_workers)
|
|
182
186
|
|
|
183
187
|
def load_files_streaming(self,
|
|
184
188
|
file_paths: list[str],
|
|
185
189
|
file_type: str,
|
|
186
190
|
batch_size: int,
|
|
187
191
|
chunk_size: int,
|
|
188
|
-
shuffle: bool
|
|
192
|
+
shuffle: bool,
|
|
193
|
+
num_workers: int = 0) -> DataLoader:
|
|
189
194
|
if shuffle:
|
|
190
195
|
logging.info("[RecDataLoader Info] Shuffle is ignored in streaming mode (IterableDataset).")
|
|
191
196
|
if batch_size != 1:
|
|
192
197
|
logging.info("[RecDataLoader Info] Streaming mode enforces batch_size=1; tune chunk_size to control memory/throughput.")
|
|
193
198
|
dataset = FileDataset(file_paths=file_paths, dense_features=self.dense_features, sparse_features=self.sparse_features, sequence_features=self.sequence_features, target_columns=self.target_columns, id_columns=self.id_columns, chunk_size=chunk_size, file_type=file_type, processor=self.processor)
|
|
194
|
-
return DataLoader(dataset, batch_size=1, collate_fn=collate_fn)
|
|
199
|
+
return DataLoader(dataset, batch_size=1, collate_fn=collate_fn, num_workers=num_workers)
|
|
195
200
|
|
|
196
201
|
def normalize_sequence_column(column, feature: SequenceFeature) -> np.ndarray:
|
|
197
202
|
if isinstance(column, pd.Series):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
NextRec - A Unified Deep Learning Framework for Recommender Systems
|
|
3
|
-
===================================================================
|
|
4
|
-
|
|
5
|
-
NextRec provides a comprehensive suite of recommendation models including:
|
|
6
|
-
- Ranking models (CTR prediction)
|
|
7
|
-
- Matching models (retrieval)
|
|
8
|
-
- Multi-task learning models
|
|
9
|
-
- Generative recommendation models
|
|
10
|
-
|
|
11
|
-
Quick Start
|
|
12
|
-
-----------
|
|
13
|
-
>>> from nextrec.basic.features import DenseFeature, SparseFeature
|
|
14
|
-
>>> from nextrec.models.ranking.deepfm import DeepFM
|
|
15
|
-
>>>
|
|
16
|
-
>>> # Define features
|
|
17
|
-
>>> dense_features = [DenseFeature('age')]
|
|
18
|
-
>>> sparse_features = [SparseFeature('category', vocab_size=100, embedding_dim=16)]
|
|
19
|
-
>>>
|
|
20
|
-
>>> # Build model
|
|
21
|
-
>>> model = DeepFM(
|
|
22
|
-
... dense_features=dense_features,
|
|
23
|
-
... sparse_features=sparse_features,
|
|
24
|
-
... targets=['label']
|
|
25
|
-
... )
|
|
26
|
-
>>>
|
|
27
|
-
>>> # Train model
|
|
28
|
-
>>> model.fit(train_data=df_train, valid_data=df_valid)
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
from nextrec.__version__ import __version__
|
|
32
|
-
|
|
33
|
-
__all__ = [
|
|
34
|
-
'__version__',
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
# Package metadata
|
|
38
|
-
__author__ = "zerolovesea"
|
|
39
|
-
__email__ = "zyaztec@gmail.com"
|
|
40
|
-
__license__ = "Apache 2.0"
|
|
41
|
-
__url__ = "https://github.com/zerolovesea/NextRec"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.3.5"
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from .fm import FM
|
|
2
|
-
from .afm import AFM
|
|
3
|
-
from .masknet import MaskNet
|
|
4
|
-
from .pnn import PNN
|
|
5
|
-
from .deepfm import DeepFM
|
|
6
|
-
from .autoint import AutoInt
|
|
7
|
-
from .widedeep import WideDeep
|
|
8
|
-
from .xdeepfm import xDeepFM
|
|
9
|
-
from .dcn import DCN
|
|
10
|
-
from .fibinet import FiBiNET
|
|
11
|
-
from .din import DIN
|
|
12
|
-
from .dien import DIEN
|
|
13
|
-
|
|
14
|
-
__all__ = [
|
|
15
|
-
'DeepFM',
|
|
16
|
-
'AutoInt',
|
|
17
|
-
'WideDeep',
|
|
18
|
-
'xDeepFM',
|
|
19
|
-
'DCN',
|
|
20
|
-
'DIN',
|
|
21
|
-
'DIEN',
|
|
22
|
-
'FM',
|
|
23
|
-
'AFM',
|
|
24
|
-
'MaskNet',
|
|
25
|
-
'PNN',
|
|
26
|
-
'FiBiNET',
|
|
27
|
-
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{nextrec-0.3.5 → nextrec-0.3.6}/docs/zh//345/277/253/351/200/237/344/270/212/346/211/213.md"
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|