nextrec 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__init__.py +1 -1
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +10 -5
- nextrec/basic/callback.py +1 -0
- nextrec/basic/features.py +30 -22
- nextrec/basic/layers.py +220 -106
- nextrec/basic/loggers.py +62 -43
- nextrec/basic/metrics.py +268 -119
- nextrec/basic/model.py +1082 -400
- nextrec/basic/session.py +10 -3
- nextrec/cli.py +498 -0
- nextrec/data/__init__.py +19 -25
- nextrec/data/batch_utils.py +11 -3
- nextrec/data/data_processing.py +51 -45
- nextrec/data/data_utils.py +26 -15
- nextrec/data/dataloader.py +272 -95
- nextrec/data/preprocessor.py +320 -199
- nextrec/loss/listwise.py +17 -9
- nextrec/loss/loss_utils.py +7 -8
- nextrec/loss/pairwise.py +2 -0
- nextrec/loss/pointwise.py +30 -12
- nextrec/models/generative/hstu.py +103 -38
- nextrec/models/match/dssm.py +82 -68
- nextrec/models/match/dssm_v2.py +72 -57
- nextrec/models/match/mind.py +175 -107
- nextrec/models/match/sdm.py +104 -87
- nextrec/models/match/youtube_dnn.py +73 -59
- nextrec/models/multi_task/esmm.py +53 -37
- nextrec/models/multi_task/mmoe.py +64 -45
- nextrec/models/multi_task/ple.py +101 -48
- nextrec/models/multi_task/poso.py +113 -36
- nextrec/models/multi_task/share_bottom.py +48 -35
- nextrec/models/ranking/afm.py +72 -37
- nextrec/models/ranking/autoint.py +72 -55
- nextrec/models/ranking/dcn.py +55 -35
- nextrec/models/ranking/dcn_v2.py +64 -23
- nextrec/models/ranking/deepfm.py +32 -22
- nextrec/models/ranking/dien.py +155 -99
- nextrec/models/ranking/din.py +85 -57
- nextrec/models/ranking/fibinet.py +52 -32
- nextrec/models/ranking/fm.py +29 -23
- nextrec/models/ranking/masknet.py +91 -29
- nextrec/models/ranking/pnn.py +31 -28
- nextrec/models/ranking/widedeep.py +34 -26
- nextrec/models/ranking/xdeepfm.py +60 -38
- nextrec/utils/__init__.py +59 -34
- nextrec/utils/config.py +490 -0
- nextrec/utils/device.py +30 -20
- nextrec/utils/distributed.py +36 -9
- nextrec/utils/embedding.py +1 -0
- nextrec/utils/feature.py +1 -0
- nextrec/utils/file.py +32 -11
- nextrec/utils/initializer.py +61 -16
- nextrec/utils/optimizer.py +25 -9
- nextrec/utils/synthetic_data.py +283 -165
- nextrec/utils/tensor.py +24 -13
- {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/METADATA +4 -4
- nextrec-0.4.2.dist-info/RECORD +69 -0
- nextrec-0.4.2.dist-info/entry_points.txt +2 -0
- nextrec-0.4.1.dist-info/RECORD +0 -66
- {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
- {nextrec-0.4.1.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
nextrec/utils/file.py
CHANGED
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
File I/O utilities for NextRec
|
|
3
3
|
|
|
4
4
|
Date: create on 03/12/2025
|
|
5
|
+
Checkpoint: edit on 06/12/2025
|
|
5
6
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
6
7
|
"""
|
|
7
8
|
|
|
9
|
+
import yaml
|
|
8
10
|
import pandas as pd
|
|
9
11
|
import pyarrow.parquet as pq
|
|
12
|
+
|
|
10
13
|
from pathlib import Path
|
|
11
14
|
from typing import Generator
|
|
12
15
|
|
|
@@ -14,7 +17,7 @@ from typing import Generator
|
|
|
14
17
|
def resolve_file_paths(path: str) -> tuple[list[str], str]:
|
|
15
18
|
"""
|
|
16
19
|
Resolve file or directory path into a sorted list of files and file type.
|
|
17
|
-
|
|
20
|
+
|
|
18
21
|
Args: path: Path to a file or directory
|
|
19
22
|
Returns: tuple: (list of file paths, file type)
|
|
20
23
|
"""
|
|
@@ -22,16 +25,23 @@ def resolve_file_paths(path: str) -> tuple[list[str], str]:
|
|
|
22
25
|
|
|
23
26
|
if path_obj.is_file():
|
|
24
27
|
file_type = path_obj.suffix.lower().lstrip(".")
|
|
25
|
-
assert file_type in [
|
|
28
|
+
assert file_type in [
|
|
29
|
+
"csv",
|
|
30
|
+
"parquet",
|
|
31
|
+
], f"Unsupported file extension: {file_type}"
|
|
26
32
|
return [str(path_obj)], file_type
|
|
27
33
|
|
|
28
34
|
if path_obj.is_dir():
|
|
29
35
|
collected_files = [p for p in path_obj.iterdir() if p.is_file()]
|
|
30
36
|
csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
|
|
31
|
-
parquet_files = [
|
|
37
|
+
parquet_files = [
|
|
38
|
+
str(p) for p in collected_files if p.suffix.lower() == ".parquet"
|
|
39
|
+
]
|
|
32
40
|
|
|
33
41
|
if csv_files and parquet_files:
|
|
34
|
-
raise ValueError(
|
|
42
|
+
raise ValueError(
|
|
43
|
+
"Directory contains both CSV and Parquet files. Please keep a single format."
|
|
44
|
+
)
|
|
35
45
|
file_paths = csv_files if csv_files else parquet_files
|
|
36
46
|
if not file_paths:
|
|
37
47
|
raise ValueError(f"No CSV or Parquet files found in directory: {path}")
|
|
@@ -42,18 +52,24 @@ def resolve_file_paths(path: str) -> tuple[list[str], str]:
|
|
|
42
52
|
raise ValueError(f"Invalid path: {path}")
|
|
43
53
|
|
|
44
54
|
|
|
45
|
-
def read_table(
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
55
|
+
def read_table(path: str | Path, data_format: str | None = None) -> pd.DataFrame:
|
|
56
|
+
data_path = Path(path)
|
|
57
|
+
fmt = data_format.lower() if data_format else data_path.suffix.lower().lstrip(".")
|
|
58
|
+
if data_path.is_dir() and not fmt:
|
|
59
|
+
fmt = "parquet"
|
|
60
|
+
if fmt in {"parquet", ""}:
|
|
61
|
+
return pd.read_parquet(data_path)
|
|
62
|
+
if fmt in {"csv", "txt"}:
|
|
63
|
+
return pd.read_csv(data_path)
|
|
64
|
+
raise ValueError(f"Unsupported data format: {data_path}")
|
|
65
|
+
|
|
49
66
|
|
|
50
67
|
def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
|
|
51
68
|
return [read_table(fp, file_type) for fp in file_paths]
|
|
52
69
|
|
|
70
|
+
|
|
53
71
|
def iter_file_chunks(
|
|
54
|
-
file_path: str,
|
|
55
|
-
file_type: str,
|
|
56
|
-
chunk_size: int
|
|
72
|
+
file_path: str, file_type: str, chunk_size: int
|
|
57
73
|
) -> Generator[pd.DataFrame, None, None]:
|
|
58
74
|
if file_type == "csv":
|
|
59
75
|
yield from pd.read_csv(file_path, chunksize=chunk_size)
|
|
@@ -68,3 +84,8 @@ def default_output_dir(path: str) -> Path:
|
|
|
68
84
|
if path_obj.is_file():
|
|
69
85
|
return path_obj.parent / f"{path_obj.stem}_preprocessed"
|
|
70
86
|
return path_obj.with_name(f"{path_obj.name}_preprocessed")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def read_yaml(path: str | Path):
|
|
90
|
+
with open(path, "r", encoding="utf-8") as file:
|
|
91
|
+
return yaml.safe_load(file) or {}
|
nextrec/utils/initializer.py
CHANGED
|
@@ -5,32 +5,77 @@ Date: create on 13/11/2025
|
|
|
5
5
|
Author: Yang Zhou, zyaztec@gmail.com
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
from typing import Any, Dict, Set, cast
|
|
9
|
+
|
|
8
10
|
import torch.nn as nn
|
|
11
|
+
from torch.nn.init import _NonlinearityType
|
|
9
12
|
|
|
13
|
+
KNOWN_NONLINEARITIES: Set[str] = {
|
|
14
|
+
"linear",
|
|
15
|
+
"conv1d",
|
|
16
|
+
"conv2d",
|
|
17
|
+
"conv3d",
|
|
18
|
+
"conv_transpose1d",
|
|
19
|
+
"conv_transpose2d",
|
|
20
|
+
"conv_transpose3d",
|
|
21
|
+
"sigmoid",
|
|
22
|
+
"tanh",
|
|
23
|
+
"relu",
|
|
24
|
+
"leaky_relu",
|
|
25
|
+
"selu",
|
|
26
|
+
"gelu",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def resolve_nonlinearity(activation: str | _NonlinearityType) -> _NonlinearityType:
|
|
31
|
+
if isinstance(activation, str):
|
|
32
|
+
if activation in KNOWN_NONLINEARITIES:
|
|
33
|
+
return cast(_NonlinearityType, activation)
|
|
34
|
+
# Fall back to linear for custom activations (gain handled separately).
|
|
35
|
+
return "linear"
|
|
36
|
+
return activation
|
|
10
37
|
|
|
11
|
-
def get_initializer(init_type='normal', activation='linear', param=None):
|
|
12
|
-
param = param or {}
|
|
13
38
|
|
|
39
|
+
def resolve_gain(activation: str | _NonlinearityType, param: Dict[str, Any]) -> float:
|
|
40
|
+
if "gain" in param:
|
|
41
|
+
return param["gain"]
|
|
42
|
+
nonlinearity = resolve_nonlinearity(activation)
|
|
14
43
|
try:
|
|
15
|
-
|
|
44
|
+
return nn.init.calculate_gain(nonlinearity, param.get("param"))
|
|
16
45
|
except ValueError:
|
|
17
|
-
|
|
18
|
-
|
|
46
|
+
return 1.0 # custom activation with no gain estimate available
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_initializer(
|
|
50
|
+
init_type: str = "normal",
|
|
51
|
+
activation: str | _NonlinearityType = "linear",
|
|
52
|
+
param: Dict[str, Any] | None = None,
|
|
53
|
+
):
|
|
54
|
+
param = param or {}
|
|
55
|
+
nonlinearity = resolve_nonlinearity(activation)
|
|
56
|
+
gain = resolve_gain(activation, param)
|
|
57
|
+
|
|
19
58
|
def initializer_fn(tensor):
|
|
20
|
-
if init_type ==
|
|
59
|
+
if init_type == "xavier_uniform":
|
|
21
60
|
nn.init.xavier_uniform_(tensor, gain=gain)
|
|
22
|
-
elif init_type ==
|
|
61
|
+
elif init_type == "xavier_normal":
|
|
23
62
|
nn.init.xavier_normal_(tensor, gain=gain)
|
|
24
|
-
elif init_type ==
|
|
25
|
-
nn.init.kaiming_uniform_(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
elif init_type ==
|
|
63
|
+
elif init_type == "kaiming_uniform":
|
|
64
|
+
nn.init.kaiming_uniform_(
|
|
65
|
+
tensor, a=param.get("a", 0), nonlinearity=nonlinearity
|
|
66
|
+
)
|
|
67
|
+
elif init_type == "kaiming_normal":
|
|
68
|
+
nn.init.kaiming_normal_(
|
|
69
|
+
tensor, a=param.get("a", 0), nonlinearity=nonlinearity
|
|
70
|
+
)
|
|
71
|
+
elif init_type == "orthogonal":
|
|
29
72
|
nn.init.orthogonal_(tensor, gain=gain)
|
|
30
|
-
elif init_type ==
|
|
31
|
-
nn.init.normal_(
|
|
32
|
-
|
|
33
|
-
|
|
73
|
+
elif init_type == "normal":
|
|
74
|
+
nn.init.normal_(
|
|
75
|
+
tensor, mean=param.get("mean", 0.0), std=param.get("std", 0.0001)
|
|
76
|
+
)
|
|
77
|
+
elif init_type == "uniform":
|
|
78
|
+
nn.init.uniform_(tensor, a=param.get("a", -0.05), b=param.get("b", 0.05))
|
|
34
79
|
else:
|
|
35
80
|
raise ValueError(f"Unknown init_type: {init_type}")
|
|
36
81
|
return tensor
|
nextrec/utils/optimizer.py
CHANGED
|
@@ -8,16 +8,17 @@ Author: Yang Zhou, zyaztec@gmail.com
|
|
|
8
8
|
import torch
|
|
9
9
|
from typing import Iterable
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
def get_optimizer(
|
|
12
13
|
optimizer: str | torch.optim.Optimizer = "adam",
|
|
13
14
|
params: Iterable[torch.nn.Parameter] | None = None,
|
|
14
|
-
**optimizer_params
|
|
15
|
+
**optimizer_params,
|
|
15
16
|
):
|
|
16
17
|
if params is None:
|
|
17
18
|
raise ValueError("params cannot be None. Please provide model parameters.")
|
|
18
19
|
|
|
19
|
-
if
|
|
20
|
-
optimizer_params[
|
|
20
|
+
if "lr" not in optimizer_params:
|
|
21
|
+
optimizer_params["lr"] = 1e-3
|
|
21
22
|
if isinstance(optimizer, str):
|
|
22
23
|
opt_name = optimizer.lower()
|
|
23
24
|
if opt_name == "adam":
|
|
@@ -39,21 +40,36 @@ def get_optimizer(
|
|
|
39
40
|
raise TypeError(f"Invalid optimizer type: {type(optimizer)}")
|
|
40
41
|
return optimizer_fn
|
|
41
42
|
|
|
43
|
+
|
|
42
44
|
def get_scheduler(
|
|
43
|
-
scheduler:
|
|
45
|
+
scheduler: (
|
|
46
|
+
str
|
|
47
|
+
| torch.optim.lr_scheduler._LRScheduler
|
|
48
|
+
| torch.optim.lr_scheduler.LRScheduler
|
|
49
|
+
| type[torch.optim.lr_scheduler._LRScheduler]
|
|
50
|
+
| type[torch.optim.lr_scheduler.LRScheduler]
|
|
51
|
+
| None
|
|
52
|
+
),
|
|
44
53
|
optimizer,
|
|
45
|
-
**scheduler_params
|
|
54
|
+
**scheduler_params,
|
|
46
55
|
):
|
|
47
56
|
if isinstance(scheduler, str):
|
|
48
57
|
if scheduler == "step":
|
|
49
|
-
scheduler_fn = torch.optim.lr_scheduler.StepLR(
|
|
58
|
+
scheduler_fn = torch.optim.lr_scheduler.StepLR(
|
|
59
|
+
optimizer, **scheduler_params
|
|
60
|
+
)
|
|
50
61
|
elif scheduler == "cosine":
|
|
51
|
-
scheduler_fn = torch.optim.lr_scheduler.CosineAnnealingLR(
|
|
62
|
+
scheduler_fn = torch.optim.lr_scheduler.CosineAnnealingLR(
|
|
63
|
+
optimizer, **scheduler_params
|
|
64
|
+
)
|
|
52
65
|
else:
|
|
53
66
|
raise NotImplementedError(f"Unsupported scheduler: {scheduler}")
|
|
54
|
-
elif isinstance(
|
|
67
|
+
elif isinstance(
|
|
68
|
+
scheduler,
|
|
69
|
+
(torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.LRScheduler),
|
|
70
|
+
):
|
|
55
71
|
scheduler_fn = scheduler
|
|
56
72
|
else:
|
|
57
73
|
raise TypeError(f"Invalid scheduler type: {type(scheduler)}")
|
|
58
|
-
|
|
74
|
+
|
|
59
75
|
return scheduler_fn
|