nextrec 0.3.6__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. nextrec/__init__.py +1 -1
  2. nextrec/__version__.py +1 -1
  3. nextrec/basic/activation.py +10 -5
  4. nextrec/basic/callback.py +1 -0
  5. nextrec/basic/features.py +30 -22
  6. nextrec/basic/layers.py +244 -113
  7. nextrec/basic/loggers.py +62 -43
  8. nextrec/basic/metrics.py +268 -119
  9. nextrec/basic/model.py +1373 -443
  10. nextrec/basic/session.py +10 -3
  11. nextrec/cli.py +498 -0
  12. nextrec/data/__init__.py +19 -25
  13. nextrec/data/batch_utils.py +11 -3
  14. nextrec/data/data_processing.py +42 -24
  15. nextrec/data/data_utils.py +26 -15
  16. nextrec/data/dataloader.py +303 -96
  17. nextrec/data/preprocessor.py +320 -199
  18. nextrec/loss/listwise.py +17 -9
  19. nextrec/loss/loss_utils.py +7 -8
  20. nextrec/loss/pairwise.py +2 -0
  21. nextrec/loss/pointwise.py +30 -12
  22. nextrec/models/generative/hstu.py +106 -40
  23. nextrec/models/match/dssm.py +82 -69
  24. nextrec/models/match/dssm_v2.py +72 -58
  25. nextrec/models/match/mind.py +175 -108
  26. nextrec/models/match/sdm.py +104 -88
  27. nextrec/models/match/youtube_dnn.py +73 -60
  28. nextrec/models/multi_task/esmm.py +53 -39
  29. nextrec/models/multi_task/mmoe.py +70 -47
  30. nextrec/models/multi_task/ple.py +107 -50
  31. nextrec/models/multi_task/poso.py +121 -41
  32. nextrec/models/multi_task/share_bottom.py +54 -38
  33. nextrec/models/ranking/afm.py +172 -45
  34. nextrec/models/ranking/autoint.py +84 -61
  35. nextrec/models/ranking/dcn.py +59 -42
  36. nextrec/models/ranking/dcn_v2.py +64 -23
  37. nextrec/models/ranking/deepfm.py +36 -26
  38. nextrec/models/ranking/dien.py +158 -102
  39. nextrec/models/ranking/din.py +88 -60
  40. nextrec/models/ranking/fibinet.py +55 -35
  41. nextrec/models/ranking/fm.py +32 -26
  42. nextrec/models/ranking/masknet.py +95 -34
  43. nextrec/models/ranking/pnn.py +34 -31
  44. nextrec/models/ranking/widedeep.py +37 -29
  45. nextrec/models/ranking/xdeepfm.py +63 -41
  46. nextrec/utils/__init__.py +61 -32
  47. nextrec/utils/config.py +490 -0
  48. nextrec/utils/device.py +52 -12
  49. nextrec/utils/distributed.py +141 -0
  50. nextrec/utils/embedding.py +1 -0
  51. nextrec/utils/feature.py +1 -0
  52. nextrec/utils/file.py +32 -11
  53. nextrec/utils/initializer.py +61 -16
  54. nextrec/utils/optimizer.py +25 -9
  55. nextrec/utils/synthetic_data.py +531 -0
  56. nextrec/utils/tensor.py +24 -13
  57. {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/METADATA +15 -5
  58. nextrec-0.4.2.dist-info/RECORD +69 -0
  59. nextrec-0.4.2.dist-info/entry_points.txt +2 -0
  60. nextrec-0.3.6.dist-info/RECORD +0 -64
  61. {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/WHEEL +0 -0
  62. {nextrec-0.3.6.dist-info → nextrec-0.4.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,141 @@
1
+ """
2
+ Distributed utilities for NextRec.
3
+
4
+ Date: create on 04/12/2025
5
+ Checkpoint: edit on 05/12/2025
6
+ Author: Yang Zhou,zyaztec@gmail.com
7
+ """
8
+
9
+ import logging
10
+ import numpy as np
11
+ import torch
12
+ import torch.distributed as dist
13
+
14
+ from torch.utils.data import DataLoader, IterableDataset
15
+ from torch.utils.data.distributed import DistributedSampler
16
+ from nextrec.basic.loggers import colorize
17
+
18
+
19
+ def init_process_group(
20
+ distributed: bool, rank: int, world_size: int, device_id: int | None = None
21
+ ) -> None:
22
+ """
23
+ initialize distributed process group for multi-GPU training.
24
+
25
+ Args:
26
+ distributed: whether to enable distributed training
27
+ rank: global rank of the current process
28
+ world_size: total number of processes
29
+ """
30
+ if (not distributed) or (not dist.is_available()) or dist.is_initialized():
31
+ return
32
+ backend = "nccl" if device_id is not None else "gloo"
33
+ if backend == "nccl":
34
+ torch.cuda.set_device(device_id)
35
+ dist.init_process_group(
36
+ backend=backend, init_method="env://", rank=rank, world_size=world_size
37
+ )
38
+
39
+
40
+ def gather_numpy(self, array: np.ndarray | None) -> np.ndarray | None:
41
+ """
42
+ Gather numpy arrays (or None) across ranks. Uses all_gather_object to avoid
43
+ shape mismatches and ensures every rank participates even when local data is empty.
44
+ """
45
+ if not (self.distributed and dist.is_available() and dist.is_initialized()):
46
+ return array
47
+
48
+ world_size = dist.get_world_size()
49
+ gathered: list[np.ndarray | None] = [None for _ in range(world_size)]
50
+ dist.all_gather_object(gathered, array)
51
+ pieces: list[np.ndarray] = []
52
+ for item in gathered:
53
+ if item is None:
54
+ continue
55
+ item_np = np.asarray(item)
56
+ if item_np.size > 0:
57
+ pieces.append(item_np)
58
+ if not pieces:
59
+ return None
60
+ return np.concatenate(pieces, axis=0)
61
+
62
+
63
+ def add_distributed_sampler(
64
+ loader: DataLoader,
65
+ distributed: bool,
66
+ world_size: int,
67
+ rank: int,
68
+ shuffle: bool,
69
+ drop_last: bool,
70
+ default_batch_size: int,
71
+ is_main_process: bool = False,
72
+ ) -> tuple[DataLoader, DistributedSampler | None]:
73
+ """
74
+ add distributedsampler to a dataloader, this for distributed training
75
+ when each device has its own dataloader
76
+ """
77
+ # early return if not distributed
78
+ if not (distributed and dist.is_available() and dist.is_initialized()):
79
+ return loader, None
80
+ # return if already has DistributedSampler
81
+ if isinstance(loader.sampler, DistributedSampler):
82
+ return loader, loader.sampler
83
+ dataset = getattr(loader, "dataset", None)
84
+ if dataset is None:
85
+ return loader, None
86
+ if isinstance(dataset, IterableDataset):
87
+ if is_main_process:
88
+ logging.info(
89
+ colorize(
90
+ "[Distributed Info] Iterable/streaming DataLoader provided; DistributedSampler is skipped. Ensure dataset handles sharding per rank.",
91
+ color="yellow",
92
+ )
93
+ )
94
+ return loader, None
95
+ sampler = DistributedSampler(
96
+ dataset,
97
+ num_replicas=world_size,
98
+ rank=rank,
99
+ shuffle=shuffle,
100
+ drop_last=drop_last,
101
+ )
102
+ loader_kwargs = {
103
+ "batch_size": (
104
+ loader.batch_size if loader.batch_size is not None else default_batch_size
105
+ ),
106
+ "shuffle": False,
107
+ "sampler": sampler,
108
+ "num_workers": loader.num_workers,
109
+ "collate_fn": loader.collate_fn,
110
+ "drop_last": drop_last,
111
+ }
112
+ if getattr(loader, "pin_memory", False):
113
+ loader_kwargs["pin_memory"] = True
114
+ pin_memory_device = getattr(loader, "pin_memory_device", None)
115
+ if pin_memory_device:
116
+ loader_kwargs["pin_memory_device"] = pin_memory_device
117
+ timeout = getattr(loader, "timeout", None)
118
+ if timeout:
119
+ loader_kwargs["timeout"] = timeout
120
+ worker_init_fn = getattr(loader, "worker_init_fn", None)
121
+ if worker_init_fn is not None:
122
+ loader_kwargs["worker_init_fn"] = worker_init_fn
123
+ generator = getattr(loader, "generator", None)
124
+ if generator is not None:
125
+ loader_kwargs["generator"] = generator
126
+ if loader.num_workers > 0:
127
+ loader_kwargs["persistent_workers"] = getattr(
128
+ loader, "persistent_workers", False
129
+ )
130
+ prefetch_factor = getattr(loader, "prefetch_factor", None)
131
+ if prefetch_factor is not None:
132
+ loader_kwargs["prefetch_factor"] = prefetch_factor
133
+ distributed_loader = DataLoader(dataset, **loader_kwargs)
134
+ if is_main_process:
135
+ logging.info(
136
+ colorize(
137
+ "[Distributed Info] Attached DistributedSampler to provided DataLoader",
138
+ color="cyan",
139
+ )
140
+ )
141
+ return distributed_loader, sampler
@@ -2,6 +2,7 @@
2
2
  Embedding utilities for NextRec
3
3
 
4
4
  Date: create on 13/11/2025
5
+ Checkpoint: edit on 06/12/2025
5
6
  Author: Yang Zhou, zyaztec@gmail.com
6
7
  """
7
8
 
nextrec/utils/feature.py CHANGED
@@ -5,6 +5,7 @@ Date: create on 03/12/2025
5
5
  Author: Yang Zhou, zyaztec@gmail.com
6
6
  """
7
7
 
8
+
8
9
  def normalize_to_list(value: str | list[str] | None) -> list[str]:
9
10
  if value is None:
10
11
  return []
nextrec/utils/file.py CHANGED
@@ -2,11 +2,14 @@
2
2
  File I/O utilities for NextRec
3
3
 
4
4
  Date: create on 03/12/2025
5
+ Checkpoint: edit on 06/12/2025
5
6
  Author: Yang Zhou, zyaztec@gmail.com
6
7
  """
7
8
 
9
+ import yaml
8
10
  import pandas as pd
9
11
  import pyarrow.parquet as pq
12
+
10
13
  from pathlib import Path
11
14
  from typing import Generator
12
15
 
@@ -14,7 +17,7 @@ from typing import Generator
14
17
  def resolve_file_paths(path: str) -> tuple[list[str], str]:
15
18
  """
16
19
  Resolve file or directory path into a sorted list of files and file type.
17
-
20
+
18
21
  Args: path: Path to a file or directory
19
22
  Returns: tuple: (list of file paths, file type)
20
23
  """
@@ -22,16 +25,23 @@ def resolve_file_paths(path: str) -> tuple[list[str], str]:
22
25
 
23
26
  if path_obj.is_file():
24
27
  file_type = path_obj.suffix.lower().lstrip(".")
25
- assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
28
+ assert file_type in [
29
+ "csv",
30
+ "parquet",
31
+ ], f"Unsupported file extension: {file_type}"
26
32
  return [str(path_obj)], file_type
27
33
 
28
34
  if path_obj.is_dir():
29
35
  collected_files = [p for p in path_obj.iterdir() if p.is_file()]
30
36
  csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
31
- parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
37
+ parquet_files = [
38
+ str(p) for p in collected_files if p.suffix.lower() == ".parquet"
39
+ ]
32
40
 
33
41
  if csv_files and parquet_files:
34
- raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
42
+ raise ValueError(
43
+ "Directory contains both CSV and Parquet files. Please keep a single format."
44
+ )
35
45
  file_paths = csv_files if csv_files else parquet_files
36
46
  if not file_paths:
37
47
  raise ValueError(f"No CSV or Parquet files found in directory: {path}")
@@ -42,18 +52,24 @@ def resolve_file_paths(path: str) -> tuple[list[str], str]:
42
52
  raise ValueError(f"Invalid path: {path}")
43
53
 
44
54
 
45
- def read_table(file_path: str, file_type: str) -> pd.DataFrame:
46
- if file_type == "csv":
47
- return pd.read_csv(file_path)
48
- return pd.read_parquet(file_path)
55
+ def read_table(path: str | Path, data_format: str | None = None) -> pd.DataFrame:
56
+ data_path = Path(path)
57
+ fmt = data_format.lower() if data_format else data_path.suffix.lower().lstrip(".")
58
+ if data_path.is_dir() and not fmt:
59
+ fmt = "parquet"
60
+ if fmt in {"parquet", ""}:
61
+ return pd.read_parquet(data_path)
62
+ if fmt in {"csv", "txt"}:
63
+ return pd.read_csv(data_path)
64
+ raise ValueError(f"Unsupported data format: {data_path}")
65
+
49
66
 
50
67
  def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
51
68
  return [read_table(fp, file_type) for fp in file_paths]
52
69
 
70
+
53
71
  def iter_file_chunks(
54
- file_path: str,
55
- file_type: str,
56
- chunk_size: int
72
+ file_path: str, file_type: str, chunk_size: int
57
73
  ) -> Generator[pd.DataFrame, None, None]:
58
74
  if file_type == "csv":
59
75
  yield from pd.read_csv(file_path, chunksize=chunk_size)
@@ -68,3 +84,8 @@ def default_output_dir(path: str) -> Path:
68
84
  if path_obj.is_file():
69
85
  return path_obj.parent / f"{path_obj.stem}_preprocessed"
70
86
  return path_obj.with_name(f"{path_obj.name}_preprocessed")
87
+
88
+
89
+ def read_yaml(path: str | Path):
90
+ with open(path, "r", encoding="utf-8") as file:
91
+ return yaml.safe_load(file) or {}
@@ -5,32 +5,77 @@ Date: create on 13/11/2025
5
5
  Author: Yang Zhou, zyaztec@gmail.com
6
6
  """
7
7
 
8
+ from typing import Any, Dict, Set, cast
9
+
8
10
  import torch.nn as nn
11
+ from torch.nn.init import _NonlinearityType
9
12
 
13
+ KNOWN_NONLINEARITIES: Set[str] = {
14
+ "linear",
15
+ "conv1d",
16
+ "conv2d",
17
+ "conv3d",
18
+ "conv_transpose1d",
19
+ "conv_transpose2d",
20
+ "conv_transpose3d",
21
+ "sigmoid",
22
+ "tanh",
23
+ "relu",
24
+ "leaky_relu",
25
+ "selu",
26
+ "gelu",
27
+ }
28
+
29
+
30
+ def resolve_nonlinearity(activation: str | _NonlinearityType) -> _NonlinearityType:
31
+ if isinstance(activation, str):
32
+ if activation in KNOWN_NONLINEARITIES:
33
+ return cast(_NonlinearityType, activation)
34
+ # Fall back to linear for custom activations (gain handled separately).
35
+ return "linear"
36
+ return activation
10
37
 
11
- def get_initializer(init_type='normal', activation='linear', param=None):
12
- param = param or {}
13
38
 
39
+ def resolve_gain(activation: str | _NonlinearityType, param: Dict[str, Any]) -> float:
40
+ if "gain" in param:
41
+ return param["gain"]
42
+ nonlinearity = resolve_nonlinearity(activation)
14
43
  try:
15
- gain = param.get('gain', nn.init.calculate_gain(activation, param.get('param', None)))
44
+ return nn.init.calculate_gain(nonlinearity, param.get("param"))
16
45
  except ValueError:
17
- gain = 1.0 # for custom activations like 'dice'
18
-
46
+ return 1.0 # custom activation with no gain estimate available
47
+
48
+
49
+ def get_initializer(
50
+ init_type: str = "normal",
51
+ activation: str | _NonlinearityType = "linear",
52
+ param: Dict[str, Any] | None = None,
53
+ ):
54
+ param = param or {}
55
+ nonlinearity = resolve_nonlinearity(activation)
56
+ gain = resolve_gain(activation, param)
57
+
19
58
  def initializer_fn(tensor):
20
- if init_type == 'xavier_uniform':
59
+ if init_type == "xavier_uniform":
21
60
  nn.init.xavier_uniform_(tensor, gain=gain)
22
- elif init_type == 'xavier_normal':
61
+ elif init_type == "xavier_normal":
23
62
  nn.init.xavier_normal_(tensor, gain=gain)
24
- elif init_type == 'kaiming_uniform':
25
- nn.init.kaiming_uniform_(tensor, a=param.get('a', 0), nonlinearity=activation)
26
- elif init_type == 'kaiming_normal':
27
- nn.init.kaiming_normal_(tensor, a=param.get('a', 0), nonlinearity=activation)
28
- elif init_type == 'orthogonal':
63
+ elif init_type == "kaiming_uniform":
64
+ nn.init.kaiming_uniform_(
65
+ tensor, a=param.get("a", 0), nonlinearity=nonlinearity
66
+ )
67
+ elif init_type == "kaiming_normal":
68
+ nn.init.kaiming_normal_(
69
+ tensor, a=param.get("a", 0), nonlinearity=nonlinearity
70
+ )
71
+ elif init_type == "orthogonal":
29
72
  nn.init.orthogonal_(tensor, gain=gain)
30
- elif init_type == 'normal':
31
- nn.init.normal_(tensor, mean=param.get('mean', 0.0), std=param.get('std', 0.0001))
32
- elif init_type == 'uniform':
33
- nn.init.uniform_(tensor, a=param.get('a', -0.05), b=param.get('b', 0.05))
73
+ elif init_type == "normal":
74
+ nn.init.normal_(
75
+ tensor, mean=param.get("mean", 0.0), std=param.get("std", 0.0001)
76
+ )
77
+ elif init_type == "uniform":
78
+ nn.init.uniform_(tensor, a=param.get("a", -0.05), b=param.get("b", 0.05))
34
79
  else:
35
80
  raise ValueError(f"Unknown init_type: {init_type}")
36
81
  return tensor
@@ -8,16 +8,17 @@ Author: Yang Zhou, zyaztec@gmail.com
8
8
  import torch
9
9
  from typing import Iterable
10
10
 
11
+
11
12
  def get_optimizer(
12
13
  optimizer: str | torch.optim.Optimizer = "adam",
13
14
  params: Iterable[torch.nn.Parameter] | None = None,
14
- **optimizer_params
15
+ **optimizer_params,
15
16
  ):
16
17
  if params is None:
17
18
  raise ValueError("params cannot be None. Please provide model parameters.")
18
19
 
19
- if 'lr' not in optimizer_params:
20
- optimizer_params['lr'] = 1e-3
20
+ if "lr" not in optimizer_params:
21
+ optimizer_params["lr"] = 1e-3
21
22
  if isinstance(optimizer, str):
22
23
  opt_name = optimizer.lower()
23
24
  if opt_name == "adam":
@@ -39,21 +40,36 @@ def get_optimizer(
39
40
  raise TypeError(f"Invalid optimizer type: {type(optimizer)}")
40
41
  return optimizer_fn
41
42
 
43
+
42
44
  def get_scheduler(
43
- scheduler: str | torch.optim.lr_scheduler._LRScheduler | torch.optim.lr_scheduler.LRScheduler | type[torch.optim.lr_scheduler._LRScheduler] | type[torch.optim.lr_scheduler.LRScheduler] | None,
45
+ scheduler: (
46
+ str
47
+ | torch.optim.lr_scheduler._LRScheduler
48
+ | torch.optim.lr_scheduler.LRScheduler
49
+ | type[torch.optim.lr_scheduler._LRScheduler]
50
+ | type[torch.optim.lr_scheduler.LRScheduler]
51
+ | None
52
+ ),
44
53
  optimizer,
45
- **scheduler_params
54
+ **scheduler_params,
46
55
  ):
47
56
  if isinstance(scheduler, str):
48
57
  if scheduler == "step":
49
- scheduler_fn = torch.optim.lr_scheduler.StepLR(optimizer, **scheduler_params)
58
+ scheduler_fn = torch.optim.lr_scheduler.StepLR(
59
+ optimizer, **scheduler_params
60
+ )
50
61
  elif scheduler == "cosine":
51
- scheduler_fn = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, **scheduler_params)
62
+ scheduler_fn = torch.optim.lr_scheduler.CosineAnnealingLR(
63
+ optimizer, **scheduler_params
64
+ )
52
65
  else:
53
66
  raise NotImplementedError(f"Unsupported scheduler: {scheduler}")
54
- elif isinstance(scheduler, (torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.LRScheduler)):
67
+ elif isinstance(
68
+ scheduler,
69
+ (torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.LRScheduler),
70
+ ):
55
71
  scheduler_fn = scheduler
56
72
  else:
57
73
  raise TypeError(f"Invalid scheduler type: {type(scheduler)}")
58
-
74
+
59
75
  return scheduler_fn