nextrec 0.1.11__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/activation.py +1 -2
- nextrec/basic/callback.py +1 -2
- nextrec/basic/features.py +39 -8
- nextrec/basic/layers.py +3 -4
- nextrec/basic/loggers.py +15 -10
- nextrec/basic/metrics.py +1 -2
- nextrec/basic/model.py +160 -125
- nextrec/basic/session.py +150 -0
- nextrec/data/__init__.py +13 -2
- nextrec/data/data_utils.py +74 -22
- nextrec/data/dataloader.py +513 -0
- nextrec/data/preprocessor.py +494 -134
- nextrec/loss/__init__.py +31 -24
- nextrec/loss/listwise.py +164 -0
- nextrec/loss/loss_utils.py +133 -106
- nextrec/loss/pairwise.py +105 -0
- nextrec/loss/pointwise.py +198 -0
- nextrec/models/match/dssm.py +26 -17
- nextrec/models/match/dssm_v2.py +20 -2
- nextrec/models/match/mind.py +18 -3
- nextrec/models/match/sdm.py +17 -2
- nextrec/models/match/youtube_dnn.py +23 -10
- nextrec/models/multi_task/esmm.py +8 -8
- nextrec/models/multi_task/mmoe.py +8 -8
- nextrec/models/multi_task/ple.py +8 -8
- nextrec/models/multi_task/share_bottom.py +8 -8
- nextrec/models/ranking/__init__.py +8 -0
- nextrec/models/ranking/afm.py +5 -4
- nextrec/models/ranking/autoint.py +6 -4
- nextrec/models/ranking/dcn.py +6 -4
- nextrec/models/ranking/deepfm.py +5 -4
- nextrec/models/ranking/dien.py +6 -4
- nextrec/models/ranking/din.py +6 -4
- nextrec/models/ranking/fibinet.py +6 -4
- nextrec/models/ranking/fm.py +6 -4
- nextrec/models/ranking/masknet.py +6 -4
- nextrec/models/ranking/pnn.py +6 -4
- nextrec/models/ranking/widedeep.py +6 -4
- nextrec/models/ranking/xdeepfm.py +6 -4
- nextrec/utils/__init__.py +7 -11
- nextrec/utils/embedding.py +2 -4
- nextrec/utils/initializer.py +4 -5
- nextrec/utils/optimizer.py +7 -8
- {nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/METADATA +3 -3
- nextrec-0.2.2.dist-info/RECORD +53 -0
- nextrec/basic/dataloader.py +0 -447
- nextrec/loss/match_losses.py +0 -294
- nextrec/utils/common.py +0 -14
- nextrec-0.1.11.dist-info/RECORD +0 -51
- {nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/WHEEL +0 -0
- {nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/licenses/LICENSE +0 -0
nextrec/basic/session.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Session and experiment utilities.
|
|
2
|
+
|
|
3
|
+
This module centralizes session/experiment management so the rest of the
|
|
4
|
+
framework writes all artifacts to a consistent location:: <pwd>/log/<experiment_id>/
|
|
5
|
+
|
|
6
|
+
Within that folder we keep model parameters, checkpoints, training metrics,
|
|
7
|
+
evaluation metrics, and consolidated log output. When users do not provide an
|
|
8
|
+
``experiment_id`` a timestamp-based identifier is generated once per process to
|
|
9
|
+
avoid scattering files across multiple directories. Test runs are redirected to
|
|
10
|
+
temporary folders so local trees are not polluted.
|
|
11
|
+
|
|
12
|
+
Date: create on 23/11/2025
|
|
13
|
+
Author: Yang Zhou,zyaztec@gmail.com
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import tempfile
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"Session",
|
|
26
|
+
"resolve_save_path",
|
|
27
|
+
"create_session",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class Session:
|
|
32
|
+
"""Encapsulate standard folders for a NextRec experiment."""
|
|
33
|
+
|
|
34
|
+
experiment_id: str
|
|
35
|
+
root: Path
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def logs_dir(self) -> Path:
|
|
39
|
+
return self._ensure_dir(self.root)
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def checkpoints_dir(self) -> Path:
|
|
43
|
+
return self._ensure_dir(self.root)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def predictions_dir(self) -> Path:
|
|
47
|
+
return self._ensure_dir(self.root / "predictions")
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def processor_dir(self) -> Path:
|
|
51
|
+
return self._ensure_dir(self.root / "processor")
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def params_dir(self) -> Path:
|
|
55
|
+
return self._ensure_dir(self.root)
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def metrics_dir(self) -> Path:
|
|
59
|
+
return self._ensure_dir(self.root)
|
|
60
|
+
|
|
61
|
+
def save_text(self, name: str, content: str) -> Path:
|
|
62
|
+
"""Convenience helper: write a text file under logs_dir."""
|
|
63
|
+
path = self.logs_dir / name
|
|
64
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
path.write_text(content, encoding="utf-8")
|
|
66
|
+
return path
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def _ensure_dir(path: Path) -> Path:
|
|
70
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
return path
|
|
72
|
+
|
|
73
|
+
def create_session(experiment_id: str | Path | None = None) -> Session:
|
|
74
|
+
"""Create a :class:`Session` instance with prepared directories."""
|
|
75
|
+
|
|
76
|
+
if experiment_id is not None and str(experiment_id).strip():
|
|
77
|
+
exp_id = str(experiment_id).strip()
|
|
78
|
+
else:
|
|
79
|
+
exp_id = "nextrec_session_" + datetime.now().strftime("%Y%m%d")
|
|
80
|
+
|
|
81
|
+
if (
|
|
82
|
+
os.getenv("PYTEST_CURRENT_TEST")
|
|
83
|
+
or os.getenv("PYTEST_RUNNING")
|
|
84
|
+
or os.getenv("NEXTREC_TEST_MODE") == "1"
|
|
85
|
+
):
|
|
86
|
+
session_path = Path(tempfile.gettempdir()) / "nextrec_logs" / exp_id
|
|
87
|
+
else:
|
|
88
|
+
# export NEXTREC_LOG_DIR=/data/nextrec/logs
|
|
89
|
+
base_dir = Path(os.getenv("NEXTREC_LOG_DIR", Path.cwd() / "nextrec_logs"))
|
|
90
|
+
session_path = base_dir / exp_id
|
|
91
|
+
|
|
92
|
+
session_path.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
root = session_path.resolve()
|
|
94
|
+
|
|
95
|
+
return Session(experiment_id=exp_id, root=root)
|
|
96
|
+
|
|
97
|
+
def resolve_save_path(
|
|
98
|
+
path: str | Path | None,
|
|
99
|
+
default_dir: str | Path,
|
|
100
|
+
default_name: str,
|
|
101
|
+
suffix: str,
|
|
102
|
+
add_timestamp: bool = False,
|
|
103
|
+
) -> Path:
|
|
104
|
+
"""
|
|
105
|
+
Normalize and create a save path.
|
|
106
|
+
|
|
107
|
+
- If ``path`` is ``None`` or has no suffix, place the file under
|
|
108
|
+
``default_dir``.
|
|
109
|
+
- If ``path`` has no suffix, its stem is used as the file name; otherwise
|
|
110
|
+
``default_name``.
|
|
111
|
+
- Relative paths with a suffix are also anchored under ``default_dir``.
|
|
112
|
+
- Enforces ``suffix`` (with leading dot) and optionally appends a
|
|
113
|
+
timestamp.
|
|
114
|
+
- Parent directories are created.
|
|
115
|
+
"""
|
|
116
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if add_timestamp else None
|
|
117
|
+
|
|
118
|
+
normalized_suffix = suffix if suffix.startswith(".") else f".{suffix}"
|
|
119
|
+
|
|
120
|
+
if path is not None and Path(path).suffix:
|
|
121
|
+
target = Path(path)
|
|
122
|
+
if not target.is_absolute():
|
|
123
|
+
target = Path(default_dir) / target
|
|
124
|
+
if target.suffix != normalized_suffix:
|
|
125
|
+
target = target.with_suffix(normalized_suffix)
|
|
126
|
+
if timestamp:
|
|
127
|
+
target = target.with_name(f"{target.stem}_{timestamp}{normalized_suffix}")
|
|
128
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
return target.resolve()
|
|
130
|
+
|
|
131
|
+
base_dir = Path(default_dir)
|
|
132
|
+
candidate = Path(path) if path is not None else None
|
|
133
|
+
|
|
134
|
+
if candidate is not None:
|
|
135
|
+
if candidate.exists() and candidate.is_dir():
|
|
136
|
+
base_dir = candidate
|
|
137
|
+
file_stem = default_name
|
|
138
|
+
else:
|
|
139
|
+
base_dir = candidate.parent if candidate.parent not in (Path("."), Path("")) else base_dir
|
|
140
|
+
file_stem = candidate.name or default_name
|
|
141
|
+
else:
|
|
142
|
+
file_stem = default_name
|
|
143
|
+
|
|
144
|
+
base_dir.mkdir(parents=True, exist_ok=True)
|
|
145
|
+
if timestamp:
|
|
146
|
+
file_stem = f"{file_stem}_{timestamp}"
|
|
147
|
+
|
|
148
|
+
return (base_dir / f"{file_stem}{normalized_suffix}").resolve()
|
|
149
|
+
|
|
150
|
+
|
nextrec/data/__init__.py
CHANGED
|
@@ -4,16 +4,21 @@ Data utilities package for NextRec
|
|
|
4
4
|
This package provides data processing and manipulation utilities.
|
|
5
5
|
|
|
6
6
|
Date: create on 13/11/2025
|
|
7
|
-
Author:
|
|
8
|
-
Yang Zhou, zyaztec@gmail.com
|
|
7
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
9
8
|
"""
|
|
10
9
|
|
|
11
10
|
from nextrec.data.data_utils import (
|
|
12
11
|
collate_fn,
|
|
13
12
|
get_column_data,
|
|
13
|
+
default_output_dir,
|
|
14
14
|
split_dict_random,
|
|
15
15
|
build_eval_candidates,
|
|
16
|
+
resolve_file_paths,
|
|
17
|
+
iter_file_chunks,
|
|
18
|
+
read_table,
|
|
19
|
+
load_dataframes,
|
|
16
20
|
)
|
|
21
|
+
from nextrec.basic.features import FeatureConfig
|
|
17
22
|
|
|
18
23
|
# For backward compatibility, keep utils accessible
|
|
19
24
|
from nextrec.data import data_utils
|
|
@@ -21,7 +26,13 @@ from nextrec.data import data_utils
|
|
|
21
26
|
__all__ = [
|
|
22
27
|
'collate_fn',
|
|
23
28
|
'get_column_data',
|
|
29
|
+
'default_output_dir',
|
|
24
30
|
'split_dict_random',
|
|
25
31
|
'build_eval_candidates',
|
|
32
|
+
'resolve_file_paths',
|
|
33
|
+
'iter_file_chunks',
|
|
34
|
+
'read_table',
|
|
35
|
+
'load_dataframes',
|
|
36
|
+
'FeatureConfig',
|
|
26
37
|
'data_utils',
|
|
27
38
|
]
|
nextrec/data/data_utils.py
CHANGED
|
@@ -1,30 +1,13 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Data processing utilities for NextRec
|
|
3
|
-
|
|
4
|
-
Date: create on 13/11/2025
|
|
5
|
-
Author:
|
|
6
|
-
Yang Zhou, zyaztec@gmail.com
|
|
7
|
-
"""
|
|
1
|
+
"""Data processing utilities for NextRec."""
|
|
8
2
|
|
|
9
3
|
import torch
|
|
10
4
|
import numpy as np
|
|
11
5
|
import pandas as pd
|
|
6
|
+
import pyarrow.parquet as pq
|
|
7
|
+
from pathlib import Path
|
|
12
8
|
|
|
13
9
|
def collate_fn(batch):
|
|
14
|
-
"""
|
|
15
|
-
Custom collate function for batching tuples of tensors.
|
|
16
|
-
Each element in batch is a tuple of tensors from FileDataset.
|
|
17
|
-
|
|
18
|
-
Examples:
|
|
19
|
-
# Single sample in batch
|
|
20
|
-
(tensor([1.0, 2.0]), tensor([10, 20]), tensor([100, 200]), tensor(1.0))
|
|
21
|
-
# Batched output
|
|
22
|
-
(tensor([[1.0, 2.0], [3.0, 4.0]]), # dense_features batch
|
|
23
|
-
tensor([[10, 20], [30, 40]]), # sparse_features batch
|
|
24
|
-
tensor([[100, 200], [300, 400]]), # sequence_features batch
|
|
25
|
-
tensor([1.0, 0.0]) # labels batch)
|
|
26
|
-
|
|
27
|
-
"""
|
|
10
|
+
"""Collate a list of tensor tuples from ``FileDataset`` into batched tensors."""
|
|
28
11
|
if not batch:
|
|
29
12
|
return tuple()
|
|
30
13
|
|
|
@@ -33,7 +16,20 @@ def collate_fn(batch):
|
|
|
33
16
|
|
|
34
17
|
for i in range(num_tensors):
|
|
35
18
|
tensor_list = [item[i] for item in batch]
|
|
36
|
-
|
|
19
|
+
first = tensor_list[0]
|
|
20
|
+
|
|
21
|
+
if isinstance(first, torch.Tensor):
|
|
22
|
+
stacked = torch.cat(tensor_list, dim=0)
|
|
23
|
+
elif isinstance(first, np.ndarray):
|
|
24
|
+
stacked = np.concatenate(tensor_list, axis=0)
|
|
25
|
+
elif isinstance(first, list):
|
|
26
|
+
combined = []
|
|
27
|
+
for entry in tensor_list:
|
|
28
|
+
combined.extend(entry)
|
|
29
|
+
stacked = combined
|
|
30
|
+
else:
|
|
31
|
+
stacked = tensor_list
|
|
32
|
+
|
|
37
33
|
result.append(stacked)
|
|
38
34
|
|
|
39
35
|
return tuple(result)
|
|
@@ -53,6 +49,62 @@ def get_column_data(data: dict | pd.DataFrame, name: str):
|
|
|
53
49
|
raise KeyError(f"Unsupported data type for extracting column {name}")
|
|
54
50
|
|
|
55
51
|
|
|
52
|
+
def resolve_file_paths(path: str) -> tuple[list[str], str]:
|
|
53
|
+
"""Resolve file or directory path into a sorted list of files and file type."""
|
|
54
|
+
path_obj = Path(path)
|
|
55
|
+
|
|
56
|
+
if path_obj.is_file():
|
|
57
|
+
file_type = path_obj.suffix.lower().lstrip(".")
|
|
58
|
+
assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
|
|
59
|
+
return [str(path_obj)], file_type
|
|
60
|
+
|
|
61
|
+
if path_obj.is_dir():
|
|
62
|
+
collected_files = [p for p in path_obj.iterdir() if p.is_file()]
|
|
63
|
+
csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
|
|
64
|
+
parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
|
|
65
|
+
|
|
66
|
+
if csv_files and parquet_files:
|
|
67
|
+
raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
|
|
68
|
+
file_paths = csv_files if csv_files else parquet_files
|
|
69
|
+
if not file_paths:
|
|
70
|
+
raise ValueError(f"No CSV or Parquet files found in directory: {path}")
|
|
71
|
+
file_paths.sort()
|
|
72
|
+
file_type = "csv" if csv_files else "parquet"
|
|
73
|
+
return file_paths, file_type
|
|
74
|
+
|
|
75
|
+
raise ValueError(f"Invalid path: {path}")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def iter_file_chunks(file_path: str, file_type: str, chunk_size: int):
|
|
79
|
+
"""Yield DataFrame chunks for CSV/Parquet without loading the whole file."""
|
|
80
|
+
if file_type == "csv":
|
|
81
|
+
yield from pd.read_csv(file_path, chunksize=chunk_size)
|
|
82
|
+
return
|
|
83
|
+
parquet_file = pq.ParquetFile(file_path)
|
|
84
|
+
for batch in parquet_file.iter_batches(batch_size=chunk_size):
|
|
85
|
+
yield batch.to_pandas()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def read_table(file_path: str, file_type: str) -> pd.DataFrame:
|
|
89
|
+
"""Read a single CSV/Parquet file."""
|
|
90
|
+
if file_type == "csv":
|
|
91
|
+
return pd.read_csv(file_path)
|
|
92
|
+
return pd.read_parquet(file_path)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
|
|
96
|
+
"""Load multiple files of the same type into DataFrames."""
|
|
97
|
+
return [read_table(fp, file_type) for fp in file_paths]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def default_output_dir(path: str) -> Path:
|
|
101
|
+
"""Generate a default output directory path based on the input path."""
|
|
102
|
+
path_obj = Path(path)
|
|
103
|
+
if path_obj.is_file():
|
|
104
|
+
return path_obj.parent / f"{path_obj.stem}_preprocessed"
|
|
105
|
+
return path_obj.with_name(f"{path_obj.name}_preprocessed")
|
|
106
|
+
|
|
107
|
+
|
|
56
108
|
def split_dict_random(data_dict: dict, test_size: float=0.2, random_state:int|None=None):
|
|
57
109
|
"""Randomly split a dictionary of data into training and testing sets."""
|
|
58
110
|
lengths = [len(v) for v in data_dict.values()]
|