nextrec 0.1.11__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. nextrec/__version__.py +1 -1
  2. nextrec/basic/activation.py +1 -2
  3. nextrec/basic/callback.py +1 -2
  4. nextrec/basic/features.py +39 -8
  5. nextrec/basic/layers.py +3 -4
  6. nextrec/basic/loggers.py +15 -10
  7. nextrec/basic/metrics.py +1 -2
  8. nextrec/basic/model.py +160 -125
  9. nextrec/basic/session.py +150 -0
  10. nextrec/data/__init__.py +13 -2
  11. nextrec/data/data_utils.py +74 -22
  12. nextrec/data/dataloader.py +513 -0
  13. nextrec/data/preprocessor.py +494 -134
  14. nextrec/loss/__init__.py +31 -24
  15. nextrec/loss/listwise.py +164 -0
  16. nextrec/loss/loss_utils.py +133 -106
  17. nextrec/loss/pairwise.py +105 -0
  18. nextrec/loss/pointwise.py +198 -0
  19. nextrec/models/match/dssm.py +26 -17
  20. nextrec/models/match/dssm_v2.py +20 -2
  21. nextrec/models/match/mind.py +18 -3
  22. nextrec/models/match/sdm.py +17 -2
  23. nextrec/models/match/youtube_dnn.py +23 -10
  24. nextrec/models/multi_task/esmm.py +8 -8
  25. nextrec/models/multi_task/mmoe.py +8 -8
  26. nextrec/models/multi_task/ple.py +8 -8
  27. nextrec/models/multi_task/share_bottom.py +8 -8
  28. nextrec/models/ranking/__init__.py +8 -0
  29. nextrec/models/ranking/afm.py +5 -4
  30. nextrec/models/ranking/autoint.py +6 -4
  31. nextrec/models/ranking/dcn.py +6 -4
  32. nextrec/models/ranking/deepfm.py +5 -4
  33. nextrec/models/ranking/dien.py +6 -4
  34. nextrec/models/ranking/din.py +6 -4
  35. nextrec/models/ranking/fibinet.py +6 -4
  36. nextrec/models/ranking/fm.py +6 -4
  37. nextrec/models/ranking/masknet.py +6 -4
  38. nextrec/models/ranking/pnn.py +6 -4
  39. nextrec/models/ranking/widedeep.py +6 -4
  40. nextrec/models/ranking/xdeepfm.py +6 -4
  41. nextrec/utils/__init__.py +7 -11
  42. nextrec/utils/embedding.py +2 -4
  43. nextrec/utils/initializer.py +4 -5
  44. nextrec/utils/optimizer.py +7 -8
  45. {nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/METADATA +3 -3
  46. nextrec-0.2.2.dist-info/RECORD +53 -0
  47. nextrec/basic/dataloader.py +0 -447
  48. nextrec/loss/match_losses.py +0 -294
  49. nextrec/utils/common.py +0 -14
  50. nextrec-0.1.11.dist-info/RECORD +0 -51
  51. {nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/WHEEL +0 -0
  52. {nextrec-0.1.11.dist-info → nextrec-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,150 @@
1
+ """Session and experiment utilities.
2
+
3
+ This module centralizes session/experiment management so the rest of the
4
+ framework writes all artifacts to a consistent location:: <pwd>/log/<experiment_id>/
5
+
6
+ Within that folder we keep model parameters, checkpoints, training metrics,
7
+ evaluation metrics, and consolidated log output. When users do not provide an
8
+ ``experiment_id`` a timestamp-based identifier is generated once per process to
9
+ avoid scattering files across multiple directories. Test runs are redirected to
10
+ temporary folders so local trees are not polluted.
11
+
12
+ Date: create on 23/11/2025
13
+ Author: Yang Zhou,zyaztec@gmail.com
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import tempfile
20
+ from dataclasses import dataclass
21
+ from datetime import datetime
22
+ from pathlib import Path
23
+
24
+ __all__ = [
25
+ "Session",
26
+ "resolve_save_path",
27
+ "create_session",
28
+ ]
29
+
30
+ @dataclass(frozen=True)
31
+ class Session:
32
+ """Encapsulate standard folders for a NextRec experiment."""
33
+
34
+ experiment_id: str
35
+ root: Path
36
+
37
+ @property
38
+ def logs_dir(self) -> Path:
39
+ return self._ensure_dir(self.root)
40
+
41
+ @property
42
+ def checkpoints_dir(self) -> Path:
43
+ return self._ensure_dir(self.root)
44
+
45
+ @property
46
+ def predictions_dir(self) -> Path:
47
+ return self._ensure_dir(self.root / "predictions")
48
+
49
+ @property
50
+ def processor_dir(self) -> Path:
51
+ return self._ensure_dir(self.root / "processor")
52
+
53
+ @property
54
+ def params_dir(self) -> Path:
55
+ return self._ensure_dir(self.root)
56
+
57
+ @property
58
+ def metrics_dir(self) -> Path:
59
+ return self._ensure_dir(self.root)
60
+
61
+ def save_text(self, name: str, content: str) -> Path:
62
+ """Convenience helper: write a text file under logs_dir."""
63
+ path = self.logs_dir / name
64
+ path.parent.mkdir(parents=True, exist_ok=True)
65
+ path.write_text(content, encoding="utf-8")
66
+ return path
67
+
68
+ @staticmethod
69
+ def _ensure_dir(path: Path) -> Path:
70
+ path.mkdir(parents=True, exist_ok=True)
71
+ return path
72
+
73
+ def create_session(experiment_id: str | Path | None = None) -> Session:
74
+ """Create a :class:`Session` instance with prepared directories."""
75
+
76
+ if experiment_id is not None and str(experiment_id).strip():
77
+ exp_id = str(experiment_id).strip()
78
+ else:
79
+ exp_id = "nextrec_session_" + datetime.now().strftime("%Y%m%d")
80
+
81
+ if (
82
+ os.getenv("PYTEST_CURRENT_TEST")
83
+ or os.getenv("PYTEST_RUNNING")
84
+ or os.getenv("NEXTREC_TEST_MODE") == "1"
85
+ ):
86
+ session_path = Path(tempfile.gettempdir()) / "nextrec_logs" / exp_id
87
+ else:
88
+ # export NEXTREC_LOG_DIR=/data/nextrec/logs
89
+ base_dir = Path(os.getenv("NEXTREC_LOG_DIR", Path.cwd() / "nextrec_logs"))
90
+ session_path = base_dir / exp_id
91
+
92
+ session_path.mkdir(parents=True, exist_ok=True)
93
+ root = session_path.resolve()
94
+
95
+ return Session(experiment_id=exp_id, root=root)
96
+
97
+ def resolve_save_path(
98
+ path: str | Path | None,
99
+ default_dir: str | Path,
100
+ default_name: str,
101
+ suffix: str,
102
+ add_timestamp: bool = False,
103
+ ) -> Path:
104
+ """
105
+ Normalize and create a save path.
106
+
107
+ - If ``path`` is ``None`` or has no suffix, place the file under
108
+ ``default_dir``.
109
+ - If ``path`` has no suffix, its stem is used as the file name; otherwise
110
+ ``default_name``.
111
+ - Relative paths with a suffix are also anchored under ``default_dir``.
112
+ - Enforces ``suffix`` (with leading dot) and optionally appends a
113
+ timestamp.
114
+ - Parent directories are created.
115
+ """
116
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if add_timestamp else None
117
+
118
+ normalized_suffix = suffix if suffix.startswith(".") else f".{suffix}"
119
+
120
+ if path is not None and Path(path).suffix:
121
+ target = Path(path)
122
+ if not target.is_absolute():
123
+ target = Path(default_dir) / target
124
+ if target.suffix != normalized_suffix:
125
+ target = target.with_suffix(normalized_suffix)
126
+ if timestamp:
127
+ target = target.with_name(f"{target.stem}_{timestamp}{normalized_suffix}")
128
+ target.parent.mkdir(parents=True, exist_ok=True)
129
+ return target.resolve()
130
+
131
+ base_dir = Path(default_dir)
132
+ candidate = Path(path) if path is not None else None
133
+
134
+ if candidate is not None:
135
+ if candidate.exists() and candidate.is_dir():
136
+ base_dir = candidate
137
+ file_stem = default_name
138
+ else:
139
+ base_dir = candidate.parent if candidate.parent not in (Path("."), Path("")) else base_dir
140
+ file_stem = candidate.name or default_name
141
+ else:
142
+ file_stem = default_name
143
+
144
+ base_dir.mkdir(parents=True, exist_ok=True)
145
+ if timestamp:
146
+ file_stem = f"{file_stem}_{timestamp}"
147
+
148
+ return (base_dir / f"{file_stem}{normalized_suffix}").resolve()
149
+
150
+
nextrec/data/__init__.py CHANGED
@@ -4,16 +4,21 @@ Data utilities package for NextRec
4
4
  This package provides data processing and manipulation utilities.
5
5
 
6
6
  Date: create on 13/11/2025
7
- Author:
8
- Yang Zhou, zyaztec@gmail.com
7
+ Author: Yang Zhou, zyaztec@gmail.com
9
8
  """
10
9
 
11
10
  from nextrec.data.data_utils import (
12
11
  collate_fn,
13
12
  get_column_data,
13
+ default_output_dir,
14
14
  split_dict_random,
15
15
  build_eval_candidates,
16
+ resolve_file_paths,
17
+ iter_file_chunks,
18
+ read_table,
19
+ load_dataframes,
16
20
  )
21
+ from nextrec.basic.features import FeatureConfig
17
22
 
18
23
  # For backward compatibility, keep utils accessible
19
24
  from nextrec.data import data_utils
@@ -21,7 +26,13 @@ from nextrec.data import data_utils
21
26
  __all__ = [
22
27
  'collate_fn',
23
28
  'get_column_data',
29
+ 'default_output_dir',
24
30
  'split_dict_random',
25
31
  'build_eval_candidates',
32
+ 'resolve_file_paths',
33
+ 'iter_file_chunks',
34
+ 'read_table',
35
+ 'load_dataframes',
36
+ 'FeatureConfig',
26
37
  'data_utils',
27
38
  ]
@@ -1,30 +1,13 @@
1
- """
2
- Data processing utilities for NextRec
3
-
4
- Date: create on 13/11/2025
5
- Author:
6
- Yang Zhou, zyaztec@gmail.com
7
- """
1
+ """Data processing utilities for NextRec."""
8
2
 
9
3
  import torch
10
4
  import numpy as np
11
5
  import pandas as pd
6
+ import pyarrow.parquet as pq
7
+ from pathlib import Path
12
8
 
13
9
  def collate_fn(batch):
14
- """
15
- Custom collate function for batching tuples of tensors.
16
- Each element in batch is a tuple of tensors from FileDataset.
17
-
18
- Examples:
19
- # Single sample in batch
20
- (tensor([1.0, 2.0]), tensor([10, 20]), tensor([100, 200]), tensor(1.0))
21
- # Batched output
22
- (tensor([[1.0, 2.0], [3.0, 4.0]]), # dense_features batch
23
- tensor([[10, 20], [30, 40]]), # sparse_features batch
24
- tensor([[100, 200], [300, 400]]), # sequence_features batch
25
- tensor([1.0, 0.0]) # labels batch)
26
-
27
- """
10
+ """Collate a list of tensor tuples from ``FileDataset`` into batched tensors."""
28
11
  if not batch:
29
12
  return tuple()
30
13
 
@@ -33,7 +16,20 @@ def collate_fn(batch):
33
16
 
34
17
  for i in range(num_tensors):
35
18
  tensor_list = [item[i] for item in batch]
36
- stacked = torch.cat(tensor_list, dim=0)
19
+ first = tensor_list[0]
20
+
21
+ if isinstance(first, torch.Tensor):
22
+ stacked = torch.cat(tensor_list, dim=0)
23
+ elif isinstance(first, np.ndarray):
24
+ stacked = np.concatenate(tensor_list, axis=0)
25
+ elif isinstance(first, list):
26
+ combined = []
27
+ for entry in tensor_list:
28
+ combined.extend(entry)
29
+ stacked = combined
30
+ else:
31
+ stacked = tensor_list
32
+
37
33
  result.append(stacked)
38
34
 
39
35
  return tuple(result)
@@ -53,6 +49,62 @@ def get_column_data(data: dict | pd.DataFrame, name: str):
53
49
  raise KeyError(f"Unsupported data type for extracting column {name}")
54
50
 
55
51
 
52
+ def resolve_file_paths(path: str) -> tuple[list[str], str]:
53
+ """Resolve file or directory path into a sorted list of files and file type."""
54
+ path_obj = Path(path)
55
+
56
+ if path_obj.is_file():
57
+ file_type = path_obj.suffix.lower().lstrip(".")
58
+ assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
59
+ return [str(path_obj)], file_type
60
+
61
+ if path_obj.is_dir():
62
+ collected_files = [p for p in path_obj.iterdir() if p.is_file()]
63
+ csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
64
+ parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
65
+
66
+ if csv_files and parquet_files:
67
+ raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
68
+ file_paths = csv_files if csv_files else parquet_files
69
+ if not file_paths:
70
+ raise ValueError(f"No CSV or Parquet files found in directory: {path}")
71
+ file_paths.sort()
72
+ file_type = "csv" if csv_files else "parquet"
73
+ return file_paths, file_type
74
+
75
+ raise ValueError(f"Invalid path: {path}")
76
+
77
+
78
+ def iter_file_chunks(file_path: str, file_type: str, chunk_size: int):
79
+ """Yield DataFrame chunks for CSV/Parquet without loading the whole file."""
80
+ if file_type == "csv":
81
+ yield from pd.read_csv(file_path, chunksize=chunk_size)
82
+ return
83
+ parquet_file = pq.ParquetFile(file_path)
84
+ for batch in parquet_file.iter_batches(batch_size=chunk_size):
85
+ yield batch.to_pandas()
86
+
87
+
88
+ def read_table(file_path: str, file_type: str) -> pd.DataFrame:
89
+ """Read a single CSV/Parquet file."""
90
+ if file_type == "csv":
91
+ return pd.read_csv(file_path)
92
+ return pd.read_parquet(file_path)
93
+
94
+
95
+ def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
96
+ """Load multiple files of the same type into DataFrames."""
97
+ return [read_table(fp, file_type) for fp in file_paths]
98
+
99
+
100
+ def default_output_dir(path: str) -> Path:
101
+ """Generate a default output directory path based on the input path."""
102
+ path_obj = Path(path)
103
+ if path_obj.is_file():
104
+ return path_obj.parent / f"{path_obj.stem}_preprocessed"
105
+ return path_obj.with_name(f"{path_obj.name}_preprocessed")
106
+
107
+
56
108
  def split_dict_random(data_dict: dict, test_size: float=0.2, random_state:int|None=None):
57
109
  """Randomly split a dictionary of data into training and testing sets."""
58
110
  lengths = [len(v) for v in data_dict.values()]