nextrec 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nextrec/__version__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.4"
1
+ __version__ = "0.3.5"
nextrec/basic/features.py CHANGED
@@ -7,7 +7,7 @@ Author: Yang Zhou, zyaztec@gmail.com
7
7
  """
8
8
  import torch
9
9
  from nextrec.utils.embedding import get_auto_embedding_dim
10
- from nextrec.utils.common import normalize_to_list
10
+ from nextrec.utils.feature import normalize_to_list
11
11
 
12
12
  class BaseFeature(object):
13
13
  def __repr__(self):
nextrec/basic/model.py CHANGED
@@ -31,10 +31,12 @@ from nextrec.basic.session import resolve_save_path, create_session
31
31
  from nextrec.basic.metrics import configure_metrics, evaluate_metrics, check_user_id
32
32
 
33
33
  from nextrec.data.dataloader import build_tensors_from_data
34
- from nextrec.data.data_utils import get_column_data, collate_fn, batch_to_dict, get_user_ids
34
+ from nextrec.data.data_processing import get_column_data, get_user_ids
35
+ from nextrec.data.batch_utils import collate_fn, batch_to_dict
35
36
 
36
37
  from nextrec.loss import get_loss_fn, get_loss_kwargs
37
- from nextrec.utils import get_optimizer, get_scheduler, to_tensor
38
+ from nextrec.utils import get_optimizer, get_scheduler
39
+ from nextrec.utils.tensor import to_tensor
38
40
 
39
41
  from nextrec import __version__
40
42
 
nextrec/basic/session.py CHANGED
@@ -1,14 +1,5 @@
1
1
  """Session and experiment utilities.
2
2
 
3
- This module centralizes session/experiment management so the rest of the
4
- framework writes all artifacts to a consistent location:: <pwd>/log/<experiment_id>/
5
-
6
- Within that folder we keep model parameters, checkpoints, training metrics,
7
- evaluation metrics, and consolidated log output. When users do not provide an
8
- ``experiment_id`` a timestamp-based identifier is generated once per process to
9
- avoid scattering files across multiple directories. Test runs are redirected to
10
- temporary folders so local trees are not polluted.
11
-
12
3
  Date: create on 23/11/2025
13
4
  Author: Yang Zhou,zyaztec@gmail.com
14
5
  """
@@ -16,7 +7,7 @@ Author: Yang Zhou,zyaztec@gmail.com
16
7
  import os
17
8
  import tempfile
18
9
  from dataclasses import dataclass
19
- from datetime import datetime
10
+ from datetime import datetime, timezone
20
11
  from pathlib import Path
21
12
 
22
13
  __all__ = [
@@ -74,6 +65,7 @@ def create_session(experiment_id: str | Path | None = None) -> Session:
74
65
  if experiment_id is not None and str(experiment_id).strip():
75
66
  exp_id = str(experiment_id).strip()
76
67
  else:
68
+ # Use local time for session naming
77
69
  exp_id = "nextrec_session_" + datetime.now().strftime("%Y%m%d")
78
70
 
79
71
  if (
@@ -111,6 +103,7 @@ def resolve_save_path(
111
103
  timestamp.
112
104
  - Parent directories are created.
113
105
  """
106
+ # Use local time for file timestamps
114
107
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if add_timestamp else None
115
108
 
116
109
  normalized_suffix = suffix if suffix.startswith(".") else f".{suffix}"
nextrec/data/__init__.py CHANGED
@@ -1,48 +1,86 @@
1
1
  """
2
2
  Data utilities package for NextRec
3
3
 
4
- This package provides data processing and manipulation utilities.
4
+ This package provides data processing and manipulation utilities organized by category:
5
+ - batch_utils: Batch collation and processing
6
+ - data_processing: Data manipulation and user ID extraction
7
+ - data_utils: Legacy module (re-exports from specialized modules)
8
+ - dataloader: Dataset and DataLoader implementations
9
+ - preprocessor: Data preprocessing pipeline
5
10
 
6
11
  Date: create on 13/11/2025
12
+ Last update: 03/12/2025 (refactored)
7
13
  Author: Yang Zhou, zyaztec@gmail.com
8
14
  """
9
15
 
10
- from nextrec.data.data_utils import (
11
- collate_fn,
16
+ # Batch utilities
17
+ from nextrec.data.batch_utils import collate_fn, batch_to_dict, stack_section
18
+
19
+ # Data processing utilities
20
+ from nextrec.data.data_processing import (
12
21
  get_column_data,
13
- default_output_dir,
14
22
  split_dict_random,
15
23
  build_eval_candidates,
24
+ get_user_ids,
25
+ )
26
+
27
+ # File utilities (from utils package)
28
+ from nextrec.utils.file import (
16
29
  resolve_file_paths,
17
30
  iter_file_chunks,
18
31
  read_table,
19
32
  load_dataframes,
33
+ default_output_dir,
20
34
  )
21
- from nextrec.basic.features import FeatureSet
22
- from nextrec.data import data_utils
35
+
36
+ # DataLoader components
23
37
  from nextrec.data.dataloader import (
24
38
  TensorDictDataset,
25
39
  FileDataset,
26
40
  RecDataLoader,
27
41
  build_tensors_from_data,
28
42
  )
43
+
44
+ # Preprocessor
29
45
  from nextrec.data.preprocessor import DataProcessor
30
46
 
47
+ # Feature definitions
48
+ from nextrec.basic.features import FeatureSet
49
+
50
+ # Legacy module (for backward compatibility)
51
+ from nextrec.data import data_utils
52
+
31
53
  __all__ = [
54
+ # Batch utilities
32
55
  'collate_fn',
56
+ 'batch_to_dict',
57
+ 'stack_section',
58
+
59
+ # Data processing
33
60
  'get_column_data',
34
- 'default_output_dir',
35
61
  'split_dict_random',
36
62
  'build_eval_candidates',
63
+ 'get_user_ids',
64
+
65
+ # File utilities
37
66
  'resolve_file_paths',
38
67
  'iter_file_chunks',
39
68
  'read_table',
40
69
  'load_dataframes',
41
- 'FeatureSet',
42
- 'data_utils',
70
+ 'default_output_dir',
71
+
72
+ # DataLoader
43
73
  'TensorDictDataset',
44
74
  'FileDataset',
45
75
  'RecDataLoader',
46
76
  'build_tensors_from_data',
77
+
78
+ # Preprocessor
47
79
  'DataProcessor',
80
+
81
+ # Features
82
+ 'FeatureSet',
83
+
84
+ # Legacy module
85
+ 'data_utils',
48
86
  ]
@@ -0,0 +1,80 @@
1
+ """
2
+ Batch collation utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ import torch
9
+ import numpy as np
10
+ from typing import Any, Mapping
11
+
12
+ def stack_section(batch: list[dict], section: str):
13
+ entries = [item.get(section) for item in batch if item.get(section) is not None]
14
+ if not entries:
15
+ return None
16
+ merged: dict = {}
17
+ for name in entries[0]: # type: ignore
18
+ tensors = [item[section][name] for item in batch if item.get(section) is not None and name in item[section]]
19
+ merged[name] = torch.stack(tensors, dim=0)
20
+ return merged
21
+
22
+ def collate_fn(batch):
23
+ """
24
+ Collate a list of sample dicts into the unified batch format:
25
+ {
26
+ "features": {name: Tensor(B, ...)},
27
+ "labels": {target: Tensor(B, ...)} or None,
28
+ "ids": {id_name: Tensor(B, ...)} or None,
29
+ }
30
+ Args: batch: List of samples from DataLoader
31
+
32
+ Returns: dict: Batched data in unified format
33
+ """
34
+ if not batch:
35
+ return {"features": {}, "labels": None, "ids": None}
36
+
37
+ first = batch[0]
38
+ if isinstance(first, dict) and "features" in first:
39
+ # Streaming dataset yields already-batched chunks; avoid adding an extra dim.
40
+ if first.get("_already_batched") and len(batch) == 1:
41
+ return {
42
+ "features": first.get("features", {}),
43
+ "labels": first.get("labels"),
44
+ "ids": first.get("ids"),
45
+ }
46
+ return {
47
+ "features": stack_section(batch, "features") or {},
48
+ "labels": stack_section(batch, "labels"),
49
+ "ids": stack_section(batch, "ids"),
50
+ }
51
+
52
+ # Fallback: stack tuples/lists of tensors
53
+ num_tensors = len(first)
54
+ result = []
55
+ for i in range(num_tensors):
56
+ tensor_list = [item[i] for item in batch]
57
+ first_item = tensor_list[0]
58
+ if isinstance(first_item, torch.Tensor):
59
+ stacked = torch.cat(tensor_list, dim=0)
60
+ elif isinstance(first_item, np.ndarray):
61
+ stacked = np.concatenate(tensor_list, axis=0)
62
+ elif isinstance(first_item, list):
63
+ combined = []
64
+ for entry in tensor_list:
65
+ combined.extend(entry)
66
+ stacked = combined
67
+ else:
68
+ stacked = tensor_list
69
+ result.append(stacked)
70
+ return tuple(result)
71
+
72
+
73
+ def batch_to_dict(batch_data: Any, include_ids: bool = True) -> dict:
74
+ if not (isinstance(batch_data, Mapping) and "features" in batch_data):
75
+ raise TypeError("[BaseModel-batch_to_dict Error] Batch data must be a dict with 'features' produced by the current DataLoader.")
76
+ return {
77
+ "features": batch_data.get("features", {}),
78
+ "labels": batch_data.get("labels"),
79
+ "ids": batch_data.get("ids") if include_ids else None,
80
+ }
@@ -0,0 +1,152 @@
1
+ """
2
+ Data processing utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ import torch
9
+ import numpy as np
10
+ import pandas as pd
11
+ from typing import Any, Mapping
12
+
13
+
14
+ def get_column_data(data: dict | pd.DataFrame, name: str):
15
+ if isinstance(data, dict):
16
+ return data[name] if name in data else None
17
+ elif isinstance(data, pd.DataFrame):
18
+ if name not in data.columns:
19
+ return None
20
+ return data[name].values
21
+ else:
22
+ if hasattr(data, name):
23
+ return getattr(data, name)
24
+ raise KeyError(f"Unsupported data type for extracting column {name}")
25
+
26
+ def split_dict_random(
27
+ data_dict: dict,
28
+ test_size: float = 0.2,
29
+ random_state: int | None = None
30
+ ):
31
+ lengths = [len(v) for v in data_dict.values()]
32
+ if len(set(lengths)) != 1:
33
+ raise ValueError(f"Length mismatch: {lengths}")
34
+
35
+ n = lengths[0]
36
+ rng = np.random.default_rng(random_state)
37
+ perm = rng.permutation(n)
38
+ cut = int(round(n * (1 - test_size)))
39
+ train_idx, test_idx = perm[:cut], perm[cut:]
40
+
41
+ def take(v, idx):
42
+ if isinstance(v, np.ndarray):
43
+ return v[idx]
44
+ elif isinstance(v, pd.Series):
45
+ return v.iloc[idx].to_numpy()
46
+ else:
47
+ v_arr = np.asarray(v, dtype=object)
48
+ return v_arr[idx]
49
+
50
+ train_dict = {k: take(v, train_idx) for k, v in data_dict.items()}
51
+ test_dict = {k: take(v, test_idx) for k, v in data_dict.items()}
52
+ return train_dict, test_dict
53
+
54
+
55
+ def build_eval_candidates(
56
+ df_all: pd.DataFrame,
57
+ user_col: str,
58
+ item_col: str,
59
+ label_col: str,
60
+ user_features: pd.DataFrame,
61
+ item_features: pd.DataFrame,
62
+ num_pos_per_user: int = 5,
63
+ num_neg_per_pos: int = 50,
64
+ random_seed: int = 2025,
65
+ ) -> pd.DataFrame:
66
+ """
67
+ Build evaluation candidates with positive and negative samples for each user.
68
+
69
+ Args:
70
+ df_all: Full interaction DataFrame
71
+ user_col: Name of the user ID column
72
+ item_col: Name of the item ID column
73
+ label_col: Name of the label column
74
+ user_features: DataFrame containing user features
75
+ item_features: DataFrame containing item features
76
+ num_pos_per_user: Number of positive samples per user (default: 5)
77
+ num_neg_per_pos: Number of negative samples per positive (default: 50)
78
+ random_seed: Random seed for reproducibility (default: 2025)
79
+
80
+ Returns:
81
+ pd.DataFrame: Evaluation candidates with features
82
+ """
83
+ rng = np.random.default_rng(random_seed)
84
+
85
+ users = df_all[user_col].unique()
86
+ all_items = item_features[item_col].unique()
87
+ rows = []
88
+ user_hist_items = {u: df_all[df_all[user_col] == u][item_col].unique() for u in users}
89
+
90
+ for u in users:
91
+ df_user = df_all[df_all[user_col] == u]
92
+ pos_items = df_user[df_user[label_col] == 1][item_col].unique()
93
+ if len(pos_items) == 0:
94
+ continue
95
+ pos_items = pos_items[:num_pos_per_user]
96
+ seen_items = set(user_hist_items[u])
97
+ neg_pool = np.setdiff1d(all_items, np.fromiter(seen_items, dtype=all_items.dtype))
98
+ if len(neg_pool) == 0:
99
+ continue
100
+ for pos in pos_items:
101
+ if len(neg_pool) <= num_neg_per_pos:
102
+ neg_items = neg_pool
103
+ else:
104
+ neg_items = rng.choice(neg_pool, size=num_neg_per_pos, replace=False)
105
+ rows.append((u, pos, 1))
106
+ for ni in neg_items:
107
+ rows.append((u, ni, 0))
108
+
109
+ eval_df = pd.DataFrame(rows, columns=[user_col, item_col, label_col])
110
+ eval_df = eval_df.merge(user_features, on=user_col, how='left')
111
+ eval_df = eval_df.merge(item_features, on=item_col, how='left')
112
+ return eval_df
113
+
114
+
115
+ def get_user_ids(
116
+ data: Any,
117
+ id_columns: list[str] | str | None = None
118
+ ) -> np.ndarray | None:
119
+ """
120
+ Extract user IDs from various data structures.
121
+
122
+ Args:
123
+ data: Data source (DataFrame, dict, or batch dict)
124
+ id_columns: List or single ID column name(s) (default: None)
125
+
126
+ Returns:
127
+ np.ndarray | None: User IDs as numpy array, or None if not found
128
+ """
129
+ id_columns = (
130
+ id_columns if isinstance(id_columns, list)
131
+ else [id_columns] if isinstance(id_columns, str)
132
+ else []
133
+ )
134
+ if not id_columns:
135
+ return None
136
+
137
+ main_id = id_columns[0]
138
+ if isinstance(data, pd.DataFrame) and main_id in data.columns:
139
+ arr = np.asarray(data[main_id].values)
140
+ return arr.reshape(arr.shape[0])
141
+
142
+ if isinstance(data, dict):
143
+ ids_container = data.get("ids")
144
+ if isinstance(ids_container, dict) and main_id in ids_container:
145
+ val = ids_container[main_id]
146
+ val = val.detach().cpu().numpy() if isinstance(val, torch.Tensor) else np.asarray(val)
147
+ return val.reshape(val.shape[0])
148
+ if main_id in data:
149
+ arr = np.asarray(data[main_id])
150
+ return arr.reshape(arr.shape[0])
151
+
152
+ return None
@@ -1,268 +1,35 @@
1
- """Data processing utilities for NextRec."""
2
-
3
- import torch
4
- import numpy as np
5
- import pandas as pd
6
- import pyarrow.parquet as pq
7
- from pathlib import Path
8
- from typing import Any, Mapping, Sequence
9
-
10
- def stack_section(batch: list[dict], section: str):
11
- """Stack one section of the batch (features/labels/ids)."""
12
- entries = [item.get(section) for item in batch if item.get(section) is not None]
13
- if not entries:
14
- return None
15
- merged: dict = {}
16
- for name in entries[0]: # type: ignore
17
- tensors = [item[section][name] for item in batch if item.get(section) is not None and name in item[section]]
18
- merged[name] = torch.stack(tensors, dim=0)
19
- return merged
20
-
21
- def collate_fn(batch):
22
- """
23
- Collate a list of sample dicts into the unified batch format:
24
- {
25
- "features": {name: Tensor(B, ...)},
26
- "labels": {target: Tensor(B, ...)} or None,
27
- "ids": {id_name: Tensor(B, ...)} or None,
28
- }
29
- """
30
- if not batch:
31
- return {"features": {}, "labels": None, "ids": None}
32
-
33
- first = batch[0]
34
- if isinstance(first, dict) and "features" in first:
35
- # Streaming dataset yields already-batched chunks; avoid adding an extra dim.
36
- if first.get("_already_batched") and len(batch) == 1:
37
- return {
38
- "features": first.get("features", {}),
39
- "labels": first.get("labels"),
40
- "ids": first.get("ids"),
41
- }
42
- return {
43
- "features": stack_section(batch, "features") or {},
44
- "labels": stack_section(batch, "labels"),
45
- "ids": stack_section(batch, "ids"),
46
- }
47
-
48
- # Fallback: stack tuples/lists of tensors
49
- num_tensors = len(first)
50
- result = []
51
- for i in range(num_tensors):
52
- tensor_list = [item[i] for item in batch]
53
- first_item = tensor_list[0]
54
- if isinstance(first_item, torch.Tensor):
55
- stacked = torch.cat(tensor_list, dim=0)
56
- elif isinstance(first_item, np.ndarray):
57
- stacked = np.concatenate(tensor_list, axis=0)
58
- elif isinstance(first_item, list):
59
- combined = []
60
- for entry in tensor_list:
61
- combined.extend(entry)
62
- stacked = combined
63
- else:
64
- stacked = tensor_list
65
- result.append(stacked)
66
- return tuple(result)
67
-
68
- def get_column_data(data: dict | pd.DataFrame, name: str):
69
- """Extract column data from various data structures."""
70
- if isinstance(data, dict):
71
- return data[name] if name in data else None
72
- elif isinstance(data, pd.DataFrame):
73
- if name not in data.columns:
74
- return None
75
- return data[name].values
76
- else:
77
- if hasattr(data, name):
78
- return getattr(data, name)
79
- raise KeyError(f"Unsupported data type for extracting column {name}")
80
-
81
- def resolve_file_paths(path: str) -> tuple[list[str], str]:
82
- """Resolve file or directory path into a sorted list of files and file type."""
83
- path_obj = Path(path)
84
-
85
- if path_obj.is_file():
86
- file_type = path_obj.suffix.lower().lstrip(".")
87
- assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
88
- return [str(path_obj)], file_type
89
-
90
- if path_obj.is_dir():
91
- collected_files = [p for p in path_obj.iterdir() if p.is_file()]
92
- csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
93
- parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
94
-
95
- if csv_files and parquet_files:
96
- raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
97
- file_paths = csv_files if csv_files else parquet_files
98
- if not file_paths:
99
- raise ValueError(f"No CSV or Parquet files found in directory: {path}")
100
- file_paths.sort()
101
- file_type = "csv" if csv_files else "parquet"
102
- return file_paths, file_type
103
-
104
- raise ValueError(f"Invalid path: {path}")
105
-
106
- def iter_file_chunks(file_path: str, file_type: str, chunk_size: int):
107
- """Yield DataFrame chunks for CSV/Parquet without loading the whole file."""
108
- if file_type == "csv":
109
- yield from pd.read_csv(file_path, chunksize=chunk_size)
110
- return
111
- parquet_file = pq.ParquetFile(file_path)
112
- for batch in parquet_file.iter_batches(batch_size=chunk_size):
113
- yield batch.to_pandas()
114
-
115
- def read_table(file_path: str, file_type: str) -> pd.DataFrame:
116
- """Read a single CSV/Parquet file."""
117
- if file_type == "csv":
118
- return pd.read_csv(file_path)
119
- return pd.read_parquet(file_path)
120
-
121
- def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
122
- """Load multiple files of the same type into DataFrames."""
123
- return [read_table(fp, file_type) for fp in file_paths]
124
-
125
- def default_output_dir(path: str) -> Path:
126
- """Generate a default output directory path based on the input path."""
127
- path_obj = Path(path)
128
- if path_obj.is_file():
129
- return path_obj.parent / f"{path_obj.stem}_preprocessed"
130
- return path_obj.with_name(f"{path_obj.name}_preprocessed")
131
-
132
- def split_dict_random(data_dict: dict, test_size: float = 0.2, random_state: int | None = None):
133
- """Randomly split a dictionary of data into training and testing sets."""
134
- lengths = [len(v) for v in data_dict.values()]
135
- if len(set(lengths)) != 1:
136
- raise ValueError(f"Length mismatch: {lengths}")
137
- n = lengths[0]
138
- rng = np.random.default_rng(random_state)
139
- perm = rng.permutation(n)
140
- cut = int(round(n * (1 - test_size)))
141
- train_idx, test_idx = perm[:cut], perm[cut:]
142
- def take(v, idx):
143
- if isinstance(v, np.ndarray):
144
- return v[idx]
145
- elif isinstance(v, pd.Series):
146
- return v.iloc[idx].to_numpy()
147
- else:
148
- v_arr = np.asarray(v, dtype=object)
149
- return v_arr[idx]
150
- train_dict = {k: take(v, train_idx) for k, v in data_dict.items()}
151
- test_dict = {k: take(v, test_idx) for k, v in data_dict.items()}
152
- return train_dict, test_dict
153
-
154
- def build_eval_candidates(
155
- df_all: pd.DataFrame,
156
- user_col: str,
157
- item_col: str,
158
- label_col: str,
159
- user_features: pd.DataFrame,
160
- item_features: pd.DataFrame,
161
- num_pos_per_user: int = 5,
162
- num_neg_per_pos: int = 50,
163
- random_seed: int = 2025,
164
- ) -> pd.DataFrame:
165
- """Build evaluation candidates with positive and negative samples for each user. """
166
- rng = np.random.default_rng(random_seed)
167
-
168
- users = df_all[user_col].unique()
169
- all_items = item_features[item_col].unique()
170
- rows = []
171
- user_hist_items = {u: df_all[df_all[user_col] == u][item_col].unique() for u in users}
172
- for u in users:
173
- df_user = df_all[df_all[user_col] == u]
174
- pos_items = df_user[df_user[label_col] == 1][item_col].unique()
175
- if len(pos_items) == 0:
176
- continue
177
- pos_items = pos_items[:num_pos_per_user]
178
- seen_items = set(user_hist_items[u])
179
- neg_pool = np.setdiff1d(all_items, np.fromiter(seen_items, dtype=all_items.dtype))
180
- if len(neg_pool) == 0:
181
- continue
182
- for pos in pos_items:
183
- if len(neg_pool) <= num_neg_per_pos:
184
- neg_items = neg_pool
185
- else:
186
- neg_items = rng.choice(neg_pool, size=num_neg_per_pos, replace=False)
187
- rows.append((u, pos, 1))
188
- for ni in neg_items:
189
- rows.append((u, ni, 0))
190
- eval_df = pd.DataFrame(rows, columns=[user_col, item_col, label_col])
191
- eval_df = eval_df.merge(user_features, on=user_col, how='left')
192
- eval_df = eval_df.merge(item_features, on=item_col, how='left')
193
- return eval_df
194
-
195
- def batch_to_dict(batch_data: Any, include_ids: bool = True) -> dict:
196
- """Standardize a dataloader batch into a dict of features, labels, and ids."""
197
- if not (isinstance(batch_data, Mapping) and "features" in batch_data):
198
- raise TypeError(
199
- "[BaseModel-batch_to_dict Error] Batch data must be a dict with 'features' produced by the current DataLoader."
200
- )
201
- return {
202
- "features": batch_data.get("features", {}),
203
- "labels": batch_data.get("labels"),
204
- "ids": batch_data.get("ids") if include_ids else None,
205
- }
206
-
207
-
208
- # def get_user_ids(
209
- # data: dict | pd.DataFrame | None, user_id_column: str = "user_id"
210
- # ) -> np.ndarray | None:
211
- # """Extract user IDs from a dataset dict or DataFrame."""
212
- # if data is None:
213
- # return None
214
- # if isinstance(data, pd.DataFrame) and user_id_column in data.columns:
215
- # return np.asarray(data[user_id_column].values)
216
- # if isinstance(data, dict) and user_id_column in data:
217
- # return np.asarray(data[user_id_column])
218
- # return None
219
-
220
-
221
- # def get_user_ids_from_batch(
222
- # batch_dict: Mapping[str, Any], id_columns: Sequence[str] | None = None
223
- # ) -> np.ndarray | None:
224
- # """Extract the prioritized user id column from a batch dict."""
225
- # ids_container = batch_dict.get("ids") if isinstance(batch_dict, Mapping) else None
226
- # if not ids_container:
227
- # return None
228
-
229
- # batch_user_id = None
230
- # if id_columns:
231
- # for id_name in id_columns:
232
- # if id_name in ids_container:
233
- # batch_user_id = ids_container[id_name]
234
- # break
235
- # if batch_user_id is None:
236
- # batch_user_id = next(iter(ids_container.values()), None)
237
- # if batch_user_id is None:
238
- # return None
239
-
240
- # if isinstance(batch_user_id, torch.Tensor):
241
- # ids_np = batch_user_id.detach().cpu().numpy()
242
- # else:
243
- # ids_np = np.asarray(batch_user_id)
244
- # if ids_np.ndim == 0:
245
- # ids_np = ids_np.reshape(1)
246
- # return ids_np.reshape(ids_np.shape[0])
247
-
248
-
249
- def get_user_ids(data, id_columns: list[str] | str | None = None) -> np.ndarray | None:
250
- id_columns = id_columns if isinstance(id_columns, list) else [id_columns] if isinstance(id_columns, str) else []
251
- if not id_columns:
252
- return None
253
-
254
- main_id = id_columns[0]
255
- if isinstance(data, pd.DataFrame) and main_id in data.columns:
256
- arr = np.asarray(data[main_id].values)
257
- return arr.reshape(arr.shape[0])
258
- if isinstance(data, dict):
259
- ids_container = data.get("ids")
260
- if isinstance(ids_container, dict) and main_id in ids_container:
261
- val = ids_container[main_id]
262
- val = val.detach().cpu().numpy() if isinstance(val, torch.Tensor) else np.asarray(val)
263
- return val.reshape(val.shape[0])
264
- if main_id in data:
265
- arr = np.asarray(data[main_id])
266
- return arr.reshape(arr.shape[0])
267
-
268
- return None
1
+ """
2
+ Data processing utilities for NextRec (Refactored)
3
+
4
+ This module now re-exports functions from specialized submodules:
5
+ - batch_utils: collate_fn, batch_to_dict
6
+ - data_processing: get_column_data, split_dict_random, build_eval_candidates, get_user_ids
7
+ - nextrec.utils.file_utils: resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
8
+
9
+ Date: create on 27/10/2025
10
+ Last update: 03/12/2025 (refactored)
11
+ Author: Yang Zhou, zyaztec@gmail.com
12
+ """
13
+
14
+ # Import from new organized modules
15
+ from nextrec.data.batch_utils import collate_fn, batch_to_dict, stack_section
16
+ from nextrec.data.data_processing import get_column_data, split_dict_random, build_eval_candidates, get_user_ids
17
+ from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
18
+
19
+ __all__ = [
20
+ # Batch utilities
21
+ 'collate_fn',
22
+ 'batch_to_dict',
23
+ 'stack_section',
24
+ # Data processing
25
+ 'get_column_data',
26
+ 'split_dict_random',
27
+ 'build_eval_candidates',
28
+ 'get_user_ids',
29
+ # File utilities
30
+ 'resolve_file_paths',
31
+ 'iter_file_chunks',
32
+ 'read_table',
33
+ 'load_dataframes',
34
+ 'default_output_dir',
35
+ ]
@@ -20,8 +20,10 @@ from nextrec.data.preprocessor import DataProcessor
20
20
  from nextrec.basic.features import DenseFeature, SparseFeature, SequenceFeature, FeatureSet
21
21
 
22
22
  from nextrec.basic.loggers import colorize
23
- from nextrec.data import get_column_data, collate_fn, resolve_file_paths, read_table
24
- from nextrec.utils import to_tensor
23
+ from nextrec.data.data_processing import get_column_data
24
+ from nextrec.data.batch_utils import collate_fn
25
+ from nextrec.utils.file import resolve_file_paths, read_table
26
+ from nextrec.utils.tensor import to_tensor
25
27
 
26
28
  class TensorDictDataset(Dataset):
27
29
  """Dataset returning sample-level dicts matching the unified batch schema."""
@@ -16,24 +16,14 @@ import pandas as pd
16
16
  import tqdm
17
17
  from pathlib import Path
18
18
  from typing import Dict, Union, Optional, Literal, Any
19
- from sklearn.preprocessing import (
20
- StandardScaler,
21
- MinMaxScaler,
22
- RobustScaler,
23
- MaxAbsScaler,
24
- LabelEncoder
25
- )
19
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, LabelEncoder
20
+
26
21
 
27
- from nextrec.basic.loggers import setup_logger, colorize
28
- from nextrec.data.data_utils import (
29
- resolve_file_paths,
30
- iter_file_chunks,
31
- read_table,
32
- load_dataframes,
33
- default_output_dir,
34
- )
35
- from nextrec.basic.session import resolve_save_path
36
22
  from nextrec.basic.features import FeatureSet
23
+ from nextrec.basic.loggers import colorize
24
+ from nextrec.basic.session import resolve_save_path
25
+ from nextrec.utils.file import resolve_file_paths, iter_file_chunks, read_table, load_dataframes, default_output_dir
26
+
37
27
  from nextrec.__version__ import __version__
38
28
 
39
29
 
@@ -46,7 +46,7 @@ from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
46
46
  from nextrec.basic.layers import EmbeddingLayer, MLP, PredictionLayer
47
47
  from nextrec.basic.activation import activation_layer
48
48
  from nextrec.basic.model import BaseModel
49
- from nextrec.utils.common import merge_features
49
+ from nextrec.utils.model import merge_features
50
50
 
51
51
 
52
52
  class POSOGate(nn.Module):
nextrec/utils/__init__.py CHANGED
@@ -1,18 +1,68 @@
1
+ """
2
+ Utilities package for NextRec
3
+
4
+ This package provides various utility functions organized by category:
5
+ - optimizer: Optimizer and scheduler utilities
6
+ - initializer: Weight initialization utilities
7
+ - embedding: Embedding dimension calculation
8
+ - device_utils: Device management and selection
9
+ - tensor_utils: Tensor operations and conversions
10
+ - file_utils: File I/O operations
11
+ - model_utils: Model-related utilities
12
+ - feature_utils: Feature processing utilities
13
+
14
+ Date: create on 13/11/2025
15
+ Last update: 03/12/2025 (refactored)
16
+ Author: Yang Zhou, zyaztec@gmail.com
17
+ """
18
+
1
19
  from .optimizer import get_optimizer, get_scheduler
2
20
  from .initializer import get_initializer
3
21
  from .embedding import get_auto_embedding_dim
4
- from .common import resolve_device, to_tensor
5
- from . import optimizer, initializer, embedding, common
22
+ from .device import resolve_device, get_device_info
23
+ from .tensor import to_tensor, stack_tensors, concat_tensors, pad_sequence_tensors
24
+ from .file import resolve_file_paths, read_table, load_dataframes, iter_file_chunks, default_output_dir
25
+ from .model import merge_features, get_mlp_output_dim
26
+ from .feature import normalize_to_list
27
+ from . import optimizer, initializer, embedding
6
28
 
7
29
  __all__ = [
30
+ # Optimizer & Scheduler
8
31
  'get_optimizer',
9
32
  'get_scheduler',
33
+
34
+ # Initializer
10
35
  'get_initializer',
36
+
37
+ # Embedding
11
38
  'get_auto_embedding_dim',
39
+
40
+ # Device utilities
12
41
  'resolve_device',
42
+ 'get_device_info',
43
+
44
+ # Tensor utilities
13
45
  'to_tensor',
46
+ 'stack_tensors',
47
+ 'concat_tensors',
48
+ 'pad_sequence_tensors',
49
+
50
+ # File utilities
51
+ 'resolve_file_paths',
52
+ 'read_table',
53
+ 'load_dataframes',
54
+ 'iter_file_chunks',
55
+ 'default_output_dir',
56
+
57
+ # Model utilities
58
+ 'merge_features',
59
+ 'get_mlp_output_dim',
60
+
61
+ # Feature utilities
62
+ 'normalize_to_list',
63
+
64
+ # Module exports
14
65
  'optimizer',
15
66
  'initializer',
16
67
  'embedding',
17
- 'common',
18
68
  ]
@@ -0,0 +1,37 @@
1
+ """
2
+ Device management utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ import torch
9
+ import platform
10
+
11
+
12
+ def resolve_device() -> str:
13
+ if torch.cuda.is_available():
14
+ return "cuda"
15
+ if torch.backends.mps.is_available():
16
+ mac_ver = platform.mac_ver()[0]
17
+ try:
18
+ major, minor = (int(x) for x in mac_ver.split(".")[:2])
19
+ except Exception:
20
+ major, minor = 0, 0
21
+ if major >= 14:
22
+ return "mps"
23
+ return "cpu"
24
+
25
+ def get_device_info() -> dict:
26
+ info = {
27
+ 'cuda_available': torch.cuda.is_available(),
28
+ 'cuda_device_count': torch.cuda.device_count() if torch.cuda.is_available() else 0,
29
+ 'mps_available': torch.backends.mps.is_available(),
30
+ 'current_device': resolve_device(),
31
+ }
32
+
33
+ if torch.cuda.is_available():
34
+ info['cuda_device_name'] = torch.cuda.get_device_name(0)
35
+ info['cuda_capability'] = torch.cuda.get_device_capability(0)
36
+
37
+ return info
@@ -0,0 +1,13 @@
1
+ """
2
+ Feature processing utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ def normalize_to_list(value: str | list[str] | None) -> list[str]:
9
+ if value is None:
10
+ return []
11
+ if isinstance(value, str):
12
+ return [value]
13
+ return list(value)
nextrec/utils/file.py ADDED
@@ -0,0 +1,70 @@
1
+ """
2
+ File I/O utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ import pandas as pd
9
+ import pyarrow.parquet as pq
10
+ from pathlib import Path
11
+ from typing import Generator
12
+
13
+
14
+ def resolve_file_paths(path: str) -> tuple[list[str], str]:
15
+ """
16
+ Resolve file or directory path into a sorted list of files and file type.
17
+
18
+ Args: path: Path to a file or directory
19
+ Returns: tuple: (list of file paths, file type)
20
+ """
21
+ path_obj = Path(path)
22
+
23
+ if path_obj.is_file():
24
+ file_type = path_obj.suffix.lower().lstrip(".")
25
+ assert file_type in ["csv", "parquet"], f"Unsupported file extension: {file_type}"
26
+ return [str(path_obj)], file_type
27
+
28
+ if path_obj.is_dir():
29
+ collected_files = [p for p in path_obj.iterdir() if p.is_file()]
30
+ csv_files = [str(p) for p in collected_files if p.suffix.lower() == ".csv"]
31
+ parquet_files = [str(p) for p in collected_files if p.suffix.lower() == ".parquet"]
32
+
33
+ if csv_files and parquet_files:
34
+ raise ValueError("Directory contains both CSV and Parquet files. Please keep a single format.")
35
+ file_paths = csv_files if csv_files else parquet_files
36
+ if not file_paths:
37
+ raise ValueError(f"No CSV or Parquet files found in directory: {path}")
38
+ file_paths.sort()
39
+ file_type = "csv" if csv_files else "parquet"
40
+ return file_paths, file_type
41
+
42
+ raise ValueError(f"Invalid path: {path}")
43
+
44
+
45
+ def read_table(file_path: str, file_type: str) -> pd.DataFrame:
46
+ if file_type == "csv":
47
+ return pd.read_csv(file_path)
48
+ return pd.read_parquet(file_path)
49
+
50
+ def load_dataframes(file_paths: list[str], file_type: str) -> list[pd.DataFrame]:
51
+ return [read_table(fp, file_type) for fp in file_paths]
52
+
53
+ def iter_file_chunks(
54
+ file_path: str,
55
+ file_type: str,
56
+ chunk_size: int
57
+ ) -> Generator[pd.DataFrame, None, None]:
58
+ if file_type == "csv":
59
+ yield from pd.read_csv(file_path, chunksize=chunk_size)
60
+ return
61
+ parquet_file = pq.ParquetFile(file_path)
62
+ for batch in parquet_file.iter_batches(batch_size=chunk_size):
63
+ yield batch.to_pandas()
64
+
65
+
66
+ def default_output_dir(path: str) -> Path:
67
+ path_obj = Path(path)
68
+ if path_obj.is_file():
69
+ return path_obj.parent / f"{path_obj.stem}_preprocessed"
70
+ return path_obj.with_name(f"{path_obj.name}_preprocessed")
@@ -9,14 +9,6 @@ import torch.nn as nn
9
9
 
10
10
 
11
11
  def get_initializer(init_type='normal', activation='linear', param=None):
12
- """
13
- Get parameter initialization function.
14
-
15
- Examples:
16
- >>> init_fn = get_initializer('xavier_uniform', 'relu')
17
- >>> init_fn(tensor)
18
- >>> init_fn = get_initializer('normal', param={'mean': 0.0, 'std': 0.01})
19
- """
20
12
  param = param or {}
21
13
 
22
14
  try:
nextrec/utils/model.py ADDED
@@ -0,0 +1,22 @@
1
+ """
2
+ Model-related utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ from collections import OrderedDict
9
+
10
+
11
+ def merge_features(primary, secondary) -> list:
12
+ merged: OrderedDict[str, object] = OrderedDict()
13
+ for feat in list(primary or []) + list(secondary or []):
14
+ merged.setdefault(feat.name, feat)
15
+ return list(merged.values())
16
+
17
+
18
+ def get_mlp_output_dim(params: dict, fallback: int) -> int:
19
+ dims = params.get("dims")
20
+ if dims:
21
+ return dims[-1]
22
+ return fallback
@@ -8,25 +8,16 @@ Author: Yang Zhou, zyaztec@gmail.com
8
8
  import torch
9
9
  from typing import Iterable
10
10
 
11
-
12
11
  def get_optimizer(
13
12
  optimizer: str | torch.optim.Optimizer = "adam",
14
13
  params: Iterable[torch.nn.Parameter] | None = None,
15
14
  **optimizer_params
16
15
  ):
17
- """
18
- Get optimizer function based on optimizer name or instance.
19
-
20
- Examples:
21
- >>> optimizer = get_optimizer("adam", model.parameters(), lr=1e-3)
22
- >>> optimizer = get_optimizer("sgd", model.parameters(), lr=0.01, momentum=0.9)
23
- """
24
16
  if params is None:
25
17
  raise ValueError("params cannot be None. Please provide model parameters.")
26
18
 
27
19
  if 'lr' not in optimizer_params:
28
20
  optimizer_params['lr'] = 1e-3
29
-
30
21
  if isinstance(optimizer, str):
31
22
  opt_name = optimizer.lower()
32
23
  if opt_name == "adam":
@@ -42,27 +33,17 @@ def get_optimizer(
42
33
  else:
43
34
  raise NotImplementedError(f"Unsupported optimizer: {optimizer}")
44
35
  optimizer_fn = opt_class(params=params, **optimizer_params)
45
-
46
36
  elif isinstance(optimizer, torch.optim.Optimizer):
47
37
  optimizer_fn = optimizer
48
38
  else:
49
39
  raise TypeError(f"Invalid optimizer type: {type(optimizer)}")
50
-
51
40
  return optimizer_fn
52
41
 
53
-
54
42
  def get_scheduler(
55
43
  scheduler: str | torch.optim.lr_scheduler._LRScheduler | torch.optim.lr_scheduler.LRScheduler | type[torch.optim.lr_scheduler._LRScheduler] | type[torch.optim.lr_scheduler.LRScheduler] | None,
56
44
  optimizer,
57
45
  **scheduler_params
58
46
  ):
59
- """
60
- Get learning rate scheduler function.
61
-
62
- Examples:
63
- >>> scheduler = get_scheduler("step", optimizer, step_size=10, gamma=0.1)
64
- >>> scheduler = get_scheduler("cosine", optimizer, T_max=100)
65
- """
66
47
  if isinstance(scheduler, str):
67
48
  if scheduler == "step":
68
49
  scheduler_fn = torch.optim.lr_scheduler.StepLR(optimizer, **scheduler_params)
@@ -0,0 +1,61 @@
1
+ """
2
+ Tensor manipulation utilities for NextRec
3
+
4
+ Date: create on 03/12/2025
5
+ Author: Yang Zhou, zyaztec@gmail.com
6
+ """
7
+
8
+ import torch
9
+ import numpy as np
10
+ from typing import Any
11
+
12
+
13
+ def to_tensor(
14
+ value: Any,
15
+ dtype: torch.dtype,
16
+ device: torch.device | str | None = None
17
+ ) -> torch.Tensor:
18
+ if value is None:
19
+ raise ValueError("[Tensor Utils Error] Cannot convert None to tensor.")
20
+ tensor = value if isinstance(value, torch.Tensor) else torch.as_tensor(value)
21
+ if tensor.dtype != dtype:
22
+ tensor = tensor.to(dtype=dtype)
23
+
24
+ if device is not None:
25
+ target_device = device if isinstance(device, torch.device) else torch.device(device)
26
+ if tensor.device != target_device:
27
+ tensor = tensor.to(target_device)
28
+ return tensor
29
+
30
+ def stack_tensors(tensors: list[torch.Tensor], dim: int = 0) -> torch.Tensor:
31
+ if not tensors:
32
+ raise ValueError("[Tensor Utils Error] Cannot stack empty list of tensors.")
33
+ return torch.stack(tensors, dim=dim)
34
+
35
+ def concat_tensors(tensors: list[torch.Tensor], dim: int = 0) -> torch.Tensor:
36
+ if not tensors:
37
+ raise ValueError("[Tensor Utils Error] Cannot concatenate empty list of tensors.")
38
+ return torch.cat(tensors, dim=dim)
39
+
40
+ def pad_sequence_tensors(
41
+ tensors: list[torch.Tensor],
42
+ max_len: int | None = None,
43
+ padding_value: float = 0.0,
44
+ padding_side: str = 'right'
45
+ ) -> torch.Tensor:
46
+ if not tensors:
47
+ raise ValueError("[Tensor Utils Error] Cannot pad empty list of tensors.")
48
+ if max_len is None:
49
+ max_len = max(t.size(0) for t in tensors)
50
+ batch_size = len(tensors)
51
+ padded = torch.full((batch_size, max_len), padding_value, dtype=tensors[0].dtype, device=tensors[0].device)
52
+
53
+ for i, tensor in enumerate(tensors):
54
+ length = min(tensor.size(0), max_len)
55
+ if padding_side == 'right':
56
+ padded[i, :length] = tensor[:length]
57
+ elif padding_side == 'left':
58
+ padded[i, -length:] = tensor[:length]
59
+ else:
60
+ raise ValueError(f"[Tensor Utils Error] padding_side must be 'right' or 'left', got {padding_side}")
61
+ return padded
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nextrec
3
- Version: 0.3.4
3
+ Version: 0.3.5
4
4
  Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
5
5
  Project-URL: Homepage, https://github.com/zerolovesea/NextRec
6
6
  Project-URL: Repository, https://github.com/zerolovesea/NextRec
@@ -63,7 +63,7 @@ Description-Content-Type: text/markdown
63
63
  ![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
64
64
  ![PyTorch](https://img.shields.io/badge/PyTorch-1.10+-ee4c2c.svg)
65
65
  ![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)
66
- ![Version](https://img.shields.io/badge/Version-0.3.4-orange.svg)
66
+ ![Version](https://img.shields.io/badge/Version-0.3.5-orange.svg)
67
67
 
68
68
  English | [中文文档](README_zh.md)
69
69
 
@@ -110,7 +110,7 @@ To dive deeper, Jupyter notebooks are available:
110
110
  - [Hands on the NextRec framework](/tutorials/notebooks/en/Hands%20on%20nextrec.ipynb)
111
111
  - [Using the data processor for preprocessing](/tutorials/notebooks/en/Hands%20on%20dataprocessor.ipynb)
112
112
 
113
- > Current version [0.3.4]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
113
+ > Current version [0.3.5]: the matching module is not fully polished yet and may have compatibility issues or unexpected errors. Please raise an issue if you run into problems.
114
114
 
115
115
  ## 5-Minute Quick Start
116
116
 
@@ -1,18 +1,20 @@
1
1
  nextrec/__init__.py,sha256=CvocnY2uBp0cjNkhrT6ogw0q2bN9s1GNp754FLO-7lo,1117
2
- nextrec/__version__.py,sha256=oYLGMpySamd16KLiaBTfRyrAS7_oyp-TOEHmzmeumwg,22
2
+ nextrec/__version__.py,sha256=ThnCuF3X7rsQSd5PAea_jfYA70ZmhLvkFcLBxBPwZnY,22
3
3
  nextrec/basic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  nextrec/basic/activation.py,sha256=1qs9pq4hT3BUxIiYdYs57axMCm4-JyOBFQ6x7xkHTwM,2849
5
5
  nextrec/basic/callback.py,sha256=wwh0I2kKYyywCB-sG9eQXShlpXFJIo75qApJmnI5p6c,1036
6
- nextrec/basic/features.py,sha256=-RRRbEPU-SFI-GtppflW6O0bKShUsV-Hg_lTGpo3AIE,4262
6
+ nextrec/basic/features.py,sha256=DFwYjG13GYHOujS_CMKa7Qrux9faF7MQNoaoRDF_Eks,4263
7
7
  nextrec/basic/layers.py,sha256=zzEseKYVnMVs1Tg5EGrFimugId15jI6HumgzjFyRqgw,23127
8
8
  nextrec/basic/loggers.py,sha256=hh9tRMmaCTaJ_sfRHIlbcqd6BcpK63vpZ_21TFCiKLI,6148
9
9
  nextrec/basic/metrics.py,sha256=8-hMZJXU5L4F8GnToxMZey5dlBrtFyRtTuI_zoQCtIo,21579
10
- nextrec/basic/model.py,sha256=afnvicyxXMgWdvhrIUaoNnZ7S-QYRYr7fTY5bdM1u_s,68829
11
- nextrec/basic/session.py,sha256=oaATn-nzbJ9A6SGbMut9xLV_NSh9_1KmVDeNauS06Ps,4767
12
- nextrec/data/__init__.py,sha256=6WgXZafzzXcv5kuxKNi67O8BJZVl_P_HM2IZCDIIhPA,1052
13
- nextrec/data/data_utils.py,sha256=aOyja3Yu7O2c8eIeL3P8MyUlUR5EerOUT9UeF4ATq8o,10574
14
- nextrec/data/dataloader.py,sha256=2MLe69y0E1cTZyzMNgyLUCxa6lllGd1ntvwpXzxdX10,14199
15
- nextrec/data/preprocessor.py,sha256=lhigpjvkEqsjTRfbBBOjgGOxoPyOifwq2LoswgyIVqc,40488
10
+ nextrec/basic/model.py,sha256=THzpEb6uIRp4xNjAQz0Xdwsqbh3jewN97L5_Ps6qyeo,68902
11
+ nextrec/basic/session.py,sha256=o-O7QMDAGjPiRBZaiYDy629xppfpiGqCWXpPrC4Y-_c,4337
12
+ nextrec/data/__init__.py,sha256=XBEOUH4EbVgGjBgxPSw15nSR7vtB_1qCxge5Lt7uJ7o,1924
13
+ nextrec/data/batch_utils.py,sha256=6G-E85H-PqYJ20EYVLnC3MqC8xYrXzZ1XYe82MhRPck,2816
14
+ nextrec/data/data_processing.py,sha256=N3Uk4NsUCyLeoMDV1zeLmH-dP02I-cRWDo-vvQgLqjo,5006
15
+ nextrec/data/data_utils.py,sha256=-3xLPW3csOiGNmj0kzzpOkCxZyu09RNBgfPkwX7nDAc,1172
16
+ nextrec/data/dataloader.py,sha256=sXyUv8rRE7P2bsoTZebLBTLErPWBJw5OacZ106m9Unk,14288
17
+ nextrec/data/preprocessor.py,sha256=_A3eEc1MpUGDEpno1TToA-dyJ_k707Mr3GilTi_9j5I,40419
16
18
  nextrec/loss/__init__.py,sha256=mO5t417BneZ8Ysa51GyjDaffjWyjzFgPXIQrrggasaQ,827
17
19
  nextrec/loss/listwise.py,sha256=gxDbO1td5IeS28jKzdE35o1KAYBRdCYoMzyZzfNLhc0,5689
18
20
  nextrec/loss/loss_utils.py,sha256=uZ4m9ChLr-UgIc5Yxm1LjwXDDepApQ-Fas8njweZ9qg,2641
@@ -30,7 +32,7 @@ nextrec/models/match/youtube_dnn.py,sha256=Wa5JWrlIpMuBoyXpnBrdnm1nQ8ZO_XcR517zf
30
32
  nextrec/models/multi_task/esmm.py,sha256=Ho5UN2H9H9-ZYML6eqpBYTVdTO4Ja9AoYP5SSgsgQaw,6442
31
33
  nextrec/models/multi_task/mmoe.py,sha256=zfBAUoQijHCuat962dZI0MCAy8C6PZqZ-zOd16JznF8,7803
32
34
  nextrec/models/multi_task/ple.py,sha256=zNBea0sfJska36RVH1N9O92m7rPmbaWYqoPbnGoy1RE,11949
33
- nextrec/models/multi_task/poso.py,sha256=_yLiCkD3NhOZEOWx-jP4MJxSEdNCu3mqeo_XRt8CWts,16652
35
+ nextrec/models/multi_task/poso.py,sha256=_Pq-cl7HB1uQVO8HXreNeVpQso250ouxBNTsdTjyFos,16651
34
36
  nextrec/models/multi_task/share_bottom.py,sha256=kvrkXQSTDPEwwmBvXw3xryBm3gT8Uq4_Hb3TenwRj9w,5920
35
37
  nextrec/models/ranking/__init__.py,sha256=AY806x-2BtltQdlR4wu23-keL9YUe3An92OJshS4t9Y,472
36
38
  nextrec/models/ranking/afm.py,sha256=uFSUIv9d6NQkCiM2epmSdMy4kxjFuCRVbrZOv3nebGE,4539
@@ -46,12 +48,16 @@ nextrec/models/ranking/masknet.py,sha256=9K6XKcr8f8PcVhLfgFd8l4tq78lcclAQAXZKlVE
46
48
  nextrec/models/ranking/pnn.py,sha256=eEyBnALuzaNx27iGJ0ZqNcf0u7dKN8SiO03lkcv1hiE,4956
47
49
  nextrec/models/ranking/widedeep.py,sha256=AJPkoThUTSBGPNBjD-aiWsMH2hSiSnGLjIPy_2neNhc,5034
48
50
  nextrec/models/ranking/xdeepfm.py,sha256=wn6YnX78EyBzil7IRBcqyDqsnysERVJ5-lWGuRMCpxE,5681
49
- nextrec/utils/__init__.py,sha256=ciw6B9SXffjSb4cwco-WXpKSE7M9D6ILpLZ2oftwj6A,457
50
- nextrec/utils/common.py,sha256=NYXnBVtUCtm8epT2ZxJHn_m1SIBBI_PEjZ5VpL465ls,2009
51
+ nextrec/utils/__init__.py,sha256=lAVpHsGe_WgGf7R-K1wr0DeVLvskG0Bj1L12N6kEPwM,1810
52
+ nextrec/utils/device.py,sha256=nos-J5VTe2hyaqiZ7D8q1k8l1KwORQ0bISI485Jdqnw,1012
51
53
  nextrec/utils/embedding.py,sha256=yxYSdFx0cJITh3Gf-K4SdhwRtKGcI0jOsyBgZ0NLa_c,465
52
- nextrec/utils/initializer.py,sha256=ffYOs5QuIns_d_-5e40iNtg6s1ftgREJN-ueq_NbDQE,1647
53
- nextrec/utils/optimizer.py,sha256=EUjAGFPeyou_Cv-_2HRvjzut8y_qpAQudc8L2T0k8zw,2706
54
- nextrec-0.3.4.dist-info/METADATA,sha256=X5fo5gymQdPXLgM1N03E58uFSQyuQOmdbUp8vXvKl0g,16319
55
- nextrec-0.3.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
56
- nextrec-0.3.4.dist-info/licenses/LICENSE,sha256=2fQfVKeafywkni7MYHyClC6RGGC3laLTXCNBx-ubtp0,1064
57
- nextrec-0.3.4.dist-info/RECORD,,
54
+ nextrec/utils/feature.py,sha256=s0eMEuvbOsotjll7eSYjb0b-1cXnvVy1mSI1Syg_7n4,299
55
+ nextrec/utils/file.py,sha256=wxKvd1_U9ugFDP7EzLNG6-3PBInA0QhxoHzBWKfe_B8,2384
56
+ nextrec/utils/initializer.py,sha256=BkP6-vJdsc0A-8ya-AVEs7W24dPXyxIilNnckwXgPEc,1391
57
+ nextrec/utils/model.py,sha256=FB7QbatO0uEvghBEfByJtRS0waaBEB1UI0YzfA_2k04,535
58
+ nextrec/utils/optimizer.py,sha256=cVkDrEkxwig17UAEhL8p9v3iVNiXI8B067Yf_6LqUp8,2198
59
+ nextrec/utils/tensor.py,sha256=_RibR6BMPizhzRLVdnJqwUgzA0zpzkZuKfTrdSjbL60,2136
60
+ nextrec-0.3.5.dist-info/METADATA,sha256=uZAs7fg2m4UtVWWoxlqecC8a7KzfqSdQbVExo88L1kM,16319
61
+ nextrec-0.3.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
62
+ nextrec-0.3.5.dist-info/licenses/LICENSE,sha256=2fQfVKeafywkni7MYHyClC6RGGC3laLTXCNBx-ubtp0,1064
63
+ nextrec-0.3.5.dist-info/RECORD,,
nextrec/utils/common.py DELETED
@@ -1,60 +0,0 @@
1
- import torch
2
- import platform
3
- from collections import OrderedDict
4
-
5
-
6
- def resolve_device() -> str:
7
- """Select a usable device with graceful fallback."""
8
- if torch.cuda.is_available():
9
- return "cuda"
10
- if torch.backends.mps.is_available():
11
- mac_ver = platform.mac_ver()[0]
12
- try:
13
- major, minor = (int(x) for x in mac_ver.split(".")[:2])
14
- except Exception:
15
- major, minor = 0, 0
16
- if major >= 14:
17
- return "mps"
18
- return "cpu"
19
-
20
-
21
- def normalize_to_list(value: str | list[str] | None) -> list[str]:
22
- if value is None:
23
- return []
24
- if isinstance(value, str):
25
- return [value]
26
- return list(value)
27
-
28
-
29
- def merge_features(primary, secondary) -> list:
30
- """
31
- Merge two feature lists while preserving order and deduplicating by feature name.
32
- Later duplicates are skipped.
33
- """
34
- merged: OrderedDict[str, object] = OrderedDict()
35
- for feat in list(primary or []) + list(secondary or []):
36
- merged.setdefault(feat.name, feat)
37
- return list(merged.values())
38
-
39
- def get_mlp_output_dim(params: dict, fallback: int) -> int:
40
- """
41
- Get the output dimension of an MLP-like config.
42
- If dims are provided, use the last dim; otherwise fall back to input dim.
43
- """
44
- dims = params.get("dims")
45
- if dims:
46
- return dims[-1]
47
- return fallback
48
-
49
- def to_tensor(value, dtype: torch.dtype, device: torch.device | str | None = None) -> torch.Tensor:
50
- """Convert any value to a tensor with the desired dtype/device."""
51
- if value is None:
52
- raise ValueError("[Tensor Utils Error] Cannot convert None to tensor.")
53
- tensor = value if isinstance(value, torch.Tensor) else torch.as_tensor(value)
54
- if tensor.dtype != dtype:
55
- tensor = tensor.to(dtype=dtype)
56
- if device is not None:
57
- target_device = device if isinstance(device, torch.device) else torch.device(device)
58
- if tensor.device != target_device:
59
- tensor = tensor.to(target_device)
60
- return tensor