nextrec 0.4.9__py3-none-any.whl → 0.4.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nextrec/__version__.py +1 -1
- nextrec/basic/model.py +4 -3
- nextrec/cli.py +181 -34
- nextrec/data/dataloader.py +19 -20
- nextrec/models/ranking/deepfm.py +4 -5
- nextrec/models/ranking/eulernet.py +365 -0
- nextrec/models/ranking/lr.py +120 -0
- {nextrec-0.4.9.dist-info → nextrec-0.4.10.dist-info}/METADATA +5 -6
- {nextrec-0.4.9.dist-info → nextrec-0.4.10.dist-info}/RECORD +12 -12
- {nextrec-0.4.9.dist-info → nextrec-0.4.10.dist-info}/WHEEL +0 -0
- {nextrec-0.4.9.dist-info → nextrec-0.4.10.dist-info}/entry_points.txt +0 -0
- {nextrec-0.4.9.dist-info → nextrec-0.4.10.dist-info}/licenses/LICENSE +0 -0
nextrec/__version__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.4.
|
|
1
|
+
__version__ = "0.4.10"
|
nextrec/basic/model.py
CHANGED
|
@@ -1376,7 +1376,7 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
1376
1376
|
data=data,
|
|
1377
1377
|
batch_size=batch_size,
|
|
1378
1378
|
shuffle=False,
|
|
1379
|
-
|
|
1379
|
+
streaming=True,
|
|
1380
1380
|
chunk_size=streaming_chunk_size,
|
|
1381
1381
|
)
|
|
1382
1382
|
else:
|
|
@@ -1510,7 +1510,7 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
1510
1510
|
data=data,
|
|
1511
1511
|
batch_size=batch_size,
|
|
1512
1512
|
shuffle=False,
|
|
1513
|
-
|
|
1513
|
+
streaming=True,
|
|
1514
1514
|
chunk_size=streaming_chunk_size,
|
|
1515
1515
|
)
|
|
1516
1516
|
elif not isinstance(data, DataLoader):
|
|
@@ -1605,7 +1605,8 @@ class BaseModel(FeatureSet, nn.Module):
|
|
|
1605
1605
|
if collected_frames
|
|
1606
1606
|
else pd.DataFrame(columns=pred_columns or [])
|
|
1607
1607
|
)
|
|
1608
|
-
|
|
1608
|
+
# Return the actual save path when not returning dataframe
|
|
1609
|
+
return target_path
|
|
1609
1610
|
|
|
1610
1611
|
def save_model(
|
|
1611
1612
|
self,
|
nextrec/cli.py
CHANGED
|
@@ -29,7 +29,7 @@ from typing import Any, Dict, List
|
|
|
29
29
|
import pandas as pd
|
|
30
30
|
|
|
31
31
|
from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
|
|
32
|
-
from nextrec.basic.loggers import setup_logger
|
|
32
|
+
from nextrec.basic.loggers import colorize, format_kv, setup_logger
|
|
33
33
|
from nextrec.data.data_utils import split_dict_random
|
|
34
34
|
from nextrec.data.dataloader import RecDataLoader
|
|
35
35
|
from nextrec.data.preprocessor import DataProcessor
|
|
@@ -52,6 +52,17 @@ from nextrec.utils.feature import normalize_to_list
|
|
|
52
52
|
logger = logging.getLogger(__name__)
|
|
53
53
|
|
|
54
54
|
|
|
55
|
+
def log_cli_section(title: str) -> None:
|
|
56
|
+
logger.info("")
|
|
57
|
+
logger.info(colorize(f"[{title}]", color="bright_blue", bold=True))
|
|
58
|
+
logger.info(colorize("-" * 80, color="bright_blue"))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def log_kv_lines(items: list[tuple[str, Any]]) -> None:
|
|
62
|
+
for label, value in items:
|
|
63
|
+
logger.info(format_kv(label, value))
|
|
64
|
+
|
|
65
|
+
|
|
55
66
|
def train_model(train_config_path: str) -> None:
|
|
56
67
|
"""
|
|
57
68
|
Train a NextRec model using the provided configuration file.
|
|
@@ -74,8 +85,17 @@ def train_model(train_config_path: str) -> None:
|
|
|
74
85
|
artifact_root = Path(session_cfg.get("artifact_root", "nextrec_logs"))
|
|
75
86
|
session_dir = artifact_root / session_id
|
|
76
87
|
setup_logger(session_id=session_id)
|
|
77
|
-
|
|
78
|
-
|
|
88
|
+
|
|
89
|
+
log_cli_section("CLI")
|
|
90
|
+
log_kv_lines(
|
|
91
|
+
[
|
|
92
|
+
("Mode", "train"),
|
|
93
|
+
("Version", get_nextrec_version()),
|
|
94
|
+
("Session ID", session_id),
|
|
95
|
+
("Artifacts", session_dir.resolve()),
|
|
96
|
+
("Config", config_file.resolve()),
|
|
97
|
+
("Command", " ".join(sys.argv)),
|
|
98
|
+
]
|
|
79
99
|
)
|
|
80
100
|
|
|
81
101
|
processor_path = session_dir / "processor.pkl"
|
|
@@ -102,11 +122,53 @@ def train_model(train_config_path: str) -> None:
|
|
|
102
122
|
cfg.get("model_config", "model_config.yaml"), config_dir
|
|
103
123
|
)
|
|
104
124
|
|
|
125
|
+
log_cli_section("Config")
|
|
126
|
+
log_kv_lines(
|
|
127
|
+
[
|
|
128
|
+
("Train config", config_file.resolve()),
|
|
129
|
+
("Feature config", feature_cfg_path),
|
|
130
|
+
("Model config", model_cfg_path),
|
|
131
|
+
]
|
|
132
|
+
)
|
|
133
|
+
|
|
105
134
|
feature_cfg = read_yaml(feature_cfg_path)
|
|
106
135
|
model_cfg = read_yaml(model_cfg_path)
|
|
107
136
|
|
|
137
|
+
# Extract id_column from data config for GAUC metrics
|
|
138
|
+
id_column = data_cfg.get("id_column") or data_cfg.get("user_id_column")
|
|
139
|
+
id_columns = [id_column] if id_column else []
|
|
140
|
+
|
|
141
|
+
log_cli_section("Data")
|
|
142
|
+
log_kv_lines(
|
|
143
|
+
[
|
|
144
|
+
("Data path", data_path),
|
|
145
|
+
("Format", data_cfg.get("format", "auto")),
|
|
146
|
+
("Streaming", streaming),
|
|
147
|
+
("Target", target),
|
|
148
|
+
("ID column", id_column or "(not set)"),
|
|
149
|
+
]
|
|
150
|
+
)
|
|
151
|
+
if data_cfg.get("valid_ratio") is not None:
|
|
152
|
+
logger.info(format_kv("Valid ratio", data_cfg.get("valid_ratio")))
|
|
153
|
+
if data_cfg.get("val_path") or data_cfg.get("valid_path"):
|
|
154
|
+
logger.info(
|
|
155
|
+
format_kv(
|
|
156
|
+
"Validation path",
|
|
157
|
+
resolve_path(
|
|
158
|
+
data_cfg.get("val_path") or data_cfg.get("valid_path"), config_dir
|
|
159
|
+
),
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
108
163
|
if streaming:
|
|
109
164
|
file_paths, file_type = resolve_file_paths(str(data_path))
|
|
165
|
+
log_kv_lines(
|
|
166
|
+
[
|
|
167
|
+
("File type", file_type),
|
|
168
|
+
("Files", len(file_paths)),
|
|
169
|
+
("Chunk size", dataloader_chunk_size),
|
|
170
|
+
]
|
|
171
|
+
)
|
|
110
172
|
first_file = file_paths[0]
|
|
111
173
|
first_chunk_size = max(1, min(dataloader_chunk_size, 1000))
|
|
112
174
|
chunk_iter = iter_file_chunks(first_file, file_type, first_chunk_size)
|
|
@@ -118,14 +180,12 @@ def train_model(train_config_path: str) -> None:
|
|
|
118
180
|
|
|
119
181
|
else:
|
|
120
182
|
df = read_table(data_path, data_cfg.get("format"))
|
|
183
|
+
logger.info(format_kv("Rows", len(df)))
|
|
184
|
+
logger.info(format_kv("Columns", len(df.columns)))
|
|
121
185
|
df_columns = list(df.columns)
|
|
122
186
|
|
|
123
187
|
dense_names, sparse_names, sequence_names = select_features(feature_cfg, df_columns)
|
|
124
188
|
|
|
125
|
-
# Extract id_column from data config for GAUC metrics
|
|
126
|
-
id_column = data_cfg.get("id_column") or data_cfg.get("user_id_column")
|
|
127
|
-
id_columns = [id_column] if id_column else []
|
|
128
|
-
|
|
129
189
|
used_columns = dense_names + sparse_names + sequence_names + target + id_columns
|
|
130
190
|
|
|
131
191
|
# keep order but drop duplicates
|
|
@@ -141,6 +201,17 @@ def train_model(train_config_path: str) -> None:
|
|
|
141
201
|
processor, feature_cfg, dense_names, sparse_names, sequence_names
|
|
142
202
|
)
|
|
143
203
|
|
|
204
|
+
log_cli_section("Features")
|
|
205
|
+
log_kv_lines(
|
|
206
|
+
[
|
|
207
|
+
("Dense features", len(dense_names)),
|
|
208
|
+
("Sparse features", len(sparse_names)),
|
|
209
|
+
("Sequence features", len(sequence_names)),
|
|
210
|
+
("Targets", len(target)),
|
|
211
|
+
("Used columns", len(unique_used_columns)),
|
|
212
|
+
]
|
|
213
|
+
)
|
|
214
|
+
|
|
144
215
|
if streaming:
|
|
145
216
|
processor.fit(str(data_path), chunk_size=dataloader_chunk_size)
|
|
146
217
|
processed = None
|
|
@@ -244,7 +315,7 @@ def train_model(train_config_path: str) -> None:
|
|
|
244
315
|
data=train_stream_source,
|
|
245
316
|
batch_size=dataloader_cfg.get("train_batch_size", 512),
|
|
246
317
|
shuffle=dataloader_cfg.get("train_shuffle", True),
|
|
247
|
-
|
|
318
|
+
streaming=True,
|
|
248
319
|
chunk_size=dataloader_chunk_size,
|
|
249
320
|
num_workers=dataloader_cfg.get("num_workers", 0),
|
|
250
321
|
)
|
|
@@ -255,7 +326,7 @@ def train_model(train_config_path: str) -> None:
|
|
|
255
326
|
data=str(val_data_resolved),
|
|
256
327
|
batch_size=dataloader_cfg.get("valid_batch_size", 512),
|
|
257
328
|
shuffle=dataloader_cfg.get("valid_shuffle", False),
|
|
258
|
-
|
|
329
|
+
streaming=True,
|
|
259
330
|
chunk_size=dataloader_chunk_size,
|
|
260
331
|
num_workers=dataloader_cfg.get("num_workers", 0),
|
|
261
332
|
)
|
|
@@ -264,7 +335,7 @@ def train_model(train_config_path: str) -> None:
|
|
|
264
335
|
data=streaming_valid_files,
|
|
265
336
|
batch_size=dataloader_cfg.get("valid_batch_size", 512),
|
|
266
337
|
shuffle=dataloader_cfg.get("valid_shuffle", False),
|
|
267
|
-
|
|
338
|
+
streaming=True,
|
|
268
339
|
chunk_size=dataloader_chunk_size,
|
|
269
340
|
num_workers=dataloader_cfg.get("num_workers", 0),
|
|
270
341
|
)
|
|
@@ -295,6 +366,15 @@ def train_model(train_config_path: str) -> None:
|
|
|
295
366
|
device,
|
|
296
367
|
)
|
|
297
368
|
|
|
369
|
+
log_cli_section("Model")
|
|
370
|
+
log_kv_lines(
|
|
371
|
+
[
|
|
372
|
+
("Model", model.__class__.__name__),
|
|
373
|
+
("Device", device),
|
|
374
|
+
("Session ID", session_id),
|
|
375
|
+
]
|
|
376
|
+
)
|
|
377
|
+
|
|
298
378
|
model.compile(
|
|
299
379
|
optimizer=train_cfg.get("optimizer", "adam"),
|
|
300
380
|
optimizer_params=train_cfg.get("optimizer_params", {}),
|
|
@@ -325,13 +405,30 @@ def predict_model(predict_config_path: str) -> None:
|
|
|
325
405
|
config_dir = config_file.resolve().parent
|
|
326
406
|
cfg = read_yaml(config_file)
|
|
327
407
|
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
408
|
+
# Checkpoint path is the primary configuration
|
|
409
|
+
if "checkpoint_path" not in cfg:
|
|
410
|
+
session_cfg = cfg.get("session", {}) or {}
|
|
411
|
+
session_id = session_cfg.get("id", "nextrec_session")
|
|
412
|
+
artifact_root = Path(session_cfg.get("artifact_root", "nextrec_logs"))
|
|
413
|
+
session_dir = artifact_root / session_id
|
|
414
|
+
else:
|
|
415
|
+
session_dir = Path(cfg["checkpoint_path"])
|
|
416
|
+
# Auto-infer session_id from checkpoint directory name
|
|
417
|
+
session_cfg = cfg.get("session", {}) or {}
|
|
418
|
+
session_id = session_cfg.get("id") or session_dir.name
|
|
419
|
+
|
|
332
420
|
setup_logger(session_id=session_id)
|
|
333
|
-
|
|
334
|
-
|
|
421
|
+
|
|
422
|
+
log_cli_section("CLI")
|
|
423
|
+
log_kv_lines(
|
|
424
|
+
[
|
|
425
|
+
("Mode", "predict"),
|
|
426
|
+
("Version", get_nextrec_version()),
|
|
427
|
+
("Session ID", session_id),
|
|
428
|
+
("Checkpoint", session_dir.resolve()),
|
|
429
|
+
("Config", config_file.resolve()),
|
|
430
|
+
("Command", " ".join(sys.argv)),
|
|
431
|
+
]
|
|
335
432
|
)
|
|
336
433
|
|
|
337
434
|
processor_path = Path(session_dir / "processor.pkl")
|
|
@@ -339,24 +436,38 @@ def predict_model(predict_config_path: str) -> None:
|
|
|
339
436
|
processor_path = session_dir / "processor" / "processor.pkl"
|
|
340
437
|
|
|
341
438
|
predict_cfg = cfg.get("predict", {}) or {}
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
439
|
+
|
|
440
|
+
# Auto-find model_config in checkpoint directory if not specified
|
|
441
|
+
if "model_config" in cfg:
|
|
442
|
+
model_cfg_path = resolve_path(cfg["model_config"], config_dir)
|
|
443
|
+
else:
|
|
444
|
+
# Try to find model_config.yaml in checkpoint directory
|
|
445
|
+
auto_model_cfg = session_dir / "model_config.yaml"
|
|
446
|
+
if auto_model_cfg.exists():
|
|
447
|
+
model_cfg_path = auto_model_cfg
|
|
448
|
+
else:
|
|
449
|
+
# Fallback to config directory
|
|
450
|
+
model_cfg_path = resolve_path("model_config.yaml", config_dir)
|
|
348
451
|
|
|
349
452
|
model_cfg = read_yaml(model_cfg_path)
|
|
350
|
-
# feature_cfg = read_yaml(feature_cfg_path)
|
|
351
453
|
model_cfg.setdefault("session_id", session_id)
|
|
352
454
|
model_cfg.setdefault("params", {})
|
|
353
455
|
|
|
456
|
+
log_cli_section("Config")
|
|
457
|
+
log_kv_lines(
|
|
458
|
+
[
|
|
459
|
+
("Predict config", config_file.resolve()),
|
|
460
|
+
("Model config", model_cfg_path),
|
|
461
|
+
("Processor", processor_path),
|
|
462
|
+
]
|
|
463
|
+
)
|
|
464
|
+
|
|
354
465
|
processor = DataProcessor.load(processor_path)
|
|
355
466
|
|
|
356
467
|
# Load checkpoint and ensure required parameters are passed
|
|
357
468
|
checkpoint_base = Path(session_dir)
|
|
358
469
|
if checkpoint_base.is_dir():
|
|
359
|
-
candidates = sorted(checkpoint_base.glob("*.
|
|
470
|
+
candidates = sorted(checkpoint_base.glob("*.pt"))
|
|
360
471
|
if not candidates:
|
|
361
472
|
raise FileNotFoundError(
|
|
362
473
|
f"[NextRec CLI Error]: Unable to find model checkpoint: {checkpoint_base}"
|
|
@@ -365,7 +476,7 @@ def predict_model(predict_config_path: str) -> None:
|
|
|
365
476
|
config_dir_for_features = checkpoint_base
|
|
366
477
|
else:
|
|
367
478
|
model_file = (
|
|
368
|
-
checkpoint_base.with_suffix(".
|
|
479
|
+
checkpoint_base.with_suffix(".pt")
|
|
369
480
|
if checkpoint_base.suffix == ""
|
|
370
481
|
else checkpoint_base
|
|
371
482
|
)
|
|
@@ -415,40 +526,78 @@ def predict_model(predict_config_path: str) -> None:
|
|
|
415
526
|
id_columns = [predict_cfg["id_column"]]
|
|
416
527
|
model.id_columns = id_columns
|
|
417
528
|
|
|
529
|
+
effective_id_columns = id_columns or model.id_columns
|
|
530
|
+
log_cli_section("Features")
|
|
531
|
+
log_kv_lines(
|
|
532
|
+
[
|
|
533
|
+
("Dense features", len(dense_features)),
|
|
534
|
+
("Sparse features", len(sparse_features)),
|
|
535
|
+
("Sequence features", len(sequence_features)),
|
|
536
|
+
("Targets", len(target_cols)),
|
|
537
|
+
("ID columns", len(effective_id_columns)),
|
|
538
|
+
]
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
log_cli_section("Model")
|
|
542
|
+
log_kv_lines(
|
|
543
|
+
[
|
|
544
|
+
("Model", model.__class__.__name__),
|
|
545
|
+
("Checkpoint", model_file),
|
|
546
|
+
("Device", predict_cfg.get("device", "cpu")),
|
|
547
|
+
]
|
|
548
|
+
)
|
|
549
|
+
|
|
418
550
|
rec_dataloader = RecDataLoader(
|
|
419
551
|
dense_features=model.dense_features,
|
|
420
552
|
sparse_features=model.sparse_features,
|
|
421
553
|
sequence_features=model.sequence_features,
|
|
422
554
|
target=None,
|
|
423
|
-
id_columns=
|
|
555
|
+
id_columns=effective_id_columns,
|
|
424
556
|
processor=processor,
|
|
425
557
|
)
|
|
426
558
|
|
|
427
559
|
data_path = resolve_path(predict_cfg["data_path"], config_dir)
|
|
428
560
|
batch_size = predict_cfg.get("batch_size", 512)
|
|
429
561
|
|
|
562
|
+
log_cli_section("Data")
|
|
563
|
+
log_kv_lines(
|
|
564
|
+
[
|
|
565
|
+
("Data path", data_path),
|
|
566
|
+
("Format", predict_cfg.get("source_data_format", predict_cfg.get("data_format", "auto"))),
|
|
567
|
+
("Batch size", batch_size),
|
|
568
|
+
("Chunk size", predict_cfg.get("chunk_size", 20000)),
|
|
569
|
+
("Streaming", predict_cfg.get("streaming", True)),
|
|
570
|
+
]
|
|
571
|
+
)
|
|
572
|
+
logger.info("")
|
|
430
573
|
pred_loader = rec_dataloader.create_dataloader(
|
|
431
574
|
data=str(data_path),
|
|
432
575
|
batch_size=batch_size,
|
|
433
576
|
shuffle=False,
|
|
434
|
-
|
|
577
|
+
streaming=predict_cfg.get("streaming", True),
|
|
435
578
|
chunk_size=predict_cfg.get("chunk_size", 20000),
|
|
436
579
|
)
|
|
437
580
|
|
|
438
|
-
|
|
439
|
-
|
|
581
|
+
# Build output path: {checkpoint_path}/predictions/{name}.{save_data_format}
|
|
582
|
+
save_format = predict_cfg.get("save_data_format", predict_cfg.get("save_format", "csv"))
|
|
583
|
+
pred_name = predict_cfg.get("name", "pred")
|
|
584
|
+
# Pass filename with extension to let model.predict handle path resolution
|
|
585
|
+
save_path = f"{pred_name}.{save_format}"
|
|
440
586
|
|
|
441
587
|
start = time.time()
|
|
442
|
-
|
|
588
|
+
logger.info("")
|
|
589
|
+
result = model.predict(
|
|
443
590
|
data=pred_loader,
|
|
444
591
|
batch_size=batch_size,
|
|
445
592
|
include_ids=bool(id_columns),
|
|
446
593
|
return_dataframe=False,
|
|
447
|
-
save_path=
|
|
448
|
-
save_format=
|
|
594
|
+
save_path=save_path,
|
|
595
|
+
save_format=save_format,
|
|
449
596
|
num_workers=predict_cfg.get("num_workers", 0),
|
|
450
597
|
)
|
|
451
598
|
duration = time.time() - start
|
|
599
|
+
# When return_dataframe=False, result is the actual file path
|
|
600
|
+
output_path = result if isinstance(result, Path) else checkpoint_base / "predictions" / save_path
|
|
452
601
|
logger.info(f"Prediction completed, results saved to: {output_path}")
|
|
453
602
|
logger.info(f"Total time: {duration:.2f} seconds")
|
|
454
603
|
|
|
@@ -492,8 +641,6 @@ Examples:
|
|
|
492
641
|
parser.add_argument("--predict_config", help="Prediction configuration file path")
|
|
493
642
|
args = parser.parse_args()
|
|
494
643
|
|
|
495
|
-
logger.info(get_nextrec_version())
|
|
496
|
-
|
|
497
644
|
if not args.mode:
|
|
498
645
|
parser.error("[NextRec CLI Error] --mode is required (train|predict)")
|
|
499
646
|
|
nextrec/data/dataloader.py
CHANGED
|
@@ -102,9 +102,8 @@ class FileDataset(FeatureSet, IterableDataset):
|
|
|
102
102
|
self.current_file_index = 0
|
|
103
103
|
for file_path in self.file_paths:
|
|
104
104
|
self.current_file_index += 1
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
logging.info(f"Processing file: {file_name}")
|
|
105
|
+
# Don't log file processing here to avoid interrupting progress bars
|
|
106
|
+
# File information is already displayed in the CLI data section
|
|
108
107
|
if self.file_type == "csv":
|
|
109
108
|
yield from self.read_csv_chunks(file_path)
|
|
110
109
|
elif self.file_type == "parquet":
|
|
@@ -190,7 +189,7 @@ class RecDataLoader(FeatureSet):
|
|
|
190
189
|
),
|
|
191
190
|
batch_size: int = 32,
|
|
192
191
|
shuffle: bool = True,
|
|
193
|
-
|
|
192
|
+
streaming: bool = False,
|
|
194
193
|
chunk_size: int = 10000,
|
|
195
194
|
num_workers: int = 0,
|
|
196
195
|
sampler=None,
|
|
@@ -202,7 +201,7 @@ class RecDataLoader(FeatureSet):
|
|
|
202
201
|
data: Data source, can be a dict, pd.DataFrame, file path (str), or existing DataLoader.
|
|
203
202
|
batch_size: Batch size for DataLoader.
|
|
204
203
|
shuffle: Whether to shuffle the data (ignored in streaming mode).
|
|
205
|
-
|
|
204
|
+
streaming: If True, use streaming mode for large files; if False, load full data into memory.
|
|
206
205
|
chunk_size: Chunk size for streaming mode (number of rows per chunk).
|
|
207
206
|
num_workers: Number of worker processes for data loading.
|
|
208
207
|
sampler: Optional sampler for DataLoader, only used for distributed training.
|
|
@@ -217,7 +216,7 @@ class RecDataLoader(FeatureSet):
|
|
|
217
216
|
path=data,
|
|
218
217
|
batch_size=batch_size,
|
|
219
218
|
shuffle=shuffle,
|
|
220
|
-
|
|
219
|
+
streaming=streaming,
|
|
221
220
|
chunk_size=chunk_size,
|
|
222
221
|
num_workers=num_workers,
|
|
223
222
|
)
|
|
@@ -230,7 +229,7 @@ class RecDataLoader(FeatureSet):
|
|
|
230
229
|
path=data,
|
|
231
230
|
batch_size=batch_size,
|
|
232
231
|
shuffle=shuffle,
|
|
233
|
-
|
|
232
|
+
streaming=streaming,
|
|
234
233
|
chunk_size=chunk_size,
|
|
235
234
|
num_workers=num_workers,
|
|
236
235
|
)
|
|
@@ -290,7 +289,7 @@ class RecDataLoader(FeatureSet):
|
|
|
290
289
|
path: str | os.PathLike | list[str] | list[os.PathLike],
|
|
291
290
|
batch_size: int,
|
|
292
291
|
shuffle: bool,
|
|
293
|
-
|
|
292
|
+
streaming: bool,
|
|
294
293
|
chunk_size: int = 10000,
|
|
295
294
|
num_workers: int = 0,
|
|
296
295
|
) -> DataLoader:
|
|
@@ -311,8 +310,17 @@ class RecDataLoader(FeatureSet):
|
|
|
311
310
|
f"[RecDataLoader Error] Unsupported file extension in list: {suffix}"
|
|
312
311
|
)
|
|
313
312
|
file_type = "csv" if suffix == ".csv" else "parquet"
|
|
313
|
+
if streaming:
|
|
314
|
+
return self.load_files_streaming(
|
|
315
|
+
file_paths,
|
|
316
|
+
file_type,
|
|
317
|
+
batch_size,
|
|
318
|
+
chunk_size,
|
|
319
|
+
shuffle,
|
|
320
|
+
num_workers=num_workers,
|
|
321
|
+
)
|
|
314
322
|
# Load full data into memory
|
|
315
|
-
|
|
323
|
+
else:
|
|
316
324
|
dfs = []
|
|
317
325
|
total_bytes = 0
|
|
318
326
|
for file_path in file_paths:
|
|
@@ -325,26 +333,17 @@ class RecDataLoader(FeatureSet):
|
|
|
325
333
|
dfs.append(df)
|
|
326
334
|
except MemoryError as exc:
|
|
327
335
|
raise MemoryError(
|
|
328
|
-
f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using
|
|
336
|
+
f"[RecDataLoader Error] Out of memory while reading {file_path}. Consider using streaming=True."
|
|
329
337
|
) from exc
|
|
330
338
|
try:
|
|
331
339
|
combined_df = pd.concat(dfs, ignore_index=True)
|
|
332
340
|
except MemoryError as exc:
|
|
333
341
|
raise MemoryError(
|
|
334
|
-
f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use
|
|
342
|
+
f"[RecDataLoader Error] Out of memory while concatenating loaded data (approx {total_bytes / (1024**3):.2f} GB). Use streaming=True or reduce chunk_size."
|
|
335
343
|
) from exc
|
|
336
344
|
return self.create_from_memory(
|
|
337
345
|
combined_df, batch_size, shuffle, num_workers=num_workers
|
|
338
346
|
)
|
|
339
|
-
else:
|
|
340
|
-
return self.load_files_streaming(
|
|
341
|
-
file_paths,
|
|
342
|
-
file_type,
|
|
343
|
-
batch_size,
|
|
344
|
-
chunk_size,
|
|
345
|
-
shuffle,
|
|
346
|
-
num_workers=num_workers,
|
|
347
|
-
)
|
|
348
347
|
|
|
349
348
|
def load_files_streaming(
|
|
350
349
|
self,
|
nextrec/models/ranking/deepfm.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Date: create on 27/10/2025
|
|
3
3
|
Checkpoint: edit on 24/11/2025
|
|
4
|
-
Author:
|
|
5
|
-
Yang Zhou,zyaztec@gmail.com
|
|
4
|
+
Author: Yang Zhou,zyaztec@gmail.com
|
|
6
5
|
Reference:
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
[1] Guo H, Tang R, Ye Y, et al. DeepFM: A factorization-machine based neural network
|
|
7
|
+
for CTR prediction[J]. arXiv preprint arXiv:1703.04247, 2017.
|
|
8
|
+
(https://arxiv.org/abs/1703.04247)
|
|
10
9
|
|
|
11
10
|
DeepFM combines a Factorization Machine (FM) for explicit second-order feature
|
|
12
11
|
interactions with a deep MLP for high-order nonlinear patterns. Both parts share
|
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Date: create on 09/11/2025
|
|
3
|
+
Checkpoint: edit on 09/12/2025
|
|
4
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
|
+
Reference:
|
|
6
|
+
[1] Zhao Z, Zhang H, Tang H, et al. EulerNet: Efficient and Effective Feature
|
|
7
|
+
Interaction Modeling with Euler's Formula. (SIGIR 2021)
|
|
8
|
+
|
|
9
|
+
EulerNet models feature interactions in the complex domain using Euler's
|
|
10
|
+
formula. Each field embedding is transformed into amplitude and phase,
|
|
11
|
+
then mapped to a complex vector. Feature interactions are captured by
|
|
12
|
+
multiplying complex vectors across fields, which corresponds to multiplying
|
|
13
|
+
amplitudes and summing phases. The resulting complex representation is
|
|
14
|
+
converted back to real-valued features for a linear readout, optionally
|
|
15
|
+
paired with a linear term for first-order signals.
|
|
16
|
+
|
|
17
|
+
Pipeline:
|
|
18
|
+
(1) Embed sparse/sequence features with a shared embedding dimension
|
|
19
|
+
(2) Map embeddings to complex vectors via amplitude/phase transforms
|
|
20
|
+
(3) Multiply complex vectors across fields (Euler interaction)
|
|
21
|
+
(4) Concatenate real & imaginary parts and apply a linear regression head
|
|
22
|
+
(5) Optionally add a linear term and apply the prediction layer
|
|
23
|
+
|
|
24
|
+
Key Advantages:
|
|
25
|
+
- Efficient higher-order interaction modeling via complex multiplication
|
|
26
|
+
- Compact representation without explicit cross-feature enumeration
|
|
27
|
+
- Works well on sparse high-dimensional feature spaces
|
|
28
|
+
|
|
29
|
+
EulerNet 使用欧拉公式将特征嵌入映射到复数域,通过复数相乘实现高效的
|
|
30
|
+
特征交互建模,再将复数表示转回实数向量做线性回归,并可选叠加线性项
|
|
31
|
+
以保留一阶信号。
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import torch
|
|
37
|
+
import torch.nn as nn
|
|
38
|
+
import torch.nn.functional as F
|
|
39
|
+
|
|
40
|
+
from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
|
|
41
|
+
from nextrec.basic.layers import LR, EmbeddingLayer, PredictionLayer
|
|
42
|
+
from nextrec.basic.model import BaseModel
|
|
43
|
+
|
|
44
|
+
class EulerInteractionLayerPaper(nn.Module):
|
|
45
|
+
"""
|
|
46
|
+
Paper-aligned Euler Interaction Layer.
|
|
47
|
+
|
|
48
|
+
Input: r, p (rectangular form) as tensors with shape [B, m, d]
|
|
49
|
+
where each field j is complex feature: r_j + i p_j.
|
|
50
|
+
|
|
51
|
+
Output: r_out, p_out as tensors with shape [B, n, d]
|
|
52
|
+
representing {o_k}_{k=1..n} (Eq.15) which can be stacked.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
*,
|
|
58
|
+
embedding_dim: int,
|
|
59
|
+
num_fields: int,
|
|
60
|
+
num_orders: int,
|
|
61
|
+
use_implicit: bool = True,
|
|
62
|
+
norm: str | None = "ln", # None | "bn" | "ln"
|
|
63
|
+
eps: float = 1e-9,
|
|
64
|
+
):
|
|
65
|
+
super().__init__()
|
|
66
|
+
self.d = embedding_dim
|
|
67
|
+
self.m = num_fields
|
|
68
|
+
self.n = num_orders
|
|
69
|
+
self.use_implicit = use_implicit
|
|
70
|
+
self.eps = eps
|
|
71
|
+
|
|
72
|
+
# Explicit part parameters
|
|
73
|
+
# alpha_{k,j} : shape [n, m, d] (vector-wise coefficients)
|
|
74
|
+
self.alpha = nn.Parameter(torch.empty(self.n, self.m, self.d))
|
|
75
|
+
# delta_k, delta'_k : shape [n, d]
|
|
76
|
+
self.delta_phase = nn.Parameter(torch.zeros(self.n, self.d))
|
|
77
|
+
self.delta_logmod = nn.Parameter(torch.zeros(self.n, self.d))
|
|
78
|
+
nn.init.xavier_uniform_(self.alpha)
|
|
79
|
+
|
|
80
|
+
# Implicit part parameters
|
|
81
|
+
if self.use_implicit:
|
|
82
|
+
# W_k in R^{d x (m*d)} and bias b_k in R^d
|
|
83
|
+
self.W_r = nn.Parameter(torch.empty(self.n, self.d, self.m * self.d))
|
|
84
|
+
self.b_r = nn.Parameter(torch.zeros(self.n, self.d))
|
|
85
|
+
self.W_p = nn.Parameter(torch.empty(self.n, self.d, self.m * self.d))
|
|
86
|
+
self.b_p = nn.Parameter(torch.zeros(self.n, self.d))
|
|
87
|
+
nn.init.xavier_uniform_(self.W_r)
|
|
88
|
+
nn.init.xavier_uniform_(self.W_p)
|
|
89
|
+
else:
|
|
90
|
+
self.W, self.b = None, None
|
|
91
|
+
|
|
92
|
+
# Normalization
|
|
93
|
+
# Apply on concatenated [r_k, p_k] per k.
|
|
94
|
+
self.norm = norm
|
|
95
|
+
if norm == "bn":
|
|
96
|
+
self.bn = nn.BatchNorm1d(self.n * self.d * 2)
|
|
97
|
+
self.ln = None
|
|
98
|
+
elif norm == "ln":
|
|
99
|
+
self.ln = nn.LayerNorm(self.d * 2)
|
|
100
|
+
self.bn = None
|
|
101
|
+
else:
|
|
102
|
+
self.bn = None
|
|
103
|
+
self.ln = None
|
|
104
|
+
|
|
105
|
+
def forward(self, r: torch.Tensor, p: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
|
106
|
+
"""
|
|
107
|
+
r, p: [B, m, d]
|
|
108
|
+
return r_out, p_out: [B, n, d]
|
|
109
|
+
"""
|
|
110
|
+
B, m, d = r.shape
|
|
111
|
+
assert m == self.m and d == self.d, f"Expected [B,{self.m},{self.d}] got {r.shape}"
|
|
112
|
+
|
|
113
|
+
# Euler Transformation: rectangular -> polar
|
|
114
|
+
lam = torch.sqrt(r * r + p * p + self.eps) # [B,m,d]
|
|
115
|
+
theta = torch.atan2(p, r) # [B,m,d]
|
|
116
|
+
log_lam = torch.log(lam + self.eps) # [B,m,d]
|
|
117
|
+
|
|
118
|
+
# Generalized Multi-order Transformation
|
|
119
|
+
# psi_k = sum_j alpha_{k,j} * theta_j + delta_k
|
|
120
|
+
# l_k = exp(sum_j alpha_{k,j} * log(lam_j) + delta'_k)
|
|
121
|
+
psi = torch.einsum("bmd,nmd->bnd", theta, self.alpha) + self.delta_phase # [B,n,d]
|
|
122
|
+
log_l = torch.einsum("bmd,nmd->bnd", log_lam, self.alpha) + self.delta_logmod # [B,n,d]
|
|
123
|
+
l = torch.exp(log_l) # [B,n,d]
|
|
124
|
+
|
|
125
|
+
# Inverse Euler Transformation
|
|
126
|
+
r_hat = l * torch.cos(psi) # [B,n,d]
|
|
127
|
+
p_hat = l * torch.sin(psi) # [B,n,d]
|
|
128
|
+
|
|
129
|
+
# Implicit interactions + fusion
|
|
130
|
+
if self.use_implicit:
|
|
131
|
+
r_cat = r.reshape(B, self.m * self.d) # [B, m*d]
|
|
132
|
+
p_cat = p.reshape(B, self.m * self.d) # [B, m*d]
|
|
133
|
+
# For each k: W_k @ r_cat + b_k -> [B,d]
|
|
134
|
+
r_imp = torch.einsum("bq,ndq->bnd", r_cat, self.W_r) + self.b_r
|
|
135
|
+
p_imp = torch.einsum("bq,ndq->bnd", p_cat, self.W_p) + self.b_p
|
|
136
|
+
r_imp = F.relu(r_imp)
|
|
137
|
+
p_imp = F.relu(p_imp)
|
|
138
|
+
r_out = r_hat + r_imp
|
|
139
|
+
p_out = p_hat + p_imp
|
|
140
|
+
else:
|
|
141
|
+
r_out, p_out = r_hat, p_hat
|
|
142
|
+
|
|
143
|
+
# Optional normalization (paper says BN/LN can be used between layers)
|
|
144
|
+
if self.bn is not None:
|
|
145
|
+
x = torch.cat([r_out, p_out], dim=-1).reshape(B, self.n * self.d * 2)
|
|
146
|
+
x = self.bn(x).reshape(B, self.n, self.d * 2)
|
|
147
|
+
r_out, p_out = x[..., : self.d], x[..., self.d :]
|
|
148
|
+
elif self.ln is not None:
|
|
149
|
+
x = torch.cat([r_out, p_out], dim=-1) # [B,n,2d]
|
|
150
|
+
x = self.ln(x)
|
|
151
|
+
r_out, p_out = x[..., : self.d], x[..., self.d :]
|
|
152
|
+
|
|
153
|
+
return r_out, p_out
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class ComplexSpaceMappingPaper(nn.Module):
|
|
157
|
+
"""
|
|
158
|
+
Map real embeddings e_j to complex features via Euler's formula (Eq.6-7).
|
|
159
|
+
For each field j:
|
|
160
|
+
r_j = mu_j * cos(e_j)
|
|
161
|
+
p_j = mu_j * sin(e_j)
|
|
162
|
+
mu_j is field-specific learnable vector (positive via exp).
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
def __init__(self, embedding_dim: int, num_fields: int):
|
|
166
|
+
super().__init__()
|
|
167
|
+
self.d = embedding_dim
|
|
168
|
+
self.m = num_fields
|
|
169
|
+
self.log_mu = nn.Parameter(torch.zeros(self.m, self.d)) # mu = exp(log_mu)
|
|
170
|
+
|
|
171
|
+
def forward(self, e: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
|
172
|
+
# e: [B, m, d]
|
|
173
|
+
mu = torch.exp(self.log_mu).unsqueeze(0) # [1,m,d]
|
|
174
|
+
r = mu * torch.cos(e)
|
|
175
|
+
p = mu * torch.sin(e)
|
|
176
|
+
return r, p
|
|
177
|
+
|
|
178
|
+
class EulerNetPaper(nn.Module):
|
|
179
|
+
"""
|
|
180
|
+
Paper-aligned EulerNet core (embedding -> mapping -> L Euler layers -> linear regression).
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
def __init__(
|
|
184
|
+
self,
|
|
185
|
+
*,
|
|
186
|
+
embedding_dim: int,
|
|
187
|
+
num_fields: int,
|
|
188
|
+
num_layers: int = 2,
|
|
189
|
+
num_orders: int = 8, # n in paper
|
|
190
|
+
use_implicit: bool = True,
|
|
191
|
+
norm: str | None = "ln", # None | "bn" | "ln"
|
|
192
|
+
):
|
|
193
|
+
super().__init__()
|
|
194
|
+
self.d = embedding_dim
|
|
195
|
+
self.m = num_fields
|
|
196
|
+
self.L = num_layers
|
|
197
|
+
self.n = num_orders
|
|
198
|
+
|
|
199
|
+
self.mapping = ComplexSpaceMappingPaper(embedding_dim, num_fields)
|
|
200
|
+
|
|
201
|
+
self.layers = nn.ModuleList([
|
|
202
|
+
EulerInteractionLayerPaper(
|
|
203
|
+
embedding_dim=embedding_dim,
|
|
204
|
+
num_fields=(num_fields if i == 0 else num_orders), # stack: m -> n -> n ...
|
|
205
|
+
num_orders=num_orders,
|
|
206
|
+
use_implicit=use_implicit,
|
|
207
|
+
norm=norm,
|
|
208
|
+
)
|
|
209
|
+
for i in range(num_layers)
|
|
210
|
+
])
|
|
211
|
+
|
|
212
|
+
# Output regression (Eq.16-17)
|
|
213
|
+
# After last layer: r,p are [B,n,d]. Concatenate to [B, n*d] each, then regress.
|
|
214
|
+
self.w = nn.Linear(self.n * self.d, 1, bias=False) # for real
|
|
215
|
+
self.w_im = nn.Linear(self.n * self.d, 1, bias=False) # for imag
|
|
216
|
+
|
|
217
|
+
def forward(self, field_emb: torch.Tensor) -> torch.Tensor:
|
|
218
|
+
"""
|
|
219
|
+
field_emb: [B, m, d] real embeddings e_j
|
|
220
|
+
return: logits, shape [B,1]
|
|
221
|
+
"""
|
|
222
|
+
r, p = self.mapping(field_emb) # [B,m,d]
|
|
223
|
+
|
|
224
|
+
# stack Euler interaction layers
|
|
225
|
+
for layer in self.layers:
|
|
226
|
+
r, p = layer(r, p) # -> [B,n,d]
|
|
227
|
+
|
|
228
|
+
r_flat = r.reshape(r.size(0), self.n * self.d)
|
|
229
|
+
p_flat = p.reshape(p.size(0), self.n * self.d)
|
|
230
|
+
|
|
231
|
+
z_re = self.w(r_flat)
|
|
232
|
+
z_im = self.w_im(p_flat)
|
|
233
|
+
return z_re + z_im # Eq.17 logits
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class EulerNet(BaseModel):
|
|
237
|
+
@property
|
|
238
|
+
def model_name(self):
|
|
239
|
+
return "EulerNet"
|
|
240
|
+
|
|
241
|
+
@property
|
|
242
|
+
def default_task(self):
|
|
243
|
+
return "binary"
|
|
244
|
+
|
|
245
|
+
def __init__(
|
|
246
|
+
self,
|
|
247
|
+
dense_features: list[DenseFeature] | None = None,
|
|
248
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
249
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
250
|
+
num_layers: int = 2,
|
|
251
|
+
num_orders: int = 8,
|
|
252
|
+
use_implicit: bool = True,
|
|
253
|
+
norm: str | None = "ln",
|
|
254
|
+
use_linear: bool = False,
|
|
255
|
+
target: list[str] | str | None = None,
|
|
256
|
+
task: str | list[str] | None = None,
|
|
257
|
+
optimizer: str = "adam",
|
|
258
|
+
optimizer_params: dict | None = None,
|
|
259
|
+
loss: str | nn.Module | None = "bce",
|
|
260
|
+
loss_params: dict | list[dict] | None = None,
|
|
261
|
+
device: str = "cpu",
|
|
262
|
+
embedding_l1_reg=1e-6,
|
|
263
|
+
dense_l1_reg=1e-5,
|
|
264
|
+
embedding_l2_reg=1e-5,
|
|
265
|
+
dense_l2_reg=1e-4,
|
|
266
|
+
**kwargs,
|
|
267
|
+
):
|
|
268
|
+
|
|
269
|
+
dense_features = dense_features or []
|
|
270
|
+
sparse_features = sparse_features or []
|
|
271
|
+
sequence_features = sequence_features or []
|
|
272
|
+
optimizer_params = optimizer_params or {}
|
|
273
|
+
if loss is None:
|
|
274
|
+
loss = "bce"
|
|
275
|
+
|
|
276
|
+
super(EulerNet, self).__init__(
|
|
277
|
+
dense_features=dense_features,
|
|
278
|
+
sparse_features=sparse_features,
|
|
279
|
+
sequence_features=sequence_features,
|
|
280
|
+
target=target,
|
|
281
|
+
task=task or self.default_task,
|
|
282
|
+
device=device,
|
|
283
|
+
embedding_l1_reg=embedding_l1_reg,
|
|
284
|
+
dense_l1_reg=dense_l1_reg,
|
|
285
|
+
embedding_l2_reg=embedding_l2_reg,
|
|
286
|
+
dense_l2_reg=dense_l2_reg,
|
|
287
|
+
**kwargs,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
self.loss = loss
|
|
291
|
+
self.use_linear = use_linear
|
|
292
|
+
|
|
293
|
+
self.linear_features = dense_features + sparse_features + sequence_features
|
|
294
|
+
self.interaction_features = (
|
|
295
|
+
[f for f in dense_features if getattr(f, "use_embedding", False)]
|
|
296
|
+
+ sparse_features
|
|
297
|
+
+ sequence_features
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
if len(self.interaction_features) < 2:
|
|
301
|
+
raise ValueError(
|
|
302
|
+
"EulerNet requires at least two embedded features for interactions."
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
306
|
+
|
|
307
|
+
self.num_fields = len(self.interaction_features)
|
|
308
|
+
self.embedding_dim = self.interaction_features[0].embedding_dim
|
|
309
|
+
if any(
|
|
310
|
+
f.embedding_dim != self.embedding_dim for f in self.interaction_features
|
|
311
|
+
):
|
|
312
|
+
raise ValueError(
|
|
313
|
+
"All interaction features must share the same embedding_dim in EulerNet."
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
self.euler = EulerNetPaper(
|
|
317
|
+
embedding_dim=self.embedding_dim,
|
|
318
|
+
num_fields=self.num_fields,
|
|
319
|
+
num_layers=num_layers,
|
|
320
|
+
num_orders=num_orders,
|
|
321
|
+
use_implicit=use_implicit,
|
|
322
|
+
norm=norm,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if self.use_linear:
|
|
326
|
+
if len(self.linear_features) == 0:
|
|
327
|
+
raise ValueError(
|
|
328
|
+
"EulerNet linear term requires at least one input feature."
|
|
329
|
+
)
|
|
330
|
+
linear_dim = self.embedding.get_input_dim(self.linear_features)
|
|
331
|
+
if linear_dim <= 0:
|
|
332
|
+
raise ValueError("EulerNet linear input_dim must be positive.")
|
|
333
|
+
self.linear = LR(linear_dim)
|
|
334
|
+
else:
|
|
335
|
+
self.linear = None
|
|
336
|
+
|
|
337
|
+
self.prediction_layer = PredictionLayer(task_type=self.task)
|
|
338
|
+
|
|
339
|
+
modules = ["euler"]
|
|
340
|
+
if self.use_linear:
|
|
341
|
+
modules.append("linear")
|
|
342
|
+
self.register_regularization_weights(
|
|
343
|
+
embedding_attr="embedding", include_modules=modules
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
self.compile(
|
|
347
|
+
optimizer=optimizer,
|
|
348
|
+
optimizer_params=optimizer_params,
|
|
349
|
+
loss=loss,
|
|
350
|
+
loss_params=loss_params,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
def forward(self, x):
|
|
354
|
+
field_emb = self.embedding(
|
|
355
|
+
x=x, features=self.interaction_features, squeeze_dim=False
|
|
356
|
+
)
|
|
357
|
+
y_euler = self.euler(field_emb)
|
|
358
|
+
|
|
359
|
+
if self.use_linear and self.linear is not None:
|
|
360
|
+
linear_input = self.embedding(
|
|
361
|
+
x=x, features=self.linear_features, squeeze_dim=True
|
|
362
|
+
)
|
|
363
|
+
y_euler = y_euler + self.linear(linear_input)
|
|
364
|
+
|
|
365
|
+
return self.prediction_layer(y_euler)
|
nextrec/models/ranking/lr.py
CHANGED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Date: create on 09/11/2025
|
|
3
|
+
Checkpoint: edit on 09/12/2025
|
|
4
|
+
Author: Yang Zhou, zyaztec@gmail.com
|
|
5
|
+
Reference:
|
|
6
|
+
[1] Hosmer D W, Lemeshow S, Sturdivant R X. Applied Logistic Regression.
|
|
7
|
+
|
|
8
|
+
Logistic Regression (LR) is a classic linear baseline for CTR/ranking tasks.
|
|
9
|
+
It maps each feature (dense, sparse, or sequence) into a numeric vector and
|
|
10
|
+
learns a single linear logit. Despite its simplicity, LR is strong for
|
|
11
|
+
high-dimensional sparse data and is commonly used as a baseline or a "wide"
|
|
12
|
+
component in hybrid models.
|
|
13
|
+
|
|
14
|
+
Pipeline:
|
|
15
|
+
(1) Embed sparse/sequence fields; project dense fields if configured
|
|
16
|
+
(2) Concatenate all feature vectors into a single linear input
|
|
17
|
+
(3) Apply a linear layer to produce logits
|
|
18
|
+
(4) Use the prediction layer to output task-specific probabilities
|
|
19
|
+
|
|
20
|
+
Key Advantages:
|
|
21
|
+
- Fast and easy to train
|
|
22
|
+
- Strong baseline for sparse, high-dimensional features
|
|
23
|
+
- Interpretable linear weights
|
|
24
|
+
|
|
25
|
+
LR 是 CTR/排序任务中最经典的线性基线模型。它将稠密、稀疏以及序列特征
|
|
26
|
+
映射为数值向量后做线性组合,输出 logit。虽然结构简单,但在稀疏高维场景
|
|
27
|
+
依然具有很强的基线效果,并常作为 Wide 端与深模型组合。
|
|
28
|
+
|
|
29
|
+
处理流程:
|
|
30
|
+
(1) 稀疏/序列特征做 embedding,稠密特征按需投影
|
|
31
|
+
(2) 拼接所有特征向量形成线性输入
|
|
32
|
+
(3) 线性层输出 logit
|
|
33
|
+
(4) 通过预测层输出任务概率
|
|
34
|
+
|
|
35
|
+
主要优点:
|
|
36
|
+
- 训练与推理速度快
|
|
37
|
+
- 稀疏高维特征下表现稳定
|
|
38
|
+
- 权重可解释性强
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
import torch.nn as nn
|
|
42
|
+
|
|
43
|
+
from nextrec.basic.features import DenseFeature, SequenceFeature, SparseFeature
|
|
44
|
+
from nextrec.basic.layers import EmbeddingLayer, LR as LinearLayer, PredictionLayer
|
|
45
|
+
from nextrec.basic.model import BaseModel
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class LR(BaseModel):
|
|
49
|
+
@property
|
|
50
|
+
def model_name(self):
|
|
51
|
+
return "LR"
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def default_task(self):
|
|
55
|
+
return "binary"
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
dense_features: list[DenseFeature] | None = None,
|
|
60
|
+
sparse_features: list[SparseFeature] | None = None,
|
|
61
|
+
sequence_features: list[SequenceFeature] | None = None,
|
|
62
|
+
target: list[str] | str | None = None,
|
|
63
|
+
task: str | list[str] | None = None,
|
|
64
|
+
optimizer: str = "adam",
|
|
65
|
+
optimizer_params: dict | None = None,
|
|
66
|
+
loss: str | nn.Module | None = "bce",
|
|
67
|
+
loss_params: dict | list[dict] | None = None,
|
|
68
|
+
device: str = "cpu",
|
|
69
|
+
embedding_l1_reg=1e-6,
|
|
70
|
+
dense_l1_reg=1e-5,
|
|
71
|
+
embedding_l2_reg=1e-5,
|
|
72
|
+
dense_l2_reg=1e-4,
|
|
73
|
+
**kwargs,
|
|
74
|
+
):
|
|
75
|
+
|
|
76
|
+
dense_features = dense_features or []
|
|
77
|
+
sparse_features = sparse_features or []
|
|
78
|
+
sequence_features = sequence_features or []
|
|
79
|
+
optimizer_params = optimizer_params or {}
|
|
80
|
+
if loss is None:
|
|
81
|
+
loss = "bce"
|
|
82
|
+
|
|
83
|
+
super(LR, self).__init__(
|
|
84
|
+
dense_features=dense_features,
|
|
85
|
+
sparse_features=sparse_features,
|
|
86
|
+
sequence_features=sequence_features,
|
|
87
|
+
target=target,
|
|
88
|
+
task=task or self.default_task,
|
|
89
|
+
device=device,
|
|
90
|
+
embedding_l1_reg=embedding_l1_reg,
|
|
91
|
+
dense_l1_reg=dense_l1_reg,
|
|
92
|
+
embedding_l2_reg=embedding_l2_reg,
|
|
93
|
+
dense_l2_reg=dense_l2_reg,
|
|
94
|
+
**kwargs,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
self.loss = loss
|
|
98
|
+
|
|
99
|
+
self.embedding = EmbeddingLayer(features=self.all_features)
|
|
100
|
+
linear_input_dim = self.embedding.input_dim
|
|
101
|
+
self.linear = LinearLayer(linear_input_dim)
|
|
102
|
+
self.prediction_layer = PredictionLayer(task_type=self.task)
|
|
103
|
+
|
|
104
|
+
self.register_regularization_weights(
|
|
105
|
+
embedding_attr="embedding", include_modules=["linear"]
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
self.compile(
|
|
109
|
+
optimizer=optimizer,
|
|
110
|
+
optimizer_params=optimizer_params,
|
|
111
|
+
loss=loss,
|
|
112
|
+
loss_params=loss_params,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def forward(self, x):
|
|
116
|
+
input_linear = self.embedding(
|
|
117
|
+
x=x, features=self.all_features, squeeze_dim=True
|
|
118
|
+
)
|
|
119
|
+
y = self.linear(input_linear)
|
|
120
|
+
return self.prediction_layer(y)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nextrec
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.10
|
|
4
4
|
Summary: A comprehensive recommendation library with match, ranking, and multi-task learning models
|
|
5
5
|
Project-URL: Homepage, https://github.com/zerolovesea/NextRec
|
|
6
6
|
Project-URL: Repository, https://github.com/zerolovesea/NextRec
|
|
@@ -66,7 +66,7 @@ Description-Content-Type: text/markdown
|
|
|
66
66
|

|
|
67
67
|

|
|
68
68
|

|
|
69
|
-

|
|
70
70
|
|
|
71
71
|
中文文档 | [English Version](README_en.md)
|
|
72
72
|
|
|
@@ -99,11 +99,10 @@ NextRec是一个基于PyTorch的现代推荐系统框架,旨在为研究工程
|
|
|
99
99
|
|
|
100
100
|
## NextRec近期进展
|
|
101
101
|
|
|
102
|
-
- **12/12/2025** 在v0.4.
|
|
102
|
+
- **12/12/2025** 在v0.4.10中加入了[RQ-VAE](/nextrec/models/representation/rqvae.py)模块。配套的[数据集](/dataset/ecommerce_task.csv)和[代码](tutorials/notebooks/zh/使用RQ-VAE构建语义ID.ipynb)已经同步在仓库中
|
|
103
103
|
- **07/12/2025** 发布了NextRec CLI命令行工具,它允许用户根据配置文件进行一键训练和推理,我们提供了相关的[教程](/nextrec_cli_preset/NextRec-CLI_zh.md)和[教学代码](/nextrec_cli_preset)
|
|
104
104
|
- **03/12/2025** NextRec获得了100颗🌟!感谢大家的支持
|
|
105
105
|
- **06/12/2025** 在v0.4.1中支持了单机多卡的分布式DDP训练,并且提供了配套的[代码](tutorials/distributed)
|
|
106
|
-
- **23/11/2025** 在v0.2.2中对basemodel进行了逻辑上的大幅重构和流程统一,并且对listwise/pairwise/pointwise损失进行了统一
|
|
107
106
|
- **11/11/2025** NextRec v0.1.0发布,我们提供了10余种Ranking模型,4种多任务模型和4种召回模型,以及统一的训练/日志/指标管理系统
|
|
108
107
|
|
|
109
108
|
## 架构
|
|
@@ -241,11 +240,11 @@ nextrec --mode=train --train_config=path/to/train_config.yaml
|
|
|
241
240
|
nextrec --mode=predict --predict_config=path/to/predict_config.yaml
|
|
242
241
|
```
|
|
243
242
|
|
|
244
|
-
> 截止当前版本0.4.
|
|
243
|
+
> 截止当前版本0.4.10,NextRec CLI支持单机训练,分布式训练相关功能尚在开发中。
|
|
245
244
|
|
|
246
245
|
## 兼容平台
|
|
247
246
|
|
|
248
|
-
当前最新版本为0.4.
|
|
247
|
+
当前最新版本为0.4.10,所有模型和测试代码均已在以下平台通过验证,如果开发者在使用中遇到兼容问题,请在issue区提出错误报告及系统版本:
|
|
249
248
|
|
|
250
249
|
| 平台 | 配置 |
|
|
251
250
|
|------|------|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
nextrec/__init__.py,sha256=_M3oUqyuvQ5k8Th_3wId6hQ_caclh7M5ad51XN09m98,235
|
|
2
|
-
nextrec/__version__.py,sha256=
|
|
3
|
-
nextrec/cli.py,sha256=
|
|
2
|
+
nextrec/__version__.py,sha256=N_k8mdXQaZTz0YYxAgWi2g6nf_GP6B5r8Q49Om9EynA,23
|
|
3
|
+
nextrec/cli.py,sha256=PXRNXMRm_a_1u6StnjsHefq0rKqsc6Mzx3mZmc9553g,23803
|
|
4
4
|
nextrec/basic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
nextrec/basic/activation.py,sha256=uzTWfCOtBSkbu_Gk9XBNTj8__s241CaYLJk6l8nGX9I,2885
|
|
6
6
|
nextrec/basic/callback.py,sha256=a6gg7r3x1v0xaOSya9PteLql7I14nepY7gX8tDYtins,14679
|
|
@@ -8,13 +8,13 @@ nextrec/basic/features.py,sha256=Wnbzr7UMotgv1Vzeg0o9Po-KKIvYUSYIghoVDfMPx_g,434
|
|
|
8
8
|
nextrec/basic/layers.py,sha256=GJH2Tx3IkZrYGb7-ET976iHCC28Ubck_NO9-iyY4mDI,28911
|
|
9
9
|
nextrec/basic/loggers.py,sha256=JnQiFvmsVgZ63gqBLR2ZFWrVPzkxRbzWhTdeoiJKcos,6526
|
|
10
10
|
nextrec/basic/metrics.py,sha256=8RswR_3MGvIBkT_n6fnmON2eYH-hfD7kIKVnyJJjL3o,23131
|
|
11
|
-
nextrec/basic/model.py,sha256=
|
|
11
|
+
nextrec/basic/model.py,sha256=OCcV9nTAZukurRISzPGCQM5yJ0Fpph3vOMKb2CPkI68,98685
|
|
12
12
|
nextrec/basic/session.py,sha256=UOG_-EgCOxvqZwCkiEd8sgNV2G1sm_HbzKYVQw8yYDI,4483
|
|
13
13
|
nextrec/data/__init__.py,sha256=YZQjpty1pDCM7q_YNmiA2sa5kbujUw26ObLHWjMPjKY,1194
|
|
14
14
|
nextrec/data/batch_utils.py,sha256=0bYGVX7RlhnHv_ZBaUngjDIpBNw-igCk98DgOsF7T6o,2879
|
|
15
15
|
nextrec/data/data_processing.py,sha256=lKXDBszrO5fJMAQetgSPr2mSQuzOluuz1eHV4jp0TDU,5538
|
|
16
16
|
nextrec/data/data_utils.py,sha256=0Ls1cnG9lBz0ovtyedw5vwp7WegGK_iF-F8e_3DEddo,880
|
|
17
|
-
nextrec/data/dataloader.py,sha256=
|
|
17
|
+
nextrec/data/dataloader.py,sha256=xTORNbaQVa20sk2S3kyV0SSngscvq8bNqHr0AmYjFqM,18768
|
|
18
18
|
nextrec/data/preprocessor.py,sha256=wNjivq2N-iDzBropkp3YfSkN0jSA4l4h81C-ECa6k4c,44643
|
|
19
19
|
nextrec/loss/__init__.py,sha256=-sibZK8QXLblVNWqdqjrPPzMCDyIXSq7yd2eZ57p9Nw,810
|
|
20
20
|
nextrec/loss/listwise.py,sha256=UT9vJCOTOQLogVwaeTV7Z5uxIYnngGdxk-p9e97MGkU,5744
|
|
@@ -35,14 +35,14 @@ nextrec/models/ranking/afm.py,sha256=96jGUPL4yTWobMIVBjHpOxl9AtAzCAGR8yw7Sy2JmdQ
|
|
|
35
35
|
nextrec/models/ranking/autoint.py,sha256=S6Cxnp1q2OErSYqmIix5P-b4qLWR-0dY6TMStuU6WLg,8109
|
|
36
36
|
nextrec/models/ranking/dcn.py,sha256=whkjiKEuadl6oSP-NJdSOCOqvWZGX4EsId9oqlfVpa8,7299
|
|
37
37
|
nextrec/models/ranking/dcn_v2.py,sha256=QnqQbJsrtQp4mtvnBXFUVefKyr4dw-gHNWrCbO26oHw,11163
|
|
38
|
-
nextrec/models/ranking/deepfm.py,sha256=
|
|
38
|
+
nextrec/models/ranking/deepfm.py,sha256=aXoK59e2KaaPe5vfyFW4YiHbX4E2iG3gxFCxmWo8RHk,5200
|
|
39
39
|
nextrec/models/ranking/dien.py,sha256=c7Zs85vxhOgKHg5s0QcSLCn1xXCCSD177TMERgM_v8g,18958
|
|
40
40
|
nextrec/models/ranking/din.py,sha256=gdUhuKiKXBNOALbK8fGhlbSeuDT8agcEdNSrC_wveHc,9422
|
|
41
|
-
nextrec/models/ranking/eulernet.py,sha256=
|
|
41
|
+
nextrec/models/ranking/eulernet.py,sha256=mZTrD8rKbGbWMEeWpTl8mVimytLFJTLM5-LS_I3U6cw,13115
|
|
42
42
|
nextrec/models/ranking/ffm.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
43
|
nextrec/models/ranking/fibinet.py,sha256=_eroddVHooJcaGT8MqS4mUrtv5j4pnTmfI3FoAKOZhs,7919
|
|
44
44
|
nextrec/models/ranking/fm.py,sha256=SsrSKK3y4xg5Lv-t3JLnZan55Hzze2AxAiVPuscy0bk,4536
|
|
45
|
-
nextrec/models/ranking/lr.py,sha256=
|
|
45
|
+
nextrec/models/ranking/lr.py,sha256=Qf8RozgWlsKjHGVbo-94d2Z_4kMfCXHmvwkYu3WVZjQ,4030
|
|
46
46
|
nextrec/models/ranking/masknet.py,sha256=tY1y2lO0iq82oylPN0SBnL5Bikc8weinFXpURyVT1hE,12373
|
|
47
47
|
nextrec/models/ranking/pnn.py,sha256=FcNIFAw5J0ORGSR6L8ZK7NeXlJPpojwe_SpsxMQqCFw,8174
|
|
48
48
|
nextrec/models/ranking/widedeep.py,sha256=-ghKfe_0puvlI9fBQr8lK3gXkfVvslGwP40AJTGqc7w,5077
|
|
@@ -63,8 +63,8 @@ nextrec/utils/embedding.py,sha256=akAEc062MG2cD7VIOllHaqtwzAirQR2gq5iW7oKpGAU,14
|
|
|
63
63
|
nextrec/utils/feature.py,sha256=rsUAv3ELyDpehVw8nPEEsLCCIjuKGTJJZuFaWB_wrPk,633
|
|
64
64
|
nextrec/utils/model.py,sha256=dYl1XfIZt6aVjNyV2AAhcArwFRMcEAKrjG_pr8AVHs0,1163
|
|
65
65
|
nextrec/utils/torch_utils.py,sha256=AKfYbSOJjEw874xsDB5IO3Ote4X7vnqzt_E0jJny0o8,13468
|
|
66
|
-
nextrec-0.4.
|
|
67
|
-
nextrec-0.4.
|
|
68
|
-
nextrec-0.4.
|
|
69
|
-
nextrec-0.4.
|
|
70
|
-
nextrec-0.4.
|
|
66
|
+
nextrec-0.4.10.dist-info/METADATA,sha256=b7ILFNk7WRZCg_2ZCx7_SWdU_d3mzN2b5IWTCnB0mbg,19318
|
|
67
|
+
nextrec-0.4.10.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
68
|
+
nextrec-0.4.10.dist-info/entry_points.txt,sha256=NN-dNSdfMRTv86bNXM7d3ZEPW2BQC6bRi7QP7i9cIps,45
|
|
69
|
+
nextrec-0.4.10.dist-info/licenses/LICENSE,sha256=2fQfVKeafywkni7MYHyClC6RGGC3laLTXCNBx-ubtp0,1064
|
|
70
|
+
nextrec-0.4.10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|