replay-rec 0.18.0__tar.gz → 0.18.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {replay_rec-0.18.0 → replay_rec-0.18.1}/PKG-INFO +73 -60
- {replay_rec-0.18.0 → replay_rec-0.18.1}/README.md +66 -56
- {replay_rec-0.18.0 → replay_rec-0.18.1}/pyproject.toml +8 -4
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/__init__.py +1 -1
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/dataset.py +27 -1
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/dataset_utils/dataset_label_encoder.py +6 -3
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/nn/schema.py +37 -16
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/nn/sequence_tokenizer.py +313 -165
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/nn/torch_sequential_dataset.py +17 -8
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/nn/utils.py +14 -7
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/schema.py +10 -6
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/offline_metrics.py +2 -2
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/__init__.py +1 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/base_rec.py +18 -21
- replay_rec-0.18.1/replay/models/lin_ucb.py +407 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/bert4rec/dataset.py +17 -4
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/bert4rec/lightning.py +121 -54
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/bert4rec/model.py +21 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/callbacks/prediction_callbacks.py +5 -1
- replay_rec-0.18.1/replay/models/nn/sequential/compiled/__init__.py +5 -0
- replay_rec-0.18.1/replay/models/nn/sequential/compiled/base_compiled_model.py +261 -0
- replay_rec-0.18.1/replay/models/nn/sequential/compiled/bert4rec_compiled.py +152 -0
- replay_rec-0.18.1/replay/models/nn/sequential/compiled/sasrec_compiled.py +145 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/postprocessors/postprocessors.py +27 -1
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/sasrec/dataset.py +17 -1
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/sasrec/lightning.py +126 -50
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/sasrec/model.py +3 -4
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/preprocessing/__init__.py +7 -1
- replay_rec-0.18.1/replay/preprocessing/discretizer.py +719 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/preprocessing/label_encoder.py +384 -52
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/cold_user_random_splitter.py +1 -1
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/utils/__init__.py +1 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/utils/common.py +7 -8
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/utils/session_handler.py +3 -4
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/utils/spark_utils.py +15 -1
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/utils/types.py +8 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/LICENSE +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/dataset_utils/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/nn/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/nn/sequential_dataset.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/data/spark_schema.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/base_metric.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/categorical_diversity.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/coverage.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/descriptors.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/experiment.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/hitrate.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/map.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/mrr.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/ndcg.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/novelty.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/precision.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/recall.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/rocauc.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/surprisal.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/torch_metrics_builder.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/metrics/unexpectedness.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/als.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/association_rules.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/base_neighbour_rec.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/cat_pop_rec.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/cluster.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/ann_mixin.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/entities/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/entities/base_hnsw_param.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/entities/hnswlib_param.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/entities/nmslib_hnsw_param.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_builders/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_builders/base_index_builder.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_builders/driver_hnswlib_index_builder.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_builders/driver_nmslib_index_builder.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_builders/executor_hnswlib_index_builder.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_builders/executor_nmslib_index_builder.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_builders/nmslib_index_builder_mixin.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_inferers/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_inferers/base_inferer.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_inferers/hnswlib_filter_index_inferer.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_inferers/hnswlib_index_inferer.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_inferers/nmslib_filter_index_inferer.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_inferers/nmslib_index_inferer.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_inferers/utils.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_stores/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_stores/base_index_store.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_stores/hdfs_index_store.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_stores/shared_disk_index_store.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_stores/spark_files_index_store.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/index_stores/utils.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/extensions/ann/utils.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/kl_ucb.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/knn.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/optimizer_utils/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/optimizer_utils/optimizer_factory.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/bert4rec/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/callbacks/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/callbacks/validation_callback.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/postprocessors/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/postprocessors/_base.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/nn/sequential/sasrec/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/pop_rec.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/query_pop_rec.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/random_rec.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/slim.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/thompson_sampling.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/ucb.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/wilson.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/models/word2vec.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/optimization/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/optimization/optuna_objective.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/preprocessing/converter.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/preprocessing/filters.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/preprocessing/history_based_fp.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/preprocessing/sessionizer.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/scenarios/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/scenarios/fallback.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/__init__.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/base_splitter.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/k_folds.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/last_n_splitter.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/new_users_splitter.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/random_splitter.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/ratio_splitter.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/time_splitter.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/splitters/two_stage_splitter.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/utils/dataframe_bucketizer.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/utils/distributions.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/utils/model_handler.py +0 -0
- {replay_rec-0.18.0 → replay_rec-0.18.1}/replay/utils/time.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: replay-rec
|
|
3
|
-
Version: 0.18.
|
|
3
|
+
Version: 0.18.1
|
|
4
4
|
Summary: RecSys Library
|
|
5
5
|
Home-page: https://sb-ai-lab.github.io/RePlay/
|
|
6
6
|
License: Apache-2.0
|
|
@@ -21,10 +21,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
21
21
|
Provides-Extra: all
|
|
22
22
|
Provides-Extra: spark
|
|
23
23
|
Provides-Extra: torch
|
|
24
|
+
Provides-Extra: torch-openvino
|
|
24
25
|
Requires-Dist: fixed-install-nmslib (==2.1.2)
|
|
25
26
|
Requires-Dist: hnswlib (>=0.7.0,<0.8.0)
|
|
26
|
-
Requires-Dist: lightning (>=2.0.2,<=2.4.0) ; extra == "torch" or extra == "all"
|
|
27
|
+
Requires-Dist: lightning (>=2.0.2,<=2.4.0) ; extra == "torch" or extra == "torch-openvino" or extra == "all"
|
|
27
28
|
Requires-Dist: numpy (>=1.20.0)
|
|
29
|
+
Requires-Dist: onnx (>=1.16.2,<1.17.0) ; extra == "torch-openvino" or extra == "all"
|
|
30
|
+
Requires-Dist: openvino (>=2024.3.0,<2024.4.0) ; extra == "torch-openvino" or extra == "all"
|
|
28
31
|
Requires-Dist: optuna (>=3.2.0,<3.3.0)
|
|
29
32
|
Requires-Dist: pandas (>=1.3.5,<=2.2.2)
|
|
30
33
|
Requires-Dist: polars (>=1.0.0,<1.1.0)
|
|
@@ -32,10 +35,10 @@ Requires-Dist: psutil (>=6.0.0,<6.1.0)
|
|
|
32
35
|
Requires-Dist: pyarrow (>=12.0.1)
|
|
33
36
|
Requires-Dist: pyspark (>=3.0,<3.6) ; (python_full_version >= "3.8.1" and python_version < "3.11") and (extra == "spark" or extra == "all")
|
|
34
37
|
Requires-Dist: pyspark (>=3.4,<3.6) ; (python_version >= "3.11" and python_version < "3.12") and (extra == "spark" or extra == "all")
|
|
35
|
-
Requires-Dist: pytorch-ranger (>=0.1.1,<0.2.0) ; extra == "torch" or extra == "all"
|
|
38
|
+
Requires-Dist: pytorch-ranger (>=0.1.1,<0.2.0) ; extra == "torch" or extra == "torch-openvino" or extra == "all"
|
|
36
39
|
Requires-Dist: scikit-learn (>=1.0.2,<2.0.0)
|
|
37
40
|
Requires-Dist: scipy (>=1.8.1,<2.0.0)
|
|
38
|
-
Requires-Dist: torch (>=1.8,<=2.
|
|
41
|
+
Requires-Dist: torch (>=1.8,<=2.5.0) ; extra == "torch" or extra == "torch-openvino" or extra == "all"
|
|
39
42
|
Project-URL: Repository, https://github.com/sb-ai-lab/RePlay
|
|
40
43
|
Description-Content-Type: text/markdown
|
|
41
44
|
|
|
@@ -44,11 +47,15 @@ Description-Content-Type: text/markdown
|
|
|
44
47
|
|
|
45
48
|
[](https://github.com/sb-ai-lab/RePlay/blob/main/LICENSE)
|
|
46
49
|
[](https://pypi.org/project/replay-rec)
|
|
50
|
+
[](https://sb-ai-lab.github.io/RePlay/)
|
|
47
51
|
[](https://pypistats.org/packages/replay-rec)
|
|
48
52
|
<br>
|
|
49
53
|
[](https://github.com/sb-ai-lab/RePlay/actions/workflows/main.yml?query=branch%3Amain)
|
|
54
|
+
[](https://github.com/astral-sh/ruff)
|
|
55
|
+
[](https://pypi.org/project/replay-rec)
|
|
50
56
|
[](https://github.com/sb-ai-lab/RePlay/discussions)
|
|
51
57
|
|
|
58
|
+
|
|
52
59
|
RePlay is an advanced framework designed to facilitate the development and evaluation of recommendation systems. It provides a robust set of tools covering the entire lifecycle of a recommendation system pipeline:
|
|
53
60
|
|
|
54
61
|
## 🚀 Features:
|
|
@@ -63,61 +70,25 @@ RePlay is an advanced framework designed to facilitate the development and evalu
|
|
|
63
70
|
1. **Diverse Hardware Support:** Compatible with various hardware configurations including CPU, GPU, Multi-GPU.
|
|
64
71
|
2. **Cluster Computing Integration:** Integrating with PySpark for distributed computing, enabling scalability for large-scale recommendation systems.
|
|
65
72
|
|
|
66
|
-
## 📖 Documentation is available [here](https://sb-ai-lab.github.io/RePlay/).
|
|
67
|
-
|
|
68
73
|
<a name="toc"></a>
|
|
69
74
|
# Table of Contents
|
|
70
75
|
|
|
71
|
-
* [Installation](#installation)
|
|
72
76
|
* [Quickstart](#quickstart)
|
|
77
|
+
* [Installation](#installation)
|
|
73
78
|
* [Resources](#examples)
|
|
74
79
|
* [Contributing to RePlay](#contributing)
|
|
75
80
|
|
|
76
81
|
|
|
77
|
-
<a name="
|
|
78
|
-
##
|
|
79
|
-
|
|
80
|
-
Installation via `pip` package manager is recommended by default:
|
|
81
|
-
|
|
82
|
-
```bash
|
|
83
|
-
pip install replay-rec
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
In this case it will be installed the `core` package without `PySpark` and `PyTorch` dependencies.
|
|
87
|
-
Also `experimental` submodule will not be installed.
|
|
88
|
-
|
|
89
|
-
To install `experimental` submodule please specify the version with `rc0` suffix.
|
|
90
|
-
For example:
|
|
91
|
-
|
|
92
|
-
```bash
|
|
93
|
-
pip install replay-rec==XX.YY.ZZrc0
|
|
94
|
-
```
|
|
95
|
-
|
|
96
|
-
### Extras
|
|
97
|
-
|
|
98
|
-
In addition to the core package, several extras are also provided, including:
|
|
99
|
-
- `[spark]`: Install PySpark functionality
|
|
100
|
-
- `[torch]`: Install PyTorch and Lightning functionality
|
|
101
|
-
- `[all]`: `[spark]` `[torch]`
|
|
82
|
+
<a name="quickstart"></a>
|
|
83
|
+
## 📈 Quickstart
|
|
102
84
|
|
|
103
|
-
Example:
|
|
104
85
|
```bash
|
|
105
|
-
|
|
106
|
-
pip install replay-rec[spark]
|
|
107
|
-
|
|
108
|
-
# Install package with experimental submodule and PySpark dependency
|
|
109
|
-
pip install replay-rec[spark]==XX.YY.ZZrc0
|
|
86
|
+
pip install replay-rec[all]
|
|
110
87
|
```
|
|
111
88
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
If you encounter an error during RePlay installation, check the [troubleshooting](https://sb-ai-lab.github.io/RePlay/pages/installation.html#troubleshooting) guide.
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
<a name="quickstart"></a>
|
|
118
|
-
## 📈 Quickstart (PySpark-based)
|
|
119
|
-
|
|
89
|
+
Pyspark-based model and [fast](https://github.com/sb-ai-lab/RePlay/blob/main/examples/11_sasrec_dataframes_comparison.ipynb) polars-based data preprocessing:
|
|
120
90
|
```python
|
|
91
|
+
from polars import from_pandas
|
|
121
92
|
from rs_datasets import MovieLens
|
|
122
93
|
|
|
123
94
|
from replay.data import Dataset, FeatureHint, FeatureInfo, FeatureSchema, FeatureType
|
|
@@ -131,10 +102,10 @@ from replay.splitters import RatioSplitter
|
|
|
131
102
|
spark = State().session
|
|
132
103
|
|
|
133
104
|
ml_1m = MovieLens("1m")
|
|
134
|
-
K=10
|
|
105
|
+
K = 10
|
|
135
106
|
|
|
136
|
-
# data
|
|
137
|
-
interactions =
|
|
107
|
+
# convert data to polars
|
|
108
|
+
interactions = from_pandas(ml_1m.ratings)
|
|
138
109
|
|
|
139
110
|
# data splitting
|
|
140
111
|
splitter = RatioSplitter(
|
|
@@ -148,7 +119,7 @@ splitter = RatioSplitter(
|
|
|
148
119
|
)
|
|
149
120
|
train, test = splitter.split(interactions)
|
|
150
121
|
|
|
151
|
-
#
|
|
122
|
+
# datasets creation
|
|
152
123
|
feature_schema = FeatureSchema(
|
|
153
124
|
[
|
|
154
125
|
FeatureInfo(
|
|
@@ -174,20 +145,18 @@ feature_schema = FeatureSchema(
|
|
|
174
145
|
]
|
|
175
146
|
)
|
|
176
147
|
|
|
177
|
-
train_dataset = Dataset(
|
|
178
|
-
|
|
179
|
-
interactions=train,
|
|
180
|
-
)
|
|
181
|
-
test_dataset = Dataset(
|
|
182
|
-
feature_schema=feature_schema,
|
|
183
|
-
interactions=test,
|
|
184
|
-
)
|
|
148
|
+
train_dataset = Dataset(feature_schema=feature_schema, interactions=train)
|
|
149
|
+
test_dataset = Dataset(feature_schema=feature_schema, interactions=test)
|
|
185
150
|
|
|
186
151
|
# data encoding
|
|
187
152
|
encoder = DatasetLabelEncoder()
|
|
188
153
|
train_dataset = encoder.fit_transform(train_dataset)
|
|
189
154
|
test_dataset = encoder.transform(test_dataset)
|
|
190
155
|
|
|
156
|
+
# convert datasets to spark
|
|
157
|
+
train_dataset.to_spark()
|
|
158
|
+
test_dataset.to_spark()
|
|
159
|
+
|
|
191
160
|
# model training
|
|
192
161
|
model = ItemKNN()
|
|
193
162
|
model.fit(train_dataset)
|
|
@@ -214,6 +183,44 @@ metrics.add_result("ItemKNN", recs)
|
|
|
214
183
|
print(metrics.results)
|
|
215
184
|
```
|
|
216
185
|
|
|
186
|
+
<a name="installation"></a>
|
|
187
|
+
## 🔧 Installation
|
|
188
|
+
|
|
189
|
+
Installation via `pip` package manager is recommended by default:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
pip install replay-rec
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
In this case it will be installed the `core` package without `PySpark` and `PyTorch` dependencies.
|
|
196
|
+
Also `experimental` submodule will not be installed.
|
|
197
|
+
|
|
198
|
+
To install `experimental` submodule please specify the version with `rc0` suffix.
|
|
199
|
+
For example:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
pip install replay-rec==XX.YY.ZZrc0
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Extras
|
|
206
|
+
|
|
207
|
+
In addition to the core package, several extras are also provided, including:
|
|
208
|
+
- `[spark]`: Install PySpark functionality
|
|
209
|
+
- `[torch]`: Install PyTorch and Lightning functionality
|
|
210
|
+
- `[all]`: `[spark]` `[torch]`
|
|
211
|
+
|
|
212
|
+
Example:
|
|
213
|
+
```bash
|
|
214
|
+
# Install core package with PySpark dependency
|
|
215
|
+
pip install replay-rec[spark]
|
|
216
|
+
|
|
217
|
+
# Install package with experimental submodule and PySpark dependency
|
|
218
|
+
pip install replay-rec[spark]==XX.YY.ZZrc0
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
To build RePlay from sources please use the [instruction](CONTRIBUTING.md#installing-from-the-source).
|
|
222
|
+
|
|
223
|
+
|
|
217
224
|
<a name="examples"></a>
|
|
218
225
|
## 📑 Resources
|
|
219
226
|
|
|
@@ -226,14 +233,19 @@ print(metrics.results)
|
|
|
226
233
|
6. [06_item2item_recommendations.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/06_item2item_recommendations.ipynb) - Item to Item recommendations example.
|
|
227
234
|
7. [07_filters.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/07_filters.ipynb) - An example of using filters.
|
|
228
235
|
8. [08_recommending_for_categories.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/08_recommending_for_categories.ipynb) - An example of recommendation for product categories.
|
|
229
|
-
9. [09_sasrec_example.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/09_sasrec_example.ipynb) - An example of using
|
|
230
|
-
|
|
236
|
+
9. [09_sasrec_example.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/09_sasrec_example.ipynb) - An example of using transformer-based SASRec model to generate recommendations.
|
|
237
|
+
10. [10_bert4rec_example.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/10_bert4rec_example.ipynb) - An example of using transformer-based BERT4Rec model to generate recommendations.
|
|
238
|
+
11. [11_sasrec_dataframes_comparison.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/11_sasrec_dataframes_comparison.ipynb) - speed comparison of using different frameworks (pandas, polars, pyspark) for data processing during SASRec training.
|
|
239
|
+
12. [12_neural_ts_exp.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/12_neural_ts_exp.ipynb) - An example of using Neural Thompson Sampling bandit model (based on Wide&Deep architecture).
|
|
240
|
+
13. [13_personalized_bandit_comparison.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/13_personalized_bandit_comparison.ipynb) - A comparison of context-free and contextual bandit models.
|
|
241
|
+
14. [14_hierarchical_recommender.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/14_hierarchical_recommender.ipynb) - An example of using HierarchicalRecommender with user-disjoint LinUCB.
|
|
231
242
|
|
|
232
243
|
### Videos and papers
|
|
233
244
|
* **Video guides**:
|
|
234
245
|
- [Replay for offline recommendations, AI Journey 2021](https://www.youtube.com/watch?v=ejQZKGAG0xs)
|
|
235
246
|
|
|
236
247
|
* **Research papers**:
|
|
248
|
+
- [RePlay: a Recommendation Framework for Experimentation and Production Use](https://arxiv.org/abs/2409.07272) Alexey Vasilev, Anna Volodkevich, Denis Kulandin, Tatiana Bysheva, Anton Klenitskiy. In The 18th ACM Conference on Recommender Systems (RecSys '24)
|
|
237
249
|
- [Turning Dross Into Gold Loss: is BERT4Rec really better than SASRec?](https://doi.org/10.1145/3604915.3610644) Anton Klenitskiy, Alexey Vasilev. In The 17th ACM Conference on Recommender Systems (RecSys '23)
|
|
238
250
|
- [The Long Tail of Context: Does it Exist and Matter?](https://arxiv.org/abs/2210.01023). Konstantin Bauman, Alexey Vasilev, Alexander Tuzhilin. In Workshop on Context-Aware Recommender Systems (CARS) (RecSys '22)
|
|
239
251
|
- [Multiobjective Evaluation of Reinforcement Learning Based Recommender Systems](https://doi.org/10.1145/3523227.3551485). Alexey Grishanov, Anastasia Ianina, Konstantin Vorontsov. In The 16th ACM Conference on Recommender Systems (RecSys '22)
|
|
@@ -244,3 +256,4 @@ print(metrics.results)
|
|
|
244
256
|
|
|
245
257
|
We welcome community contributions. For details please check our [contributing guidelines](CONTRIBUTING.md).
|
|
246
258
|
|
|
259
|
+
|
|
@@ -3,11 +3,15 @@
|
|
|
3
3
|
|
|
4
4
|
[](https://github.com/sb-ai-lab/RePlay/blob/main/LICENSE)
|
|
5
5
|
[](https://pypi.org/project/replay-rec)
|
|
6
|
+
[](https://sb-ai-lab.github.io/RePlay/)
|
|
6
7
|
[](https://pypistats.org/packages/replay-rec)
|
|
7
8
|
<br>
|
|
8
9
|
[](https://github.com/sb-ai-lab/RePlay/actions/workflows/main.yml?query=branch%3Amain)
|
|
10
|
+
[](https://github.com/astral-sh/ruff)
|
|
11
|
+
[](https://pypi.org/project/replay-rec)
|
|
9
12
|
[](https://github.com/sb-ai-lab/RePlay/discussions)
|
|
10
13
|
|
|
14
|
+
|
|
11
15
|
RePlay is an advanced framework designed to facilitate the development and evaluation of recommendation systems. It provides a robust set of tools covering the entire lifecycle of a recommendation system pipeline:
|
|
12
16
|
|
|
13
17
|
## 🚀 Features:
|
|
@@ -22,61 +26,25 @@ RePlay is an advanced framework designed to facilitate the development and evalu
|
|
|
22
26
|
1. **Diverse Hardware Support:** Compatible with various hardware configurations including CPU, GPU, Multi-GPU.
|
|
23
27
|
2. **Cluster Computing Integration:** Integrating with PySpark for distributed computing, enabling scalability for large-scale recommendation systems.
|
|
24
28
|
|
|
25
|
-
## 📖 Documentation is available [here](https://sb-ai-lab.github.io/RePlay/).
|
|
26
|
-
|
|
27
29
|
<a name="toc"></a>
|
|
28
30
|
# Table of Contents
|
|
29
31
|
|
|
30
|
-
* [Installation](#installation)
|
|
31
32
|
* [Quickstart](#quickstart)
|
|
33
|
+
* [Installation](#installation)
|
|
32
34
|
* [Resources](#examples)
|
|
33
35
|
* [Contributing to RePlay](#contributing)
|
|
34
36
|
|
|
35
37
|
|
|
36
|
-
<a name="
|
|
37
|
-
##
|
|
38
|
-
|
|
39
|
-
Installation via `pip` package manager is recommended by default:
|
|
40
|
-
|
|
41
|
-
```bash
|
|
42
|
-
pip install replay-rec
|
|
43
|
-
```
|
|
44
|
-
|
|
45
|
-
In this case it will be installed the `core` package without `PySpark` and `PyTorch` dependencies.
|
|
46
|
-
Also `experimental` submodule will not be installed.
|
|
47
|
-
|
|
48
|
-
To install `experimental` submodule please specify the version with `rc0` suffix.
|
|
49
|
-
For example:
|
|
50
|
-
|
|
51
|
-
```bash
|
|
52
|
-
pip install replay-rec==XX.YY.ZZrc0
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
### Extras
|
|
56
|
-
|
|
57
|
-
In addition to the core package, several extras are also provided, including:
|
|
58
|
-
- `[spark]`: Install PySpark functionality
|
|
59
|
-
- `[torch]`: Install PyTorch and Lightning functionality
|
|
60
|
-
- `[all]`: `[spark]` `[torch]`
|
|
38
|
+
<a name="quickstart"></a>
|
|
39
|
+
## 📈 Quickstart
|
|
61
40
|
|
|
62
|
-
Example:
|
|
63
41
|
```bash
|
|
64
|
-
|
|
65
|
-
pip install replay-rec[spark]
|
|
66
|
-
|
|
67
|
-
# Install package with experimental submodule and PySpark dependency
|
|
68
|
-
pip install replay-rec[spark]==XX.YY.ZZrc0
|
|
42
|
+
pip install replay-rec[all]
|
|
69
43
|
```
|
|
70
44
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
If you encounter an error during RePlay installation, check the [troubleshooting](https://sb-ai-lab.github.io/RePlay/pages/installation.html#troubleshooting) guide.
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
<a name="quickstart"></a>
|
|
77
|
-
## 📈 Quickstart (PySpark-based)
|
|
78
|
-
|
|
45
|
+
Pyspark-based model and [fast](https://github.com/sb-ai-lab/RePlay/blob/main/examples/11_sasrec_dataframes_comparison.ipynb) polars-based data preprocessing:
|
|
79
46
|
```python
|
|
47
|
+
from polars import from_pandas
|
|
80
48
|
from rs_datasets import MovieLens
|
|
81
49
|
|
|
82
50
|
from replay.data import Dataset, FeatureHint, FeatureInfo, FeatureSchema, FeatureType
|
|
@@ -90,10 +58,10 @@ from replay.splitters import RatioSplitter
|
|
|
90
58
|
spark = State().session
|
|
91
59
|
|
|
92
60
|
ml_1m = MovieLens("1m")
|
|
93
|
-
K=10
|
|
61
|
+
K = 10
|
|
94
62
|
|
|
95
|
-
# data
|
|
96
|
-
interactions =
|
|
63
|
+
# convert data to polars
|
|
64
|
+
interactions = from_pandas(ml_1m.ratings)
|
|
97
65
|
|
|
98
66
|
# data splitting
|
|
99
67
|
splitter = RatioSplitter(
|
|
@@ -107,7 +75,7 @@ splitter = RatioSplitter(
|
|
|
107
75
|
)
|
|
108
76
|
train, test = splitter.split(interactions)
|
|
109
77
|
|
|
110
|
-
#
|
|
78
|
+
# datasets creation
|
|
111
79
|
feature_schema = FeatureSchema(
|
|
112
80
|
[
|
|
113
81
|
FeatureInfo(
|
|
@@ -133,20 +101,18 @@ feature_schema = FeatureSchema(
|
|
|
133
101
|
]
|
|
134
102
|
)
|
|
135
103
|
|
|
136
|
-
train_dataset = Dataset(
|
|
137
|
-
|
|
138
|
-
interactions=train,
|
|
139
|
-
)
|
|
140
|
-
test_dataset = Dataset(
|
|
141
|
-
feature_schema=feature_schema,
|
|
142
|
-
interactions=test,
|
|
143
|
-
)
|
|
104
|
+
train_dataset = Dataset(feature_schema=feature_schema, interactions=train)
|
|
105
|
+
test_dataset = Dataset(feature_schema=feature_schema, interactions=test)
|
|
144
106
|
|
|
145
107
|
# data encoding
|
|
146
108
|
encoder = DatasetLabelEncoder()
|
|
147
109
|
train_dataset = encoder.fit_transform(train_dataset)
|
|
148
110
|
test_dataset = encoder.transform(test_dataset)
|
|
149
111
|
|
|
112
|
+
# convert datasets to spark
|
|
113
|
+
train_dataset.to_spark()
|
|
114
|
+
test_dataset.to_spark()
|
|
115
|
+
|
|
150
116
|
# model training
|
|
151
117
|
model = ItemKNN()
|
|
152
118
|
model.fit(train_dataset)
|
|
@@ -173,6 +139,44 @@ metrics.add_result("ItemKNN", recs)
|
|
|
173
139
|
print(metrics.results)
|
|
174
140
|
```
|
|
175
141
|
|
|
142
|
+
<a name="installation"></a>
|
|
143
|
+
## 🔧 Installation
|
|
144
|
+
|
|
145
|
+
Installation via `pip` package manager is recommended by default:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
pip install replay-rec
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
In this case it will be installed the `core` package without `PySpark` and `PyTorch` dependencies.
|
|
152
|
+
Also `experimental` submodule will not be installed.
|
|
153
|
+
|
|
154
|
+
To install `experimental` submodule please specify the version with `rc0` suffix.
|
|
155
|
+
For example:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
pip install replay-rec==XX.YY.ZZrc0
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Extras
|
|
162
|
+
|
|
163
|
+
In addition to the core package, several extras are also provided, including:
|
|
164
|
+
- `[spark]`: Install PySpark functionality
|
|
165
|
+
- `[torch]`: Install PyTorch and Lightning functionality
|
|
166
|
+
- `[all]`: `[spark]` `[torch]`
|
|
167
|
+
|
|
168
|
+
Example:
|
|
169
|
+
```bash
|
|
170
|
+
# Install core package with PySpark dependency
|
|
171
|
+
pip install replay-rec[spark]
|
|
172
|
+
|
|
173
|
+
# Install package with experimental submodule and PySpark dependency
|
|
174
|
+
pip install replay-rec[spark]==XX.YY.ZZrc0
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
To build RePlay from sources please use the [instruction](CONTRIBUTING.md#installing-from-the-source).
|
|
178
|
+
|
|
179
|
+
|
|
176
180
|
<a name="examples"></a>
|
|
177
181
|
## 📑 Resources
|
|
178
182
|
|
|
@@ -185,14 +189,19 @@ print(metrics.results)
|
|
|
185
189
|
6. [06_item2item_recommendations.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/06_item2item_recommendations.ipynb) - Item to Item recommendations example.
|
|
186
190
|
7. [07_filters.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/07_filters.ipynb) - An example of using filters.
|
|
187
191
|
8. [08_recommending_for_categories.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/08_recommending_for_categories.ipynb) - An example of recommendation for product categories.
|
|
188
|
-
9. [09_sasrec_example.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/09_sasrec_example.ipynb) - An example of using
|
|
189
|
-
|
|
192
|
+
9. [09_sasrec_example.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/09_sasrec_example.ipynb) - An example of using transformer-based SASRec model to generate recommendations.
|
|
193
|
+
10. [10_bert4rec_example.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/10_bert4rec_example.ipynb) - An example of using transformer-based BERT4Rec model to generate recommendations.
|
|
194
|
+
11. [11_sasrec_dataframes_comparison.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/11_sasrec_dataframes_comparison.ipynb) - speed comparison of using different frameworks (pandas, polars, pyspark) for data processing during SASRec training.
|
|
195
|
+
12. [12_neural_ts_exp.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/12_neural_ts_exp.ipynb) - An example of using Neural Thompson Sampling bandit model (based on Wide&Deep architecture).
|
|
196
|
+
13. [13_personalized_bandit_comparison.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/13_personalized_bandit_comparison.ipynb) - A comparison of context-free and contextual bandit models.
|
|
197
|
+
14. [14_hierarchical_recommender.ipynb](https://github.com/sb-ai-lab/RePlay/blob/main/examples/14_hierarchical_recommender.ipynb) - An example of using HierarchicalRecommender with user-disjoint LinUCB.
|
|
190
198
|
|
|
191
199
|
### Videos and papers
|
|
192
200
|
* **Video guides**:
|
|
193
201
|
- [Replay for offline recommendations, AI Journey 2021](https://www.youtube.com/watch?v=ejQZKGAG0xs)
|
|
194
202
|
|
|
195
203
|
* **Research papers**:
|
|
204
|
+
- [RePlay: a Recommendation Framework for Experimentation and Production Use](https://arxiv.org/abs/2409.07272) Alexey Vasilev, Anna Volodkevich, Denis Kulandin, Tatiana Bysheva, Anton Klenitskiy. In The 18th ACM Conference on Recommender Systems (RecSys '24)
|
|
196
205
|
- [Turning Dross Into Gold Loss: is BERT4Rec really better than SASRec?](https://doi.org/10.1145/3604915.3610644) Anton Klenitskiy, Alexey Vasilev. In The 17th ACM Conference on Recommender Systems (RecSys '23)
|
|
197
206
|
- [The Long Tail of Context: Does it Exist and Matter?](https://arxiv.org/abs/2210.01023). Konstantin Bauman, Alexey Vasilev, Alexander Tuzhilin. In Workshop on Context-Aware Recommender Systems (CARS) (RecSys '22)
|
|
198
207
|
- [Multiobjective Evaluation of Reinforcement Learning Based Recommender Systems](https://doi.org/10.1145/3523227.3551485). Alexey Grishanov, Anastasia Ianina, Konstantin Vorontsov. In The 16th ACM Conference on Recommender Systems (RecSys '22)
|
|
@@ -202,3 +211,4 @@ print(metrics.results)
|
|
|
202
211
|
## 💡 Contributing to RePlay
|
|
203
212
|
|
|
204
213
|
We welcome community contributions. For details please check our [contributing guidelines](CONTRIBUTING.md).
|
|
214
|
+
|
|
@@ -41,7 +41,7 @@ exclude = [
|
|
|
41
41
|
"replay/conftest.py",
|
|
42
42
|
"replay/experimental",
|
|
43
43
|
]
|
|
44
|
-
version = "0.18.
|
|
44
|
+
version = "0.18.1"
|
|
45
45
|
|
|
46
46
|
[tool.poetry.dependencies]
|
|
47
47
|
python = ">=3.8.1, <3.12"
|
|
@@ -53,11 +53,13 @@ scipy = "^1.8.1"
|
|
|
53
53
|
psutil = "~6.0.0"
|
|
54
54
|
scikit-learn = "^1.0.2"
|
|
55
55
|
pyarrow = ">=12.0.1"
|
|
56
|
+
openvino = {version = "~2024.3.0", optional = true}
|
|
57
|
+
onnx = {version = "~1.16.2", optional = true}
|
|
56
58
|
pyspark = [
|
|
57
59
|
{version = ">=3.4,<3.6", python = ">=3.11,<3.12", optional = true},
|
|
58
60
|
{version = ">=3.0,<3.6", python = ">=3.8.1,<3.11", optional = true},
|
|
59
61
|
]
|
|
60
|
-
torch = {version = ">=1.8, <=2.
|
|
62
|
+
torch = {version = ">=1.8, <=2.5.0", optional = true}
|
|
61
63
|
lightning = {version = ">=2.0.2, <=2.4.0", optional = true}
|
|
62
64
|
pytorch-ranger = {version = "^0.1.1", optional = true}
|
|
63
65
|
fixed-install-nmslib = "2.1.2"
|
|
@@ -66,7 +68,8 @@ hnswlib = "^0.7.0"
|
|
|
66
68
|
[tool.poetry.extras]
|
|
67
69
|
spark = ["pyspark"]
|
|
68
70
|
torch = ["torch", "pytorch-ranger", "lightning"]
|
|
69
|
-
|
|
71
|
+
torch-openvino = ["torch", "pytorch-ranger", "lightning", "openvino", "onnx"]
|
|
72
|
+
all = ["pyspark", "torch", "pytorch-ranger", "lightning", "openvino", "onnx"]
|
|
70
73
|
|
|
71
74
|
[tool.poetry.group.dev.dependencies]
|
|
72
75
|
jupyter = "~1.0.0"
|
|
@@ -85,10 +88,11 @@ myst-parser = "1.0.0"
|
|
|
85
88
|
ghp-import = "2.1.0"
|
|
86
89
|
docutils = "0.16"
|
|
87
90
|
data-science-types = "0.2.23"
|
|
91
|
+
filelock = "~3.14.0"
|
|
88
92
|
|
|
89
93
|
[tool.poetry-dynamic-versioning]
|
|
90
94
|
enable = false
|
|
91
|
-
format-jinja = """0.18.
|
|
95
|
+
format-jinja = """0.18.1{{ env['PACKAGE_SUFFIX'] }}"""
|
|
92
96
|
vcs = "git"
|
|
93
97
|
|
|
94
98
|
[tool.ruff]
|
|
@@ -458,13 +458,23 @@ class Dataset:
|
|
|
458
458
|
if feature.feature_hint in [FeatureHint.ITEM_ID, FeatureHint.QUERY_ID]:
|
|
459
459
|
return nunique(self._ids_feature_map[feature.feature_hint], column)
|
|
460
460
|
assert feature.feature_source
|
|
461
|
+
if feature.feature_type == FeatureType.CATEGORICAL_LIST:
|
|
462
|
+
if self.is_spark:
|
|
463
|
+
data = (
|
|
464
|
+
self._feature_source_map[feature.feature_source]
|
|
465
|
+
.select(column)
|
|
466
|
+
.withColumn(column, sf.explode(column))
|
|
467
|
+
)
|
|
468
|
+
else:
|
|
469
|
+
data = self._feature_source_map[feature.feature_source][[column]].explode(column)
|
|
470
|
+
return nunique(data, column)
|
|
461
471
|
return nunique(self._feature_source_map[feature.feature_source], column)
|
|
462
472
|
|
|
463
473
|
return callback
|
|
464
474
|
|
|
465
475
|
def _set_cardinality(self, features_list: Sequence[FeatureInfo]) -> None:
|
|
466
476
|
for feature in features_list:
|
|
467
|
-
if feature.feature_type
|
|
477
|
+
if feature.feature_type in [FeatureType.CATEGORICAL, FeatureType.CATEGORICAL_LIST]:
|
|
468
478
|
feature._set_cardinality_callback(self._get_cardinality(feature))
|
|
469
479
|
|
|
470
480
|
def _fill_feature_schema(self, feature_schema: FeatureSchema) -> FeatureSchema:
|
|
@@ -581,6 +591,7 @@ class Dataset:
|
|
|
581
591
|
data: DataFrameLike,
|
|
582
592
|
column: str,
|
|
583
593
|
source: FeatureSource,
|
|
594
|
+
feature_type: FeatureType,
|
|
584
595
|
cardinality: Optional[int],
|
|
585
596
|
) -> None:
|
|
586
597
|
"""
|
|
@@ -593,6 +604,16 @@ class Dataset:
|
|
|
593
604
|
Option: Keep this criterion, but suggest the user to disable the check if he understands
|
|
594
605
|
that the criterion will not pass.
|
|
595
606
|
"""
|
|
607
|
+
if feature_type == FeatureType.CATEGORICAL_LIST: # explode column if list
|
|
608
|
+
data = data.withColumn(column, sf.explode(column)) if self.is_spark else data[[column]].explode(column)
|
|
609
|
+
|
|
610
|
+
if self.is_pandas:
|
|
611
|
+
try:
|
|
612
|
+
data[column] = data[column].astype(int)
|
|
613
|
+
except Exception:
|
|
614
|
+
msg = f"IDs in {source.name}.{column} are not encoded. They are not int."
|
|
615
|
+
raise ValueError(msg)
|
|
616
|
+
|
|
596
617
|
if self.is_pandas:
|
|
597
618
|
is_int = np.issubdtype(dict(data.dtypes)[column], int)
|
|
598
619
|
elif self.is_spark:
|
|
@@ -632,6 +653,7 @@ class Dataset:
|
|
|
632
653
|
self.interactions,
|
|
633
654
|
feature.column,
|
|
634
655
|
FeatureSource.INTERACTIONS,
|
|
656
|
+
feature.feature_type,
|
|
635
657
|
feature.cardinality,
|
|
636
658
|
)
|
|
637
659
|
if self.item_features is not None:
|
|
@@ -639,6 +661,7 @@ class Dataset:
|
|
|
639
661
|
self.item_features,
|
|
640
662
|
feature.column,
|
|
641
663
|
FeatureSource.ITEM_FEATURES,
|
|
664
|
+
feature.feature_type,
|
|
642
665
|
feature.cardinality,
|
|
643
666
|
)
|
|
644
667
|
elif feature.feature_hint == FeatureHint.QUERY_ID:
|
|
@@ -646,6 +669,7 @@ class Dataset:
|
|
|
646
669
|
self.interactions,
|
|
647
670
|
feature.column,
|
|
648
671
|
FeatureSource.INTERACTIONS,
|
|
672
|
+
feature.feature_type,
|
|
649
673
|
feature.cardinality,
|
|
650
674
|
)
|
|
651
675
|
if self.query_features is not None:
|
|
@@ -653,6 +677,7 @@ class Dataset:
|
|
|
653
677
|
self.query_features,
|
|
654
678
|
feature.column,
|
|
655
679
|
FeatureSource.QUERY_FEATURES,
|
|
680
|
+
feature.feature_type,
|
|
656
681
|
feature.cardinality,
|
|
657
682
|
)
|
|
658
683
|
else:
|
|
@@ -661,6 +686,7 @@ class Dataset:
|
|
|
661
686
|
data,
|
|
662
687
|
feature.column,
|
|
663
688
|
feature.feature_source,
|
|
689
|
+
feature.feature_type,
|
|
664
690
|
feature.cardinality,
|
|
665
691
|
)
|
|
666
692
|
|
|
@@ -8,8 +8,8 @@ Contains classes for encoding categorical data
|
|
|
8
8
|
import warnings
|
|
9
9
|
from typing import Dict, Iterable, Iterator, Optional, Sequence, Set, Union
|
|
10
10
|
|
|
11
|
-
from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource
|
|
12
|
-
from replay.preprocessing import LabelEncoder, LabelEncodingRule
|
|
11
|
+
from replay.data import Dataset, FeatureHint, FeatureSchema, FeatureSource, FeatureType
|
|
12
|
+
from replay.preprocessing import LabelEncoder, LabelEncodingRule, SequenceEncodingRule
|
|
13
13
|
from replay.preprocessing.label_encoder import HandleUnknownStrategies
|
|
14
14
|
|
|
15
15
|
|
|
@@ -62,7 +62,10 @@ class DatasetLabelEncoder:
|
|
|
62
62
|
|
|
63
63
|
self._fill_features_columns(dataset.feature_schema)
|
|
64
64
|
for column, feature_info in dataset.feature_schema.categorical_features.items():
|
|
65
|
-
|
|
65
|
+
encoding_rule_class = (
|
|
66
|
+
SequenceEncodingRule if feature_info.feature_type == FeatureType.CATEGORICAL_LIST else LabelEncodingRule
|
|
67
|
+
)
|
|
68
|
+
encoding_rule = encoding_rule_class(
|
|
66
69
|
column, handle_unknown=self._handle_unknown_rule, default_value=self._default_value_rule
|
|
67
70
|
)
|
|
68
71
|
if feature_info.feature_hint == FeatureHint.QUERY_ID:
|