recnexteval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recnexteval/__init__.py +20 -0
- recnexteval/algorithms/__init__.py +99 -0
- recnexteval/algorithms/base.py +377 -0
- recnexteval/algorithms/baseline/__init__.py +10 -0
- recnexteval/algorithms/baseline/decay_popularity.py +110 -0
- recnexteval/algorithms/baseline/most_popular.py +72 -0
- recnexteval/algorithms/baseline/random.py +39 -0
- recnexteval/algorithms/baseline/recent_popularity.py +34 -0
- recnexteval/algorithms/itemknn/__init__.py +14 -0
- recnexteval/algorithms/itemknn/itemknn.py +119 -0
- recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
- recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
- recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
- recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
- recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
- recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
- recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
- recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
- recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
- recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
- recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
- recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
- recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
- recnexteval/algorithms/utils.py +51 -0
- recnexteval/datasets/__init__.py +109 -0
- recnexteval/datasets/base.py +316 -0
- recnexteval/datasets/config/__init__.py +113 -0
- recnexteval/datasets/config/amazon.py +188 -0
- recnexteval/datasets/config/base.py +72 -0
- recnexteval/datasets/config/lastfm.py +105 -0
- recnexteval/datasets/config/movielens.py +169 -0
- recnexteval/datasets/config/yelp.py +25 -0
- recnexteval/datasets/datasets/__init__.py +24 -0
- recnexteval/datasets/datasets/amazon.py +151 -0
- recnexteval/datasets/datasets/base.py +250 -0
- recnexteval/datasets/datasets/lastfm.py +121 -0
- recnexteval/datasets/datasets/movielens.py +93 -0
- recnexteval/datasets/datasets/test.py +46 -0
- recnexteval/datasets/datasets/yelp.py +103 -0
- recnexteval/datasets/metadata/__init__.py +58 -0
- recnexteval/datasets/metadata/amazon.py +68 -0
- recnexteval/datasets/metadata/base.py +38 -0
- recnexteval/datasets/metadata/lastfm.py +110 -0
- recnexteval/datasets/metadata/movielens.py +87 -0
- recnexteval/evaluators/__init__.py +189 -0
- recnexteval/evaluators/accumulator.py +167 -0
- recnexteval/evaluators/base.py +216 -0
- recnexteval/evaluators/builder/__init__.py +125 -0
- recnexteval/evaluators/builder/base.py +166 -0
- recnexteval/evaluators/builder/pipeline.py +111 -0
- recnexteval/evaluators/builder/stream.py +54 -0
- recnexteval/evaluators/evaluator_pipeline.py +287 -0
- recnexteval/evaluators/evaluator_stream.py +374 -0
- recnexteval/evaluators/state_management.py +310 -0
- recnexteval/evaluators/strategy.py +32 -0
- recnexteval/evaluators/util.py +124 -0
- recnexteval/matrix/__init__.py +48 -0
- recnexteval/matrix/exception.py +5 -0
- recnexteval/matrix/interaction_matrix.py +784 -0
- recnexteval/matrix/prediction_matrix.py +153 -0
- recnexteval/matrix/util.py +24 -0
- recnexteval/metrics/__init__.py +57 -0
- recnexteval/metrics/binary/__init__.py +4 -0
- recnexteval/metrics/binary/hit.py +49 -0
- recnexteval/metrics/core/__init__.py +10 -0
- recnexteval/metrics/core/base.py +126 -0
- recnexteval/metrics/core/elementwise_top_k.py +75 -0
- recnexteval/metrics/core/listwise_top_k.py +72 -0
- recnexteval/metrics/core/top_k.py +60 -0
- recnexteval/metrics/core/util.py +29 -0
- recnexteval/metrics/ranking/__init__.py +6 -0
- recnexteval/metrics/ranking/dcg.py +55 -0
- recnexteval/metrics/ranking/ndcg.py +78 -0
- recnexteval/metrics/ranking/precision.py +51 -0
- recnexteval/metrics/ranking/recall.py +42 -0
- recnexteval/models/__init__.py +4 -0
- recnexteval/models/base.py +69 -0
- recnexteval/preprocessing/__init__.py +37 -0
- recnexteval/preprocessing/filter.py +181 -0
- recnexteval/preprocessing/preprocessor.py +137 -0
- recnexteval/registries/__init__.py +67 -0
- recnexteval/registries/algorithm.py +68 -0
- recnexteval/registries/base.py +131 -0
- recnexteval/registries/dataset.py +37 -0
- recnexteval/registries/metric.py +57 -0
- recnexteval/settings/__init__.py +127 -0
- recnexteval/settings/base.py +414 -0
- recnexteval/settings/exception.py +8 -0
- recnexteval/settings/leave_n_out_setting.py +48 -0
- recnexteval/settings/processor.py +115 -0
- recnexteval/settings/schema.py +11 -0
- recnexteval/settings/single_time_point_setting.py +111 -0
- recnexteval/settings/sliding_window_setting.py +153 -0
- recnexteval/settings/splitters/__init__.py +14 -0
- recnexteval/settings/splitters/base.py +57 -0
- recnexteval/settings/splitters/n_last.py +39 -0
- recnexteval/settings/splitters/n_last_timestamp.py +76 -0
- recnexteval/settings/splitters/timestamp.py +82 -0
- recnexteval/settings/util.py +0 -0
- recnexteval/utils/__init__.py +115 -0
- recnexteval/utils/json_to_csv_converter.py +128 -0
- recnexteval/utils/logging_tools.py +159 -0
- recnexteval/utils/path.py +155 -0
- recnexteval/utils/url_certificate_installer.py +54 -0
- recnexteval/utils/util.py +166 -0
- recnexteval/utils/uuid_util.py +7 -0
- recnexteval/utils/yaml_tool.py +65 -0
- recnexteval-0.1.0.dist-info/METADATA +85 -0
- recnexteval-0.1.0.dist-info/RECORD +110 -0
- recnexteval-0.1.0.dist-info/WHEEL +4 -0
recnexteval/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RecNextEval
|
|
3
|
+
-----------
|
|
4
|
+
|
|
5
|
+
RecNextEval is a Python package toolkit developed for evaluation of recommendation
|
|
6
|
+
systems in different settings. Mainly the toolkit is developed to evaluate
|
|
7
|
+
in a sliding window setting.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
from recnexteval.utils import prepare_logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
LOGGING_CONFIG_FILENAME = "logging_config.yaml"
|
|
16
|
+
|
|
17
|
+
prepare_logger(LOGGING_CONFIG_FILENAME)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
logger.info("recnexteval package loaded.")
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""
|
|
2
|
+
.. currentmodule:: recnexteval.algorithms
|
|
3
|
+
|
|
4
|
+
The algorithms module in recnexteval contains a collection of baseline algorithms
|
|
5
|
+
and various of the item-based KNN collaborative filtering algorithm. A total of
|
|
6
|
+
3 variation of the item-based KNN algorithm is implemented in the module. Which
|
|
7
|
+
are listed below
|
|
8
|
+
|
|
9
|
+
Algorithm
|
|
10
|
+
---------
|
|
11
|
+
Base class for all algorithms. Programmer should inherit from this class when
|
|
12
|
+
implementing a new algorithm. It provides a common interface for all algorithms
|
|
13
|
+
such that the expected methods and properties are defined to avoid any runtime
|
|
14
|
+
errors.
|
|
15
|
+
|
|
16
|
+
.. autosummary::
|
|
17
|
+
:toctree: generated/
|
|
18
|
+
|
|
19
|
+
Algorithm
|
|
20
|
+
|
|
21
|
+
Baseline Algorithms
|
|
22
|
+
-------------------
|
|
23
|
+
|
|
24
|
+
The baseline algorithms are simple algorithms that can be used as a reference
|
|
25
|
+
point to compare the performance of the more complex algorithms. The following
|
|
26
|
+
baseline algorithms are implemented in the module.
|
|
27
|
+
|
|
28
|
+
.. autosummary::
|
|
29
|
+
:toctree: generated/
|
|
30
|
+
|
|
31
|
+
Random
|
|
32
|
+
RecentPopularity
|
|
33
|
+
DecayPopularity
|
|
34
|
+
MostPopular
|
|
35
|
+
|
|
36
|
+
Item Similarity Algorithms
|
|
37
|
+
----------------------------
|
|
38
|
+
|
|
39
|
+
Item similarity algorithms exploit relationships between items to make recommendations.
|
|
40
|
+
At prediction time, the user is represented by the items they have interacted
|
|
41
|
+
with. 3 variations of the item-based KNN algorithm are implemented in the module.
|
|
42
|
+
Each variation is to showcase the difference in the learning and prediction of
|
|
43
|
+
the algorithm. We note that no one algorithm is better than the other, and it
|
|
44
|
+
greatly depends on the dataset and parameters used in the algorithm which would
|
|
45
|
+
yield the best performance.
|
|
46
|
+
|
|
47
|
+
.. autosummary::
|
|
48
|
+
:toctree: generated/
|
|
49
|
+
|
|
50
|
+
ItemKNN
|
|
51
|
+
ItemKNNIncremental
|
|
52
|
+
ItemKNNIncrementalMovieLens100K
|
|
53
|
+
ItemKNNRolling
|
|
54
|
+
ItemKNNStatic
|
|
55
|
+
|
|
56
|
+
Time-Aware Item Similarity Algorithms
|
|
57
|
+
-------------------------------------
|
|
58
|
+
|
|
59
|
+
This is an extension of the item-based KNN algorithm.
|
|
60
|
+
The time-aware item-based KNN algorithms applies a decay factor to the timestamps of interactions.
|
|
61
|
+
This gives heavier weight to interactions that are more recent.
|
|
62
|
+
|
|
63
|
+
.. autosummary::
|
|
64
|
+
:toctree: generated/
|
|
65
|
+
|
|
66
|
+
TARSItemKNN
|
|
67
|
+
TARSItemKNNLiu
|
|
68
|
+
TARSItemKNNVaz
|
|
69
|
+
TARSItemKNNDing
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
from .base import Algorithm
|
|
73
|
+
from .baseline import MostPopular, Random, RecentPopularity
|
|
74
|
+
from .baseline.decay_popularity import DecayPopularity
|
|
75
|
+
from .itemknn import ItemKNN, ItemKNNIncremental, ItemKNNIncrementalMovieLens100K, ItemKNNRolling, ItemKNNStatic
|
|
76
|
+
from .time_aware_item_knn import (
|
|
77
|
+
TARSItemKNN,
|
|
78
|
+
TARSItemKNNDing,
|
|
79
|
+
TARSItemKNNLiu,
|
|
80
|
+
TARSItemKNNVaz,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
__all__ = [
|
|
85
|
+
"Algorithm",
|
|
86
|
+
"DecayPopularity",
|
|
87
|
+
"ItemKNN",
|
|
88
|
+
"ItemKNNIncremental",
|
|
89
|
+
"ItemKNNIncrementalMovieLens100K",
|
|
90
|
+
"ItemKNNRolling",
|
|
91
|
+
"ItemKNNStatic",
|
|
92
|
+
"MostPopular",
|
|
93
|
+
"Random",
|
|
94
|
+
"RecentPopularity",
|
|
95
|
+
"TARSItemKNN",
|
|
96
|
+
"TARSItemKNNDing",
|
|
97
|
+
"TARSItemKNNLiu",
|
|
98
|
+
"TARSItemKNNVaz",
|
|
99
|
+
]
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from inspect import Parameter, signature
|
|
5
|
+
from typing import Self
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from scipy.sparse import csr_matrix
|
|
10
|
+
from sklearn.base import BaseEstimator
|
|
11
|
+
from sklearn.utils.validation import check_is_fitted
|
|
12
|
+
|
|
13
|
+
from recnexteval.matrix import InteractionMatrix, ItemUserBasedEnum, PredictionMatrix, to_csr_matrix
|
|
14
|
+
from ..models import BaseModel, ParamMixin
|
|
15
|
+
from ..utils import add_columns_to_csr_matrix, add_rows_to_csr_matrix
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Algorithm(BaseEstimator, BaseModel, ParamMixin):
|
|
22
|
+
"""Base class for all recnexteval algorithm implementations."""
|
|
23
|
+
|
|
24
|
+
ITEM_USER_BASED: ItemUserBasedEnum
|
|
25
|
+
|
|
26
|
+
def __init__(self) -> None:
|
|
27
|
+
super().__init__()
|
|
28
|
+
if not hasattr(self, "seed"):
|
|
29
|
+
self.seed = 42
|
|
30
|
+
self.rand_gen = np.random.default_rng(seed=self.seed)
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def description(self) -> str:
|
|
34
|
+
"""Description of the algorithm.
|
|
35
|
+
|
|
36
|
+
:return: Description of the algorithm
|
|
37
|
+
:rtype: str
|
|
38
|
+
"""
|
|
39
|
+
return self.__doc__ or "No description provided."
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def identifier(self) -> str:
|
|
43
|
+
"""Identifier of the object.
|
|
44
|
+
|
|
45
|
+
Identifier is made by combining the class name with the parameters
|
|
46
|
+
passed at construction time.
|
|
47
|
+
|
|
48
|
+
Constructed by recreating the initialisation call.
|
|
49
|
+
Example: `Algorithm(param_1=value)`
|
|
50
|
+
|
|
51
|
+
:return: Identifier of the object
|
|
52
|
+
:rtype: str
|
|
53
|
+
"""
|
|
54
|
+
paramstring = ",".join((f"{k}={v}" for k, v in self.get_params().items()))
|
|
55
|
+
return self.name + "(" + paramstring + ")"
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def get_default_params(cls) -> dict:
|
|
59
|
+
"""Get default parameters without instantiation.
|
|
60
|
+
|
|
61
|
+
Uses inspect.signature to extract __init__ parameters and their
|
|
62
|
+
default values without instantiating the class.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Dictionary of parameter names to default values.
|
|
66
|
+
Parameters without defaults map to None.
|
|
67
|
+
"""
|
|
68
|
+
try:
|
|
69
|
+
sig = signature(cls.__init__)
|
|
70
|
+
except (ValueError, TypeError):
|
|
71
|
+
# Fallback for built-in types or special cases
|
|
72
|
+
return {}
|
|
73
|
+
|
|
74
|
+
params = {}
|
|
75
|
+
for param_name, param in sig.parameters.items():
|
|
76
|
+
if param_name == "self":
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
if param.kind in (Parameter.VAR_POSITIONAL, Parameter.VAR_KEYWORD):
|
|
80
|
+
# Skip *args, **kwargs
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
# Extract the default value
|
|
84
|
+
if param.default is not Parameter.empty:
|
|
85
|
+
params[param_name] = param.default
|
|
86
|
+
else:
|
|
87
|
+
params[param_name] = None
|
|
88
|
+
|
|
89
|
+
return params
|
|
90
|
+
|
|
91
|
+
def __str__(self) -> str:
|
|
92
|
+
return self.name
|
|
93
|
+
|
|
94
|
+
def set_params(self, **params) -> Self:
|
|
95
|
+
"""Set the parameters of the estimator.
|
|
96
|
+
|
|
97
|
+
:param params: Estimator parameters
|
|
98
|
+
:type params: dict
|
|
99
|
+
"""
|
|
100
|
+
return super().set_params(**params)
|
|
101
|
+
|
|
102
|
+
@abstractmethod
|
|
103
|
+
def _fit(self, X: csr_matrix) -> Self:
|
|
104
|
+
"""Stub implementation for fitting an algorithm.
|
|
105
|
+
|
|
106
|
+
Will be called by the `fit` wrapper.
|
|
107
|
+
Child classes should implement this function.
|
|
108
|
+
|
|
109
|
+
:param X: User-item interaction matrix to fit the model to
|
|
110
|
+
:type X: csr_matrix
|
|
111
|
+
:raises NotImplementedError: Implement this method in the child class
|
|
112
|
+
"""
|
|
113
|
+
raise NotImplementedError("Please implement _fit")
|
|
114
|
+
|
|
115
|
+
@abstractmethod
|
|
116
|
+
def _predict(self, X: PredictionMatrix) -> csr_matrix:
|
|
117
|
+
"""Stub for predicting scores to users
|
|
118
|
+
|
|
119
|
+
Will be called by the `predict` wrapper.
|
|
120
|
+
Child classes should implement this function.
|
|
121
|
+
|
|
122
|
+
:param X: User-item interaction matrix used as input to predict
|
|
123
|
+
:type X: PredictionMatrix
|
|
124
|
+
:raises NotImplementedError: Implement this method in the child class
|
|
125
|
+
:return: Predictions made for all nonzero users in X
|
|
126
|
+
:rtype: csr_matrix
|
|
127
|
+
"""
|
|
128
|
+
raise NotImplementedError("Please implement _predict")
|
|
129
|
+
|
|
130
|
+
def _check_fit_complete(self) -> None:
|
|
131
|
+
"""Helper function to check if model was correctly fitted
|
|
132
|
+
|
|
133
|
+
Uses the sklearn check_is_fitted function,
|
|
134
|
+
https://scikit-learn.org/stable/modules/generated/sklearn.utils.validation.check_is_fitted.html
|
|
135
|
+
"""
|
|
136
|
+
check_is_fitted(self)
|
|
137
|
+
|
|
138
|
+
def _transform_fit_input(self, X: InteractionMatrix | csr_matrix) -> csr_matrix:
|
|
139
|
+
"""Transform the training data to expected type
|
|
140
|
+
|
|
141
|
+
Data will be turned into a binary csr matrix.
|
|
142
|
+
|
|
143
|
+
:param X: User-item interaction matrix to fit the model to
|
|
144
|
+
:type X: InteractionMatrix | csr_matrix
|
|
145
|
+
:return: Transformed user-item interaction matrix to fit the model
|
|
146
|
+
:rtype: csr_matrix
|
|
147
|
+
"""
|
|
148
|
+
return to_csr_matrix(X, binary=True)
|
|
149
|
+
|
|
150
|
+
def fit(self, X: InteractionMatrix) -> Self:
|
|
151
|
+
"""Fit the model to the input interaction matrix.
|
|
152
|
+
|
|
153
|
+
The input data is transformed to the expected type using
|
|
154
|
+
:meth:`_transform_fit_input`. The fitting is done using the
|
|
155
|
+
:meth:`_fit` method. Finally the method checks that the fitting
|
|
156
|
+
was successful using :meth:`_check_fit_complete`.
|
|
157
|
+
|
|
158
|
+
:param X: The interactions to fit the model on.
|
|
159
|
+
:type X: InteractionMatrix
|
|
160
|
+
:return: Fitted algorithm
|
|
161
|
+
:rtype: Algorithm
|
|
162
|
+
"""
|
|
163
|
+
start = time.time()
|
|
164
|
+
X_transformed = self._transform_fit_input(X)
|
|
165
|
+
self._fit(X_transformed)
|
|
166
|
+
|
|
167
|
+
self._check_fit_complete()
|
|
168
|
+
end = time.time()
|
|
169
|
+
logger.debug(f"Fitting {self.name} complete - Took {end - start:.3}s")
|
|
170
|
+
return self
|
|
171
|
+
|
|
172
|
+
def _pad_unknown_iid_with_none_strategy(
|
|
173
|
+
self,
|
|
174
|
+
y_pred: csr_matrix,
|
|
175
|
+
current_shape: tuple[int, int],
|
|
176
|
+
intended_shape: tuple[int, int],
|
|
177
|
+
) -> csr_matrix:
|
|
178
|
+
"""Pad the predictions with empty fields for unknown items.
|
|
179
|
+
|
|
180
|
+
This is to ensure that when we compute the performance of the prediction, we are
|
|
181
|
+
comparing the prediction against the ground truth for the same set of items.
|
|
182
|
+
"""
|
|
183
|
+
if y_pred.shape == intended_shape:
|
|
184
|
+
return y_pred
|
|
185
|
+
|
|
186
|
+
known_user_id, known_item_id = current_shape
|
|
187
|
+
logger.debug(f"Padding item ID in range({known_item_id}, {intended_shape[1]}) with empty fields")
|
|
188
|
+
y_pred = add_columns_to_csr_matrix(y_pred, intended_shape[1] - known_item_id)
|
|
189
|
+
logger.debug(f"Padding by {self.name} completed")
|
|
190
|
+
return y_pred
|
|
191
|
+
|
|
192
|
+
# TODO change X_pred to y_pred for consistency
|
|
193
|
+
def _pad_unknown_uid_with_random_strategy(
|
|
194
|
+
self,
|
|
195
|
+
X_pred: csr_matrix,
|
|
196
|
+
current_shape: tuple[int, int],
|
|
197
|
+
intended_shape: tuple[int, int],
|
|
198
|
+
predict_ui_df: pd.DataFrame,
|
|
199
|
+
k: int = 10,
|
|
200
|
+
) -> csr_matrix:
|
|
201
|
+
"""Pad the predictions with random items for users that are not in the training data.
|
|
202
|
+
|
|
203
|
+
:param X_pred: Predictions made by the algorithm
|
|
204
|
+
:type X_pred: csr_matrix
|
|
205
|
+
:param intended_shape: The intended shape of the prediction matrix
|
|
206
|
+
:type intended_shape: tuple[int, int]
|
|
207
|
+
:param predict_ui_df: DataFrame containing the user IDs to predict for
|
|
208
|
+
:type predict_ui_df: pd.DataFrame
|
|
209
|
+
:return: The padded prediction matrix
|
|
210
|
+
:rtype: csr_matrix
|
|
211
|
+
"""
|
|
212
|
+
if X_pred.shape == intended_shape:
|
|
213
|
+
return X_pred
|
|
214
|
+
|
|
215
|
+
known_user_id, known_item_id = current_shape
|
|
216
|
+
# +1 to include the last user id
|
|
217
|
+
X_pred = add_rows_to_csr_matrix(X_pred, intended_shape[0] - known_user_id)
|
|
218
|
+
# pad users with random items
|
|
219
|
+
logger.debug(f"Padding user ID in range({known_user_id}, {intended_shape[0]}) with random items")
|
|
220
|
+
to_predict = pd.Series(predict_ui_df.uid.unique())
|
|
221
|
+
# Filter for users not in training data
|
|
222
|
+
filtered = to_predict[to_predict >= known_user_id]
|
|
223
|
+
filtered = filtered.sort_values(ignore_index=True)
|
|
224
|
+
if not filtered.empty:
|
|
225
|
+
row = filtered.values.repeat(k)
|
|
226
|
+
total_pad = len(row)
|
|
227
|
+
col = self.rand_gen.integers(0, known_item_id, total_pad)
|
|
228
|
+
pad = csr_matrix((np.ones(total_pad), (row, col)), shape=intended_shape)
|
|
229
|
+
else:
|
|
230
|
+
pad = csr_matrix(intended_shape)
|
|
231
|
+
X_pred += pad
|
|
232
|
+
logger.debug(f"Padding by {self.name} completed")
|
|
233
|
+
return X_pred
|
|
234
|
+
|
|
235
|
+
def predict(self, X: PredictionMatrix) -> csr_matrix:
|
|
236
|
+
"""Predicts scores, given the interactions in X
|
|
237
|
+
|
|
238
|
+
The input data is transformed to the expected type using
|
|
239
|
+
:meth:`_transform_predict_input`. The predictions are made
|
|
240
|
+
using the :meth:`_predict` method. Finally the predictions
|
|
241
|
+
are then padded with random items for users that are not in the
|
|
242
|
+
training data.
|
|
243
|
+
|
|
244
|
+
:param X: interactions to predict from.
|
|
245
|
+
:type X: InteractionMatrix
|
|
246
|
+
:return: The recommendation scores in a sparse matrix format.
|
|
247
|
+
:rtype: csr_matrix
|
|
248
|
+
"""
|
|
249
|
+
self._check_fit_complete()
|
|
250
|
+
X_pred = self._predict(X)
|
|
251
|
+
return X_pred
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class PopularityPaddingMixin:
|
|
255
|
+
"""Mixin class to add popularity-based padding to prediction methods."""
|
|
256
|
+
|
|
257
|
+
def __init__(self, pad_with_popularity: bool = False) -> None:
|
|
258
|
+
super().__init__()
|
|
259
|
+
self.pad_with_popularity = pad_with_popularity
|
|
260
|
+
|
|
261
|
+
def get_popularity_scores(self, X: csr_matrix) -> np.ndarray:
|
|
262
|
+
"""Compute a popularity-based scoring vector for items.
|
|
263
|
+
|
|
264
|
+
This method calculates normalized interaction counts for each item,
|
|
265
|
+
selects the top-K most popular items, and returns a vector where
|
|
266
|
+
only those top-K items have their normalized scores (others are 0).
|
|
267
|
+
This is used to pad predictions for unseen users with popular items.
|
|
268
|
+
|
|
269
|
+
:param X: The interaction matrix (user-item) to compute popularity from.
|
|
270
|
+
:type X: csr_matrix
|
|
271
|
+
:return: A 1D array of shape (num_items,) with popularity scores for top-K items.
|
|
272
|
+
:rtype: np.ndarray
|
|
273
|
+
"""
|
|
274
|
+
interaction_counts = X.sum(axis=0).A[0]
|
|
275
|
+
normalized_scores = interaction_counts / interaction_counts.max()
|
|
276
|
+
|
|
277
|
+
num_items = X.shape[1]
|
|
278
|
+
if hasattr(self, "K"):
|
|
279
|
+
k_value = self.K
|
|
280
|
+
else:
|
|
281
|
+
k_value = 100
|
|
282
|
+
if num_items < k_value:
|
|
283
|
+
logger.warning("K is larger than the number of items.")
|
|
284
|
+
|
|
285
|
+
effective_k = min(k_value, num_items)
|
|
286
|
+
# Get indices of top-K items by popularity
|
|
287
|
+
top_k_indices = np.argpartition(normalized_scores, -effective_k)[-effective_k:]
|
|
288
|
+
popularity_vector = np.zeros(num_items)
|
|
289
|
+
popularity_vector[top_k_indices] = normalized_scores[top_k_indices]
|
|
290
|
+
|
|
291
|
+
return popularity_vector
|
|
292
|
+
|
|
293
|
+
def _pad_uknown_uid_with_popularity_strategy(
|
|
294
|
+
self,
|
|
295
|
+
X_pred: csr_matrix,
|
|
296
|
+
intended_shape: tuple,
|
|
297
|
+
predict_ui_df: pd.DataFrame,
|
|
298
|
+
) -> csr_matrix:
|
|
299
|
+
"""Pad the predictions with popular items for users that are not in the training data.
|
|
300
|
+
|
|
301
|
+
:param X_pred: Predictions made by the algorithm
|
|
302
|
+
:type X_pred: csr_matrix
|
|
303
|
+
:param intended_shape: The intended shape of the prediction matrix
|
|
304
|
+
:type intended_shape: tuple
|
|
305
|
+
:param predict_ui_df: DataFrame containing the user IDs to predict for
|
|
306
|
+
:type predict_ui_df: pd.DataFrame
|
|
307
|
+
:return: The padded prediction matrix
|
|
308
|
+
:rtype: csr_matrix
|
|
309
|
+
"""
|
|
310
|
+
if X_pred.shape == intended_shape:
|
|
311
|
+
return X_pred
|
|
312
|
+
|
|
313
|
+
known_user_id, known_item_id = X_pred.shape
|
|
314
|
+
X_pred = add_rows_to_csr_matrix(X_pred, intended_shape[0] - known_user_id)
|
|
315
|
+
# pad users with popular items
|
|
316
|
+
logger.debug(f"Padding user ID in range({known_user_id}, {intended_shape[0]}) with popular items")
|
|
317
|
+
popular_items = self.get_popularity_scores(X_pred)
|
|
318
|
+
|
|
319
|
+
to_predict = predict_ui_df.value_counts("uid")
|
|
320
|
+
# Filter for users not in training data
|
|
321
|
+
filtered = to_predict[to_predict.index >= known_user_id]
|
|
322
|
+
for user_id in filtered.index:
|
|
323
|
+
if user_id >= known_user_id:
|
|
324
|
+
X_pred[user_id, :] = popular_items
|
|
325
|
+
return X_pred
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class TopKAlgorithm(Algorithm):
|
|
329
|
+
"""Base algorithm for algorithms that recommend top-K items for every user."""
|
|
330
|
+
|
|
331
|
+
def __init__(self, K: int = 10) -> None:
|
|
332
|
+
super().__init__()
|
|
333
|
+
self.K = K
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
class TopKItemSimilarityMatrixAlgorithm(TopKAlgorithm):
|
|
337
|
+
"""Base algorithm for algorithms that fit an item to item similarity model with K similar items for every item
|
|
338
|
+
|
|
339
|
+
Model that encodes the similarity between items is expected
|
|
340
|
+
under the ``similarity_matrix_`` attribute.
|
|
341
|
+
|
|
342
|
+
This matrix should have shape ``(|items| x |items|)``.
|
|
343
|
+
This can be dense or sparse matrix depending on the algorithm used.
|
|
344
|
+
|
|
345
|
+
Predictions are made by computing the dot product of the history vector of a user
|
|
346
|
+
and the similarity matrix.
|
|
347
|
+
|
|
348
|
+
Usually a new algorithm will have to
|
|
349
|
+
implement just the :meth:`_fit` method,
|
|
350
|
+
to construct the `self.similarity_matrix_` attribute.
|
|
351
|
+
"""
|
|
352
|
+
|
|
353
|
+
similarity_matrix_: csr_matrix
|
|
354
|
+
|
|
355
|
+
def _check_fit_complete(self) -> None:
|
|
356
|
+
"""Helper function to check if model was correctly fitted
|
|
357
|
+
|
|
358
|
+
Checks implemented:
|
|
359
|
+
|
|
360
|
+
- Checks if the algorithm has been fitted, using sklearn's `check_is_fitted`
|
|
361
|
+
- Checks if the fitted similarity matrix contains similar items for each item
|
|
362
|
+
|
|
363
|
+
For failing checks a warning is printed.
|
|
364
|
+
"""
|
|
365
|
+
# Use super to check is fitted
|
|
366
|
+
super()._check_fit_complete()
|
|
367
|
+
|
|
368
|
+
# Ensures that similarity_matrix_ is computed
|
|
369
|
+
if not hasattr(self, "similarity_matrix_"):
|
|
370
|
+
raise AttributeError(f"{self.name} has no attribute similarity_matrix_ after fitting.")
|
|
371
|
+
|
|
372
|
+
# Check row wise, since that will determine the recommendation options.
|
|
373
|
+
items_with_score = set(self.similarity_matrix_.nonzero()[0])
|
|
374
|
+
|
|
375
|
+
missing = self.similarity_matrix_.shape[0] - len(items_with_score)
|
|
376
|
+
if missing > 0:
|
|
377
|
+
logger.warning(f"{self.name} missing similar items for {missing} items.")
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from warnings import warn
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from scipy.sparse import csr_matrix, lil_matrix
|
|
5
|
+
|
|
6
|
+
from ...matrix import InteractionMatrix
|
|
7
|
+
from ..base import Algorithm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DecayPopularity(Algorithm):
|
|
11
|
+
"""A popularity-based algorithm with exponential decay over data from earlier time windows.
|
|
12
|
+
"""
|
|
13
|
+
IS_BASE: bool = False
|
|
14
|
+
|
|
15
|
+
def __init__(self, K: int = 200) -> None:
|
|
16
|
+
super().__init__()
|
|
17
|
+
self.K = K
|
|
18
|
+
self.historical_data: list[csr_matrix] = [] # Store all historical training data
|
|
19
|
+
self.num_items = 0 # Track the maximum number of items seen so far
|
|
20
|
+
|
|
21
|
+
def _pad_matrix(self, matrix: csr_matrix, new_num_items: int) -> csr_matrix:
|
|
22
|
+
"""
|
|
23
|
+
Pad a sparse matrix with zero columns to match the new number of items.
|
|
24
|
+
|
|
25
|
+
:param matrix: The matrix to pad
|
|
26
|
+
:type matrix: csr_matrix
|
|
27
|
+
:param new_num_items: The target number of columns
|
|
28
|
+
:type new_num_items: int
|
|
29
|
+
:return: The padded matrix
|
|
30
|
+
:rtype: csr_matrix
|
|
31
|
+
"""
|
|
32
|
+
if matrix.shape[1] >= new_num_items:
|
|
33
|
+
return matrix
|
|
34
|
+
padding = csr_matrix((matrix.shape[0], new_num_items - matrix.shape[1]))
|
|
35
|
+
return csr_matrix(np.hstack([matrix.toarray(), padding.toarray()]))
|
|
36
|
+
|
|
37
|
+
def _expand_historical_data(self, new_num_items: int):
|
|
38
|
+
"""
|
|
39
|
+
Expand all matrices in historical_data to match the new number of items.
|
|
40
|
+
|
|
41
|
+
:param new_num_items: The updated number of items
|
|
42
|
+
:type new_num_items: int
|
|
43
|
+
"""
|
|
44
|
+
for i in range(len(self.historical_data)):
|
|
45
|
+
if self.historical_data[i].shape[1] < new_num_items:
|
|
46
|
+
self.historical_data[i] = self._pad_matrix(self.historical_data[i], new_num_items)
|
|
47
|
+
|
|
48
|
+
def _fit(self, X: csr_matrix) -> "DecayPopularity":
|
|
49
|
+
"""
|
|
50
|
+
Fit the model by applying decay to historical data and adding new data.
|
|
51
|
+
|
|
52
|
+
:param X: Interaction matrix (users x items) for the current window
|
|
53
|
+
:type X: csr_matrix
|
|
54
|
+
"""
|
|
55
|
+
# Update the maximum number of items
|
|
56
|
+
new_num_items = X.shape[1]
|
|
57
|
+
if new_num_items > self.num_items:
|
|
58
|
+
self._expand_historical_data(new_num_items)
|
|
59
|
+
self.num_items = new_num_items
|
|
60
|
+
|
|
61
|
+
# Append the new matrix (ensure it has the correct number of items)
|
|
62
|
+
if X.shape[1] < self.num_items:
|
|
63
|
+
X = self._pad_matrix(X, self.num_items)
|
|
64
|
+
|
|
65
|
+
# Append new data to historical data
|
|
66
|
+
self.historical_data.append(X)
|
|
67
|
+
|
|
68
|
+
# Initialize decayed scores
|
|
69
|
+
num_items = X.shape[1]
|
|
70
|
+
if num_items < self.K:
|
|
71
|
+
warn("K is larger than the number of items.", UserWarning)
|
|
72
|
+
|
|
73
|
+
decayed_scores = np.zeros(num_items)
|
|
74
|
+
|
|
75
|
+
# Apply decay to each historical matrix
|
|
76
|
+
for i, matrix in enumerate(self.historical_data):
|
|
77
|
+
# length 2, i = 0 -> 2-1-0 = 1, i = 1 -> 2-1-1 = 0
|
|
78
|
+
# length 3, i = 0 -> 3-1-0 = 2, i = 1 -> 3-1-1 = 1, i = 2 -> 3-1-2 = 0
|
|
79
|
+
decay_factor = np.exp(-(len(self.historical_data) - 1 - i))
|
|
80
|
+
decayed_scores += matrix.sum(axis=0).A[0] * decay_factor
|
|
81
|
+
|
|
82
|
+
normalized_scores = decayed_scores / decayed_scores.max()
|
|
83
|
+
|
|
84
|
+
K = min(self.K, num_items)
|
|
85
|
+
ind = np.argpartition(normalized_scores, -K)[-K:]
|
|
86
|
+
a = np.zeros(num_items)
|
|
87
|
+
a[ind] = normalized_scores[ind]
|
|
88
|
+
self.decayed_scores_ = a
|
|
89
|
+
return self
|
|
90
|
+
|
|
91
|
+
def _predict(self, X: csr_matrix, predict_im: InteractionMatrix) -> csr_matrix:
|
|
92
|
+
"""
|
|
93
|
+
Predict the K most popular item for each user scaled by the decay factor.
|
|
94
|
+
"""
|
|
95
|
+
if predict_im is None:
|
|
96
|
+
raise AttributeError("Predict frame with requested ID is required for Popularity algorithm")
|
|
97
|
+
|
|
98
|
+
predict_frame = predict_im._df
|
|
99
|
+
|
|
100
|
+
users = predict_frame["uid"].unique().tolist()
|
|
101
|
+
known_item_id = X.shape[1]
|
|
102
|
+
|
|
103
|
+
# predict_frame contains (user_id, -1) pairs
|
|
104
|
+
max_user_id = predict_frame["uid"].max() + 1
|
|
105
|
+
intended_shape = (max(max_user_id, X.shape[0]), known_item_id)
|
|
106
|
+
|
|
107
|
+
X_pred = lil_matrix(intended_shape)
|
|
108
|
+
X_pred[users] = self.decayed_scores_
|
|
109
|
+
|
|
110
|
+
return X_pred.tocsr()
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Self
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from scipy.sparse import csr_matrix, hstack, vstack
|
|
6
|
+
|
|
7
|
+
from ...matrix import PredictionMatrix
|
|
8
|
+
from ..base import PopularityPaddingMixin, TopKAlgorithm
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MostPopular(TopKAlgorithm, PopularityPaddingMixin):
|
|
15
|
+
"""A popularity-based algorithm that considers all historical data."""
|
|
16
|
+
|
|
17
|
+
IS_BASE: bool = False
|
|
18
|
+
X_: csr_matrix | None = None # Store all historical training data
|
|
19
|
+
|
|
20
|
+
def _append_training_data(self, X: csr_matrix) -> None:
|
|
21
|
+
"""Append a new interaction matrix to the historical data.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
X (csr_matrix): Interaction matrix to append
|
|
25
|
+
"""
|
|
26
|
+
if self.X_ is None:
|
|
27
|
+
raise ValueError("No existing training data to append to.")
|
|
28
|
+
X_prev: csr_matrix = self.X_.copy()
|
|
29
|
+
new_num_rows = max(X_prev.shape[0], X.shape[0])
|
|
30
|
+
new_num_cols = max(X_prev.shape[1], X.shape[1])
|
|
31
|
+
# Pad the previous matrix
|
|
32
|
+
if X_prev.shape[0] < new_num_rows: # Pad rows
|
|
33
|
+
row_padding = csr_matrix((new_num_rows - X_prev.shape[0], X_prev.shape[1]))
|
|
34
|
+
X_prev = vstack([X_prev, row_padding])
|
|
35
|
+
if X_prev.shape[1] < new_num_cols: # Pad columns
|
|
36
|
+
col_padding = csr_matrix((X_prev.shape[0], new_num_cols - X_prev.shape[1]))
|
|
37
|
+
X_prev = hstack([X_prev, col_padding])
|
|
38
|
+
|
|
39
|
+
# Pad the current matrix
|
|
40
|
+
if X.shape[0] < new_num_rows: # Pad rows
|
|
41
|
+
row_padding = csr_matrix((new_num_rows - X.shape[0], X.shape[1]))
|
|
42
|
+
X = vstack([X, row_padding])
|
|
43
|
+
if X.shape[1] < new_num_cols: # Pad columns
|
|
44
|
+
col_padding = csr_matrix((X.shape[0], new_num_cols - X.shape[1]))
|
|
45
|
+
X = hstack([X, col_padding])
|
|
46
|
+
|
|
47
|
+
# Merge data
|
|
48
|
+
self.X_ = X_prev + X
|
|
49
|
+
|
|
50
|
+
def _fit(self, X: csr_matrix) -> Self:
|
|
51
|
+
if self.X_ is not None:
|
|
52
|
+
self._append_training_data(X)
|
|
53
|
+
else:
|
|
54
|
+
self.X_ = X.copy()
|
|
55
|
+
|
|
56
|
+
if not isinstance(self.X_, csr_matrix):
|
|
57
|
+
raise ValueError("Training data is not initialized properly.")
|
|
58
|
+
|
|
59
|
+
if self.X_.shape[1] < self.K:
|
|
60
|
+
logger.warning("K is larger than the number of items.", UserWarning)
|
|
61
|
+
|
|
62
|
+
self.sorted_scores_ = self.get_popularity_scores(self.X_)
|
|
63
|
+
return self
|
|
64
|
+
|
|
65
|
+
def _predict(self, X: PredictionMatrix) -> csr_matrix:
|
|
66
|
+
intended_shape = (X.get_prediction_data().num_interactions, X.user_item_shape[1])
|
|
67
|
+
|
|
68
|
+
# Vectorized: repeat the sorted scores for each prediction row
|
|
69
|
+
data = np.tile(self.sorted_scores_, (intended_shape[0], 1))
|
|
70
|
+
X_pred = csr_matrix(data)
|
|
71
|
+
|
|
72
|
+
return X_pred
|