recnexteval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recnexteval/__init__.py +20 -0
- recnexteval/algorithms/__init__.py +99 -0
- recnexteval/algorithms/base.py +377 -0
- recnexteval/algorithms/baseline/__init__.py +10 -0
- recnexteval/algorithms/baseline/decay_popularity.py +110 -0
- recnexteval/algorithms/baseline/most_popular.py +72 -0
- recnexteval/algorithms/baseline/random.py +39 -0
- recnexteval/algorithms/baseline/recent_popularity.py +34 -0
- recnexteval/algorithms/itemknn/__init__.py +14 -0
- recnexteval/algorithms/itemknn/itemknn.py +119 -0
- recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
- recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
- recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
- recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
- recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
- recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
- recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
- recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
- recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
- recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
- recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
- recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
- recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
- recnexteval/algorithms/utils.py +51 -0
- recnexteval/datasets/__init__.py +109 -0
- recnexteval/datasets/base.py +316 -0
- recnexteval/datasets/config/__init__.py +113 -0
- recnexteval/datasets/config/amazon.py +188 -0
- recnexteval/datasets/config/base.py +72 -0
- recnexteval/datasets/config/lastfm.py +105 -0
- recnexteval/datasets/config/movielens.py +169 -0
- recnexteval/datasets/config/yelp.py +25 -0
- recnexteval/datasets/datasets/__init__.py +24 -0
- recnexteval/datasets/datasets/amazon.py +151 -0
- recnexteval/datasets/datasets/base.py +250 -0
- recnexteval/datasets/datasets/lastfm.py +121 -0
- recnexteval/datasets/datasets/movielens.py +93 -0
- recnexteval/datasets/datasets/test.py +46 -0
- recnexteval/datasets/datasets/yelp.py +103 -0
- recnexteval/datasets/metadata/__init__.py +58 -0
- recnexteval/datasets/metadata/amazon.py +68 -0
- recnexteval/datasets/metadata/base.py +38 -0
- recnexteval/datasets/metadata/lastfm.py +110 -0
- recnexteval/datasets/metadata/movielens.py +87 -0
- recnexteval/evaluators/__init__.py +189 -0
- recnexteval/evaluators/accumulator.py +167 -0
- recnexteval/evaluators/base.py +216 -0
- recnexteval/evaluators/builder/__init__.py +125 -0
- recnexteval/evaluators/builder/base.py +166 -0
- recnexteval/evaluators/builder/pipeline.py +111 -0
- recnexteval/evaluators/builder/stream.py +54 -0
- recnexteval/evaluators/evaluator_pipeline.py +287 -0
- recnexteval/evaluators/evaluator_stream.py +374 -0
- recnexteval/evaluators/state_management.py +310 -0
- recnexteval/evaluators/strategy.py +32 -0
- recnexteval/evaluators/util.py +124 -0
- recnexteval/matrix/__init__.py +48 -0
- recnexteval/matrix/exception.py +5 -0
- recnexteval/matrix/interaction_matrix.py +784 -0
- recnexteval/matrix/prediction_matrix.py +153 -0
- recnexteval/matrix/util.py +24 -0
- recnexteval/metrics/__init__.py +57 -0
- recnexteval/metrics/binary/__init__.py +4 -0
- recnexteval/metrics/binary/hit.py +49 -0
- recnexteval/metrics/core/__init__.py +10 -0
- recnexteval/metrics/core/base.py +126 -0
- recnexteval/metrics/core/elementwise_top_k.py +75 -0
- recnexteval/metrics/core/listwise_top_k.py +72 -0
- recnexteval/metrics/core/top_k.py +60 -0
- recnexteval/metrics/core/util.py +29 -0
- recnexteval/metrics/ranking/__init__.py +6 -0
- recnexteval/metrics/ranking/dcg.py +55 -0
- recnexteval/metrics/ranking/ndcg.py +78 -0
- recnexteval/metrics/ranking/precision.py +51 -0
- recnexteval/metrics/ranking/recall.py +42 -0
- recnexteval/models/__init__.py +4 -0
- recnexteval/models/base.py +69 -0
- recnexteval/preprocessing/__init__.py +37 -0
- recnexteval/preprocessing/filter.py +181 -0
- recnexteval/preprocessing/preprocessor.py +137 -0
- recnexteval/registries/__init__.py +67 -0
- recnexteval/registries/algorithm.py +68 -0
- recnexteval/registries/base.py +131 -0
- recnexteval/registries/dataset.py +37 -0
- recnexteval/registries/metric.py +57 -0
- recnexteval/settings/__init__.py +127 -0
- recnexteval/settings/base.py +414 -0
- recnexteval/settings/exception.py +8 -0
- recnexteval/settings/leave_n_out_setting.py +48 -0
- recnexteval/settings/processor.py +115 -0
- recnexteval/settings/schema.py +11 -0
- recnexteval/settings/single_time_point_setting.py +111 -0
- recnexteval/settings/sliding_window_setting.py +153 -0
- recnexteval/settings/splitters/__init__.py +14 -0
- recnexteval/settings/splitters/base.py +57 -0
- recnexteval/settings/splitters/n_last.py +39 -0
- recnexteval/settings/splitters/n_last_timestamp.py +76 -0
- recnexteval/settings/splitters/timestamp.py +82 -0
- recnexteval/settings/util.py +0 -0
- recnexteval/utils/__init__.py +115 -0
- recnexteval/utils/json_to_csv_converter.py +128 -0
- recnexteval/utils/logging_tools.py +159 -0
- recnexteval/utils/path.py +155 -0
- recnexteval/utils/url_certificate_installer.py +54 -0
- recnexteval/utils/util.py +166 -0
- recnexteval/utils/uuid_util.py +7 -0
- recnexteval/utils/yaml_tool.py +65 -0
- recnexteval-0.1.0.dist-info/METADATA +85 -0
- recnexteval-0.1.0.dist-info/RECORD +110 -0
- recnexteval-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Self
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy.sparse import csr_matrix
|
|
6
|
+
|
|
7
|
+
from ...matrix import PredictionMatrix
|
|
8
|
+
from ..base import TopKAlgorithm
|
|
9
|
+
from ..utils import get_top_K_values
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Random(TopKAlgorithm):
|
|
13
|
+
"""Random recommendation for users.
|
|
14
|
+
|
|
15
|
+
The Random algorithm recommends K random items to all users in the predict frame.
|
|
16
|
+
"""
|
|
17
|
+
IS_BASE: bool = False
|
|
18
|
+
|
|
19
|
+
def _fit(self, X: csr_matrix) -> Self: # noqa: ARG002
|
|
20
|
+
self.fit_complete_ = True
|
|
21
|
+
return self
|
|
22
|
+
|
|
23
|
+
def _predict(self, X: PredictionMatrix) -> csr_matrix:
|
|
24
|
+
predict_ui_df = X.get_prediction_data()._df # noqa: SLF001
|
|
25
|
+
|
|
26
|
+
known_item_id = X.max_known_item_id
|
|
27
|
+
intended_shape = (X.max_global_user_id, known_item_id)
|
|
28
|
+
|
|
29
|
+
to_predict = pd.Series(predict_ui_df.uid.unique())
|
|
30
|
+
to_predict = to_predict.sort_values(ignore_index=True)
|
|
31
|
+
row = to_predict.values.repeat(self.K)
|
|
32
|
+
total_items_to_predict = len(row)
|
|
33
|
+
col = self.rand_gen.integers(0, known_item_id, total_items_to_predict)
|
|
34
|
+
scores = csr_matrix((np.ones(total_items_to_predict), (row, col)), shape=intended_shape)
|
|
35
|
+
|
|
36
|
+
# Get top K of allowed items per user
|
|
37
|
+
X_pred = get_top_K_values(scores, K=self.K)
|
|
38
|
+
X_pred = X_pred[predict_ui_df["uid"].values]
|
|
39
|
+
return X_pred
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Self
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from scipy.sparse import csr_matrix
|
|
7
|
+
|
|
8
|
+
from ...matrix import PredictionMatrix
|
|
9
|
+
from ..base import PopularityPaddingMixin, TopKAlgorithm
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RecentPopularity(TopKAlgorithm, PopularityPaddingMixin):
|
|
16
|
+
"""A popularity-based algorithm which only considers popularity of the latest train data."""
|
|
17
|
+
|
|
18
|
+
IS_BASE: bool = False
|
|
19
|
+
|
|
20
|
+
def _fit(self, X: csr_matrix) -> Self:
|
|
21
|
+
self.sorted_scores_ = self.get_popularity_scores(X)
|
|
22
|
+
return self
|
|
23
|
+
|
|
24
|
+
def _predict(self, X: PredictionMatrix) -> csr_matrix:
|
|
25
|
+
"""
|
|
26
|
+
Predict the K most popular item for each user using only data from the latest window.
|
|
27
|
+
"""
|
|
28
|
+
intended_shape = (X.get_prediction_data().num_interactions, X.user_item_shape[1])
|
|
29
|
+
|
|
30
|
+
# Vectorized: repeat the sorted scores for each prediction row
|
|
31
|
+
data = np.tile(self.sorted_scores_, (intended_shape[0], 1))
|
|
32
|
+
X_pred = csr_matrix(data)
|
|
33
|
+
|
|
34
|
+
return X_pred
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .itemknn import ItemKNN
|
|
2
|
+
from .itemknn_incremental import ItemKNNIncremental
|
|
3
|
+
from .itemknn_incremental_movielens import ItemKNNIncrementalMovieLens100K
|
|
4
|
+
from .itemknn_rolling import ItemKNNRolling
|
|
5
|
+
from .itemknn_static import ItemKNNStatic
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ItemKNN",
|
|
10
|
+
"ItemKNNIncremental",
|
|
11
|
+
"ItemKNNIncrementalMovieLens100K",
|
|
12
|
+
"ItemKNNRolling",
|
|
13
|
+
"ItemKNNStatic",
|
|
14
|
+
]
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Self
|
|
3
|
+
|
|
4
|
+
from scipy.sparse import csr_matrix
|
|
5
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
6
|
+
|
|
7
|
+
from recnexteval.matrix import ItemUserBasedEnum, PredictionMatrix
|
|
8
|
+
from ..base import PopularityPaddingMixin, TopKItemSimilarityMatrixAlgorithm
|
|
9
|
+
from ..utils import get_top_K_values
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def compute_cosine_similarity(X: csr_matrix) -> csr_matrix:
|
|
16
|
+
"""Compute the cosine similarity between the items in the matrix.
|
|
17
|
+
|
|
18
|
+
Self similarity is removed.
|
|
19
|
+
|
|
20
|
+
:param X: user x item matrix with scores per user, item pair.
|
|
21
|
+
:type X: csr_matrix
|
|
22
|
+
:return: similarity matrix
|
|
23
|
+
:rtype: csr_matrix
|
|
24
|
+
"""
|
|
25
|
+
# X.T otherwise we are doing a user KNN
|
|
26
|
+
item_cosine_similarities = cosine_similarity(X.T, dense_output=False)
|
|
27
|
+
if not isinstance(item_cosine_similarities, csr_matrix):
|
|
28
|
+
item_cosine_similarities = csr_matrix(item_cosine_similarities)
|
|
29
|
+
# Set diagonal to 0, because we don't want to support self similarity
|
|
30
|
+
item_cosine_similarities.setdiag(0)
|
|
31
|
+
|
|
32
|
+
return item_cosine_similarities
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ItemKNN(TopKItemSimilarityMatrixAlgorithm, PopularityPaddingMixin):
|
|
36
|
+
"""Item K Nearest Neighbours model.
|
|
37
|
+
|
|
38
|
+
First described in 'Item-based top-n recommendation algorithms.' :cite:`10.1145/963770.963776`
|
|
39
|
+
|
|
40
|
+
This code is adapted from RecPack :cite:`recpack`
|
|
41
|
+
|
|
42
|
+
For each item the K most similar items are computed during fit.
|
|
43
|
+
Similarity parameter decides how to compute the similarity between two items.
|
|
44
|
+
|
|
45
|
+
Cosine similarity between item i and j is computed as
|
|
46
|
+
|
|
47
|
+
.. math::
|
|
48
|
+
sim(i,j) = \\frac{X_i X_j}{||X_i||_2 ||X_j||_2}
|
|
49
|
+
|
|
50
|
+
:param K: How many neigbours to use per item,
|
|
51
|
+
make sure to pick a value below the number of columns of the matrix to fit on.
|
|
52
|
+
Defaults to 200
|
|
53
|
+
:type K: int, optional
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
ITEM_USER_BASED = ItemUserBasedEnum.ITEM
|
|
57
|
+
|
|
58
|
+
def _fit(self, X: csr_matrix) -> Self:
|
|
59
|
+
"""Fit a cosine similarity matrix from item to item
|
|
60
|
+
We assume that X is a binary matrix of shape (n_users, n_items)
|
|
61
|
+
"""
|
|
62
|
+
item_similarities = compute_cosine_similarity(X)
|
|
63
|
+
item_similarities = get_top_K_values(item_similarities, K=self.K)
|
|
64
|
+
|
|
65
|
+
self.similarity_matrix_ = item_similarities
|
|
66
|
+
self.X_ = X.copy()
|
|
67
|
+
return self
|
|
68
|
+
|
|
69
|
+
def _predict(self, X: PredictionMatrix) -> csr_matrix:
|
|
70
|
+
predict_ui_df = X.get_prediction_data()._df # noqa: SLF001
|
|
71
|
+
|
|
72
|
+
# create a boolean series that is true for index in predict_ui_df.uid
|
|
73
|
+
uid_to_predict = predict_ui_df[predict_ui_df.uid < self.X_.shape[0]].uid.unique()
|
|
74
|
+
uid_to_predict = sorted(uid_to_predict.tolist())
|
|
75
|
+
|
|
76
|
+
# features: csr_matrix = self.X_[uid_to_predict]
|
|
77
|
+
# we try without any filtering on the feature matrix
|
|
78
|
+
features: csr_matrix = self.X_
|
|
79
|
+
scores = features @ self.similarity_matrix_
|
|
80
|
+
|
|
81
|
+
if not isinstance(scores, csr_matrix):
|
|
82
|
+
scores = csr_matrix(scores)
|
|
83
|
+
|
|
84
|
+
intended_shape = (X.max_global_user_id, X.max_global_item_id)
|
|
85
|
+
|
|
86
|
+
if scores.shape == intended_shape:
|
|
87
|
+
return scores
|
|
88
|
+
|
|
89
|
+
# there are 2 cases where the shape is different:
|
|
90
|
+
# 1. The algorithm did not predict unknown user, causing shortage in rows
|
|
91
|
+
# 2. The algorithm not aware of unknown items, causing shortage in columns
|
|
92
|
+
|
|
93
|
+
# handle case 1
|
|
94
|
+
if scores.shape[1] < intended_shape[1]:
|
|
95
|
+
scores = self._pad_unknown_iid_with_none_strategy(
|
|
96
|
+
y_pred=scores,
|
|
97
|
+
current_shape=scores.shape,
|
|
98
|
+
intended_shape=intended_shape,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# handle case 2
|
|
102
|
+
if self.pad_with_popularity:
|
|
103
|
+
scores = self._pad_uknown_uid_with_popularity_strategy(
|
|
104
|
+
X_pred=scores,
|
|
105
|
+
intended_shape=intended_shape,
|
|
106
|
+
predict_ui_df=predict_ui_df,
|
|
107
|
+
)
|
|
108
|
+
else:
|
|
109
|
+
# current_shape = (X.max_known_user_id, X.max_known_item_id)
|
|
110
|
+
scores = self._pad_unknown_uid_with_random_strategy(
|
|
111
|
+
X_pred=scores,
|
|
112
|
+
current_shape=scores.shape,
|
|
113
|
+
# current_shape=current_shape,
|
|
114
|
+
intended_shape=intended_shape,
|
|
115
|
+
predict_ui_df=predict_ui_df,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
pred = scores[predict_ui_df["uid"].values]
|
|
119
|
+
return pred
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Self
|
|
3
|
+
|
|
4
|
+
from scipy.sparse import csr_matrix, hstack, vstack
|
|
5
|
+
|
|
6
|
+
from ..base import PopularityPaddingMixin, TopKItemSimilarityMatrixAlgorithm
|
|
7
|
+
from .itemknn import ItemKNN
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ItemKNNIncremental(ItemKNN):
|
|
14
|
+
"""Incremental version of ItemKNN algorithm.
|
|
15
|
+
|
|
16
|
+
This class extends the ItemKNN algorithm to allow for incremental updates
|
|
17
|
+
to the model. The incremental updates are done by updating the historical
|
|
18
|
+
data with the new data by appending the new data to the historical data.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
IS_BASE: bool = False
|
|
22
|
+
|
|
23
|
+
def __init__(self, K: int = 10, pad_with_popularity: bool = True) -> None:
|
|
24
|
+
PopularityPaddingMixin.__init__(self, pad_with_popularity=pad_with_popularity)
|
|
25
|
+
TopKItemSimilarityMatrixAlgorithm.__init__(self, K=K)
|
|
26
|
+
self.X_: None | csr_matrix = None
|
|
27
|
+
|
|
28
|
+
def _append_training_data(self, X: csr_matrix) -> None:
|
|
29
|
+
"""Append a new interaction matrix to the historical data.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
X (csr_matrix): Interaction matrix to append
|
|
33
|
+
"""
|
|
34
|
+
if self.X_ is None:
|
|
35
|
+
raise ValueError("No existing training data to append to.")
|
|
36
|
+
X_prev: csr_matrix = self.X_.copy()
|
|
37
|
+
new_num_rows = max(X_prev.shape[0], X.shape[0])
|
|
38
|
+
new_num_cols = max(X_prev.shape[1], X.shape[1])
|
|
39
|
+
# Pad the previous matrix
|
|
40
|
+
if X_prev.shape[0] < new_num_rows: # Pad rows
|
|
41
|
+
row_padding = csr_matrix((new_num_rows - X_prev.shape[0], X_prev.shape[1]))
|
|
42
|
+
X_prev = vstack([X_prev, row_padding])
|
|
43
|
+
if X_prev.shape[1] < new_num_cols: # Pad columns
|
|
44
|
+
col_padding = csr_matrix((X_prev.shape[0], new_num_cols - X_prev.shape[1]))
|
|
45
|
+
X_prev = hstack([X_prev, col_padding])
|
|
46
|
+
|
|
47
|
+
# Pad the current matrix
|
|
48
|
+
if X.shape[0] < new_num_rows: # Pad rows
|
|
49
|
+
row_padding = csr_matrix((new_num_rows - X.shape[0], X.shape[1]))
|
|
50
|
+
X = vstack([X, row_padding])
|
|
51
|
+
if X.shape[1] < new_num_cols: # Pad columns
|
|
52
|
+
col_padding = csr_matrix((X.shape[0], new_num_cols - X.shape[1]))
|
|
53
|
+
X = hstack([X, col_padding])
|
|
54
|
+
|
|
55
|
+
# Merge data
|
|
56
|
+
self.X_ = X_prev + X
|
|
57
|
+
|
|
58
|
+
def _fit(self, X: csr_matrix) -> Self:
|
|
59
|
+
"""Fit a cosine similarity matrix from item to item."""
|
|
60
|
+
if self.X_ is not None:
|
|
61
|
+
self._append_training_data(X)
|
|
62
|
+
super()._fit(self.X_)
|
|
63
|
+
else:
|
|
64
|
+
super()._fit(X)
|
|
65
|
+
return self
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy.sparse import csr_matrix
|
|
6
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
7
|
+
from sklearn.preprocessing import OneHotEncoder
|
|
8
|
+
|
|
9
|
+
from ...matrix import InteractionMatrix
|
|
10
|
+
from ...utils import add_rows_to_csr_matrix
|
|
11
|
+
from .itemknn_incremental import ItemKNNIncremental
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ItemKNNIncrementalMovieLens100K(ItemKNNIncremental):
|
|
18
|
+
"""Incremental version of ItemKNN algorithm with MovieLens100k Metadata.
|
|
19
|
+
|
|
20
|
+
This class extends the ItemKNN algorithm to allow for incremental updates
|
|
21
|
+
to the model. The incremental updates are done by updating the historical
|
|
22
|
+
data with the new data by appending the new data to the historical data.
|
|
23
|
+
"""
|
|
24
|
+
IS_BASE: bool = False
|
|
25
|
+
|
|
26
|
+
def __init__(self, metadata: pd.DataFrame, K:int=10) -> None:
|
|
27
|
+
super().__init__(K)
|
|
28
|
+
if metadata is None:
|
|
29
|
+
raise ValueError("Metadata is required for ItemKNNIncrementalMovieLens100K")
|
|
30
|
+
self.metadata = metadata.copy()
|
|
31
|
+
|
|
32
|
+
def _predict(self, X: csr_matrix, predict_im: InteractionMatrix) -> csr_matrix:
|
|
33
|
+
"""Predict the K most similar items for each item using the latest data."""
|
|
34
|
+
X_pred = super()._predict(self.X_)
|
|
35
|
+
# ID indexing starts at 0, so max_id + 1 is the number of unique IDs
|
|
36
|
+
max_user_id = predict_im.max_user_id + 1
|
|
37
|
+
max_item_id = predict_im.max_item_id + 1
|
|
38
|
+
intended_shape = (
|
|
39
|
+
max(max_user_id, X.shape[0]),
|
|
40
|
+
max(max_item_id, X.shape[1]),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
predict_frame = predict_im._df
|
|
44
|
+
|
|
45
|
+
if X_pred.shape == intended_shape:
|
|
46
|
+
return X_pred
|
|
47
|
+
|
|
48
|
+
known_user_id, known_item_id = X_pred.shape
|
|
49
|
+
X_pred = add_rows_to_csr_matrix(X_pred, intended_shape[0] - known_user_id)
|
|
50
|
+
logger.debug(f"Padding user ID in range({known_user_id}, {intended_shape[0]}) with items")
|
|
51
|
+
to_predict = predict_frame.value_counts("uid")
|
|
52
|
+
|
|
53
|
+
# pad users with items from most similar user
|
|
54
|
+
user_similarity_matrix = self.get_user_similarity_matrix()
|
|
55
|
+
for user_id in to_predict.index:
|
|
56
|
+
if user_id >= known_user_id:
|
|
57
|
+
most_similar_user_idx = np.argmax(user_similarity_matrix[user_id][:known_user_id])
|
|
58
|
+
X_pred[user_id, :] = X_pred[most_similar_user_idx, :]
|
|
59
|
+
|
|
60
|
+
logger.debug(f"Padding by {self.name} completed")
|
|
61
|
+
return X_pred
|
|
62
|
+
|
|
63
|
+
def get_user_similarity_matrix(self):
|
|
64
|
+
user_metadata = self.metadata.copy()
|
|
65
|
+
|
|
66
|
+
# set userId as index
|
|
67
|
+
user_metadata.set_index("userId", inplace=True)
|
|
68
|
+
user_metadata.index.name = None
|
|
69
|
+
|
|
70
|
+
# reorder the indices
|
|
71
|
+
user_metadata.reset_index(drop=True)
|
|
72
|
+
user_metadata.sort_index(inplace=True)
|
|
73
|
+
|
|
74
|
+
# zipcode is a column that does not provide any useful information so we drop it
|
|
75
|
+
user_metadata = user_metadata.drop(columns=["zipcode"])
|
|
76
|
+
|
|
77
|
+
# obtain categorical columns
|
|
78
|
+
categorical_columns = user_metadata.select_dtypes(include=["object"]).columns.tolist()
|
|
79
|
+
|
|
80
|
+
# Use one-hot encoding to encode the categorical columns
|
|
81
|
+
encoder = OneHotEncoder(sparse_output=False)
|
|
82
|
+
one_hot_encoded = encoder.fit_transform(user_metadata[categorical_columns])
|
|
83
|
+
|
|
84
|
+
# obtain the column names for the encoded data
|
|
85
|
+
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
|
|
86
|
+
|
|
87
|
+
# Concatenate the one-hot encoded dataframe with the original dataframe and drop the original categorical columns
|
|
88
|
+
df_encoded = pd.concat([user_metadata, one_hot_df], axis=1)
|
|
89
|
+
df_encoded = df_encoded.drop(categorical_columns, axis=1)
|
|
90
|
+
|
|
91
|
+
# compute cosine similarity but exclude self-similarity
|
|
92
|
+
user_similarity_matrix = cosine_similarity(df_encoded)
|
|
93
|
+
np.fill_diagonal(user_similarity_matrix, 0)
|
|
94
|
+
|
|
95
|
+
return user_similarity_matrix
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from .itemknn import ItemKNN
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ItemKNNRolling(ItemKNN):
|
|
10
|
+
"""Rolling version of ItemKNN algorithm.
|
|
11
|
+
|
|
12
|
+
This class extends the ItemKNN algorithm to update the memory of the model
|
|
13
|
+
to only keep the last window of interactions. The model is simply discarding
|
|
14
|
+
all interactions that are older than the window size.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
IS_BASE: bool = False
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Self
|
|
3
|
+
|
|
4
|
+
from recnexteval.matrix import InteractionMatrix
|
|
5
|
+
from .itemknn import ItemKNN
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ItemKNNStatic(ItemKNN):
|
|
12
|
+
"""Static version of ItemKNN algorithm.
|
|
13
|
+
|
|
14
|
+
This class extends the ItemKNN algorithm to only fit the model once. `fit` will only
|
|
15
|
+
fit the model once and will not update the model with new data. The purpose
|
|
16
|
+
is to make the training data static and not update the model with new data.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
IS_BASE: bool = False
|
|
20
|
+
|
|
21
|
+
def __init__(self, K: int = 10) -> None:
|
|
22
|
+
self._is_fitted = False
|
|
23
|
+
super().__init__(K)
|
|
24
|
+
|
|
25
|
+
def fit(self, X: InteractionMatrix) -> Self:
|
|
26
|
+
if self._is_fitted:
|
|
27
|
+
return self
|
|
28
|
+
|
|
29
|
+
super().fit(X)
|
|
30
|
+
self._is_fitted = True
|
|
31
|
+
return self
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# RecPack, An Experimentation Toolkit for Top-N Recommendation
|
|
2
|
+
# Copyright (C) 2020 Froomle N.V.
|
|
3
|
+
# License: GNU AGPLv3 - https://gitlab.com/recpack-maintainers/recpack/-/blob/master/LICENSE
|
|
4
|
+
# Author:
|
|
5
|
+
# Lien Michiels
|
|
6
|
+
# Robin Verachtert
|
|
7
|
+
|
|
8
|
+
from recnexteval.algorithms.time_aware_item_knn.base import TARSItemKNN
|
|
9
|
+
from recnexteval.algorithms.time_aware_item_knn.ding_2005 import TARSItemKNNDing
|
|
10
|
+
from recnexteval.algorithms.time_aware_item_knn.liu_2010 import TARSItemKNNLiu
|
|
11
|
+
from recnexteval.algorithms.time_aware_item_knn.vaz_2013 import TARSItemKNNVaz
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# RecPack, An Experimentation Toolkit for Top-N Recommendation
|
|
2
|
+
# Copyright (C) 2020 Froomle N.V.
|
|
3
|
+
# License: GNU AGPLv3 - https://gitlab.com/recpack-maintainers/recpack/-/blob/master/LICENSE
|
|
4
|
+
# Author:
|
|
5
|
+
# Lien Michiels
|
|
6
|
+
# Robin Verachtert
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from scipy.sparse import csr_matrix
|
|
12
|
+
|
|
13
|
+
from recnexteval.algorithms.base import TopKItemSimilarityMatrixAlgorithm, PopularityPaddingMixin
|
|
14
|
+
from recnexteval.matrix import InteractionMatrix
|
|
15
|
+
from recnexteval.utils.util import add_rows_to_csr_matrix
|
|
16
|
+
from .decay_functions import (
|
|
17
|
+
ConcaveDecay,
|
|
18
|
+
ConvexDecay,
|
|
19
|
+
ExponentialDecay,
|
|
20
|
+
InverseDecay,
|
|
21
|
+
LinearDecay,
|
|
22
|
+
LogDecay,
|
|
23
|
+
NoDecay,
|
|
24
|
+
)
|
|
25
|
+
from .similarity_functions import (
|
|
26
|
+
compute_conditional_probability,
|
|
27
|
+
compute_cosine_similarity,
|
|
28
|
+
compute_pearson_similarity,
|
|
29
|
+
)
|
|
30
|
+
from .top_k import get_top_K_values
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
EPSILON = 1e-13
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TARSItemKNN(TopKItemSimilarityMatrixAlgorithm, PopularityPaddingMixin):
|
|
39
|
+
"""Framework for time aware variants of the ItemKNN algorithm.
|
|
40
|
+
|
|
41
|
+
This class was inspired by works from Liu, Nathan N., et al. (2010), Ding et al. (2005) and Lee et al. (2007).
|
|
42
|
+
|
|
43
|
+
The framework for these approaches can be summarised as:
|
|
44
|
+
|
|
45
|
+
- When training the user interaction matrix is weighted to take into account temporal information.
|
|
46
|
+
- Similarities are computed on this weighted matrix, using various similarity measures.
|
|
47
|
+
- When predicting the interactions are similarly weighted, giving more weight to more recent interactions.
|
|
48
|
+
- Recommendation scores are obtained by multiplying the weighted interaction matrix with
|
|
49
|
+
the previously computed similarity matrix.
|
|
50
|
+
|
|
51
|
+
The similarity between items is based on their decayed interaction vectors:
|
|
52
|
+
|
|
53
|
+
.. math::
|
|
54
|
+
|
|
55
|
+
\\text{sim}(i,j) = s(\\Gamma(A_i), \\Gamma(A_j))
|
|
56
|
+
|
|
57
|
+
Where :math:`s` is a similarity function (like ``cosine``),
|
|
58
|
+
:math:`\\Gamma` a decay function (like ``exponential_decay``) and
|
|
59
|
+
:math:`A_i` contains the distances to now from when the users interacted with item `i`,
|
|
60
|
+
if they interacted with the item at all (else the value is 0).
|
|
61
|
+
|
|
62
|
+
During computation, 'now' is considered as the maximal timestamp in the matrix + 1.
|
|
63
|
+
As such the age is always a positive non-zero value.
|
|
64
|
+
|
|
65
|
+
:param K: How many neigbours to use per item,
|
|
66
|
+
make sure to pick a value below the number of columns of the matrix to fit on.
|
|
67
|
+
Defaults to 200
|
|
68
|
+
:type K: int, Optional
|
|
69
|
+
:param pad_with_popularity: Whether to pad the similarity matrix with RecentPop Algorithm.
|
|
70
|
+
Defaults to True.
|
|
71
|
+
:type pad_with_popularity: bool, optional
|
|
72
|
+
:param fit_decay: Defines the decay scaling used for decay during model fitting.
|
|
73
|
+
Defaults to `` 1 / (24 * 3600)`` (one day).
|
|
74
|
+
:type fit_decay: float, optional
|
|
75
|
+
:param predict_decay: Defines the decay scaling used for decay during prediction.
|
|
76
|
+
Defaults to ``1 / (24 * 3600)`` (one day).
|
|
77
|
+
:type predict_decay: float, optional
|
|
78
|
+
:param decay_interval: Size of a single time unit in seconds.
|
|
79
|
+
Allows more finegrained parameters for large scale datasets where events are collected over months of data.
|
|
80
|
+
Defaults to 1 (second).
|
|
81
|
+
:type decay_interval: int, optional
|
|
82
|
+
:param similarity: Which similarity measure to use. Defaults to ``"cosine"``.
|
|
83
|
+
``["cosine", "conditional_probability", "pearson"]`` are supported.
|
|
84
|
+
:type similarity: str, Optional
|
|
85
|
+
:param decay_function: The decay function to use, defaults to ``"exponential"``.
|
|
86
|
+
Supported values are ``["exponential", "log", "linear", "concave", "convex", "inverse"]``
|
|
87
|
+
|
|
88
|
+
This code is adapted from RecPack :cite:`recpack`
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
SUPPORTED_SIMILARITIES = ["cosine", "conditional_probability", "pearson"]
|
|
92
|
+
DECAY_FUNCTIONS = {
|
|
93
|
+
"exponential": ExponentialDecay,
|
|
94
|
+
"log": LogDecay,
|
|
95
|
+
"linear": LinearDecay,
|
|
96
|
+
"concave": ConcaveDecay,
|
|
97
|
+
"convex": ConvexDecay,
|
|
98
|
+
"inverse": InverseDecay,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
def __init__(
|
|
102
|
+
self,
|
|
103
|
+
K: int = 200,
|
|
104
|
+
pad_with_popularity: bool = True,
|
|
105
|
+
fit_decay: float = 1 / (24 * 3600),
|
|
106
|
+
predict_decay: float = 1 / (24 * 3600),
|
|
107
|
+
decay_interval: int = 1,
|
|
108
|
+
similarity: str = "cosine",
|
|
109
|
+
decay_function: str = "exponential",
|
|
110
|
+
):
|
|
111
|
+
# Uses other default parameters for ItemKNN
|
|
112
|
+
super().__init__(K=K)
|
|
113
|
+
self.training_data: InteractionMatrix = None
|
|
114
|
+
self.pad_with_popularity = pad_with_popularity
|
|
115
|
+
|
|
116
|
+
if decay_interval <= 0 or type(decay_interval) == float:
|
|
117
|
+
raise ValueError("Parameter decay_interval needs to be a positive integer.")
|
|
118
|
+
|
|
119
|
+
self.decay_interval = decay_interval
|
|
120
|
+
|
|
121
|
+
if similarity not in self.SUPPORTED_SIMILARITIES:
|
|
122
|
+
raise ValueError(f"Similarity {similarity} is not supported.")
|
|
123
|
+
self.similarity = similarity
|
|
124
|
+
|
|
125
|
+
if decay_function not in self.DECAY_FUNCTIONS:
|
|
126
|
+
raise ValueError(f"Decay function {decay_function} is not supported.")
|
|
127
|
+
|
|
128
|
+
self.decay_function = decay_function
|
|
129
|
+
|
|
130
|
+
# Verify decay parameters
|
|
131
|
+
if self.decay_function in ["exponential", "log", "linear", "concave", "convex"]:
|
|
132
|
+
if fit_decay != 0:
|
|
133
|
+
self.DECAY_FUNCTIONS[decay_function].validate_decay(fit_decay)
|
|
134
|
+
|
|
135
|
+
if predict_decay != 0:
|
|
136
|
+
self.DECAY_FUNCTIONS[decay_function].validate_decay(predict_decay)
|
|
137
|
+
|
|
138
|
+
self.fit_decay = fit_decay
|
|
139
|
+
self.predict_decay = predict_decay
|
|
140
|
+
self.decay_function = decay_function
|
|
141
|
+
|
|
142
|
+
def _get_decay_func(self, decay, max_value):
|
|
143
|
+
if decay == 0:
|
|
144
|
+
return NoDecay()
|
|
145
|
+
|
|
146
|
+
elif self.decay_function == "inverse":
|
|
147
|
+
return self.DECAY_FUNCTIONS[self.decay_function]()
|
|
148
|
+
elif self.decay_function in ["exponential", "convex"]:
|
|
149
|
+
return self.DECAY_FUNCTIONS[self.decay_function](decay)
|
|
150
|
+
elif self.decay_function in ["log", "linear", "concave"]:
|
|
151
|
+
return self.DECAY_FUNCTIONS[self.decay_function](decay, max_value)
|
|
152
|
+
|
|
153
|
+
def _predict(self, X: csr_matrix, predict_im: InteractionMatrix) -> csr_matrix:
|
|
154
|
+
"""Predict scores for nonzero users in X.
|
|
155
|
+
|
|
156
|
+
Scores are computed by matrix multiplication of weighted X
|
|
157
|
+
with the stored similarity matrix.
|
|
158
|
+
|
|
159
|
+
:param X: csr_matrix with interactions
|
|
160
|
+
:type X: csr_matrix
|
|
161
|
+
:return: csr_matrix with scores
|
|
162
|
+
:rtype: csr_matrix
|
|
163
|
+
"""
|
|
164
|
+
X_decay = self._add_decay_to_predict_matrix(self.training_data)
|
|
165
|
+
X_pred = super()._predict(X_decay)
|
|
166
|
+
|
|
167
|
+
# ID indexing starts at 0, so max_id + 1 is the number of unique IDs
|
|
168
|
+
max_user_id = predict_im.max_user_id + 1
|
|
169
|
+
max_item_id = predict_im.max_item_id + 1
|
|
170
|
+
intended_shape = (
|
|
171
|
+
max(max_user_id, X.shape[0]),
|
|
172
|
+
max(max_item_id, X.shape[1]),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
predict_frame = predict_im._df
|
|
176
|
+
|
|
177
|
+
if X_pred.shape == intended_shape:
|
|
178
|
+
return X_pred
|
|
179
|
+
|
|
180
|
+
known_user_id, known_item_id = X_pred.shape
|
|
181
|
+
X_pred = add_rows_to_csr_matrix(X_pred, intended_shape[0] - known_user_id)
|
|
182
|
+
logger.debug(f"Padding user ID in range({known_user_id}, {intended_shape[0]}) with items")
|
|
183
|
+
to_predict = predict_frame.value_counts("uid")
|
|
184
|
+
|
|
185
|
+
if self.pad_with_popularity:
|
|
186
|
+
popular_items = self.get_popularity_scores(super()._transform_fit_input(X))
|
|
187
|
+
for user_id in to_predict.index:
|
|
188
|
+
if user_id >= known_user_id:
|
|
189
|
+
X_pred[user_id, :] = popular_items
|
|
190
|
+
else:
|
|
191
|
+
row = []
|
|
192
|
+
col = []
|
|
193
|
+
for user_id in to_predict.index:
|
|
194
|
+
if user_id >= known_user_id:
|
|
195
|
+
row += [user_id] * to_predict[user_id]
|
|
196
|
+
col += self.rand_gen.integers(0, known_item_id, to_predict[user_id]).tolist()
|
|
197
|
+
pad = csr_matrix((np.ones(len(row)), (row, col)), shape=intended_shape)
|
|
198
|
+
X_pred += pad
|
|
199
|
+
|
|
200
|
+
logger.debug(f"Padding by {self.name} completed")
|
|
201
|
+
return X_pred
|
|
202
|
+
|
|
203
|
+
def _fit(self, X: csr_matrix) -> "TARSItemKNN":
|
|
204
|
+
"""Fit a cosine similarity matrix from item to item."""
|
|
205
|
+
|
|
206
|
+
if self.training_data is None:
|
|
207
|
+
self.training_data = X.copy()
|
|
208
|
+
else:
|
|
209
|
+
self.training_data = self.training_data.union(X)
|
|
210
|
+
X = self.training_data.copy()
|
|
211
|
+
|
|
212
|
+
X = self._add_decay_to_fit_matrix(X)
|
|
213
|
+
if self.similarity == "cosine":
|
|
214
|
+
item_similarities = compute_cosine_similarity(X)
|
|
215
|
+
elif self.similarity == "conditional_probability":
|
|
216
|
+
item_similarities = compute_conditional_probability(X)
|
|
217
|
+
elif self.similarity == "pearson":
|
|
218
|
+
item_similarities = compute_pearson_similarity(X)
|
|
219
|
+
|
|
220
|
+
item_similarities = get_top_K_values(item_similarities, K=self.K)
|
|
221
|
+
|
|
222
|
+
self.similarity_matrix_ = item_similarities
|
|
223
|
+
|
|
224
|
+
return self
|
|
225
|
+
|
|
226
|
+
def _add_decay_to_interaction_matrix(self, X: InteractionMatrix, decay: float) -> csr_matrix:
|
|
227
|
+
"""Weigh the interaction matrix based on age of the events.
|
|
228
|
+
|
|
229
|
+
If decay is 0, it is assumed to be disabled, and so we just return binary matrix.
|
|
230
|
+
:param X: Interaction matrix.
|
|
231
|
+
:type X: InteractionMatrix
|
|
232
|
+
:return: Weighted csr matrix.
|
|
233
|
+
:rtype: csr_matrix
|
|
234
|
+
"""
|
|
235
|
+
timestamp_mat = X.latest_interaction_timestamps_matrix
|
|
236
|
+
|
|
237
|
+
# To get 'now', we add 1 to the maximal timestamp. This makes sure there are no vanishing zeroes.
|
|
238
|
+
now = timestamp_mat.data.max() + 1
|
|
239
|
+
ages = (now - timestamp_mat.data) / self.decay_interval
|
|
240
|
+
timestamp_mat.data = self._get_decay_func(decay, ages.max())(ages)
|
|
241
|
+
|
|
242
|
+
return csr_matrix(timestamp_mat)
|
|
243
|
+
|
|
244
|
+
def _add_decay_to_fit_matrix(self, X: InteractionMatrix) -> csr_matrix:
|
|
245
|
+
return self._add_decay_to_interaction_matrix(X, self.fit_decay)
|
|
246
|
+
|
|
247
|
+
def _add_decay_to_predict_matrix(self, X: InteractionMatrix) -> csr_matrix:
|
|
248
|
+
return self._add_decay_to_interaction_matrix(X, self.predict_decay)
|