recnexteval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. recnexteval/__init__.py +20 -0
  2. recnexteval/algorithms/__init__.py +99 -0
  3. recnexteval/algorithms/base.py +377 -0
  4. recnexteval/algorithms/baseline/__init__.py +10 -0
  5. recnexteval/algorithms/baseline/decay_popularity.py +110 -0
  6. recnexteval/algorithms/baseline/most_popular.py +72 -0
  7. recnexteval/algorithms/baseline/random.py +39 -0
  8. recnexteval/algorithms/baseline/recent_popularity.py +34 -0
  9. recnexteval/algorithms/itemknn/__init__.py +14 -0
  10. recnexteval/algorithms/itemknn/itemknn.py +119 -0
  11. recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
  12. recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
  13. recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
  14. recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
  15. recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
  16. recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
  17. recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
  18. recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
  19. recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
  20. recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
  21. recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
  22. recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
  23. recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
  24. recnexteval/algorithms/utils.py +51 -0
  25. recnexteval/datasets/__init__.py +109 -0
  26. recnexteval/datasets/base.py +316 -0
  27. recnexteval/datasets/config/__init__.py +113 -0
  28. recnexteval/datasets/config/amazon.py +188 -0
  29. recnexteval/datasets/config/base.py +72 -0
  30. recnexteval/datasets/config/lastfm.py +105 -0
  31. recnexteval/datasets/config/movielens.py +169 -0
  32. recnexteval/datasets/config/yelp.py +25 -0
  33. recnexteval/datasets/datasets/__init__.py +24 -0
  34. recnexteval/datasets/datasets/amazon.py +151 -0
  35. recnexteval/datasets/datasets/base.py +250 -0
  36. recnexteval/datasets/datasets/lastfm.py +121 -0
  37. recnexteval/datasets/datasets/movielens.py +93 -0
  38. recnexteval/datasets/datasets/test.py +46 -0
  39. recnexteval/datasets/datasets/yelp.py +103 -0
  40. recnexteval/datasets/metadata/__init__.py +58 -0
  41. recnexteval/datasets/metadata/amazon.py +68 -0
  42. recnexteval/datasets/metadata/base.py +38 -0
  43. recnexteval/datasets/metadata/lastfm.py +110 -0
  44. recnexteval/datasets/metadata/movielens.py +87 -0
  45. recnexteval/evaluators/__init__.py +189 -0
  46. recnexteval/evaluators/accumulator.py +167 -0
  47. recnexteval/evaluators/base.py +216 -0
  48. recnexteval/evaluators/builder/__init__.py +125 -0
  49. recnexteval/evaluators/builder/base.py +166 -0
  50. recnexteval/evaluators/builder/pipeline.py +111 -0
  51. recnexteval/evaluators/builder/stream.py +54 -0
  52. recnexteval/evaluators/evaluator_pipeline.py +287 -0
  53. recnexteval/evaluators/evaluator_stream.py +374 -0
  54. recnexteval/evaluators/state_management.py +310 -0
  55. recnexteval/evaluators/strategy.py +32 -0
  56. recnexteval/evaluators/util.py +124 -0
  57. recnexteval/matrix/__init__.py +48 -0
  58. recnexteval/matrix/exception.py +5 -0
  59. recnexteval/matrix/interaction_matrix.py +784 -0
  60. recnexteval/matrix/prediction_matrix.py +153 -0
  61. recnexteval/matrix/util.py +24 -0
  62. recnexteval/metrics/__init__.py +57 -0
  63. recnexteval/metrics/binary/__init__.py +4 -0
  64. recnexteval/metrics/binary/hit.py +49 -0
  65. recnexteval/metrics/core/__init__.py +10 -0
  66. recnexteval/metrics/core/base.py +126 -0
  67. recnexteval/metrics/core/elementwise_top_k.py +75 -0
  68. recnexteval/metrics/core/listwise_top_k.py +72 -0
  69. recnexteval/metrics/core/top_k.py +60 -0
  70. recnexteval/metrics/core/util.py +29 -0
  71. recnexteval/metrics/ranking/__init__.py +6 -0
  72. recnexteval/metrics/ranking/dcg.py +55 -0
  73. recnexteval/metrics/ranking/ndcg.py +78 -0
  74. recnexteval/metrics/ranking/precision.py +51 -0
  75. recnexteval/metrics/ranking/recall.py +42 -0
  76. recnexteval/models/__init__.py +4 -0
  77. recnexteval/models/base.py +69 -0
  78. recnexteval/preprocessing/__init__.py +37 -0
  79. recnexteval/preprocessing/filter.py +181 -0
  80. recnexteval/preprocessing/preprocessor.py +137 -0
  81. recnexteval/registries/__init__.py +67 -0
  82. recnexteval/registries/algorithm.py +68 -0
  83. recnexteval/registries/base.py +131 -0
  84. recnexteval/registries/dataset.py +37 -0
  85. recnexteval/registries/metric.py +57 -0
  86. recnexteval/settings/__init__.py +127 -0
  87. recnexteval/settings/base.py +414 -0
  88. recnexteval/settings/exception.py +8 -0
  89. recnexteval/settings/leave_n_out_setting.py +48 -0
  90. recnexteval/settings/processor.py +115 -0
  91. recnexteval/settings/schema.py +11 -0
  92. recnexteval/settings/single_time_point_setting.py +111 -0
  93. recnexteval/settings/sliding_window_setting.py +153 -0
  94. recnexteval/settings/splitters/__init__.py +14 -0
  95. recnexteval/settings/splitters/base.py +57 -0
  96. recnexteval/settings/splitters/n_last.py +39 -0
  97. recnexteval/settings/splitters/n_last_timestamp.py +76 -0
  98. recnexteval/settings/splitters/timestamp.py +82 -0
  99. recnexteval/settings/util.py +0 -0
  100. recnexteval/utils/__init__.py +115 -0
  101. recnexteval/utils/json_to_csv_converter.py +128 -0
  102. recnexteval/utils/logging_tools.py +159 -0
  103. recnexteval/utils/path.py +155 -0
  104. recnexteval/utils/url_certificate_installer.py +54 -0
  105. recnexteval/utils/util.py +166 -0
  106. recnexteval/utils/uuid_util.py +7 -0
  107. recnexteval/utils/yaml_tool.py +65 -0
  108. recnexteval-0.1.0.dist-info/METADATA +85 -0
  109. recnexteval-0.1.0.dist-info/RECORD +110 -0
  110. recnexteval-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,39 @@
1
+ from typing import Self
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scipy.sparse import csr_matrix
6
+
7
+ from ...matrix import PredictionMatrix
8
+ from ..base import TopKAlgorithm
9
+ from ..utils import get_top_K_values
10
+
11
+
12
+ class Random(TopKAlgorithm):
13
+ """Random recommendation for users.
14
+
15
+ The Random algorithm recommends K random items to all users in the predict frame.
16
+ """
17
+ IS_BASE: bool = False
18
+
19
+ def _fit(self, X: csr_matrix) -> Self: # noqa: ARG002
20
+ self.fit_complete_ = True
21
+ return self
22
+
23
+ def _predict(self, X: PredictionMatrix) -> csr_matrix:
24
+ predict_ui_df = X.get_prediction_data()._df # noqa: SLF001
25
+
26
+ known_item_id = X.max_known_item_id
27
+ intended_shape = (X.max_global_user_id, known_item_id)
28
+
29
+ to_predict = pd.Series(predict_ui_df.uid.unique())
30
+ to_predict = to_predict.sort_values(ignore_index=True)
31
+ row = to_predict.values.repeat(self.K)
32
+ total_items_to_predict = len(row)
33
+ col = self.rand_gen.integers(0, known_item_id, total_items_to_predict)
34
+ scores = csr_matrix((np.ones(total_items_to_predict), (row, col)), shape=intended_shape)
35
+
36
+ # Get top K of allowed items per user
37
+ X_pred = get_top_K_values(scores, K=self.K)
38
+ X_pred = X_pred[predict_ui_df["uid"].values]
39
+ return X_pred
@@ -0,0 +1,34 @@
1
+
2
+ import logging
3
+ from typing import Self
4
+
5
+ import numpy as np
6
+ from scipy.sparse import csr_matrix
7
+
8
+ from ...matrix import PredictionMatrix
9
+ from ..base import PopularityPaddingMixin, TopKAlgorithm
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class RecentPopularity(TopKAlgorithm, PopularityPaddingMixin):
16
+ """A popularity-based algorithm which only considers popularity of the latest train data."""
17
+
18
+ IS_BASE: bool = False
19
+
20
+ def _fit(self, X: csr_matrix) -> Self:
21
+ self.sorted_scores_ = self.get_popularity_scores(X)
22
+ return self
23
+
24
+ def _predict(self, X: PredictionMatrix) -> csr_matrix:
25
+ """
26
+ Predict the K most popular item for each user using only data from the latest window.
27
+ """
28
+ intended_shape = (X.get_prediction_data().num_interactions, X.user_item_shape[1])
29
+
30
+ # Vectorized: repeat the sorted scores for each prediction row
31
+ data = np.tile(self.sorted_scores_, (intended_shape[0], 1))
32
+ X_pred = csr_matrix(data)
33
+
34
+ return X_pred
@@ -0,0 +1,14 @@
1
+ from .itemknn import ItemKNN
2
+ from .itemknn_incremental import ItemKNNIncremental
3
+ from .itemknn_incremental_movielens import ItemKNNIncrementalMovieLens100K
4
+ from .itemknn_rolling import ItemKNNRolling
5
+ from .itemknn_static import ItemKNNStatic
6
+
7
+
8
+ __all__ = [
9
+ "ItemKNN",
10
+ "ItemKNNIncremental",
11
+ "ItemKNNIncrementalMovieLens100K",
12
+ "ItemKNNRolling",
13
+ "ItemKNNStatic",
14
+ ]
@@ -0,0 +1,119 @@
1
+ import logging
2
+ from typing import Self
3
+
4
+ from scipy.sparse import csr_matrix
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+ from recnexteval.matrix import ItemUserBasedEnum, PredictionMatrix
8
+ from ..base import PopularityPaddingMixin, TopKItemSimilarityMatrixAlgorithm
9
+ from ..utils import get_top_K_values
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def compute_cosine_similarity(X: csr_matrix) -> csr_matrix:
16
+ """Compute the cosine similarity between the items in the matrix.
17
+
18
+ Self similarity is removed.
19
+
20
+ :param X: user x item matrix with scores per user, item pair.
21
+ :type X: csr_matrix
22
+ :return: similarity matrix
23
+ :rtype: csr_matrix
24
+ """
25
+ # X.T otherwise we are doing a user KNN
26
+ item_cosine_similarities = cosine_similarity(X.T, dense_output=False)
27
+ if not isinstance(item_cosine_similarities, csr_matrix):
28
+ item_cosine_similarities = csr_matrix(item_cosine_similarities)
29
+ # Set diagonal to 0, because we don't want to support self similarity
30
+ item_cosine_similarities.setdiag(0)
31
+
32
+ return item_cosine_similarities
33
+
34
+
35
+ class ItemKNN(TopKItemSimilarityMatrixAlgorithm, PopularityPaddingMixin):
36
+ """Item K Nearest Neighbours model.
37
+
38
+ First described in 'Item-based top-n recommendation algorithms.' :cite:`10.1145/963770.963776`
39
+
40
+ This code is adapted from RecPack :cite:`recpack`
41
+
42
+ For each item the K most similar items are computed during fit.
43
+ Similarity parameter decides how to compute the similarity between two items.
44
+
45
+ Cosine similarity between item i and j is computed as
46
+
47
+ .. math::
48
+ sim(i,j) = \\frac{X_i X_j}{||X_i||_2 ||X_j||_2}
49
+
50
+ :param K: How many neigbours to use per item,
51
+ make sure to pick a value below the number of columns of the matrix to fit on.
52
+ Defaults to 200
53
+ :type K: int, optional
54
+ """
55
+
56
+ ITEM_USER_BASED = ItemUserBasedEnum.ITEM
57
+
58
+ def _fit(self, X: csr_matrix) -> Self:
59
+ """Fit a cosine similarity matrix from item to item
60
+ We assume that X is a binary matrix of shape (n_users, n_items)
61
+ """
62
+ item_similarities = compute_cosine_similarity(X)
63
+ item_similarities = get_top_K_values(item_similarities, K=self.K)
64
+
65
+ self.similarity_matrix_ = item_similarities
66
+ self.X_ = X.copy()
67
+ return self
68
+
69
+ def _predict(self, X: PredictionMatrix) -> csr_matrix:
70
+ predict_ui_df = X.get_prediction_data()._df # noqa: SLF001
71
+
72
+ # create a boolean series that is true for index in predict_ui_df.uid
73
+ uid_to_predict = predict_ui_df[predict_ui_df.uid < self.X_.shape[0]].uid.unique()
74
+ uid_to_predict = sorted(uid_to_predict.tolist())
75
+
76
+ # features: csr_matrix = self.X_[uid_to_predict]
77
+ # we try without any filtering on the feature matrix
78
+ features: csr_matrix = self.X_
79
+ scores = features @ self.similarity_matrix_
80
+
81
+ if not isinstance(scores, csr_matrix):
82
+ scores = csr_matrix(scores)
83
+
84
+ intended_shape = (X.max_global_user_id, X.max_global_item_id)
85
+
86
+ if scores.shape == intended_shape:
87
+ return scores
88
+
89
+ # there are 2 cases where the shape is different:
90
+ # 1. The algorithm did not predict unknown user, causing shortage in rows
91
+ # 2. The algorithm not aware of unknown items, causing shortage in columns
92
+
93
+ # handle case 1
94
+ if scores.shape[1] < intended_shape[1]:
95
+ scores = self._pad_unknown_iid_with_none_strategy(
96
+ y_pred=scores,
97
+ current_shape=scores.shape,
98
+ intended_shape=intended_shape,
99
+ )
100
+
101
+ # handle case 2
102
+ if self.pad_with_popularity:
103
+ scores = self._pad_uknown_uid_with_popularity_strategy(
104
+ X_pred=scores,
105
+ intended_shape=intended_shape,
106
+ predict_ui_df=predict_ui_df,
107
+ )
108
+ else:
109
+ # current_shape = (X.max_known_user_id, X.max_known_item_id)
110
+ scores = self._pad_unknown_uid_with_random_strategy(
111
+ X_pred=scores,
112
+ current_shape=scores.shape,
113
+ # current_shape=current_shape,
114
+ intended_shape=intended_shape,
115
+ predict_ui_df=predict_ui_df,
116
+ )
117
+
118
+ pred = scores[predict_ui_df["uid"].values]
119
+ return pred
@@ -0,0 +1,65 @@
1
+ import logging
2
+ from typing import Self
3
+
4
+ from scipy.sparse import csr_matrix, hstack, vstack
5
+
6
+ from ..base import PopularityPaddingMixin, TopKItemSimilarityMatrixAlgorithm
7
+ from .itemknn import ItemKNN
8
+
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class ItemKNNIncremental(ItemKNN):
14
+ """Incremental version of ItemKNN algorithm.
15
+
16
+ This class extends the ItemKNN algorithm to allow for incremental updates
17
+ to the model. The incremental updates are done by updating the historical
18
+ data with the new data by appending the new data to the historical data.
19
+ """
20
+
21
+ IS_BASE: bool = False
22
+
23
+ def __init__(self, K: int = 10, pad_with_popularity: bool = True) -> None:
24
+ PopularityPaddingMixin.__init__(self, pad_with_popularity=pad_with_popularity)
25
+ TopKItemSimilarityMatrixAlgorithm.__init__(self, K=K)
26
+ self.X_: None | csr_matrix = None
27
+
28
+ def _append_training_data(self, X: csr_matrix) -> None:
29
+ """Append a new interaction matrix to the historical data.
30
+
31
+ Args:
32
+ X (csr_matrix): Interaction matrix to append
33
+ """
34
+ if self.X_ is None:
35
+ raise ValueError("No existing training data to append to.")
36
+ X_prev: csr_matrix = self.X_.copy()
37
+ new_num_rows = max(X_prev.shape[0], X.shape[0])
38
+ new_num_cols = max(X_prev.shape[1], X.shape[1])
39
+ # Pad the previous matrix
40
+ if X_prev.shape[0] < new_num_rows: # Pad rows
41
+ row_padding = csr_matrix((new_num_rows - X_prev.shape[0], X_prev.shape[1]))
42
+ X_prev = vstack([X_prev, row_padding])
43
+ if X_prev.shape[1] < new_num_cols: # Pad columns
44
+ col_padding = csr_matrix((X_prev.shape[0], new_num_cols - X_prev.shape[1]))
45
+ X_prev = hstack([X_prev, col_padding])
46
+
47
+ # Pad the current matrix
48
+ if X.shape[0] < new_num_rows: # Pad rows
49
+ row_padding = csr_matrix((new_num_rows - X.shape[0], X.shape[1]))
50
+ X = vstack([X, row_padding])
51
+ if X.shape[1] < new_num_cols: # Pad columns
52
+ col_padding = csr_matrix((X.shape[0], new_num_cols - X.shape[1]))
53
+ X = hstack([X, col_padding])
54
+
55
+ # Merge data
56
+ self.X_ = X_prev + X
57
+
58
+ def _fit(self, X: csr_matrix) -> Self:
59
+ """Fit a cosine similarity matrix from item to item."""
60
+ if self.X_ is not None:
61
+ self._append_training_data(X)
62
+ super()._fit(self.X_)
63
+ else:
64
+ super()._fit(X)
65
+ return self
@@ -0,0 +1,95 @@
1
+ import logging
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scipy.sparse import csr_matrix
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from sklearn.preprocessing import OneHotEncoder
8
+
9
+ from ...matrix import InteractionMatrix
10
+ from ...utils import add_rows_to_csr_matrix
11
+ from .itemknn_incremental import ItemKNNIncremental
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class ItemKNNIncrementalMovieLens100K(ItemKNNIncremental):
18
+ """Incremental version of ItemKNN algorithm with MovieLens100k Metadata.
19
+
20
+ This class extends the ItemKNN algorithm to allow for incremental updates
21
+ to the model. The incremental updates are done by updating the historical
22
+ data with the new data by appending the new data to the historical data.
23
+ """
24
+ IS_BASE: bool = False
25
+
26
+ def __init__(self, metadata: pd.DataFrame, K:int=10) -> None:
27
+ super().__init__(K)
28
+ if metadata is None:
29
+ raise ValueError("Metadata is required for ItemKNNIncrementalMovieLens100K")
30
+ self.metadata = metadata.copy()
31
+
32
+ def _predict(self, X: csr_matrix, predict_im: InteractionMatrix) -> csr_matrix:
33
+ """Predict the K most similar items for each item using the latest data."""
34
+ X_pred = super()._predict(self.X_)
35
+ # ID indexing starts at 0, so max_id + 1 is the number of unique IDs
36
+ max_user_id = predict_im.max_user_id + 1
37
+ max_item_id = predict_im.max_item_id + 1
38
+ intended_shape = (
39
+ max(max_user_id, X.shape[0]),
40
+ max(max_item_id, X.shape[1]),
41
+ )
42
+
43
+ predict_frame = predict_im._df
44
+
45
+ if X_pred.shape == intended_shape:
46
+ return X_pred
47
+
48
+ known_user_id, known_item_id = X_pred.shape
49
+ X_pred = add_rows_to_csr_matrix(X_pred, intended_shape[0] - known_user_id)
50
+ logger.debug(f"Padding user ID in range({known_user_id}, {intended_shape[0]}) with items")
51
+ to_predict = predict_frame.value_counts("uid")
52
+
53
+ # pad users with items from most similar user
54
+ user_similarity_matrix = self.get_user_similarity_matrix()
55
+ for user_id in to_predict.index:
56
+ if user_id >= known_user_id:
57
+ most_similar_user_idx = np.argmax(user_similarity_matrix[user_id][:known_user_id])
58
+ X_pred[user_id, :] = X_pred[most_similar_user_idx, :]
59
+
60
+ logger.debug(f"Padding by {self.name} completed")
61
+ return X_pred
62
+
63
+ def get_user_similarity_matrix(self):
64
+ user_metadata = self.metadata.copy()
65
+
66
+ # set userId as index
67
+ user_metadata.set_index("userId", inplace=True)
68
+ user_metadata.index.name = None
69
+
70
+ # reorder the indices
71
+ user_metadata.reset_index(drop=True)
72
+ user_metadata.sort_index(inplace=True)
73
+
74
+ # zipcode is a column that does not provide any useful information so we drop it
75
+ user_metadata = user_metadata.drop(columns=["zipcode"])
76
+
77
+ # obtain categorical columns
78
+ categorical_columns = user_metadata.select_dtypes(include=["object"]).columns.tolist()
79
+
80
+ # Use one-hot encoding to encode the categorical columns
81
+ encoder = OneHotEncoder(sparse_output=False)
82
+ one_hot_encoded = encoder.fit_transform(user_metadata[categorical_columns])
83
+
84
+ # obtain the column names for the encoded data
85
+ one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
86
+
87
+ # Concatenate the one-hot encoded dataframe with the original dataframe and drop the original categorical columns
88
+ df_encoded = pd.concat([user_metadata, one_hot_df], axis=1)
89
+ df_encoded = df_encoded.drop(categorical_columns, axis=1)
90
+
91
+ # compute cosine similarity but exclude self-similarity
92
+ user_similarity_matrix = cosine_similarity(df_encoded)
93
+ np.fill_diagonal(user_similarity_matrix, 0)
94
+
95
+ return user_similarity_matrix
@@ -0,0 +1,17 @@
1
+ import logging
2
+
3
+ from .itemknn import ItemKNN
4
+
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class ItemKNNRolling(ItemKNN):
10
+ """Rolling version of ItemKNN algorithm.
11
+
12
+ This class extends the ItemKNN algorithm to update the memory of the model
13
+ to only keep the last window of interactions. The model is simply discarding
14
+ all interactions that are older than the window size.
15
+ """
16
+
17
+ IS_BASE: bool = False
@@ -0,0 +1,31 @@
1
+ import logging
2
+ from typing import Self
3
+
4
+ from recnexteval.matrix import InteractionMatrix
5
+ from .itemknn import ItemKNN
6
+
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ItemKNNStatic(ItemKNN):
12
+ """Static version of ItemKNN algorithm.
13
+
14
+ This class extends the ItemKNN algorithm to only fit the model once. `fit` will only
15
+ fit the model once and will not update the model with new data. The purpose
16
+ is to make the training data static and not update the model with new data.
17
+ """
18
+
19
+ IS_BASE: bool = False
20
+
21
+ def __init__(self, K: int = 10) -> None:
22
+ self._is_fitted = False
23
+ super().__init__(K)
24
+
25
+ def fit(self, X: InteractionMatrix) -> Self:
26
+ if self._is_fitted:
27
+ return self
28
+
29
+ super().fit(X)
30
+ self._is_fitted = True
31
+ return self
@@ -0,0 +1,11 @@
1
+ # RecPack, An Experimentation Toolkit for Top-N Recommendation
2
+ # Copyright (C) 2020 Froomle N.V.
3
+ # License: GNU AGPLv3 - https://gitlab.com/recpack-maintainers/recpack/-/blob/master/LICENSE
4
+ # Author:
5
+ # Lien Michiels
6
+ # Robin Verachtert
7
+
8
+ from recnexteval.algorithms.time_aware_item_knn.base import TARSItemKNN
9
+ from recnexteval.algorithms.time_aware_item_knn.ding_2005 import TARSItemKNNDing
10
+ from recnexteval.algorithms.time_aware_item_knn.liu_2010 import TARSItemKNNLiu
11
+ from recnexteval.algorithms.time_aware_item_knn.vaz_2013 import TARSItemKNNVaz
@@ -0,0 +1,248 @@
1
+ # RecPack, An Experimentation Toolkit for Top-N Recommendation
2
+ # Copyright (C) 2020 Froomle N.V.
3
+ # License: GNU AGPLv3 - https://gitlab.com/recpack-maintainers/recpack/-/blob/master/LICENSE
4
+ # Author:
5
+ # Lien Michiels
6
+ # Robin Verachtert
7
+
8
+ import logging
9
+
10
+ import numpy as np
11
+ from scipy.sparse import csr_matrix
12
+
13
+ from recnexteval.algorithms.base import TopKItemSimilarityMatrixAlgorithm, PopularityPaddingMixin
14
+ from recnexteval.matrix import InteractionMatrix
15
+ from recnexteval.utils.util import add_rows_to_csr_matrix
16
+ from .decay_functions import (
17
+ ConcaveDecay,
18
+ ConvexDecay,
19
+ ExponentialDecay,
20
+ InverseDecay,
21
+ LinearDecay,
22
+ LogDecay,
23
+ NoDecay,
24
+ )
25
+ from .similarity_functions import (
26
+ compute_conditional_probability,
27
+ compute_cosine_similarity,
28
+ compute_pearson_similarity,
29
+ )
30
+ from .top_k import get_top_K_values
31
+
32
+
33
+ EPSILON = 1e-13
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ class TARSItemKNN(TopKItemSimilarityMatrixAlgorithm, PopularityPaddingMixin):
39
+ """Framework for time aware variants of the ItemKNN algorithm.
40
+
41
+ This class was inspired by works from Liu, Nathan N., et al. (2010), Ding et al. (2005) and Lee et al. (2007).
42
+
43
+ The framework for these approaches can be summarised as:
44
+
45
+ - When training the user interaction matrix is weighted to take into account temporal information.
46
+ - Similarities are computed on this weighted matrix, using various similarity measures.
47
+ - When predicting the interactions are similarly weighted, giving more weight to more recent interactions.
48
+ - Recommendation scores are obtained by multiplying the weighted interaction matrix with
49
+ the previously computed similarity matrix.
50
+
51
+ The similarity between items is based on their decayed interaction vectors:
52
+
53
+ .. math::
54
+
55
+ \\text{sim}(i,j) = s(\\Gamma(A_i), \\Gamma(A_j))
56
+
57
+ Where :math:`s` is a similarity function (like ``cosine``),
58
+ :math:`\\Gamma` a decay function (like ``exponential_decay``) and
59
+ :math:`A_i` contains the distances to now from when the users interacted with item `i`,
60
+ if they interacted with the item at all (else the value is 0).
61
+
62
+ During computation, 'now' is considered as the maximal timestamp in the matrix + 1.
63
+ As such the age is always a positive non-zero value.
64
+
65
+ :param K: How many neigbours to use per item,
66
+ make sure to pick a value below the number of columns of the matrix to fit on.
67
+ Defaults to 200
68
+ :type K: int, Optional
69
+ :param pad_with_popularity: Whether to pad the similarity matrix with RecentPop Algorithm.
70
+ Defaults to True.
71
+ :type pad_with_popularity: bool, optional
72
+ :param fit_decay: Defines the decay scaling used for decay during model fitting.
73
+ Defaults to `` 1 / (24 * 3600)`` (one day).
74
+ :type fit_decay: float, optional
75
+ :param predict_decay: Defines the decay scaling used for decay during prediction.
76
+ Defaults to ``1 / (24 * 3600)`` (one day).
77
+ :type predict_decay: float, optional
78
+ :param decay_interval: Size of a single time unit in seconds.
79
+ Allows more finegrained parameters for large scale datasets where events are collected over months of data.
80
+ Defaults to 1 (second).
81
+ :type decay_interval: int, optional
82
+ :param similarity: Which similarity measure to use. Defaults to ``"cosine"``.
83
+ ``["cosine", "conditional_probability", "pearson"]`` are supported.
84
+ :type similarity: str, Optional
85
+ :param decay_function: The decay function to use, defaults to ``"exponential"``.
86
+ Supported values are ``["exponential", "log", "linear", "concave", "convex", "inverse"]``
87
+
88
+ This code is adapted from RecPack :cite:`recpack`
89
+ """
90
+
91
+ SUPPORTED_SIMILARITIES = ["cosine", "conditional_probability", "pearson"]
92
+ DECAY_FUNCTIONS = {
93
+ "exponential": ExponentialDecay,
94
+ "log": LogDecay,
95
+ "linear": LinearDecay,
96
+ "concave": ConcaveDecay,
97
+ "convex": ConvexDecay,
98
+ "inverse": InverseDecay,
99
+ }
100
+
101
+ def __init__(
102
+ self,
103
+ K: int = 200,
104
+ pad_with_popularity: bool = True,
105
+ fit_decay: float = 1 / (24 * 3600),
106
+ predict_decay: float = 1 / (24 * 3600),
107
+ decay_interval: int = 1,
108
+ similarity: str = "cosine",
109
+ decay_function: str = "exponential",
110
+ ):
111
+ # Uses other default parameters for ItemKNN
112
+ super().__init__(K=K)
113
+ self.training_data: InteractionMatrix = None
114
+ self.pad_with_popularity = pad_with_popularity
115
+
116
+ if decay_interval <= 0 or type(decay_interval) == float:
117
+ raise ValueError("Parameter decay_interval needs to be a positive integer.")
118
+
119
+ self.decay_interval = decay_interval
120
+
121
+ if similarity not in self.SUPPORTED_SIMILARITIES:
122
+ raise ValueError(f"Similarity {similarity} is not supported.")
123
+ self.similarity = similarity
124
+
125
+ if decay_function not in self.DECAY_FUNCTIONS:
126
+ raise ValueError(f"Decay function {decay_function} is not supported.")
127
+
128
+ self.decay_function = decay_function
129
+
130
+ # Verify decay parameters
131
+ if self.decay_function in ["exponential", "log", "linear", "concave", "convex"]:
132
+ if fit_decay != 0:
133
+ self.DECAY_FUNCTIONS[decay_function].validate_decay(fit_decay)
134
+
135
+ if predict_decay != 0:
136
+ self.DECAY_FUNCTIONS[decay_function].validate_decay(predict_decay)
137
+
138
+ self.fit_decay = fit_decay
139
+ self.predict_decay = predict_decay
140
+ self.decay_function = decay_function
141
+
142
+ def _get_decay_func(self, decay, max_value):
143
+ if decay == 0:
144
+ return NoDecay()
145
+
146
+ elif self.decay_function == "inverse":
147
+ return self.DECAY_FUNCTIONS[self.decay_function]()
148
+ elif self.decay_function in ["exponential", "convex"]:
149
+ return self.DECAY_FUNCTIONS[self.decay_function](decay)
150
+ elif self.decay_function in ["log", "linear", "concave"]:
151
+ return self.DECAY_FUNCTIONS[self.decay_function](decay, max_value)
152
+
153
+ def _predict(self, X: csr_matrix, predict_im: InteractionMatrix) -> csr_matrix:
154
+ """Predict scores for nonzero users in X.
155
+
156
+ Scores are computed by matrix multiplication of weighted X
157
+ with the stored similarity matrix.
158
+
159
+ :param X: csr_matrix with interactions
160
+ :type X: csr_matrix
161
+ :return: csr_matrix with scores
162
+ :rtype: csr_matrix
163
+ """
164
+ X_decay = self._add_decay_to_predict_matrix(self.training_data)
165
+ X_pred = super()._predict(X_decay)
166
+
167
+ # ID indexing starts at 0, so max_id + 1 is the number of unique IDs
168
+ max_user_id = predict_im.max_user_id + 1
169
+ max_item_id = predict_im.max_item_id + 1
170
+ intended_shape = (
171
+ max(max_user_id, X.shape[0]),
172
+ max(max_item_id, X.shape[1]),
173
+ )
174
+
175
+ predict_frame = predict_im._df
176
+
177
+ if X_pred.shape == intended_shape:
178
+ return X_pred
179
+
180
+ known_user_id, known_item_id = X_pred.shape
181
+ X_pred = add_rows_to_csr_matrix(X_pred, intended_shape[0] - known_user_id)
182
+ logger.debug(f"Padding user ID in range({known_user_id}, {intended_shape[0]}) with items")
183
+ to_predict = predict_frame.value_counts("uid")
184
+
185
+ if self.pad_with_popularity:
186
+ popular_items = self.get_popularity_scores(super()._transform_fit_input(X))
187
+ for user_id in to_predict.index:
188
+ if user_id >= known_user_id:
189
+ X_pred[user_id, :] = popular_items
190
+ else:
191
+ row = []
192
+ col = []
193
+ for user_id in to_predict.index:
194
+ if user_id >= known_user_id:
195
+ row += [user_id] * to_predict[user_id]
196
+ col += self.rand_gen.integers(0, known_item_id, to_predict[user_id]).tolist()
197
+ pad = csr_matrix((np.ones(len(row)), (row, col)), shape=intended_shape)
198
+ X_pred += pad
199
+
200
+ logger.debug(f"Padding by {self.name} completed")
201
+ return X_pred
202
+
203
+ def _fit(self, X: csr_matrix) -> "TARSItemKNN":
204
+ """Fit a cosine similarity matrix from item to item."""
205
+
206
+ if self.training_data is None:
207
+ self.training_data = X.copy()
208
+ else:
209
+ self.training_data = self.training_data.union(X)
210
+ X = self.training_data.copy()
211
+
212
+ X = self._add_decay_to_fit_matrix(X)
213
+ if self.similarity == "cosine":
214
+ item_similarities = compute_cosine_similarity(X)
215
+ elif self.similarity == "conditional_probability":
216
+ item_similarities = compute_conditional_probability(X)
217
+ elif self.similarity == "pearson":
218
+ item_similarities = compute_pearson_similarity(X)
219
+
220
+ item_similarities = get_top_K_values(item_similarities, K=self.K)
221
+
222
+ self.similarity_matrix_ = item_similarities
223
+
224
+ return self
225
+
226
+ def _add_decay_to_interaction_matrix(self, X: InteractionMatrix, decay: float) -> csr_matrix:
227
+ """Weigh the interaction matrix based on age of the events.
228
+
229
+ If decay is 0, it is assumed to be disabled, and so we just return binary matrix.
230
+ :param X: Interaction matrix.
231
+ :type X: InteractionMatrix
232
+ :return: Weighted csr matrix.
233
+ :rtype: csr_matrix
234
+ """
235
+ timestamp_mat = X.latest_interaction_timestamps_matrix
236
+
237
+ # To get 'now', we add 1 to the maximal timestamp. This makes sure there are no vanishing zeroes.
238
+ now = timestamp_mat.data.max() + 1
239
+ ages = (now - timestamp_mat.data) / self.decay_interval
240
+ timestamp_mat.data = self._get_decay_func(decay, ages.max())(ages)
241
+
242
+ return csr_matrix(timestamp_mat)
243
+
244
+ def _add_decay_to_fit_matrix(self, X: InteractionMatrix) -> csr_matrix:
245
+ return self._add_decay_to_interaction_matrix(X, self.fit_decay)
246
+
247
+ def _add_decay_to_predict_matrix(self, X: InteractionMatrix) -> csr_matrix:
248
+ return self._add_decay_to_interaction_matrix(X, self.predict_decay)