recnexteval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. recnexteval/__init__.py +20 -0
  2. recnexteval/algorithms/__init__.py +99 -0
  3. recnexteval/algorithms/base.py +377 -0
  4. recnexteval/algorithms/baseline/__init__.py +10 -0
  5. recnexteval/algorithms/baseline/decay_popularity.py +110 -0
  6. recnexteval/algorithms/baseline/most_popular.py +72 -0
  7. recnexteval/algorithms/baseline/random.py +39 -0
  8. recnexteval/algorithms/baseline/recent_popularity.py +34 -0
  9. recnexteval/algorithms/itemknn/__init__.py +14 -0
  10. recnexteval/algorithms/itemknn/itemknn.py +119 -0
  11. recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
  12. recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
  13. recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
  14. recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
  15. recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
  16. recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
  17. recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
  18. recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
  19. recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
  20. recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
  21. recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
  22. recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
  23. recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
  24. recnexteval/algorithms/utils.py +51 -0
  25. recnexteval/datasets/__init__.py +109 -0
  26. recnexteval/datasets/base.py +316 -0
  27. recnexteval/datasets/config/__init__.py +113 -0
  28. recnexteval/datasets/config/amazon.py +188 -0
  29. recnexteval/datasets/config/base.py +72 -0
  30. recnexteval/datasets/config/lastfm.py +105 -0
  31. recnexteval/datasets/config/movielens.py +169 -0
  32. recnexteval/datasets/config/yelp.py +25 -0
  33. recnexteval/datasets/datasets/__init__.py +24 -0
  34. recnexteval/datasets/datasets/amazon.py +151 -0
  35. recnexteval/datasets/datasets/base.py +250 -0
  36. recnexteval/datasets/datasets/lastfm.py +121 -0
  37. recnexteval/datasets/datasets/movielens.py +93 -0
  38. recnexteval/datasets/datasets/test.py +46 -0
  39. recnexteval/datasets/datasets/yelp.py +103 -0
  40. recnexteval/datasets/metadata/__init__.py +58 -0
  41. recnexteval/datasets/metadata/amazon.py +68 -0
  42. recnexteval/datasets/metadata/base.py +38 -0
  43. recnexteval/datasets/metadata/lastfm.py +110 -0
  44. recnexteval/datasets/metadata/movielens.py +87 -0
  45. recnexteval/evaluators/__init__.py +189 -0
  46. recnexteval/evaluators/accumulator.py +167 -0
  47. recnexteval/evaluators/base.py +216 -0
  48. recnexteval/evaluators/builder/__init__.py +125 -0
  49. recnexteval/evaluators/builder/base.py +166 -0
  50. recnexteval/evaluators/builder/pipeline.py +111 -0
  51. recnexteval/evaluators/builder/stream.py +54 -0
  52. recnexteval/evaluators/evaluator_pipeline.py +287 -0
  53. recnexteval/evaluators/evaluator_stream.py +374 -0
  54. recnexteval/evaluators/state_management.py +310 -0
  55. recnexteval/evaluators/strategy.py +32 -0
  56. recnexteval/evaluators/util.py +124 -0
  57. recnexteval/matrix/__init__.py +48 -0
  58. recnexteval/matrix/exception.py +5 -0
  59. recnexteval/matrix/interaction_matrix.py +784 -0
  60. recnexteval/matrix/prediction_matrix.py +153 -0
  61. recnexteval/matrix/util.py +24 -0
  62. recnexteval/metrics/__init__.py +57 -0
  63. recnexteval/metrics/binary/__init__.py +4 -0
  64. recnexteval/metrics/binary/hit.py +49 -0
  65. recnexteval/metrics/core/__init__.py +10 -0
  66. recnexteval/metrics/core/base.py +126 -0
  67. recnexteval/metrics/core/elementwise_top_k.py +75 -0
  68. recnexteval/metrics/core/listwise_top_k.py +72 -0
  69. recnexteval/metrics/core/top_k.py +60 -0
  70. recnexteval/metrics/core/util.py +29 -0
  71. recnexteval/metrics/ranking/__init__.py +6 -0
  72. recnexteval/metrics/ranking/dcg.py +55 -0
  73. recnexteval/metrics/ranking/ndcg.py +78 -0
  74. recnexteval/metrics/ranking/precision.py +51 -0
  75. recnexteval/metrics/ranking/recall.py +42 -0
  76. recnexteval/models/__init__.py +4 -0
  77. recnexteval/models/base.py +69 -0
  78. recnexteval/preprocessing/__init__.py +37 -0
  79. recnexteval/preprocessing/filter.py +181 -0
  80. recnexteval/preprocessing/preprocessor.py +137 -0
  81. recnexteval/registries/__init__.py +67 -0
  82. recnexteval/registries/algorithm.py +68 -0
  83. recnexteval/registries/base.py +131 -0
  84. recnexteval/registries/dataset.py +37 -0
  85. recnexteval/registries/metric.py +57 -0
  86. recnexteval/settings/__init__.py +127 -0
  87. recnexteval/settings/base.py +414 -0
  88. recnexteval/settings/exception.py +8 -0
  89. recnexteval/settings/leave_n_out_setting.py +48 -0
  90. recnexteval/settings/processor.py +115 -0
  91. recnexteval/settings/schema.py +11 -0
  92. recnexteval/settings/single_time_point_setting.py +111 -0
  93. recnexteval/settings/sliding_window_setting.py +153 -0
  94. recnexteval/settings/splitters/__init__.py +14 -0
  95. recnexteval/settings/splitters/base.py +57 -0
  96. recnexteval/settings/splitters/n_last.py +39 -0
  97. recnexteval/settings/splitters/n_last_timestamp.py +76 -0
  98. recnexteval/settings/splitters/timestamp.py +82 -0
  99. recnexteval/settings/util.py +0 -0
  100. recnexteval/utils/__init__.py +115 -0
  101. recnexteval/utils/json_to_csv_converter.py +128 -0
  102. recnexteval/utils/logging_tools.py +159 -0
  103. recnexteval/utils/path.py +155 -0
  104. recnexteval/utils/url_certificate_installer.py +54 -0
  105. recnexteval/utils/util.py +166 -0
  106. recnexteval/utils/uuid_util.py +7 -0
  107. recnexteval/utils/yaml_tool.py +65 -0
  108. recnexteval-0.1.0.dist-info/METADATA +85 -0
  109. recnexteval-0.1.0.dist-info/RECORD +110 -0
  110. recnexteval-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,20 @@
1
+ """
2
+ RecNextEval
3
+ -----------
4
+
5
+ RecNextEval is a Python package toolkit developed for evaluation of recommendation
6
+ systems in different settings. Mainly the toolkit is developed to evaluate
7
+ in a sliding window setting.
8
+ """
9
+
10
+ import logging
11
+
12
+ from recnexteval.utils import prepare_logger
13
+
14
+
15
+ LOGGING_CONFIG_FILENAME = "logging_config.yaml"
16
+
17
+ prepare_logger(LOGGING_CONFIG_FILENAME)
18
+
19
+ logger = logging.getLogger(__name__)
20
+ logger.info("recnexteval package loaded.")
@@ -0,0 +1,99 @@
1
+ """
2
+ .. currentmodule:: recnexteval.algorithms
3
+
4
+ The algorithms module in recnexteval contains a collection of baseline algorithms
5
+ and various of the item-based KNN collaborative filtering algorithm. A total of
6
+ 3 variation of the item-based KNN algorithm is implemented in the module. Which
7
+ are listed below
8
+
9
+ Algorithm
10
+ ---------
11
+ Base class for all algorithms. Programmer should inherit from this class when
12
+ implementing a new algorithm. It provides a common interface for all algorithms
13
+ such that the expected methods and properties are defined to avoid any runtime
14
+ errors.
15
+
16
+ .. autosummary::
17
+ :toctree: generated/
18
+
19
+ Algorithm
20
+
21
+ Baseline Algorithms
22
+ -------------------
23
+
24
+ The baseline algorithms are simple algorithms that can be used as a reference
25
+ point to compare the performance of the more complex algorithms. The following
26
+ baseline algorithms are implemented in the module.
27
+
28
+ .. autosummary::
29
+ :toctree: generated/
30
+
31
+ Random
32
+ RecentPopularity
33
+ DecayPopularity
34
+ MostPopular
35
+
36
+ Item Similarity Algorithms
37
+ ----------------------------
38
+
39
+ Item similarity algorithms exploit relationships between items to make recommendations.
40
+ At prediction time, the user is represented by the items they have interacted
41
+ with. 3 variations of the item-based KNN algorithm are implemented in the module.
42
+ Each variation is to showcase the difference in the learning and prediction of
43
+ the algorithm. We note that no one algorithm is better than the other, and it
44
+ greatly depends on the dataset and parameters used in the algorithm which would
45
+ yield the best performance.
46
+
47
+ .. autosummary::
48
+ :toctree: generated/
49
+
50
+ ItemKNN
51
+ ItemKNNIncremental
52
+ ItemKNNIncrementalMovieLens100K
53
+ ItemKNNRolling
54
+ ItemKNNStatic
55
+
56
+ Time-Aware Item Similarity Algorithms
57
+ -------------------------------------
58
+
59
+ This is an extension of the item-based KNN algorithm.
60
+ The time-aware item-based KNN algorithms applies a decay factor to the timestamps of interactions.
61
+ This gives heavier weight to interactions that are more recent.
62
+
63
+ .. autosummary::
64
+ :toctree: generated/
65
+
66
+ TARSItemKNN
67
+ TARSItemKNNLiu
68
+ TARSItemKNNVaz
69
+ TARSItemKNNDing
70
+ """
71
+
72
+ from .base import Algorithm
73
+ from .baseline import MostPopular, Random, RecentPopularity
74
+ from .baseline.decay_popularity import DecayPopularity
75
+ from .itemknn import ItemKNN, ItemKNNIncremental, ItemKNNIncrementalMovieLens100K, ItemKNNRolling, ItemKNNStatic
76
+ from .time_aware_item_knn import (
77
+ TARSItemKNN,
78
+ TARSItemKNNDing,
79
+ TARSItemKNNLiu,
80
+ TARSItemKNNVaz,
81
+ )
82
+
83
+
84
+ __all__ = [
85
+ "Algorithm",
86
+ "DecayPopularity",
87
+ "ItemKNN",
88
+ "ItemKNNIncremental",
89
+ "ItemKNNIncrementalMovieLens100K",
90
+ "ItemKNNRolling",
91
+ "ItemKNNStatic",
92
+ "MostPopular",
93
+ "Random",
94
+ "RecentPopularity",
95
+ "TARSItemKNN",
96
+ "TARSItemKNNDing",
97
+ "TARSItemKNNLiu",
98
+ "TARSItemKNNVaz",
99
+ ]
@@ -0,0 +1,377 @@
1
+ import logging
2
+ import time
3
+ from abc import abstractmethod
4
+ from inspect import Parameter, signature
5
+ from typing import Self
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from scipy.sparse import csr_matrix
10
+ from sklearn.base import BaseEstimator
11
+ from sklearn.utils.validation import check_is_fitted
12
+
13
+ from recnexteval.matrix import InteractionMatrix, ItemUserBasedEnum, PredictionMatrix, to_csr_matrix
14
+ from ..models import BaseModel, ParamMixin
15
+ from ..utils import add_columns_to_csr_matrix, add_rows_to_csr_matrix
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class Algorithm(BaseEstimator, BaseModel, ParamMixin):
22
+ """Base class for all recnexteval algorithm implementations."""
23
+
24
+ ITEM_USER_BASED: ItemUserBasedEnum
25
+
26
+ def __init__(self) -> None:
27
+ super().__init__()
28
+ if not hasattr(self, "seed"):
29
+ self.seed = 42
30
+ self.rand_gen = np.random.default_rng(seed=self.seed)
31
+
32
+ @property
33
+ def description(self) -> str:
34
+ """Description of the algorithm.
35
+
36
+ :return: Description of the algorithm
37
+ :rtype: str
38
+ """
39
+ return self.__doc__ or "No description provided."
40
+
41
+ @property
42
+ def identifier(self) -> str:
43
+ """Identifier of the object.
44
+
45
+ Identifier is made by combining the class name with the parameters
46
+ passed at construction time.
47
+
48
+ Constructed by recreating the initialisation call.
49
+ Example: `Algorithm(param_1=value)`
50
+
51
+ :return: Identifier of the object
52
+ :rtype: str
53
+ """
54
+ paramstring = ",".join((f"{k}={v}" for k, v in self.get_params().items()))
55
+ return self.name + "(" + paramstring + ")"
56
+
57
+ @classmethod
58
+ def get_default_params(cls) -> dict:
59
+ """Get default parameters without instantiation.
60
+
61
+ Uses inspect.signature to extract __init__ parameters and their
62
+ default values without instantiating the class.
63
+
64
+ Returns:
65
+ Dictionary of parameter names to default values.
66
+ Parameters without defaults map to None.
67
+ """
68
+ try:
69
+ sig = signature(cls.__init__)
70
+ except (ValueError, TypeError):
71
+ # Fallback for built-in types or special cases
72
+ return {}
73
+
74
+ params = {}
75
+ for param_name, param in sig.parameters.items():
76
+ if param_name == "self":
77
+ continue
78
+
79
+ if param.kind in (Parameter.VAR_POSITIONAL, Parameter.VAR_KEYWORD):
80
+ # Skip *args, **kwargs
81
+ continue
82
+
83
+ # Extract the default value
84
+ if param.default is not Parameter.empty:
85
+ params[param_name] = param.default
86
+ else:
87
+ params[param_name] = None
88
+
89
+ return params
90
+
91
+ def __str__(self) -> str:
92
+ return self.name
93
+
94
+ def set_params(self, **params) -> Self:
95
+ """Set the parameters of the estimator.
96
+
97
+ :param params: Estimator parameters
98
+ :type params: dict
99
+ """
100
+ return super().set_params(**params)
101
+
102
+ @abstractmethod
103
+ def _fit(self, X: csr_matrix) -> Self:
104
+ """Stub implementation for fitting an algorithm.
105
+
106
+ Will be called by the `fit` wrapper.
107
+ Child classes should implement this function.
108
+
109
+ :param X: User-item interaction matrix to fit the model to
110
+ :type X: csr_matrix
111
+ :raises NotImplementedError: Implement this method in the child class
112
+ """
113
+ raise NotImplementedError("Please implement _fit")
114
+
115
+ @abstractmethod
116
+ def _predict(self, X: PredictionMatrix) -> csr_matrix:
117
+ """Stub for predicting scores to users
118
+
119
+ Will be called by the `predict` wrapper.
120
+ Child classes should implement this function.
121
+
122
+ :param X: User-item interaction matrix used as input to predict
123
+ :type X: PredictionMatrix
124
+ :raises NotImplementedError: Implement this method in the child class
125
+ :return: Predictions made for all nonzero users in X
126
+ :rtype: csr_matrix
127
+ """
128
+ raise NotImplementedError("Please implement _predict")
129
+
130
+ def _check_fit_complete(self) -> None:
131
+ """Helper function to check if model was correctly fitted
132
+
133
+ Uses the sklearn check_is_fitted function,
134
+ https://scikit-learn.org/stable/modules/generated/sklearn.utils.validation.check_is_fitted.html
135
+ """
136
+ check_is_fitted(self)
137
+
138
+ def _transform_fit_input(self, X: InteractionMatrix | csr_matrix) -> csr_matrix:
139
+ """Transform the training data to expected type
140
+
141
+ Data will be turned into a binary csr matrix.
142
+
143
+ :param X: User-item interaction matrix to fit the model to
144
+ :type X: InteractionMatrix | csr_matrix
145
+ :return: Transformed user-item interaction matrix to fit the model
146
+ :rtype: csr_matrix
147
+ """
148
+ return to_csr_matrix(X, binary=True)
149
+
150
+ def fit(self, X: InteractionMatrix) -> Self:
151
+ """Fit the model to the input interaction matrix.
152
+
153
+ The input data is transformed to the expected type using
154
+ :meth:`_transform_fit_input`. The fitting is done using the
155
+ :meth:`_fit` method. Finally the method checks that the fitting
156
+ was successful using :meth:`_check_fit_complete`.
157
+
158
+ :param X: The interactions to fit the model on.
159
+ :type X: InteractionMatrix
160
+ :return: Fitted algorithm
161
+ :rtype: Algorithm
162
+ """
163
+ start = time.time()
164
+ X_transformed = self._transform_fit_input(X)
165
+ self._fit(X_transformed)
166
+
167
+ self._check_fit_complete()
168
+ end = time.time()
169
+ logger.debug(f"Fitting {self.name} complete - Took {end - start:.3}s")
170
+ return self
171
+
172
+ def _pad_unknown_iid_with_none_strategy(
173
+ self,
174
+ y_pred: csr_matrix,
175
+ current_shape: tuple[int, int],
176
+ intended_shape: tuple[int, int],
177
+ ) -> csr_matrix:
178
+ """Pad the predictions with empty fields for unknown items.
179
+
180
+ This is to ensure that when we compute the performance of the prediction, we are
181
+ comparing the prediction against the ground truth for the same set of items.
182
+ """
183
+ if y_pred.shape == intended_shape:
184
+ return y_pred
185
+
186
+ known_user_id, known_item_id = current_shape
187
+ logger.debug(f"Padding item ID in range({known_item_id}, {intended_shape[1]}) with empty fields")
188
+ y_pred = add_columns_to_csr_matrix(y_pred, intended_shape[1] - known_item_id)
189
+ logger.debug(f"Padding by {self.name} completed")
190
+ return y_pred
191
+
192
+ # TODO change X_pred to y_pred for consistency
193
+ def _pad_unknown_uid_with_random_strategy(
194
+ self,
195
+ X_pred: csr_matrix,
196
+ current_shape: tuple[int, int],
197
+ intended_shape: tuple[int, int],
198
+ predict_ui_df: pd.DataFrame,
199
+ k: int = 10,
200
+ ) -> csr_matrix:
201
+ """Pad the predictions with random items for users that are not in the training data.
202
+
203
+ :param X_pred: Predictions made by the algorithm
204
+ :type X_pred: csr_matrix
205
+ :param intended_shape: The intended shape of the prediction matrix
206
+ :type intended_shape: tuple[int, int]
207
+ :param predict_ui_df: DataFrame containing the user IDs to predict for
208
+ :type predict_ui_df: pd.DataFrame
209
+ :return: The padded prediction matrix
210
+ :rtype: csr_matrix
211
+ """
212
+ if X_pred.shape == intended_shape:
213
+ return X_pred
214
+
215
+ known_user_id, known_item_id = current_shape
216
+ # +1 to include the last user id
217
+ X_pred = add_rows_to_csr_matrix(X_pred, intended_shape[0] - known_user_id)
218
+ # pad users with random items
219
+ logger.debug(f"Padding user ID in range({known_user_id}, {intended_shape[0]}) with random items")
220
+ to_predict = pd.Series(predict_ui_df.uid.unique())
221
+ # Filter for users not in training data
222
+ filtered = to_predict[to_predict >= known_user_id]
223
+ filtered = filtered.sort_values(ignore_index=True)
224
+ if not filtered.empty:
225
+ row = filtered.values.repeat(k)
226
+ total_pad = len(row)
227
+ col = self.rand_gen.integers(0, known_item_id, total_pad)
228
+ pad = csr_matrix((np.ones(total_pad), (row, col)), shape=intended_shape)
229
+ else:
230
+ pad = csr_matrix(intended_shape)
231
+ X_pred += pad
232
+ logger.debug(f"Padding by {self.name} completed")
233
+ return X_pred
234
+
235
+ def predict(self, X: PredictionMatrix) -> csr_matrix:
236
+ """Predicts scores, given the interactions in X
237
+
238
+ The input data is transformed to the expected type using
239
+ :meth:`_transform_predict_input`. The predictions are made
240
+ using the :meth:`_predict` method. Finally the predictions
241
+ are then padded with random items for users that are not in the
242
+ training data.
243
+
244
+ :param X: interactions to predict from.
245
+ :type X: InteractionMatrix
246
+ :return: The recommendation scores in a sparse matrix format.
247
+ :rtype: csr_matrix
248
+ """
249
+ self._check_fit_complete()
250
+ X_pred = self._predict(X)
251
+ return X_pred
252
+
253
+
254
+ class PopularityPaddingMixin:
255
+ """Mixin class to add popularity-based padding to prediction methods."""
256
+
257
+ def __init__(self, pad_with_popularity: bool = False) -> None:
258
+ super().__init__()
259
+ self.pad_with_popularity = pad_with_popularity
260
+
261
+ def get_popularity_scores(self, X: csr_matrix) -> np.ndarray:
262
+ """Compute a popularity-based scoring vector for items.
263
+
264
+ This method calculates normalized interaction counts for each item,
265
+ selects the top-K most popular items, and returns a vector where
266
+ only those top-K items have their normalized scores (others are 0).
267
+ This is used to pad predictions for unseen users with popular items.
268
+
269
+ :param X: The interaction matrix (user-item) to compute popularity from.
270
+ :type X: csr_matrix
271
+ :return: A 1D array of shape (num_items,) with popularity scores for top-K items.
272
+ :rtype: np.ndarray
273
+ """
274
+ interaction_counts = X.sum(axis=0).A[0]
275
+ normalized_scores = interaction_counts / interaction_counts.max()
276
+
277
+ num_items = X.shape[1]
278
+ if hasattr(self, "K"):
279
+ k_value = self.K
280
+ else:
281
+ k_value = 100
282
+ if num_items < k_value:
283
+ logger.warning("K is larger than the number of items.")
284
+
285
+ effective_k = min(k_value, num_items)
286
+ # Get indices of top-K items by popularity
287
+ top_k_indices = np.argpartition(normalized_scores, -effective_k)[-effective_k:]
288
+ popularity_vector = np.zeros(num_items)
289
+ popularity_vector[top_k_indices] = normalized_scores[top_k_indices]
290
+
291
+ return popularity_vector
292
+
293
+ def _pad_uknown_uid_with_popularity_strategy(
294
+ self,
295
+ X_pred: csr_matrix,
296
+ intended_shape: tuple,
297
+ predict_ui_df: pd.DataFrame,
298
+ ) -> csr_matrix:
299
+ """Pad the predictions with popular items for users that are not in the training data.
300
+
301
+ :param X_pred: Predictions made by the algorithm
302
+ :type X_pred: csr_matrix
303
+ :param intended_shape: The intended shape of the prediction matrix
304
+ :type intended_shape: tuple
305
+ :param predict_ui_df: DataFrame containing the user IDs to predict for
306
+ :type predict_ui_df: pd.DataFrame
307
+ :return: The padded prediction matrix
308
+ :rtype: csr_matrix
309
+ """
310
+ if X_pred.shape == intended_shape:
311
+ return X_pred
312
+
313
+ known_user_id, known_item_id = X_pred.shape
314
+ X_pred = add_rows_to_csr_matrix(X_pred, intended_shape[0] - known_user_id)
315
+ # pad users with popular items
316
+ logger.debug(f"Padding user ID in range({known_user_id}, {intended_shape[0]}) with popular items")
317
+ popular_items = self.get_popularity_scores(X_pred)
318
+
319
+ to_predict = predict_ui_df.value_counts("uid")
320
+ # Filter for users not in training data
321
+ filtered = to_predict[to_predict.index >= known_user_id]
322
+ for user_id in filtered.index:
323
+ if user_id >= known_user_id:
324
+ X_pred[user_id, :] = popular_items
325
+ return X_pred
326
+
327
+
328
+ class TopKAlgorithm(Algorithm):
329
+ """Base algorithm for algorithms that recommend top-K items for every user."""
330
+
331
+ def __init__(self, K: int = 10) -> None:
332
+ super().__init__()
333
+ self.K = K
334
+
335
+
336
+ class TopKItemSimilarityMatrixAlgorithm(TopKAlgorithm):
337
+ """Base algorithm for algorithms that fit an item to item similarity model with K similar items for every item
338
+
339
+ Model that encodes the similarity between items is expected
340
+ under the ``similarity_matrix_`` attribute.
341
+
342
+ This matrix should have shape ``(|items| x |items|)``.
343
+ This can be dense or sparse matrix depending on the algorithm used.
344
+
345
+ Predictions are made by computing the dot product of the history vector of a user
346
+ and the similarity matrix.
347
+
348
+ Usually a new algorithm will have to
349
+ implement just the :meth:`_fit` method,
350
+ to construct the `self.similarity_matrix_` attribute.
351
+ """
352
+
353
+ similarity_matrix_: csr_matrix
354
+
355
+ def _check_fit_complete(self) -> None:
356
+ """Helper function to check if model was correctly fitted
357
+
358
+ Checks implemented:
359
+
360
+ - Checks if the algorithm has been fitted, using sklearn's `check_is_fitted`
361
+ - Checks if the fitted similarity matrix contains similar items for each item
362
+
363
+ For failing checks a warning is printed.
364
+ """
365
+ # Use super to check is fitted
366
+ super()._check_fit_complete()
367
+
368
+ # Ensures that similarity_matrix_ is computed
369
+ if not hasattr(self, "similarity_matrix_"):
370
+ raise AttributeError(f"{self.name} has no attribute similarity_matrix_ after fitting.")
371
+
372
+ # Check row wise, since that will determine the recommendation options.
373
+ items_with_score = set(self.similarity_matrix_.nonzero()[0])
374
+
375
+ missing = self.similarity_matrix_.shape[0] - len(items_with_score)
376
+ if missing > 0:
377
+ logger.warning(f"{self.name} missing similar items for {missing} items.")
@@ -0,0 +1,10 @@
1
+ from .most_popular import MostPopular
2
+ from .random import Random
3
+ from .recent_popularity import RecentPopularity
4
+
5
+
6
+ __all__ = [
7
+ "MostPopular",
8
+ "Random",
9
+ "RecentPopularity",
10
+ ]
@@ -0,0 +1,110 @@
1
+ from warnings import warn
2
+
3
+ import numpy as np
4
+ from scipy.sparse import csr_matrix, lil_matrix
5
+
6
+ from ...matrix import InteractionMatrix
7
+ from ..base import Algorithm
8
+
9
+
10
+ class DecayPopularity(Algorithm):
11
+ """A popularity-based algorithm with exponential decay over data from earlier time windows.
12
+ """
13
+ IS_BASE: bool = False
14
+
15
+ def __init__(self, K: int = 200) -> None:
16
+ super().__init__()
17
+ self.K = K
18
+ self.historical_data: list[csr_matrix] = [] # Store all historical training data
19
+ self.num_items = 0 # Track the maximum number of items seen so far
20
+
21
+ def _pad_matrix(self, matrix: csr_matrix, new_num_items: int) -> csr_matrix:
22
+ """
23
+ Pad a sparse matrix with zero columns to match the new number of items.
24
+
25
+ :param matrix: The matrix to pad
26
+ :type matrix: csr_matrix
27
+ :param new_num_items: The target number of columns
28
+ :type new_num_items: int
29
+ :return: The padded matrix
30
+ :rtype: csr_matrix
31
+ """
32
+ if matrix.shape[1] >= new_num_items:
33
+ return matrix
34
+ padding = csr_matrix((matrix.shape[0], new_num_items - matrix.shape[1]))
35
+ return csr_matrix(np.hstack([matrix.toarray(), padding.toarray()]))
36
+
37
+ def _expand_historical_data(self, new_num_items: int):
38
+ """
39
+ Expand all matrices in historical_data to match the new number of items.
40
+
41
+ :param new_num_items: The updated number of items
42
+ :type new_num_items: int
43
+ """
44
+ for i in range(len(self.historical_data)):
45
+ if self.historical_data[i].shape[1] < new_num_items:
46
+ self.historical_data[i] = self._pad_matrix(self.historical_data[i], new_num_items)
47
+
48
+ def _fit(self, X: csr_matrix) -> "DecayPopularity":
49
+ """
50
+ Fit the model by applying decay to historical data and adding new data.
51
+
52
+ :param X: Interaction matrix (users x items) for the current window
53
+ :type X: csr_matrix
54
+ """
55
+ # Update the maximum number of items
56
+ new_num_items = X.shape[1]
57
+ if new_num_items > self.num_items:
58
+ self._expand_historical_data(new_num_items)
59
+ self.num_items = new_num_items
60
+
61
+ # Append the new matrix (ensure it has the correct number of items)
62
+ if X.shape[1] < self.num_items:
63
+ X = self._pad_matrix(X, self.num_items)
64
+
65
+ # Append new data to historical data
66
+ self.historical_data.append(X)
67
+
68
+ # Initialize decayed scores
69
+ num_items = X.shape[1]
70
+ if num_items < self.K:
71
+ warn("K is larger than the number of items.", UserWarning)
72
+
73
+ decayed_scores = np.zeros(num_items)
74
+
75
+ # Apply decay to each historical matrix
76
+ for i, matrix in enumerate(self.historical_data):
77
+ # length 2, i = 0 -> 2-1-0 = 1, i = 1 -> 2-1-1 = 0
78
+ # length 3, i = 0 -> 3-1-0 = 2, i = 1 -> 3-1-1 = 1, i = 2 -> 3-1-2 = 0
79
+ decay_factor = np.exp(-(len(self.historical_data) - 1 - i))
80
+ decayed_scores += matrix.sum(axis=0).A[0] * decay_factor
81
+
82
+ normalized_scores = decayed_scores / decayed_scores.max()
83
+
84
+ K = min(self.K, num_items)
85
+ ind = np.argpartition(normalized_scores, -K)[-K:]
86
+ a = np.zeros(num_items)
87
+ a[ind] = normalized_scores[ind]
88
+ self.decayed_scores_ = a
89
+ return self
90
+
91
+ def _predict(self, X: csr_matrix, predict_im: InteractionMatrix) -> csr_matrix:
92
+ """
93
+ Predict the K most popular item for each user scaled by the decay factor.
94
+ """
95
+ if predict_im is None:
96
+ raise AttributeError("Predict frame with requested ID is required for Popularity algorithm")
97
+
98
+ predict_frame = predict_im._df
99
+
100
+ users = predict_frame["uid"].unique().tolist()
101
+ known_item_id = X.shape[1]
102
+
103
+ # predict_frame contains (user_id, -1) pairs
104
+ max_user_id = predict_frame["uid"].max() + 1
105
+ intended_shape = (max(max_user_id, X.shape[0]), known_item_id)
106
+
107
+ X_pred = lil_matrix(intended_shape)
108
+ X_pred[users] = self.decayed_scores_
109
+
110
+ return X_pred.tocsr()
@@ -0,0 +1,72 @@
1
+ import logging
2
+ from typing import Self
3
+
4
+ import numpy as np
5
+ from scipy.sparse import csr_matrix, hstack, vstack
6
+
7
+ from ...matrix import PredictionMatrix
8
+ from ..base import PopularityPaddingMixin, TopKAlgorithm
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class MostPopular(TopKAlgorithm, PopularityPaddingMixin):
15
+ """A popularity-based algorithm that considers all historical data."""
16
+
17
+ IS_BASE: bool = False
18
+ X_: csr_matrix | None = None # Store all historical training data
19
+
20
+ def _append_training_data(self, X: csr_matrix) -> None:
21
+ """Append a new interaction matrix to the historical data.
22
+
23
+ Args:
24
+ X (csr_matrix): Interaction matrix to append
25
+ """
26
+ if self.X_ is None:
27
+ raise ValueError("No existing training data to append to.")
28
+ X_prev: csr_matrix = self.X_.copy()
29
+ new_num_rows = max(X_prev.shape[0], X.shape[0])
30
+ new_num_cols = max(X_prev.shape[1], X.shape[1])
31
+ # Pad the previous matrix
32
+ if X_prev.shape[0] < new_num_rows: # Pad rows
33
+ row_padding = csr_matrix((new_num_rows - X_prev.shape[0], X_prev.shape[1]))
34
+ X_prev = vstack([X_prev, row_padding])
35
+ if X_prev.shape[1] < new_num_cols: # Pad columns
36
+ col_padding = csr_matrix((X_prev.shape[0], new_num_cols - X_prev.shape[1]))
37
+ X_prev = hstack([X_prev, col_padding])
38
+
39
+ # Pad the current matrix
40
+ if X.shape[0] < new_num_rows: # Pad rows
41
+ row_padding = csr_matrix((new_num_rows - X.shape[0], X.shape[1]))
42
+ X = vstack([X, row_padding])
43
+ if X.shape[1] < new_num_cols: # Pad columns
44
+ col_padding = csr_matrix((X.shape[0], new_num_cols - X.shape[1]))
45
+ X = hstack([X, col_padding])
46
+
47
+ # Merge data
48
+ self.X_ = X_prev + X
49
+
50
+ def _fit(self, X: csr_matrix) -> Self:
51
+ if self.X_ is not None:
52
+ self._append_training_data(X)
53
+ else:
54
+ self.X_ = X.copy()
55
+
56
+ if not isinstance(self.X_, csr_matrix):
57
+ raise ValueError("Training data is not initialized properly.")
58
+
59
+ if self.X_.shape[1] < self.K:
60
+ logger.warning("K is larger than the number of items.", UserWarning)
61
+
62
+ self.sorted_scores_ = self.get_popularity_scores(self.X_)
63
+ return self
64
+
65
+ def _predict(self, X: PredictionMatrix) -> csr_matrix:
66
+ intended_shape = (X.get_prediction_data().num_interactions, X.user_item_shape[1])
67
+
68
+ # Vectorized: repeat the sorted scores for each prediction row
69
+ data = np.tile(self.sorted_scores_, (intended_shape[0], 1))
70
+ X_pred = csr_matrix(data)
71
+
72
+ return X_pred