recnexteval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. recnexteval/__init__.py +20 -0
  2. recnexteval/algorithms/__init__.py +99 -0
  3. recnexteval/algorithms/base.py +377 -0
  4. recnexteval/algorithms/baseline/__init__.py +10 -0
  5. recnexteval/algorithms/baseline/decay_popularity.py +110 -0
  6. recnexteval/algorithms/baseline/most_popular.py +72 -0
  7. recnexteval/algorithms/baseline/random.py +39 -0
  8. recnexteval/algorithms/baseline/recent_popularity.py +34 -0
  9. recnexteval/algorithms/itemknn/__init__.py +14 -0
  10. recnexteval/algorithms/itemknn/itemknn.py +119 -0
  11. recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
  12. recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
  13. recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
  14. recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
  15. recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
  16. recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
  17. recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
  18. recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
  19. recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
  20. recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
  21. recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
  22. recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
  23. recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
  24. recnexteval/algorithms/utils.py +51 -0
  25. recnexteval/datasets/__init__.py +109 -0
  26. recnexteval/datasets/base.py +316 -0
  27. recnexteval/datasets/config/__init__.py +113 -0
  28. recnexteval/datasets/config/amazon.py +188 -0
  29. recnexteval/datasets/config/base.py +72 -0
  30. recnexteval/datasets/config/lastfm.py +105 -0
  31. recnexteval/datasets/config/movielens.py +169 -0
  32. recnexteval/datasets/config/yelp.py +25 -0
  33. recnexteval/datasets/datasets/__init__.py +24 -0
  34. recnexteval/datasets/datasets/amazon.py +151 -0
  35. recnexteval/datasets/datasets/base.py +250 -0
  36. recnexteval/datasets/datasets/lastfm.py +121 -0
  37. recnexteval/datasets/datasets/movielens.py +93 -0
  38. recnexteval/datasets/datasets/test.py +46 -0
  39. recnexteval/datasets/datasets/yelp.py +103 -0
  40. recnexteval/datasets/metadata/__init__.py +58 -0
  41. recnexteval/datasets/metadata/amazon.py +68 -0
  42. recnexteval/datasets/metadata/base.py +38 -0
  43. recnexteval/datasets/metadata/lastfm.py +110 -0
  44. recnexteval/datasets/metadata/movielens.py +87 -0
  45. recnexteval/evaluators/__init__.py +189 -0
  46. recnexteval/evaluators/accumulator.py +167 -0
  47. recnexteval/evaluators/base.py +216 -0
  48. recnexteval/evaluators/builder/__init__.py +125 -0
  49. recnexteval/evaluators/builder/base.py +166 -0
  50. recnexteval/evaluators/builder/pipeline.py +111 -0
  51. recnexteval/evaluators/builder/stream.py +54 -0
  52. recnexteval/evaluators/evaluator_pipeline.py +287 -0
  53. recnexteval/evaluators/evaluator_stream.py +374 -0
  54. recnexteval/evaluators/state_management.py +310 -0
  55. recnexteval/evaluators/strategy.py +32 -0
  56. recnexteval/evaluators/util.py +124 -0
  57. recnexteval/matrix/__init__.py +48 -0
  58. recnexteval/matrix/exception.py +5 -0
  59. recnexteval/matrix/interaction_matrix.py +784 -0
  60. recnexteval/matrix/prediction_matrix.py +153 -0
  61. recnexteval/matrix/util.py +24 -0
  62. recnexteval/metrics/__init__.py +57 -0
  63. recnexteval/metrics/binary/__init__.py +4 -0
  64. recnexteval/metrics/binary/hit.py +49 -0
  65. recnexteval/metrics/core/__init__.py +10 -0
  66. recnexteval/metrics/core/base.py +126 -0
  67. recnexteval/metrics/core/elementwise_top_k.py +75 -0
  68. recnexteval/metrics/core/listwise_top_k.py +72 -0
  69. recnexteval/metrics/core/top_k.py +60 -0
  70. recnexteval/metrics/core/util.py +29 -0
  71. recnexteval/metrics/ranking/__init__.py +6 -0
  72. recnexteval/metrics/ranking/dcg.py +55 -0
  73. recnexteval/metrics/ranking/ndcg.py +78 -0
  74. recnexteval/metrics/ranking/precision.py +51 -0
  75. recnexteval/metrics/ranking/recall.py +42 -0
  76. recnexteval/models/__init__.py +4 -0
  77. recnexteval/models/base.py +69 -0
  78. recnexteval/preprocessing/__init__.py +37 -0
  79. recnexteval/preprocessing/filter.py +181 -0
  80. recnexteval/preprocessing/preprocessor.py +137 -0
  81. recnexteval/registries/__init__.py +67 -0
  82. recnexteval/registries/algorithm.py +68 -0
  83. recnexteval/registries/base.py +131 -0
  84. recnexteval/registries/dataset.py +37 -0
  85. recnexteval/registries/metric.py +57 -0
  86. recnexteval/settings/__init__.py +127 -0
  87. recnexteval/settings/base.py +414 -0
  88. recnexteval/settings/exception.py +8 -0
  89. recnexteval/settings/leave_n_out_setting.py +48 -0
  90. recnexteval/settings/processor.py +115 -0
  91. recnexteval/settings/schema.py +11 -0
  92. recnexteval/settings/single_time_point_setting.py +111 -0
  93. recnexteval/settings/sliding_window_setting.py +153 -0
  94. recnexteval/settings/splitters/__init__.py +14 -0
  95. recnexteval/settings/splitters/base.py +57 -0
  96. recnexteval/settings/splitters/n_last.py +39 -0
  97. recnexteval/settings/splitters/n_last_timestamp.py +76 -0
  98. recnexteval/settings/splitters/timestamp.py +82 -0
  99. recnexteval/settings/util.py +0 -0
  100. recnexteval/utils/__init__.py +115 -0
  101. recnexteval/utils/json_to_csv_converter.py +128 -0
  102. recnexteval/utils/logging_tools.py +159 -0
  103. recnexteval/utils/path.py +155 -0
  104. recnexteval/utils/url_certificate_installer.py +54 -0
  105. recnexteval/utils/util.py +166 -0
  106. recnexteval/utils/uuid_util.py +7 -0
  107. recnexteval/utils/yaml_tool.py +65 -0
  108. recnexteval-0.1.0.dist-info/METADATA +85 -0
  109. recnexteval-0.1.0.dist-info/RECORD +110 -0
  110. recnexteval-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,784 @@
1
+ import logging
2
+ import operator
3
+ from collections.abc import Callable
4
+ from copy import deepcopy
5
+ from enum import StrEnum
6
+ from typing import Literal, overload
7
+ from warnings import warn
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from scipy.sparse import csr_matrix
12
+
13
+ from .exception import TimestampAttributeMissingError
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class ItemUserBasedEnum(StrEnum):
20
+ """Enum class for item and user based properties.
21
+
22
+ Enum class to indicate if the function or logic is based on item or user.
23
+ """
24
+
25
+ ITEM = "item"
26
+ """Property based on item"""
27
+ USER = "user"
28
+ """Property based on user"""
29
+
30
+ @classmethod
31
+ def has_value(cls, value: str) -> bool:
32
+ """Check valid value for ItemUserBasedEnum
33
+
34
+ :param value: String value input
35
+ :type value: str
36
+ """
37
+ return value in ItemUserBasedEnum
38
+
39
+
40
+ class InteractionMatrix:
41
+ """Matrix of interaction data between users and items.
42
+
43
+ It provides a number of properties and methods for easy manipulation of this interaction data.
44
+
45
+ .. attention::
46
+
47
+ - The InteractionMatrix does not assume binary user-item pairs.
48
+ If a user interacts with an item more than once, there will be two
49
+ entries for this user-item pair.
50
+
51
+ - We assume that the user and item IDs are integers starting from 0. IDs
52
+ that are indicated by "-1" are reserved to label the user or item to
53
+ be predicted. This assumption is crucial as it will be used during the
54
+ split scheme and evaluation of the RS since it will affect the 2D shape
55
+ of the CSR matrix
56
+
57
+ :param df: Dataframe containing user-item interactions. Must contain at least
58
+ item ids and user ids.
59
+ :type df: pd.DataFrame
60
+ :param item_ix: Item ids column name.
61
+ :type item_ix: str
62
+ :param user_ix: User ids column name.
63
+ :type user_ix: str
64
+ :param timestamp_ix: Interaction timestamps column name.
65
+ :type timestamp_ix: str
66
+ :param shape: The desired shape of the matrix, i.e. the number of users and items.
67
+ If no shape is specified, the number of users will be equal to the
68
+ maximum user id plus one, the number of items to the maximum item
69
+ id plus one.
70
+ :type shape: tuple[int, int], optional
71
+ :param skip_df_processing: Skip processing of the dataframe. This is useful
72
+ when the dataframe is already processed and the columns are already
73
+ renamed.
74
+ :type skip_df_processing: bool, optional
75
+ """
76
+
77
+ ITEM_IX = "iid"
78
+ USER_IX = "uid"
79
+ TIMESTAMP_IX = "ts"
80
+ INTERACTION_IX = "interactionid"
81
+ MASKED_LABEL = -1
82
+
83
+ def __init__(
84
+ self,
85
+ df: pd.DataFrame,
86
+ item_ix: str,
87
+ user_ix: str,
88
+ timestamp_ix: str,
89
+ shape: None | tuple[int, int] = None,
90
+ skip_df_processing: bool = False,
91
+ ) -> None:
92
+ self.user_item_shape: tuple[int, int]
93
+ """The shape of the interaction matrix, i.e. `|user| x |item|`."""
94
+ if shape:
95
+ self.user_item_shape = shape
96
+
97
+ if skip_df_processing:
98
+ self._df = df
99
+ return
100
+
101
+ col_mapper = {
102
+ item_ix: InteractionMatrix.ITEM_IX,
103
+ user_ix: InteractionMatrix.USER_IX,
104
+ timestamp_ix: InteractionMatrix.TIMESTAMP_IX,
105
+ }
106
+ df = df.rename(columns=col_mapper)
107
+ required_columns = [
108
+ InteractionMatrix.USER_IX,
109
+ InteractionMatrix.ITEM_IX,
110
+ InteractionMatrix.TIMESTAMP_IX,
111
+ ]
112
+ extra_columns = [col for col in df.columns if col not in required_columns]
113
+ df = df[required_columns + extra_columns].copy()
114
+ # TODO refactor this statement below
115
+ df = df.reset_index(drop=True).reset_index().rename(columns={"index": InteractionMatrix.INTERACTION_IX})
116
+
117
+ self._df = df
118
+
119
+ def copy(self) -> "InteractionMatrix":
120
+ """Create a deep copy of this InteractionMatrix.
121
+
122
+ :return: Deep copy of this InteractionMatrix.
123
+ :rtype: InteractionMatrix
124
+ """
125
+ return deepcopy(self)
126
+
127
+ def copy_df(self, reset_index: bool = False) -> "pd.DataFrame":
128
+ """Create a deep copy of the dataframe.
129
+
130
+ :return: Deep copy of dataframe.
131
+ :rtype: pd.DataFrame
132
+ """
133
+ if reset_index:
134
+ return deepcopy(self._df.reset_index(drop=True))
135
+ return deepcopy(self._df)
136
+
137
+ def concat(self, im: "InteractionMatrix | pd.DataFrame") -> "InteractionMatrix":
138
+ """Concatenate this InteractionMatrix with another.
139
+
140
+ .. note::
141
+ This is a inplace operation. and will modify the current object.
142
+
143
+ :param im: InteractionMatrix to concat with.
144
+ :type im: Union[InteractionMatrix, pd.DataFrame]
145
+ :return: InteractionMatrix with the interactions from both matrices.
146
+ :rtype: InteractionMatrix
147
+ """
148
+ if isinstance(im, pd.DataFrame):
149
+ self._df = pd.concat([self._df, im])
150
+ else:
151
+ self._df = pd.concat([self._df, im._df])
152
+
153
+ return self
154
+
155
+ # TODO this should be shifted to prediction matrix
156
+ def union(self, im: "InteractionMatrix") -> "InteractionMatrix":
157
+ """Combine events from this InteractionMatrix with another.
158
+
159
+ :param im: InteractionMatrix to union with.
160
+ :type im: InteractionMatrix
161
+ :return: Union of interactions in this InteractionMatrix and the other.
162
+ :rtype: InteractionMatrix
163
+ """
164
+ return self + im
165
+
166
+ def difference(self, im: "InteractionMatrix") -> "InteractionMatrix":
167
+ """Difference between this InteractionMatrix and another.
168
+
169
+ :param im: InteractionMatrix to subtract from this.
170
+ :type im: InteractionMatrix
171
+ :return: Difference between this InteractionMatrix and the other.
172
+ :rtype: InteractionMatrix
173
+ """
174
+ return self - im
175
+
176
+ @property
177
+ def values(self) -> csr_matrix:
178
+ """All user-item interactions as a sparse matrix of size (|`global_users`|, |`global_items`|).
179
+
180
+ Each entry is the number of interactions between that user and item.
181
+ If there are no interactions between a user and item, the entry is 0.
182
+
183
+ :return: Interactions between users and items as a csr_matrix.
184
+ :rtype: csr_matrix
185
+ """
186
+ # TODO issue with -1 labeling in the interaction matrix should i create prediction matrix
187
+ if not hasattr(self, "user_item_shape"):
188
+ raise AttributeError("InteractionMatrix has no `user_item_shape` attribute. Please call mask_shape() first.")
189
+
190
+ values = np.ones(self._df.shape[0])
191
+ indices = self._df[[InteractionMatrix.USER_IX, InteractionMatrix.ITEM_IX]].values
192
+ indices = (indices[:, 0], indices[:, 1])
193
+
194
+ matrix = csr_matrix((values, indices), shape=self.user_item_shape, dtype=np.int32)
195
+ return matrix
196
+
197
+ @property
198
+ def indices(self) -> tuple[list[int], list[int]]:
199
+ """Returns a tuple of lists of user IDs and item IDs corresponding to interactions.
200
+
201
+ :return: tuple of lists of user IDs and item IDs that correspond to at least one interaction.
202
+ :rtype: tuple[list[int], list[int]]
203
+ """
204
+ return self.values.nonzero()
205
+
206
+ def nonzero(self) -> tuple[list[int], list[int]]:
207
+ return self.values.nonzero()
208
+
209
+ @overload
210
+ def users_in(self, U: set[int]) -> "InteractionMatrix": ...
211
+ @overload
212
+ def users_in(self, U: set[int], inplace: Literal[False]) -> "InteractionMatrix": ...
213
+ @overload
214
+ def users_in(self, U: set[int], inplace: Literal[True]) -> None: ...
215
+ def users_in(self, U: set[int], inplace: bool = False) -> "None | InteractionMatrix":
216
+ """Keep only interactions by one of the specified users.
217
+
218
+ :param U: A set or list of users to select the interactions from.
219
+ :type U: Union[set[int]
220
+ :param inplace: Apply the selection in place or not, defaults to False
221
+ :type inplace: bool, optional
222
+ :return: None if `inplace`, otherwise returns a new InteractionMatrix object
223
+ :rtype: Union[InteractionMatrix, None]
224
+ """
225
+ logger.debug("Performing users_in comparison")
226
+
227
+ mask = self._df[InteractionMatrix.USER_IX].isin(U)
228
+
229
+ return self._apply_mask(mask, inplace=inplace)
230
+
231
+ @overload
232
+ def _apply_mask(self, mask: pd.Series) -> "InteractionMatrix": ...
233
+ @overload
234
+ def _apply_mask(self, mask: pd.Series, inplace: Literal[True]) -> None: ...
235
+ @overload
236
+ def _apply_mask(self, mask: pd.Series, inplace: Literal[False]) -> "InteractionMatrix": ...
237
+ def _apply_mask(self, mask: pd.Series, inplace: bool = False) -> "None | InteractionMatrix":
238
+ interaction_m = self if inplace else self.copy()
239
+ interaction_m._df = interaction_m._df[mask]
240
+ return None if inplace else interaction_m
241
+
242
+ def _timestamps_cmp(self, op: Callable, timestamp: float, inplace: bool = False) -> "None | InteractionMatrix":
243
+ """Filter interactions based on timestamp.
244
+ Keep only interactions for which op(t, timestamp) is True.
245
+
246
+ :param op: Comparison operator.
247
+ :type op: Callable
248
+ :param timestamp: Timestamp to compare against in seconds from epoch.
249
+ :type timestamp: float
250
+ :param inplace: Modify the data matrix in place. If False, returns a new object.
251
+ :type inplace: bool, optional
252
+ """
253
+ logger.debug(f"Performing {op.__name__}(t, {timestamp})")
254
+
255
+ mask = op(self._df[InteractionMatrix.TIMESTAMP_IX], timestamp)
256
+ return self._apply_mask(mask, inplace=inplace)
257
+
258
+ @overload
259
+ def timestamps_gt(self, timestamp: float) -> "InteractionMatrix": ...
260
+ @overload
261
+ def timestamps_gt(self, timestamp: float, inplace: Literal[True]) -> None: ...
262
+ def timestamps_gt(self, timestamp: float, inplace: bool = False) -> "None | InteractionMatrix":
263
+ """Select interactions after a given timestamp.
264
+
265
+ :param timestamp: The timestamp with which
266
+ the interactions timestamp is compared.
267
+ :type timestamp: float
268
+ :param inplace: Apply the selection in place if True, defaults to False
269
+ :type inplace: bool, optional
270
+ :return: None if `inplace`, otherwise returns a new InteractionMatrix object
271
+ :rtype: Union[InteractionMatrix, None]
272
+ """
273
+ return self._timestamps_cmp(operator.gt, timestamp, inplace)
274
+
275
+ @overload
276
+ def timestamps_gte(self, timestamp: float) -> "InteractionMatrix": ...
277
+ @overload
278
+ def timestamps_gte(self, timestamp: float, inplace: Literal[True]) -> None: ...
279
+ def timestamps_gte(self, timestamp: float, inplace: bool = False) -> "None | InteractionMatrix":
280
+ """Select interactions after and including a given timestamp.
281
+
282
+ :param timestamp: The timestamp with which
283
+ the interactions timestamp is compared.
284
+ :type timestamp: float
285
+ :param inplace: Apply the selection in place if True, defaults to False
286
+ :type inplace: bool, optional
287
+ :return: None if `inplace`, otherwise returns a new InteractionMatrix object
288
+ :rtype: Union[InteractionMatrix, None]
289
+ """
290
+ return self._timestamps_cmp(operator.ge, timestamp, inplace)
291
+
292
+ @overload
293
+ def timestamps_lt(self, timestamp: float) -> "InteractionMatrix": ...
294
+ @overload
295
+ def timestamps_lt(self, timestamp: float, inplace: Literal[True]) -> None: ...
296
+ def timestamps_lt(self, timestamp: float, inplace: bool = False) -> "None | InteractionMatrix":
297
+ """Select interactions up to a given timestamp.
298
+
299
+ :param timestamp: The timestamp with which
300
+ the interactions timestamp is compared.
301
+ :type timestamp: float
302
+ :param inplace: Apply the selection in place if True, defaults to False
303
+ :type inplace: bool, optional
304
+ :return: None if `inplace`, otherwise returns a new InteractionMatrix object
305
+ :rtype: Union[InteractionMatrix, None]
306
+ """
307
+ return self._timestamps_cmp(operator.lt, timestamp, inplace)
308
+
309
+ @overload
310
+ def timestamps_lte(self, timestamp: float) -> "InteractionMatrix": ...
311
+ @overload
312
+ def timestamps_lte(self, timestamp: float, inplace: Literal[True]) -> None: ...
313
+ def timestamps_lte(self, timestamp: float, inplace: bool = False) -> "None | InteractionMatrix":
314
+ """Select interactions up to and including a given timestamp.
315
+
316
+ :param timestamp: The timestamp with which
317
+ the interactions timestamp is compared.
318
+ :type timestamp: float
319
+ :param inplace: Apply the selection in place if True, defaults to False
320
+ :type inplace: bool, optional
321
+ :return: None if `inplace`, otherwise returns a new InteractionMatrix object
322
+ :rtype: Union[InteractionMatrix, None]
323
+ """
324
+ return self._timestamps_cmp(operator.le, timestamp, inplace)
325
+
326
+ def __add__(self, im: "InteractionMatrix") -> "InteractionMatrix":
327
+ """Combine events from this InteractionMatrix with another.
328
+
329
+ :param im: InteractionMatrix to union with.
330
+ :type im: InteractionMatrix
331
+ :return: Union of interactions in this InteractionMatrix and the other.
332
+ :rtype: InteractionMatrix
333
+ """
334
+ df = pd.concat([self._df, im._df], copy=False)
335
+
336
+ shape = None
337
+ if hasattr(self, "user_item_shape") and hasattr(im, "user_item_shape"):
338
+ shape = (max(self.user_item_shape[0], im.user_item_shape[0]), max(self.user_item_shape[1], im.user_item_shape[1]))
339
+ self.user_item_shape = shape
340
+
341
+ return InteractionMatrix(
342
+ df,
343
+ InteractionMatrix.ITEM_IX,
344
+ InteractionMatrix.USER_IX,
345
+ InteractionMatrix.TIMESTAMP_IX,
346
+ shape,
347
+ True,
348
+ )
349
+
350
+ def __sub__(self, im: "InteractionMatrix") -> "InteractionMatrix":
351
+ full_data = pd.MultiIndex.from_frame(self._df)
352
+ data_part_2 = pd.MultiIndex.from_frame(im._df)
353
+ data_part_1 = full_data.difference(data_part_2).to_frame().reset_index(drop=True)
354
+
355
+ shape = None
356
+ if hasattr(self, "user_item_shape") and hasattr(im, "user_item_shape"):
357
+ shape = (max(self.user_item_shape[0], im.user_item_shape[0]), max(self.user_item_shape[1], im.user_item_shape[1]))
358
+
359
+ return InteractionMatrix(
360
+ data_part_1,
361
+ InteractionMatrix.ITEM_IX,
362
+ InteractionMatrix.USER_IX,
363
+ InteractionMatrix.TIMESTAMP_IX,
364
+ shape,
365
+ True,
366
+ )
367
+
368
+ def __repr__(self) -> str:
369
+ return repr(self._df)
370
+
371
+ def __eq__(self, value: object) -> bool:
372
+ if not isinstance(value, InteractionMatrix):
373
+ logger.debug(f"Comparing {type(value)} with InteractionMatrix is not supported")
374
+ return False
375
+ return self._df.equals(value._df)
376
+
377
+ def __len__(self) -> int:
378
+ """Return the number of interactions in the matrix.
379
+
380
+ This is distinct from the shape of the matrix, which is the number of
381
+ users and items that has been released to the model. The length of the
382
+ matrix is the number of interactions present in the matrix resulting
383
+ from filter operations.
384
+ """
385
+ return len(self._df)
386
+
387
+ @overload
388
+ def items_in(self, id_set: set[int]) -> "InteractionMatrix": ...
389
+ @overload
390
+ def items_in(self, id_set: set[int], inplace: Literal[False]) -> "InteractionMatrix": ...
391
+ @overload
392
+ def items_in(self, id_set: set[int], inplace: Literal[True]) -> None: ...
393
+ def items_in(self, id_set: set[int], inplace=False) -> "None | InteractionMatrix":
394
+ """Keep only interactions with the specified items.
395
+
396
+ :param id_set: A set or list of items to select the interactions.
397
+ :type id_set: set[int]
398
+ :param inplace: Apply the selection in place or not, defaults to False
399
+ :type inplace: bool, optional
400
+ :return: None if `inplace`, otherwise returns a new InteractionMatrix object
401
+ :rtype: Union[InteractionMatrix, None]
402
+ """
403
+ logger.debug("Performing items_in comparison")
404
+
405
+ mask = self._df[InteractionMatrix.ITEM_IX].isin(id_set)
406
+
407
+ return self._apply_mask(mask, inplace=inplace)
408
+
409
+ @overload
410
+ def items_not_in(self, id_set: set[int]) -> "InteractionMatrix": ...
411
+ @overload
412
+ def items_not_in(self, id_set: set[int], inplace: Literal[False]) -> "InteractionMatrix": ...
413
+ @overload
414
+ def items_not_in(self, id_set: set[int], inplace: Literal[True]) -> None: ...
415
+ def items_not_in(self, id_set: set[int], inplace: bool=False) -> "None | InteractionMatrix":
416
+ """Keep only interactions not with the specified items.
417
+
418
+ :param id_set: A set or list of items to exclude from the interactions.
419
+ :type id_set: set[int]
420
+ :param inplace: Apply the selection in place or not, defaults to False
421
+ :type inplace: bool, optional
422
+ :return: None if `inplace`, otherwise returns a new InteractionMatrix object
423
+ :rtype: Union[InteractionMatrix, None]
424
+ """
425
+ logger.debug("Performing items_not_in comparison")
426
+
427
+ mask = ~self._df[InteractionMatrix.ITEM_IX].isin(id_set)
428
+
429
+ return self._apply_mask(mask, inplace=inplace)
430
+
431
+ def users_not_in(self, U: set[int], inplace=False) -> "None | InteractionMatrix":
432
+ """Keep only interactions not by the specified users.
433
+
434
+ :param U: A set or list of users to exclude from the interactions.
435
+ :type U: set[int]
436
+ :param inplace: Apply the selection in place or not, defaults to False
437
+ :type inplace: bool, optional
438
+ :return: None if `inplace`, otherwise returns a new InteractionMatrix object
439
+ :rtype: Union[InteractionMatrix, None]
440
+ """
441
+ logger.debug("Performing users_not_in comparison")
442
+
443
+ mask = ~self._df[InteractionMatrix.USER_IX].isin(U)
444
+
445
+ return self._apply_mask(mask, inplace=inplace)
446
+
447
+ def interactions_in(self, interaction_ids: list[int], inplace: bool = False) -> "None | InteractionMatrix":
448
+ """Select the interactions by their interaction ids
449
+
450
+ :param interaction_ids: A list of interaction ids
451
+ :type interaction_ids: list[int]
452
+ :param inplace: Apply the selection in place,
453
+ or return a new InteractionMatrix object, defaults to False
454
+ :type inplace: bool, optional
455
+ :return: None if inplace, otherwise new InteractionMatrix
456
+ object with the selected interactions
457
+ :rtype: Union[None, InteractionMatrix]
458
+ """
459
+ logger.debug("Performing interactions_in comparison")
460
+
461
+ mask = self._df[InteractionMatrix.INTERACTION_IX].isin(interaction_ids)
462
+
463
+ unknown_interaction_ids = set(interaction_ids).difference(self._df[InteractionMatrix.INTERACTION_IX].unique())
464
+
465
+ if unknown_interaction_ids:
466
+ warn(f"IDs {unknown_interaction_ids} not present in data")
467
+ if not interaction_ids:
468
+ warn("No interaction IDs given, returning empty InteractionMatrix.")
469
+
470
+ return self._apply_mask(mask, inplace=inplace)
471
+
472
+ def _get_last_n_interactions(
473
+ self,
474
+ by: ItemUserBasedEnum,
475
+ n_seq_data: int,
476
+ t_upper: None | int = None,
477
+ id_in: None | set[int] = None,
478
+ inplace=False,
479
+ ) -> "InteractionMatrix":
480
+ if not self.has_timestamps:
481
+ raise TimestampAttributeMissingError()
482
+ if t_upper is None:
483
+ t_upper = self.max_timestamp + 1 # to include the last timestamp
484
+
485
+ interaction_m = self if inplace else self.copy()
486
+
487
+ mask = interaction_m._df[InteractionMatrix.TIMESTAMP_IX] < t_upper
488
+ if id_in and by == ItemUserBasedEnum.USER:
489
+ mask = mask & interaction_m._df[InteractionMatrix.USER_IX].isin(id_in)
490
+ elif id_in and by == ItemUserBasedEnum.ITEM:
491
+ mask = mask & interaction_m._df[InteractionMatrix.ITEM_IX].isin(id_in)
492
+
493
+ if by == ItemUserBasedEnum.USER:
494
+ c_df = interaction_m._df[mask].groupby(InteractionMatrix.USER_IX).tail(n_seq_data)
495
+ else:
496
+ c_df = interaction_m._df[mask].groupby(InteractionMatrix.ITEM_IX).tail(n_seq_data)
497
+ interaction_m._df = c_df
498
+
499
+ return interaction_m
500
+
501
+ def _get_first_n_interactions(
502
+ self, by: ItemUserBasedEnum, n_seq_data: int, t_lower: None | int = None, inplace=False
503
+ ) -> "InteractionMatrix":
504
+ if not self.has_timestamps:
505
+ raise TimestampAttributeMissingError()
506
+ if t_lower is None:
507
+ t_lower = self.min_timestamp
508
+
509
+ interaction_m = self if inplace else self.copy()
510
+
511
+ mask = interaction_m._df[InteractionMatrix.TIMESTAMP_IX] >= t_lower
512
+ if by == ItemUserBasedEnum.USER:
513
+ c_df = interaction_m._df[mask].groupby(InteractionMatrix.USER_IX).head(n_seq_data)
514
+ else:
515
+ c_df = interaction_m._df[mask].groupby(InteractionMatrix.ITEM_IX).head(n_seq_data)
516
+ interaction_m._df = c_df
517
+ return interaction_m
518
+
519
+ def get_users_n_last_interaction(
520
+ self,
521
+ n_seq_data: int = 1,
522
+ t_upper: None | int = None,
523
+ user_in: None | set[int] = None,
524
+ inplace: bool = False,
525
+ ) -> "InteractionMatrix":
526
+ """Select the last n interactions for each user.
527
+
528
+ :param n_seq_data: Number of interactions to select, defaults to 1
529
+ :type n_seq_data: int, optional
530
+ :param t_upper: Seconds past t. Upper limit for the timestamp
531
+ of the interactions to select, defaults to None
532
+ :type t_upper: None | int, optional
533
+ :param user_in: set of user IDs to select the interactions from,
534
+ defaults to None
535
+ :type user_in: None | set[int], optional
536
+ :param inplace: If operation is inplace, defaults to False
537
+ :type inplace: bool, optional
538
+ :return: Resulting interaction matrix
539
+ :rtype: InteractionMatrix
540
+ """
541
+ logger.debug("Performing get_user_n_last_interaction comparison")
542
+ return self._get_last_n_interactions(ItemUserBasedEnum.USER, n_seq_data, t_upper, user_in, inplace)
543
+
544
+ def get_items_n_last_interaction(
545
+ self,
546
+ n_seq_data: int = 1,
547
+ t_upper: None | int = None,
548
+ item_in: None | set[int] = None,
549
+ inplace: bool = False,
550
+ ) -> "InteractionMatrix":
551
+ """Select the last n interactions for each item.
552
+
553
+ :param n_seq_data: Number of interactions to select, defaults to 1
554
+ :type n_seq_data: int, optional
555
+ :param t_upper: Seconds past t. Upper limit for the timestamp
556
+ of the interactions to select, defaults to None
557
+ :type t_upper: None | int, optional
558
+ :param item_in: set of item IDs to select the interactions from,
559
+ defaults to None
560
+ :type item_in: None | set[int], optional
561
+ :param inplace: If operation is inplace, defaults to False
562
+ :type inplace: bool, optional
563
+ :return: Resulting interaction matrix
564
+ :rtype: InteractionMatrix
565
+ """
566
+ logger.debug("Performing get_item_n_last_interaction comparison")
567
+ return self._get_last_n_interactions(ItemUserBasedEnum.ITEM, n_seq_data, t_upper, item_in, inplace)
568
+
569
+ def get_users_n_first_interaction(
570
+ self, n_seq_data: int = 1, t_lower: None | int = None, inplace=False
571
+ ) -> "InteractionMatrix":
572
+ """Select the first n interactions for each user.
573
+
574
+ :param n_seq_data: Number of interactions to select, defaults to 1
575
+ :type n_seq_data: int, optional
576
+ :param t_lower: Seconds past t. Lower limit for the timestamp
577
+ of the interactions to select, defaults to None
578
+ :type t_lower: None | int, optional
579
+ :param inplace: If operation is inplace, defaults to False
580
+ :type inplace: bool, optional
581
+ :return: Resulting interaction matrix
582
+ :rtype: InteractionMatrix
583
+ """
584
+ return self._get_first_n_interactions(ItemUserBasedEnum.USER, n_seq_data, t_lower, inplace)
585
+
586
+ def get_items_n_first_interaction(
587
+ self, n_seq_data: int = 1, t_lower: None | int = None, inplace=False
588
+ ) -> "InteractionMatrix":
589
+ """Select the first n interactions for each item.
590
+
591
+ :param n_seq_data: Number of interactions to select, defaults to 1
592
+ :type n_seq_data: int, optional
593
+ :param t_lower: Seconds past t. Lower limit for the timestamp
594
+ of the interactions to select, defaults to None
595
+ :type t_lower: None | int, optional
596
+ :param inplace: If operation is inplace, defaults to False
597
+ :type inplace: bool, optional
598
+ :return: Resulting interaction matrix
599
+ :rtype: InteractionMatrix
600
+ """
601
+ return self._get_first_n_interactions(ItemUserBasedEnum.ITEM, n_seq_data, t_lower, inplace)
602
+
603
+ @property
604
+ def item_interaction_sequence_matrix(self) -> csr_matrix:
605
+ """Converts the interaction data into an item interaction sequence matrix.
606
+
607
+ Dataframe values are converted into such that the row sequence is maintained and
608
+ the item that interacted with will have the column at item_id marked with 1.
609
+ """
610
+ values = np.ones(self._df.shape[0])
611
+ indices = (np.arange(self._df.shape[0]), self._df[InteractionMatrix.ITEM_IX].to_numpy())
612
+ shape = (self._df.shape[0], self.user_item_shape[1])
613
+
614
+ sparse_matrix = csr_matrix((values, indices), shape=shape, dtype=values.dtype)
615
+ return sparse_matrix
616
+
617
+ @property
618
+ def user_id_sequence_array(self) -> np.ndarray:
619
+ """Array of user IDs in the order of interactions.
620
+
621
+ :return: Numpy array of user IDs.
622
+ :rtype: np.ndarray
623
+ """
624
+ return self._df[InteractionMatrix.USER_IX].to_numpy()
625
+
626
+ def get_prediction_data(self) -> "InteractionMatrix":
627
+ """Get the data to be predicted.
628
+
629
+ :return: InteractionMatrix with only the data to be predicted.
630
+ :rtype: InteractionMatrix
631
+ """
632
+ return self.items_in({-1})
633
+
634
+ def get_interaction_data(self) -> "InteractionMatrix":
635
+ """Get the data that is not denoted by "-1".
636
+
637
+ User and item IDs that are not denoted by "-1" are the ones that are
638
+ known to the model.
639
+
640
+ :return: InteractionMatrix with only the known data.
641
+ :rtype: InteractionMatrix
642
+ """
643
+ mask = (self._df[InteractionMatrix.USER_IX] != -1) & (self._df[InteractionMatrix.ITEM_IX] != -1)
644
+ return self._apply_mask(mask)
645
+
646
+ @property
647
+ def user_ids(self) -> set[int]:
648
+ """The set of all user ID in matrix"""
649
+ return set(self._df[InteractionMatrix.USER_IX].dropna().unique())
650
+
651
+ @property
652
+ def item_ids(self) -> set[int]:
653
+ """The set of all item ID in matrix"""
654
+ return set(self._df[InteractionMatrix.ITEM_IX].dropna().unique())
655
+
656
+ @property
657
+ def num_interactions(self) -> int:
658
+ """The total number of interactions.
659
+
660
+ :return: Total interaction count.
661
+ :rtype: int
662
+ """
663
+ return len(self._df)
664
+
665
+ @property
666
+ def has_timestamps(self) -> bool:
667
+ """Boolean indicating whether instance has timestamp information.
668
+
669
+ :return: True if timestamps information is available, False otherwise.
670
+ :rtype: bool
671
+ """
672
+ return self.TIMESTAMP_IX in self._df
673
+
674
+ @property
675
+ def min_timestamp(self) -> int:
676
+ """The earliest timestamp in the interaction
677
+
678
+ :return: The earliest timestamp.
679
+ :rtype: int
680
+ """
681
+ if not self.has_timestamps:
682
+ raise TimestampAttributeMissingError()
683
+ return self._df[self.TIMESTAMP_IX].min()
684
+
685
+ @property
686
+ def max_timestamp(self) -> int:
687
+ """The latest timestamp in the interaction
688
+
689
+ :return: The latest timestamp.
690
+ :rtype: int
691
+ """
692
+ if not self.has_timestamps:
693
+ raise TimestampAttributeMissingError()
694
+ return self._df[self.TIMESTAMP_IX].max()
695
+
696
+ @property
697
+ def max_global_user_id(self) -> int:
698
+ """The highest known global user ID in the interaction matrix.
699
+
700
+ The difference between `max_global_user_id` and `max_user_id` is that
701
+ `max_global_user_id` considers all user IDs present in the dataframe,
702
+ including users that are only encountered during prediction time.
703
+ """
704
+ return max(int(self._df[InteractionMatrix.USER_IX].max()) + 1, self.user_item_shape[0])
705
+
706
+ @property
707
+ def max_global_item_id(self) -> int:
708
+ return max(int(self._df[InteractionMatrix.ITEM_IX].max()) + 1, self.user_item_shape[1])
709
+
710
+ @property
711
+ def max_known_user_id(self) -> int:
712
+ """The highest known user ID in the interaction matrix."""
713
+ max_val = self._df[(self._df != -1).all(axis=1)][InteractionMatrix.USER_IX].max()
714
+ if pd.isna(max_val):
715
+ return self.user_item_shape[0]
716
+ return min(int(max_val) + 1, self.user_item_shape[0])
717
+
718
+ @property
719
+ def max_known_item_id(self) -> int:
720
+ """The highest known user ID in the interaction matrix."""
721
+ max_val = self._df[(self._df != -1).all(axis=1)][InteractionMatrix.ITEM_IX].max()
722
+ if pd.isna(max_val):
723
+ return self.user_item_shape[1]
724
+ return min(int(max_val) + 1, self.user_item_shape[1])
725
+
726
+ @property
727
+ def max_user_id(self) -> int:
728
+ """The highest known user ID in the interaction matrix.
729
+
730
+ :return: The highest user ID.
731
+ :rtype: int
732
+ """
733
+ max_val = self._df[self._df != -1][InteractionMatrix.USER_IX].max()
734
+ if np.isnan(max_val):
735
+ return -1
736
+ return max_val
737
+
738
+ @property
739
+ def max_item_id(self) -> int:
740
+ """The highest known item ID in the interaction matrix.
741
+
742
+ In the case of an empty matrix, the highest item ID is -1. This is
743
+ consistent with the the definition that -1 denotes the item that is
744
+ unknown. It would be incorrect to use any other value, since 0 is a
745
+ valid item ID.
746
+
747
+ :return: The highest item ID.
748
+ :rtype: int
749
+ """
750
+ max_val = self._df[self._df != -1][InteractionMatrix.ITEM_IX].max()
751
+ if np.isnan(max_val):
752
+ return -1
753
+ return max_val
754
+
755
+ @property
756
+ def timestamps(self) -> pd.Series:
757
+ """The interaction timestamps indexed by user and item ID.
758
+
759
+ :raises
760
+ """
761
+ """Timestamps of interactions as a pandas Series, indexed by user ID and item ID.
762
+
763
+ :raises TimestampAttributeMissingError: If timestamp column is missing.
764
+ :return: Interactions with composite index on (user ID, item ID)
765
+ :rtype: pd.Series
766
+ """
767
+ if not self.has_timestamps:
768
+ raise TimestampAttributeMissingError()
769
+ index = pd.MultiIndex.from_frame(self._df[[InteractionMatrix.USER_IX, InteractionMatrix.ITEM_IX]])
770
+ return pd.DataFrame(self._df[[InteractionMatrix.TIMESTAMP_IX]]).set_index(index)[InteractionMatrix.TIMESTAMP_IX]
771
+
772
+ @property
773
+ def latest_interaction_timestamps_matrix(self) -> csr_matrix:
774
+ """A csr matrix containing the last interaction timestamp for each user, item pair.
775
+
776
+ We only account for the last interacted timestamp making the dataset non-deduplicated.
777
+ """
778
+ timestamps = self.timestamps.groupby(["uid", "iid"]).max().reset_index()
779
+ timestamp_mat = csr_matrix(
780
+ (timestamps.ts.values, (timestamps.uid.values, timestamps.iid.values)),
781
+ shape=self.user_item_shape,
782
+ )
783
+
784
+ return timestamp_mat