recnexteval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. recnexteval/__init__.py +20 -0
  2. recnexteval/algorithms/__init__.py +99 -0
  3. recnexteval/algorithms/base.py +377 -0
  4. recnexteval/algorithms/baseline/__init__.py +10 -0
  5. recnexteval/algorithms/baseline/decay_popularity.py +110 -0
  6. recnexteval/algorithms/baseline/most_popular.py +72 -0
  7. recnexteval/algorithms/baseline/random.py +39 -0
  8. recnexteval/algorithms/baseline/recent_popularity.py +34 -0
  9. recnexteval/algorithms/itemknn/__init__.py +14 -0
  10. recnexteval/algorithms/itemknn/itemknn.py +119 -0
  11. recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
  12. recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
  13. recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
  14. recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
  15. recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
  16. recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
  17. recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
  18. recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
  19. recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
  20. recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
  21. recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
  22. recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
  23. recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
  24. recnexteval/algorithms/utils.py +51 -0
  25. recnexteval/datasets/__init__.py +109 -0
  26. recnexteval/datasets/base.py +316 -0
  27. recnexteval/datasets/config/__init__.py +113 -0
  28. recnexteval/datasets/config/amazon.py +188 -0
  29. recnexteval/datasets/config/base.py +72 -0
  30. recnexteval/datasets/config/lastfm.py +105 -0
  31. recnexteval/datasets/config/movielens.py +169 -0
  32. recnexteval/datasets/config/yelp.py +25 -0
  33. recnexteval/datasets/datasets/__init__.py +24 -0
  34. recnexteval/datasets/datasets/amazon.py +151 -0
  35. recnexteval/datasets/datasets/base.py +250 -0
  36. recnexteval/datasets/datasets/lastfm.py +121 -0
  37. recnexteval/datasets/datasets/movielens.py +93 -0
  38. recnexteval/datasets/datasets/test.py +46 -0
  39. recnexteval/datasets/datasets/yelp.py +103 -0
  40. recnexteval/datasets/metadata/__init__.py +58 -0
  41. recnexteval/datasets/metadata/amazon.py +68 -0
  42. recnexteval/datasets/metadata/base.py +38 -0
  43. recnexteval/datasets/metadata/lastfm.py +110 -0
  44. recnexteval/datasets/metadata/movielens.py +87 -0
  45. recnexteval/evaluators/__init__.py +189 -0
  46. recnexteval/evaluators/accumulator.py +167 -0
  47. recnexteval/evaluators/base.py +216 -0
  48. recnexteval/evaluators/builder/__init__.py +125 -0
  49. recnexteval/evaluators/builder/base.py +166 -0
  50. recnexteval/evaluators/builder/pipeline.py +111 -0
  51. recnexteval/evaluators/builder/stream.py +54 -0
  52. recnexteval/evaluators/evaluator_pipeline.py +287 -0
  53. recnexteval/evaluators/evaluator_stream.py +374 -0
  54. recnexteval/evaluators/state_management.py +310 -0
  55. recnexteval/evaluators/strategy.py +32 -0
  56. recnexteval/evaluators/util.py +124 -0
  57. recnexteval/matrix/__init__.py +48 -0
  58. recnexteval/matrix/exception.py +5 -0
  59. recnexteval/matrix/interaction_matrix.py +784 -0
  60. recnexteval/matrix/prediction_matrix.py +153 -0
  61. recnexteval/matrix/util.py +24 -0
  62. recnexteval/metrics/__init__.py +57 -0
  63. recnexteval/metrics/binary/__init__.py +4 -0
  64. recnexteval/metrics/binary/hit.py +49 -0
  65. recnexteval/metrics/core/__init__.py +10 -0
  66. recnexteval/metrics/core/base.py +126 -0
  67. recnexteval/metrics/core/elementwise_top_k.py +75 -0
  68. recnexteval/metrics/core/listwise_top_k.py +72 -0
  69. recnexteval/metrics/core/top_k.py +60 -0
  70. recnexteval/metrics/core/util.py +29 -0
  71. recnexteval/metrics/ranking/__init__.py +6 -0
  72. recnexteval/metrics/ranking/dcg.py +55 -0
  73. recnexteval/metrics/ranking/ndcg.py +78 -0
  74. recnexteval/metrics/ranking/precision.py +51 -0
  75. recnexteval/metrics/ranking/recall.py +42 -0
  76. recnexteval/models/__init__.py +4 -0
  77. recnexteval/models/base.py +69 -0
  78. recnexteval/preprocessing/__init__.py +37 -0
  79. recnexteval/preprocessing/filter.py +181 -0
  80. recnexteval/preprocessing/preprocessor.py +137 -0
  81. recnexteval/registries/__init__.py +67 -0
  82. recnexteval/registries/algorithm.py +68 -0
  83. recnexteval/registries/base.py +131 -0
  84. recnexteval/registries/dataset.py +37 -0
  85. recnexteval/registries/metric.py +57 -0
  86. recnexteval/settings/__init__.py +127 -0
  87. recnexteval/settings/base.py +414 -0
  88. recnexteval/settings/exception.py +8 -0
  89. recnexteval/settings/leave_n_out_setting.py +48 -0
  90. recnexteval/settings/processor.py +115 -0
  91. recnexteval/settings/schema.py +11 -0
  92. recnexteval/settings/single_time_point_setting.py +111 -0
  93. recnexteval/settings/sliding_window_setting.py +153 -0
  94. recnexteval/settings/splitters/__init__.py +14 -0
  95. recnexteval/settings/splitters/base.py +57 -0
  96. recnexteval/settings/splitters/n_last.py +39 -0
  97. recnexteval/settings/splitters/n_last_timestamp.py +76 -0
  98. recnexteval/settings/splitters/timestamp.py +82 -0
  99. recnexteval/settings/util.py +0 -0
  100. recnexteval/utils/__init__.py +115 -0
  101. recnexteval/utils/json_to_csv_converter.py +128 -0
  102. recnexteval/utils/logging_tools.py +159 -0
  103. recnexteval/utils/path.py +155 -0
  104. recnexteval/utils/url_certificate_installer.py +54 -0
  105. recnexteval/utils/util.py +166 -0
  106. recnexteval/utils/uuid_util.py +7 -0
  107. recnexteval/utils/yaml_tool.py +65 -0
  108. recnexteval-0.1.0.dist-info/METADATA +85 -0
  109. recnexteval-0.1.0.dist-info/RECORD +110 -0
  110. recnexteval-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,111 @@
1
+ import logging
2
+ from warnings import warn
3
+
4
+ import numpy as np
5
+
6
+ from recnexteval.matrix import InteractionMatrix, TimestampAttributeMissingError
7
+ from .base import Setting
8
+ from .splitters import (
9
+ NLastInteractionTimestampSplitter,
10
+ TimestampSplitter,
11
+ )
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class SingleTimePointSetting(Setting):
18
+ """Single time point setting for data split.
19
+
20
+ Splits an interaction dataset at a single timestamp into background
21
+ (training) data and evaluation data. The evaluation data can be
22
+ further processed to produce unlabeled inputs and ground-truth
23
+ targets for model evaluation.
24
+
25
+ Args:
26
+ background_t: Time point to split the data. The background
27
+ split covers interactions with timestamps in `[0, background_t)`.
28
+ n_seq_data: Number of last sequential interactions
29
+ to provide as input for prediction. Defaults to `1`.
30
+ top_K: Number of interactions per user to select for
31
+ evaluation purposes. Defaults to `1`.
32
+ t_upper: Upper bound on the timestamp of
33
+ interactions included in evaluation. Defaults to the maximum
34
+ 32-bit integer value (acts like infinity).
35
+ include_all_past_data: If True, include all past
36
+ interactions when constructing input sequences. Defaults to False.
37
+ seed: Random seed for reproducible behavior.
38
+ If None, a seed will be generated.
39
+ """
40
+ IS_BASE: bool = False
41
+
42
+ def __init__(
43
+ self,
44
+ background_t: int,
45
+ n_seq_data: int = 1,
46
+ top_K: int = 1,
47
+ t_upper: int = np.iinfo(np.int32).max,
48
+ include_all_past_data: bool = False,
49
+ seed: int = 42,
50
+ ):
51
+ super().__init__(seed=seed)
52
+ self.t = background_t
53
+ """Seconds before `t` timestamp value to be used in `background_set`."""
54
+ self.t_upper = t_upper
55
+ """Seconds after `t` timestamp value to be used in `ground_truth_data`."""
56
+ self.n_seq_data = n_seq_data
57
+ self.top_K = top_K
58
+
59
+ logger.info("Splitting data at time %s with t_upper interval %s", background_t, t_upper)
60
+
61
+ self._background_splitter = TimestampSplitter(
62
+ t=background_t,
63
+ t_lower=None,
64
+ t_upper=t_upper,
65
+ )
66
+ self._splitter = NLastInteractionTimestampSplitter(
67
+ t=background_t,
68
+ t_upper=t_upper,
69
+ n_seq_data=n_seq_data,
70
+ include_all_past_data=include_all_past_data,
71
+ )
72
+ self._t_window = background_t
73
+
74
+ def _split(self, data: InteractionMatrix) -> None:
75
+ """Split the dataset by timestamp into background and evaluation sets.
76
+
77
+ The method raises :class:`TimestampAttributeMissingError` when the
78
+ provided :class:`InteractionMatrix` does not contain timestamp
79
+ information. It will warn if the chosen split time is before the
80
+ earliest timestamp in the data.
81
+
82
+ Args:
83
+ data: Interaction matrix to split. Must have timestamps.
84
+
85
+ Raises:
86
+ TimestampAttributeMissingError: If `data` has no timestamp attribute.
87
+ """
88
+ if not data.has_timestamps:
89
+ raise TimestampAttributeMissingError()
90
+ if data.min_timestamp > self.t:
91
+ warn(
92
+ f"Splitting at time {self.t} is before the first timestamp"
93
+ " in the data. No data will be in the training set."
94
+ )
95
+
96
+ self._background_data, _ = self._background_splitter.split(data)
97
+ past_interaction, future_interaction = self._splitter.split(data)
98
+ self._unlabeled_data, self._ground_truth_data = self.prediction_data_processor.process(
99
+ past_interaction=past_interaction,
100
+ future_interaction=future_interaction,
101
+ top_K=self.top_K,
102
+ )
103
+
104
+ if len(self._background_data) == 0:
105
+ logger.info("Background data is empty after splitting at time %s", self.t)
106
+ if len(self._unlabeled_data) == 0:
107
+ logger.info("Unlabeled data is empty after splitting at time %s", self.t)
108
+ if len(self._ground_truth_data) == 0:
109
+ logger.info("Ground truth data is empty after splitting at time %s", self.t)
110
+
111
+ logger.info("Finished splitting data at time %s", self.t)
@@ -0,0 +1,153 @@
1
+ import logging
2
+ from typing import Optional
3
+ from warnings import warn
4
+
5
+ import numpy as np
6
+ from tqdm import tqdm
7
+
8
+ from recnexteval.matrix import InteractionMatrix, TimestampAttributeMissingError
9
+ from .base import Setting
10
+ from .splitters import NLastInteractionTimestampSplitter, TimestampSplitter
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class SlidingWindowSetting(Setting):
17
+ """Sliding window setting for splitting data.
18
+
19
+ The data is split into a background set and evaluation set. The evaluation set is defined by a sliding window
20
+ that moves over the data. The window size is defined by the :data:`window_size` parameter. The evaluation set comprises of the
21
+ unlabeled data and ground truth data stored in a list. The unlabeled data contains the last :data:`n_seq_data` interactions
22
+ of the users/item before the split point along with masked interactions after the split point. The number of
23
+ interactions per user/item is limited to :data:`top_K`.
24
+ The ground truth data is the interactions after the split point and spans :data:`window_size` seconds.
25
+
26
+ Core attribute
27
+ ====================
28
+ - :attr:`background_data`: Data used for training the model. Interval is `[0, background_t)`.
29
+ - :attr:`unlabeled_data`: List of unlabeled data. Each element is a :class:`InteractionMatrix` object of interval `[0, t)`.
30
+ - :attr:`ground_truth_data`: List of ground truth data. Each element is a :class:`InteractionMatrix` object
31
+ of interval `[t, t + window_size)`.
32
+ - :attr:`data_timestamp_limit`: List of timestamps that the splitter will slide over the data.
33
+ - :attr:`incremental_data`: List of data that is used to incrementally update the model. Each element is
34
+ a :class:`InteractionMatrix` object of interval `[t, t + window_size)`.
35
+
36
+ :param background_t: Time point to split the data into background and evaluation data. Split will be from `[0, t)`
37
+ :type background_t: int
38
+ :param window_size: Size of the window in seconds to slide over the data.
39
+ Affects the incremental data being released to the model. If
40
+ :param:`t_ground_truth_window` is not provided, ground truth data will also
41
+ take this window.
42
+ :type window_size: int, optional
43
+ :param n_seq_data: Number of last sequential interactions to provide as
44
+ data for model to make prediction. Defaults to 0.
45
+ :type n_seq_data: int, optional
46
+ :param top_K: Number of interaction per user that should be selected for evaluation purposes.
47
+ :type top_K: int, optional
48
+ :param t_upper: Upper bound on the timestamp of interactions.
49
+ Defaults to maximal integer value (acting as infinity).
50
+ :type t_upper: int, optional
51
+ :param t_ground_truth_window: Size of the window in seconds to slide over the data for ground truth data.
52
+ If not provided, defaults to window_size during computation.
53
+ :type t_ground_truth_window: int, optional
54
+ :param seed: Seed for random number generator.
55
+ :type seed: int, optional
56
+ """
57
+
58
+ IS_BASE: bool = False
59
+
60
+ def __init__(
61
+ self,
62
+ background_t: int,
63
+ window_size: int = np.iinfo(np.int32).max, # in seconds
64
+ n_seq_data: int = 0,
65
+ top_K: int = 10,
66
+ t_upper: int = np.iinfo(np.int32).max,
67
+ t_ground_truth_window: None | int = None,
68
+ seed: int = 42,
69
+ ) -> None:
70
+ super().__init__(seed=seed)
71
+ self._sliding_window_setting = True
72
+ self.t = background_t
73
+ self.window_size = window_size
74
+ """Window size in seconds for splitter to slide over the data."""
75
+ self.n_seq_data = n_seq_data
76
+ self.top_K = top_K
77
+ self.t_upper = t_upper
78
+ """Upper bound on the timestamp of interactions. Defaults to maximal integer value (acting as infinity)."""
79
+
80
+ if t_upper and t_upper < background_t:
81
+ raise ValueError("t_upper must be greater than background_t")
82
+
83
+ if t_ground_truth_window is None:
84
+ t_ground_truth_window = window_size
85
+
86
+ self.t_ground_truth_window = t_ground_truth_window
87
+
88
+ self._background_splitter = TimestampSplitter(background_t, None, None)
89
+ self._window_splitter = NLastInteractionTimestampSplitter(
90
+ background_t,
91
+ t_ground_truth_window,
92
+ n_seq_data,
93
+ )
94
+
95
+ def _split(self, data: InteractionMatrix) -> None:
96
+ if not data.has_timestamps:
97
+ raise TimestampAttributeMissingError()
98
+ if data.min_timestamp > self.t:
99
+ warn(
100
+ f"Splitting at time {self.t} is before the first "
101
+ "timestamp in the data. No data will be in the background(training) set."
102
+ )
103
+ if self.t_upper:
104
+ data = data.timestamps_lt(self.t_upper)
105
+
106
+ self._background_data, _ = self._background_splitter.split(data)
107
+ self._ground_truth_data, self._unlabeled_data, self._t_window, self._incremental_data = (
108
+ [],
109
+ [],
110
+ [],
111
+ [],
112
+ )
113
+
114
+ # sub_time is the subjugate time point that the splitter will slide over the data
115
+ sub_time = self.t
116
+ max_timestamp = data.max_timestamp
117
+
118
+ pbar = tqdm(total=int((max_timestamp - sub_time) / self.window_size))
119
+ while sub_time <= max_timestamp:
120
+ self._t_window.append(sub_time)
121
+ # the set used for eval will always have a timestamp greater than
122
+ # data released such that it is unknown to the model
123
+ self._window_splitter.update_split_point(sub_time)
124
+ past_interaction, future_interaction = self._window_splitter.split(data)
125
+
126
+ # if past_interaction, future_interaction is empty, log an info message
127
+ if len(past_interaction) == 0:
128
+ logger.info(
129
+ "Split at time %s resulted in empty unlabelled testing samples.", sub_time
130
+ )
131
+ if len(future_interaction) == 0:
132
+ logger.info("Split at time %s resulted in empty incremental data.", sub_time)
133
+
134
+ unlabeled_set, ground_truth = self.prediction_data_processor.process(
135
+ past_interaction=past_interaction,
136
+ future_interaction=future_interaction,
137
+ top_K=self.top_K,
138
+ )
139
+ self._unlabeled_data.append(unlabeled_set)
140
+ self._ground_truth_data.append(ground_truth)
141
+
142
+ self._incremental_data.append(future_interaction)
143
+
144
+ sub_time += self.window_size
145
+ pbar.update(1)
146
+ pbar.close()
147
+
148
+ self._num_split_set = len(self._unlabeled_data)
149
+ logger.info(
150
+ "Finished split with window size %s seconds. Number of splits: %s in total.",
151
+ self.window_size,
152
+ self._num_split_set,
153
+ )
@@ -0,0 +1,14 @@
1
+
2
+
3
+ from .base import Splitter
4
+ from .n_last import NLastInteractionSplitter
5
+ from .n_last_timestamp import NLastInteractionTimestampSplitter
6
+ from .timestamp import TimestampSplitter
7
+
8
+
9
+ __all__ = [
10
+ "Splitter",
11
+ "TimestampSplitter",
12
+ "NLastInteractionTimestampSplitter",
13
+ "NLastInteractionSplitter",
14
+ ]
@@ -0,0 +1,57 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+
4
+ from recnexteval.matrix import InteractionMatrix
5
+
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class Splitter(ABC):
11
+ """Abstract base class for dataset splitters.
12
+
13
+ Implementations should split an :class:`InteractionMatrix` into two
14
+ parts according to a splitting condition (for example, by timestamp).
15
+ """
16
+
17
+ def __init__(self) -> None:
18
+ pass
19
+
20
+ @property
21
+ def name(self) -> str:
22
+ """Return the class name of the splitter.
23
+
24
+ Returns:
25
+ The splitter class name.
26
+ """
27
+ return self.__class__.__name__
28
+
29
+ @property
30
+ def identifier(self) -> str:
31
+ """Return a string identifier including the splitter's parameters.
32
+
33
+ The identifier includes the class name and a comma-separated list of
34
+ attribute name/value pairs from `self.__dict__`.
35
+
36
+ Returns:
37
+ Identifier string like `Name(k1=v1,k2=v2)`.
38
+ """
39
+
40
+ paramstring = ",".join((f"{k}={v}" for k, v in self.__dict__.items()))
41
+ return self.name + f"({paramstring})"
42
+
43
+ @abstractmethod
44
+ def split(self, data: InteractionMatrix) -> tuple[InteractionMatrix, InteractionMatrix]:
45
+ """Split an interaction matrix into two parts.
46
+
47
+ Args:
48
+ data (InteractionMatrix): The interaction dataset to split.
49
+
50
+ Returns:
51
+ A pair of `InteractionMatrix` objects representing the two parts.
52
+
53
+ Raises:
54
+ NotImplementedError: If the concrete splitter does not implement this method.
55
+ """
56
+
57
+ raise NotImplementedError(f"{self.name} must implement the _split method.")
@@ -0,0 +1,39 @@
1
+ import logging
2
+
3
+ from recnexteval.matrix import InteractionMatrix
4
+ from .base import Splitter
5
+
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class NLastInteractionSplitter(Splitter):
11
+ """Splits the n most recent interactions of a user into the second return value,
12
+ and earlier interactions into the first.
13
+
14
+ Args:
15
+ n (int): Number of most recent actions to assign to the second return value.
16
+ n_seq_data (int, optional): Number of last interactions to provide as unlabeled data
17
+ for model to make prediction. Defaults to 1.
18
+
19
+ Raises:
20
+ ValueError: If n is less than 1, as this would cause the ground truth data to be empty.
21
+ """
22
+
23
+ def __init__(self, n: int, n_seq_data: int = 1) -> None:
24
+ super().__init__()
25
+ if n < 1:
26
+ raise ValueError(
27
+ f"n must be greater than 0, got {n}. "
28
+ f"Values for n < 1 will cause the ground truth data to be empty."
29
+ )
30
+ self.n = n
31
+ self.n_seq_data = n_seq_data
32
+
33
+ def split(self, data: InteractionMatrix) -> tuple[InteractionMatrix, InteractionMatrix]:
34
+ future_interaction = data.get_users_n_last_interaction(self.n)
35
+ past_interaction = data - future_interaction
36
+ past_interaction = past_interaction.get_users_n_last_interaction(self.n_seq_data)
37
+ logger.debug(f"{self.identifier} has complete split")
38
+
39
+ return past_interaction, future_interaction
@@ -0,0 +1,76 @@
1
+ import logging
2
+
3
+ from recnexteval.matrix import InteractionMatrix
4
+ from .timestamp import TimestampSplitter
5
+
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class NLastInteractionTimestampSplitter(TimestampSplitter):
11
+ """Splits with n last interactions based on a timestamp.
12
+
13
+ Splits the data into unlabeled and ground truth data based on a timestamp.
14
+ Historical data contains last `n_seq_data` interactions before the timestamp `t`
15
+ and the future interaction contains interactions after the timestamp `t`.
16
+
17
+
18
+ Attributes:
19
+ past_interaction: List of unlabeled data. Interval is `[0, t)`.
20
+ - future_interaction: Data used for training the model.
21
+ Interval is `[t, t+t_upper)` or `[t,inf]`.
22
+ n_seq_data: Number of last interactions to provide as data for model to make prediction.
23
+ These interactions are past interactions from before the timestamp `t`.
24
+
25
+ Args:
26
+ t: Timestamp to split on in seconds since epoch.
27
+ t_upper: Seconds past t. Upper bound on the timestamp
28
+ of interactions. Defaults to None (infinity).
29
+ n_seq_data: Number of last interactions to provide as data
30
+ for model to make prediction. Defaults to 1.
31
+ include_all_past_data: If True, include all past data in the past_interaction.
32
+ Defaults to False.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ t: int,
38
+ t_upper: None | int = None,
39
+ n_seq_data: int = 1,
40
+ include_all_past_data: bool = False,
41
+ ) -> None:
42
+ super().__init__(t=t, t_lower=None, t_upper=t_upper)
43
+ self.n_seq_data = n_seq_data
44
+ self.include_all_past_data = include_all_past_data
45
+
46
+ def update_split_point(self, t: int) -> None:
47
+ logger.debug(f"{self.identifier} - Updating split point to t={t}")
48
+ self.t = t
49
+
50
+ def split(self, data: InteractionMatrix) -> tuple[InteractionMatrix, InteractionMatrix]:
51
+ """Splits data such that the following definition holds:
52
+
53
+ - past_interaction: List of unlabeled data. Interval is `[0, t)`.
54
+ - future_interaction: Data used for training the model.
55
+ Interval is `[t, t+t_upper)` or `[t,inf]`.
56
+
57
+ Args:
58
+ data: Interaction matrix to be split. Must contain timestamps.
59
+
60
+ Returns:
61
+ A 2-tuple containing the `past_interaction` and `future_interaction` matrices.
62
+ """
63
+ if self.t_upper is None:
64
+ future_interaction = data.timestamps_gte(self.t)
65
+ else:
66
+ future_interaction = data.timestamps_lt(self.t + self.t_upper).timestamps_gte(self.t)
67
+
68
+ if self.include_all_past_data:
69
+ past_interaction = data.timestamps_lt(self.t)
70
+ else:
71
+ past_interaction = data.get_users_n_last_interaction(
72
+ self.n_seq_data, self.t, future_interaction.user_ids
73
+ )
74
+
75
+ logger.debug(f"{self.identifier} has complete split")
76
+ return past_interaction, future_interaction
@@ -0,0 +1,82 @@
1
+ import logging
2
+
3
+ from recnexteval.matrix import InteractionMatrix
4
+ from .base import Splitter
5
+
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class TimestampSplitter(Splitter):
11
+ """Split an interaction dataset by timestamp.
12
+
13
+ The splitter divides the data into two parts:
14
+
15
+ 1. Interactions with timestamps in the interval `[t - t_lower, t)`,
16
+ representing past interactions.
17
+ 2. Interactions with timestamps in the interval `[t, t + t_upper]`,
18
+ representing future interactions.
19
+
20
+ If `t_lower` or `t_upper` are not provided, they default to infinity,
21
+ meaning the corresponding interval is unbounded on that side.
22
+
23
+ Note that a user can appear in both the past and future interaction sets.
24
+
25
+ Attributes:
26
+ past_interaction (InteractionMatrix): Interactions in the interval
27
+ `[0, t)`, representing unlabeled data for prediction.
28
+ future_interaction (InteractionMatrix): Interactions in the interval
29
+ `[t, t + t_upper)` or `[t, inf)`, used for training the model.
30
+
31
+ Args:
32
+ t: Timestamp to split on, in seconds since the Unix epoch.
33
+ t_lower: Seconds before `t` to include in
34
+ the past interactions. If None, the interval is unbounded.
35
+ Defaults to None.
36
+ t_upper: Seconds after `t` to include in
37
+ the future interactions. If None, the interval is unbounded.
38
+ Defaults to None.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ t: int,
44
+ t_lower: None | int = None,
45
+ t_upper: None | int = None,
46
+ ) -> None:
47
+ super().__init__()
48
+ self.t = t
49
+ self.t_lower = t_lower
50
+ self.t_upper = t_upper
51
+
52
+ def split(self, data: InteractionMatrix) -> tuple[InteractionMatrix, InteractionMatrix]:
53
+ """Split the interaction data by timestamp.
54
+
55
+ The method populates the `past_interaction` and `future_interaction`
56
+ attributes with the corresponding subsets of the input data.
57
+
58
+ Args:
59
+ data: The interaction dataset to split.
60
+ Must include timestamp information.
61
+
62
+ Returns:
63
+ A pair containing the past interactions and future interactions.
64
+ """
65
+
66
+ if self.t_lower is None:
67
+ # timestamp < t
68
+ past_interaction = data.timestamps_lt(self.t)
69
+ else:
70
+ # t-t_lower =< timestamp < t
71
+ past_interaction = data.timestamps_lt(self.t).timestamps_gte(self.t - self.t_lower)
72
+
73
+ if self.t_upper is None:
74
+ # timestamp >= t
75
+ future_interaction = data.timestamps_gte(self.t)
76
+ else:
77
+ # t =< timestamp < t + t_upper
78
+ future_interaction = data.timestamps_gte(self.t).timestamps_lt(self.t + self.t_upper)
79
+
80
+ logger.debug(f"{self.identifier} has complete split")
81
+
82
+ return past_interaction, future_interaction
File without changes
@@ -0,0 +1,115 @@
1
+ """Utility module for RecNextEval library.
2
+
3
+ This module provides a set of general utility functions used throughout RecNextEval.
4
+ It includes utilities for file handling, configuration, matrix operations, and logging.
5
+
6
+ ## Utility Functions
7
+
8
+ General-purpose utility functions that support library operations:
9
+
10
+ - `create_config_yaml`: Create configuration YAML file
11
+ - `safe_dir`: Safely manage directory operations
12
+ - `add_columns_to_csr_matrix`: Add columns to sparse matrix
13
+ - `add_rows_to_csr_matrix`: Add rows to sparse matrix
14
+ - `arg_to_str`: Convert arguments to string representation
15
+ - `df_to_sparse`: Convert DataFrame to sparse matrix
16
+ - `to_binary`: Convert data to binary format
17
+ - `to_tuple`: Convert data to tuple format
18
+ - `ProgressBar`: Progress bar utility for tracking operations
19
+
20
+ ## Path Utilities
21
+
22
+ Directory and path management functions:
23
+
24
+ - `get_cache_dir`: Get cache directory path
25
+ - `get_data_dir`: Get data directory path
26
+ - `get_logs_dir`: Get logs directory path
27
+ - `get_repo_root`: Get repository root directory
28
+ - `safe_dir`: Safely create and manage directories
29
+
30
+ ## Logging Control
31
+
32
+ Functions to control logging level and warning suppression:
33
+
34
+ - `log_level`: Get current logging level
35
+ - `log_level_by_name`: Set logging level by name (DEBUG, INFO, WARNING, ERROR)
36
+ - `prepare_logger`: Initialize logger for RecNextEval
37
+ - `suppress_warnings`: Suppress all Python warnings
38
+ - `suppress_specific_warnings`: Suppress specific warning types
39
+
40
+ ## Logging Example
41
+
42
+ ```python
43
+ import logging
44
+ import warnings
45
+ import recnexteval
46
+
47
+ # Set log level to INFO
48
+ recnexteval.log_level_by_name("INFO")
49
+
50
+ # Suppress all warnings
51
+ recnexteval.suppress_warnings(suppress=True)
52
+
53
+ # Log information
54
+ logger = logging.getLogger("recnexteval")
55
+ logger.info("This is an informational message.")
56
+
57
+ # Warnings will be suppressed
58
+ warnings.warn("This warning will not appear.")
59
+ ```
60
+
61
+ ## Configuration
62
+
63
+ - `create_config_yaml`: Generate configuration YAML file for RecNextEval
64
+ """
65
+
66
+ from .logging_tools import (
67
+ log_level,
68
+ log_level_by_name,
69
+ prepare_logger,
70
+ suppress_specific_warnings,
71
+ suppress_warnings,
72
+ )
73
+ from .path import (
74
+ get_cache_dir,
75
+ get_data_dir,
76
+ get_logs_dir,
77
+ get_repo_root,
78
+ safe_dir,
79
+ )
80
+ from .util import (
81
+ ProgressBar,
82
+ add_columns_to_csr_matrix,
83
+ add_rows_to_csr_matrix,
84
+ arg_to_str,
85
+ df_to_sparse,
86
+ invert,
87
+ to_binary,
88
+ to_tuple,
89
+ )
90
+ from .uuid_util import generate_algorithm_uuid
91
+ from .yaml_tool import create_config_yaml
92
+
93
+
94
+ __all__ = [
95
+ "create_config_yaml",
96
+ "safe_dir",
97
+ "add_columns_to_csr_matrix",
98
+ "add_rows_to_csr_matrix",
99
+ "arg_to_str",
100
+ "df_to_sparse",
101
+ "prepare_logger",
102
+ "to_binary",
103
+ "to_tuple",
104
+ "ProgressBar",
105
+ "log_level",
106
+ "log_level_by_name",
107
+ "suppress_warnings",
108
+ "suppress_specific_warnings",
109
+ "get_cache_dir",
110
+ "get_data_dir",
111
+ "get_logs_dir",
112
+ "get_repo_root",
113
+ "invert",
114
+ "generate_algorithm_uuid",
115
+ ]