recnexteval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recnexteval/__init__.py +20 -0
- recnexteval/algorithms/__init__.py +99 -0
- recnexteval/algorithms/base.py +377 -0
- recnexteval/algorithms/baseline/__init__.py +10 -0
- recnexteval/algorithms/baseline/decay_popularity.py +110 -0
- recnexteval/algorithms/baseline/most_popular.py +72 -0
- recnexteval/algorithms/baseline/random.py +39 -0
- recnexteval/algorithms/baseline/recent_popularity.py +34 -0
- recnexteval/algorithms/itemknn/__init__.py +14 -0
- recnexteval/algorithms/itemknn/itemknn.py +119 -0
- recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
- recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
- recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
- recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
- recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
- recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
- recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
- recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
- recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
- recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
- recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
- recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
- recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
- recnexteval/algorithms/utils.py +51 -0
- recnexteval/datasets/__init__.py +109 -0
- recnexteval/datasets/base.py +316 -0
- recnexteval/datasets/config/__init__.py +113 -0
- recnexteval/datasets/config/amazon.py +188 -0
- recnexteval/datasets/config/base.py +72 -0
- recnexteval/datasets/config/lastfm.py +105 -0
- recnexteval/datasets/config/movielens.py +169 -0
- recnexteval/datasets/config/yelp.py +25 -0
- recnexteval/datasets/datasets/__init__.py +24 -0
- recnexteval/datasets/datasets/amazon.py +151 -0
- recnexteval/datasets/datasets/base.py +250 -0
- recnexteval/datasets/datasets/lastfm.py +121 -0
- recnexteval/datasets/datasets/movielens.py +93 -0
- recnexteval/datasets/datasets/test.py +46 -0
- recnexteval/datasets/datasets/yelp.py +103 -0
- recnexteval/datasets/metadata/__init__.py +58 -0
- recnexteval/datasets/metadata/amazon.py +68 -0
- recnexteval/datasets/metadata/base.py +38 -0
- recnexteval/datasets/metadata/lastfm.py +110 -0
- recnexteval/datasets/metadata/movielens.py +87 -0
- recnexteval/evaluators/__init__.py +189 -0
- recnexteval/evaluators/accumulator.py +167 -0
- recnexteval/evaluators/base.py +216 -0
- recnexteval/evaluators/builder/__init__.py +125 -0
- recnexteval/evaluators/builder/base.py +166 -0
- recnexteval/evaluators/builder/pipeline.py +111 -0
- recnexteval/evaluators/builder/stream.py +54 -0
- recnexteval/evaluators/evaluator_pipeline.py +287 -0
- recnexteval/evaluators/evaluator_stream.py +374 -0
- recnexteval/evaluators/state_management.py +310 -0
- recnexteval/evaluators/strategy.py +32 -0
- recnexteval/evaluators/util.py +124 -0
- recnexteval/matrix/__init__.py +48 -0
- recnexteval/matrix/exception.py +5 -0
- recnexteval/matrix/interaction_matrix.py +784 -0
- recnexteval/matrix/prediction_matrix.py +153 -0
- recnexteval/matrix/util.py +24 -0
- recnexteval/metrics/__init__.py +57 -0
- recnexteval/metrics/binary/__init__.py +4 -0
- recnexteval/metrics/binary/hit.py +49 -0
- recnexteval/metrics/core/__init__.py +10 -0
- recnexteval/metrics/core/base.py +126 -0
- recnexteval/metrics/core/elementwise_top_k.py +75 -0
- recnexteval/metrics/core/listwise_top_k.py +72 -0
- recnexteval/metrics/core/top_k.py +60 -0
- recnexteval/metrics/core/util.py +29 -0
- recnexteval/metrics/ranking/__init__.py +6 -0
- recnexteval/metrics/ranking/dcg.py +55 -0
- recnexteval/metrics/ranking/ndcg.py +78 -0
- recnexteval/metrics/ranking/precision.py +51 -0
- recnexteval/metrics/ranking/recall.py +42 -0
- recnexteval/models/__init__.py +4 -0
- recnexteval/models/base.py +69 -0
- recnexteval/preprocessing/__init__.py +37 -0
- recnexteval/preprocessing/filter.py +181 -0
- recnexteval/preprocessing/preprocessor.py +137 -0
- recnexteval/registries/__init__.py +67 -0
- recnexteval/registries/algorithm.py +68 -0
- recnexteval/registries/base.py +131 -0
- recnexteval/registries/dataset.py +37 -0
- recnexteval/registries/metric.py +57 -0
- recnexteval/settings/__init__.py +127 -0
- recnexteval/settings/base.py +414 -0
- recnexteval/settings/exception.py +8 -0
- recnexteval/settings/leave_n_out_setting.py +48 -0
- recnexteval/settings/processor.py +115 -0
- recnexteval/settings/schema.py +11 -0
- recnexteval/settings/single_time_point_setting.py +111 -0
- recnexteval/settings/sliding_window_setting.py +153 -0
- recnexteval/settings/splitters/__init__.py +14 -0
- recnexteval/settings/splitters/base.py +57 -0
- recnexteval/settings/splitters/n_last.py +39 -0
- recnexteval/settings/splitters/n_last_timestamp.py +76 -0
- recnexteval/settings/splitters/timestamp.py +82 -0
- recnexteval/settings/util.py +0 -0
- recnexteval/utils/__init__.py +115 -0
- recnexteval/utils/json_to_csv_converter.py +128 -0
- recnexteval/utils/logging_tools.py +159 -0
- recnexteval/utils/path.py +155 -0
- recnexteval/utils/url_certificate_installer.py +54 -0
- recnexteval/utils/util.py +166 -0
- recnexteval/utils/uuid_util.py +7 -0
- recnexteval/utils/yaml_tool.py +65 -0
- recnexteval-0.1.0.dist-info/METADATA +85 -0
- recnexteval-0.1.0.dist-info/RECORD +110 -0
- recnexteval-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from warnings import warn
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from recnexteval.matrix import InteractionMatrix, TimestampAttributeMissingError
|
|
7
|
+
from .base import Setting
|
|
8
|
+
from .splitters import (
|
|
9
|
+
NLastInteractionTimestampSplitter,
|
|
10
|
+
TimestampSplitter,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SingleTimePointSetting(Setting):
|
|
18
|
+
"""Single time point setting for data split.
|
|
19
|
+
|
|
20
|
+
Splits an interaction dataset at a single timestamp into background
|
|
21
|
+
(training) data and evaluation data. The evaluation data can be
|
|
22
|
+
further processed to produce unlabeled inputs and ground-truth
|
|
23
|
+
targets for model evaluation.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
background_t: Time point to split the data. The background
|
|
27
|
+
split covers interactions with timestamps in `[0, background_t)`.
|
|
28
|
+
n_seq_data: Number of last sequential interactions
|
|
29
|
+
to provide as input for prediction. Defaults to `1`.
|
|
30
|
+
top_K: Number of interactions per user to select for
|
|
31
|
+
evaluation purposes. Defaults to `1`.
|
|
32
|
+
t_upper: Upper bound on the timestamp of
|
|
33
|
+
interactions included in evaluation. Defaults to the maximum
|
|
34
|
+
32-bit integer value (acts like infinity).
|
|
35
|
+
include_all_past_data: If True, include all past
|
|
36
|
+
interactions when constructing input sequences. Defaults to False.
|
|
37
|
+
seed: Random seed for reproducible behavior.
|
|
38
|
+
If None, a seed will be generated.
|
|
39
|
+
"""
|
|
40
|
+
IS_BASE: bool = False
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
background_t: int,
|
|
45
|
+
n_seq_data: int = 1,
|
|
46
|
+
top_K: int = 1,
|
|
47
|
+
t_upper: int = np.iinfo(np.int32).max,
|
|
48
|
+
include_all_past_data: bool = False,
|
|
49
|
+
seed: int = 42,
|
|
50
|
+
):
|
|
51
|
+
super().__init__(seed=seed)
|
|
52
|
+
self.t = background_t
|
|
53
|
+
"""Seconds before `t` timestamp value to be used in `background_set`."""
|
|
54
|
+
self.t_upper = t_upper
|
|
55
|
+
"""Seconds after `t` timestamp value to be used in `ground_truth_data`."""
|
|
56
|
+
self.n_seq_data = n_seq_data
|
|
57
|
+
self.top_K = top_K
|
|
58
|
+
|
|
59
|
+
logger.info("Splitting data at time %s with t_upper interval %s", background_t, t_upper)
|
|
60
|
+
|
|
61
|
+
self._background_splitter = TimestampSplitter(
|
|
62
|
+
t=background_t,
|
|
63
|
+
t_lower=None,
|
|
64
|
+
t_upper=t_upper,
|
|
65
|
+
)
|
|
66
|
+
self._splitter = NLastInteractionTimestampSplitter(
|
|
67
|
+
t=background_t,
|
|
68
|
+
t_upper=t_upper,
|
|
69
|
+
n_seq_data=n_seq_data,
|
|
70
|
+
include_all_past_data=include_all_past_data,
|
|
71
|
+
)
|
|
72
|
+
self._t_window = background_t
|
|
73
|
+
|
|
74
|
+
def _split(self, data: InteractionMatrix) -> None:
|
|
75
|
+
"""Split the dataset by timestamp into background and evaluation sets.
|
|
76
|
+
|
|
77
|
+
The method raises :class:`TimestampAttributeMissingError` when the
|
|
78
|
+
provided :class:`InteractionMatrix` does not contain timestamp
|
|
79
|
+
information. It will warn if the chosen split time is before the
|
|
80
|
+
earliest timestamp in the data.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
data: Interaction matrix to split. Must have timestamps.
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
TimestampAttributeMissingError: If `data` has no timestamp attribute.
|
|
87
|
+
"""
|
|
88
|
+
if not data.has_timestamps:
|
|
89
|
+
raise TimestampAttributeMissingError()
|
|
90
|
+
if data.min_timestamp > self.t:
|
|
91
|
+
warn(
|
|
92
|
+
f"Splitting at time {self.t} is before the first timestamp"
|
|
93
|
+
" in the data. No data will be in the training set."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
self._background_data, _ = self._background_splitter.split(data)
|
|
97
|
+
past_interaction, future_interaction = self._splitter.split(data)
|
|
98
|
+
self._unlabeled_data, self._ground_truth_data = self.prediction_data_processor.process(
|
|
99
|
+
past_interaction=past_interaction,
|
|
100
|
+
future_interaction=future_interaction,
|
|
101
|
+
top_K=self.top_K,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if len(self._background_data) == 0:
|
|
105
|
+
logger.info("Background data is empty after splitting at time %s", self.t)
|
|
106
|
+
if len(self._unlabeled_data) == 0:
|
|
107
|
+
logger.info("Unlabeled data is empty after splitting at time %s", self.t)
|
|
108
|
+
if len(self._ground_truth_data) == 0:
|
|
109
|
+
logger.info("Ground truth data is empty after splitting at time %s", self.t)
|
|
110
|
+
|
|
111
|
+
logger.info("Finished splitting data at time %s", self.t)
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from warnings import warn
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
from recnexteval.matrix import InteractionMatrix, TimestampAttributeMissingError
|
|
9
|
+
from .base import Setting
|
|
10
|
+
from .splitters import NLastInteractionTimestampSplitter, TimestampSplitter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SlidingWindowSetting(Setting):
|
|
17
|
+
"""Sliding window setting for splitting data.
|
|
18
|
+
|
|
19
|
+
The data is split into a background set and evaluation set. The evaluation set is defined by a sliding window
|
|
20
|
+
that moves over the data. The window size is defined by the :data:`window_size` parameter. The evaluation set comprises of the
|
|
21
|
+
unlabeled data and ground truth data stored in a list. The unlabeled data contains the last :data:`n_seq_data` interactions
|
|
22
|
+
of the users/item before the split point along with masked interactions after the split point. The number of
|
|
23
|
+
interactions per user/item is limited to :data:`top_K`.
|
|
24
|
+
The ground truth data is the interactions after the split point and spans :data:`window_size` seconds.
|
|
25
|
+
|
|
26
|
+
Core attribute
|
|
27
|
+
====================
|
|
28
|
+
- :attr:`background_data`: Data used for training the model. Interval is `[0, background_t)`.
|
|
29
|
+
- :attr:`unlabeled_data`: List of unlabeled data. Each element is a :class:`InteractionMatrix` object of interval `[0, t)`.
|
|
30
|
+
- :attr:`ground_truth_data`: List of ground truth data. Each element is a :class:`InteractionMatrix` object
|
|
31
|
+
of interval `[t, t + window_size)`.
|
|
32
|
+
- :attr:`data_timestamp_limit`: List of timestamps that the splitter will slide over the data.
|
|
33
|
+
- :attr:`incremental_data`: List of data that is used to incrementally update the model. Each element is
|
|
34
|
+
a :class:`InteractionMatrix` object of interval `[t, t + window_size)`.
|
|
35
|
+
|
|
36
|
+
:param background_t: Time point to split the data into background and evaluation data. Split will be from `[0, t)`
|
|
37
|
+
:type background_t: int
|
|
38
|
+
:param window_size: Size of the window in seconds to slide over the data.
|
|
39
|
+
Affects the incremental data being released to the model. If
|
|
40
|
+
:param:`t_ground_truth_window` is not provided, ground truth data will also
|
|
41
|
+
take this window.
|
|
42
|
+
:type window_size: int, optional
|
|
43
|
+
:param n_seq_data: Number of last sequential interactions to provide as
|
|
44
|
+
data for model to make prediction. Defaults to 0.
|
|
45
|
+
:type n_seq_data: int, optional
|
|
46
|
+
:param top_K: Number of interaction per user that should be selected for evaluation purposes.
|
|
47
|
+
:type top_K: int, optional
|
|
48
|
+
:param t_upper: Upper bound on the timestamp of interactions.
|
|
49
|
+
Defaults to maximal integer value (acting as infinity).
|
|
50
|
+
:type t_upper: int, optional
|
|
51
|
+
:param t_ground_truth_window: Size of the window in seconds to slide over the data for ground truth data.
|
|
52
|
+
If not provided, defaults to window_size during computation.
|
|
53
|
+
:type t_ground_truth_window: int, optional
|
|
54
|
+
:param seed: Seed for random number generator.
|
|
55
|
+
:type seed: int, optional
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
IS_BASE: bool = False
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
background_t: int,
|
|
63
|
+
window_size: int = np.iinfo(np.int32).max, # in seconds
|
|
64
|
+
n_seq_data: int = 0,
|
|
65
|
+
top_K: int = 10,
|
|
66
|
+
t_upper: int = np.iinfo(np.int32).max,
|
|
67
|
+
t_ground_truth_window: None | int = None,
|
|
68
|
+
seed: int = 42,
|
|
69
|
+
) -> None:
|
|
70
|
+
super().__init__(seed=seed)
|
|
71
|
+
self._sliding_window_setting = True
|
|
72
|
+
self.t = background_t
|
|
73
|
+
self.window_size = window_size
|
|
74
|
+
"""Window size in seconds for splitter to slide over the data."""
|
|
75
|
+
self.n_seq_data = n_seq_data
|
|
76
|
+
self.top_K = top_K
|
|
77
|
+
self.t_upper = t_upper
|
|
78
|
+
"""Upper bound on the timestamp of interactions. Defaults to maximal integer value (acting as infinity)."""
|
|
79
|
+
|
|
80
|
+
if t_upper and t_upper < background_t:
|
|
81
|
+
raise ValueError("t_upper must be greater than background_t")
|
|
82
|
+
|
|
83
|
+
if t_ground_truth_window is None:
|
|
84
|
+
t_ground_truth_window = window_size
|
|
85
|
+
|
|
86
|
+
self.t_ground_truth_window = t_ground_truth_window
|
|
87
|
+
|
|
88
|
+
self._background_splitter = TimestampSplitter(background_t, None, None)
|
|
89
|
+
self._window_splitter = NLastInteractionTimestampSplitter(
|
|
90
|
+
background_t,
|
|
91
|
+
t_ground_truth_window,
|
|
92
|
+
n_seq_data,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def _split(self, data: InteractionMatrix) -> None:
|
|
96
|
+
if not data.has_timestamps:
|
|
97
|
+
raise TimestampAttributeMissingError()
|
|
98
|
+
if data.min_timestamp > self.t:
|
|
99
|
+
warn(
|
|
100
|
+
f"Splitting at time {self.t} is before the first "
|
|
101
|
+
"timestamp in the data. No data will be in the background(training) set."
|
|
102
|
+
)
|
|
103
|
+
if self.t_upper:
|
|
104
|
+
data = data.timestamps_lt(self.t_upper)
|
|
105
|
+
|
|
106
|
+
self._background_data, _ = self._background_splitter.split(data)
|
|
107
|
+
self._ground_truth_data, self._unlabeled_data, self._t_window, self._incremental_data = (
|
|
108
|
+
[],
|
|
109
|
+
[],
|
|
110
|
+
[],
|
|
111
|
+
[],
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# sub_time is the subjugate time point that the splitter will slide over the data
|
|
115
|
+
sub_time = self.t
|
|
116
|
+
max_timestamp = data.max_timestamp
|
|
117
|
+
|
|
118
|
+
pbar = tqdm(total=int((max_timestamp - sub_time) / self.window_size))
|
|
119
|
+
while sub_time <= max_timestamp:
|
|
120
|
+
self._t_window.append(sub_time)
|
|
121
|
+
# the set used for eval will always have a timestamp greater than
|
|
122
|
+
# data released such that it is unknown to the model
|
|
123
|
+
self._window_splitter.update_split_point(sub_time)
|
|
124
|
+
past_interaction, future_interaction = self._window_splitter.split(data)
|
|
125
|
+
|
|
126
|
+
# if past_interaction, future_interaction is empty, log an info message
|
|
127
|
+
if len(past_interaction) == 0:
|
|
128
|
+
logger.info(
|
|
129
|
+
"Split at time %s resulted in empty unlabelled testing samples.", sub_time
|
|
130
|
+
)
|
|
131
|
+
if len(future_interaction) == 0:
|
|
132
|
+
logger.info("Split at time %s resulted in empty incremental data.", sub_time)
|
|
133
|
+
|
|
134
|
+
unlabeled_set, ground_truth = self.prediction_data_processor.process(
|
|
135
|
+
past_interaction=past_interaction,
|
|
136
|
+
future_interaction=future_interaction,
|
|
137
|
+
top_K=self.top_K,
|
|
138
|
+
)
|
|
139
|
+
self._unlabeled_data.append(unlabeled_set)
|
|
140
|
+
self._ground_truth_data.append(ground_truth)
|
|
141
|
+
|
|
142
|
+
self._incremental_data.append(future_interaction)
|
|
143
|
+
|
|
144
|
+
sub_time += self.window_size
|
|
145
|
+
pbar.update(1)
|
|
146
|
+
pbar.close()
|
|
147
|
+
|
|
148
|
+
self._num_split_set = len(self._unlabeled_data)
|
|
149
|
+
logger.info(
|
|
150
|
+
"Finished split with window size %s seconds. Number of splits: %s in total.",
|
|
151
|
+
self.window_size,
|
|
152
|
+
self._num_split_set,
|
|
153
|
+
)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
from .base import Splitter
|
|
4
|
+
from .n_last import NLastInteractionSplitter
|
|
5
|
+
from .n_last_timestamp import NLastInteractionTimestampSplitter
|
|
6
|
+
from .timestamp import TimestampSplitter
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Splitter",
|
|
11
|
+
"TimestampSplitter",
|
|
12
|
+
"NLastInteractionTimestampSplitter",
|
|
13
|
+
"NLastInteractionSplitter",
|
|
14
|
+
]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
|
|
4
|
+
from recnexteval.matrix import InteractionMatrix
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Splitter(ABC):
|
|
11
|
+
"""Abstract base class for dataset splitters.
|
|
12
|
+
|
|
13
|
+
Implementations should split an :class:`InteractionMatrix` into two
|
|
14
|
+
parts according to a splitting condition (for example, by timestamp).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self) -> None:
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def name(self) -> str:
|
|
22
|
+
"""Return the class name of the splitter.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
The splitter class name.
|
|
26
|
+
"""
|
|
27
|
+
return self.__class__.__name__
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def identifier(self) -> str:
|
|
31
|
+
"""Return a string identifier including the splitter's parameters.
|
|
32
|
+
|
|
33
|
+
The identifier includes the class name and a comma-separated list of
|
|
34
|
+
attribute name/value pairs from `self.__dict__`.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Identifier string like `Name(k1=v1,k2=v2)`.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
paramstring = ",".join((f"{k}={v}" for k, v in self.__dict__.items()))
|
|
41
|
+
return self.name + f"({paramstring})"
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def split(self, data: InteractionMatrix) -> tuple[InteractionMatrix, InteractionMatrix]:
|
|
45
|
+
"""Split an interaction matrix into two parts.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
data (InteractionMatrix): The interaction dataset to split.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
A pair of `InteractionMatrix` objects representing the two parts.
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
NotImplementedError: If the concrete splitter does not implement this method.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
raise NotImplementedError(f"{self.name} must implement the _split method.")
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from recnexteval.matrix import InteractionMatrix
|
|
4
|
+
from .base import Splitter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NLastInteractionSplitter(Splitter):
|
|
11
|
+
"""Splits the n most recent interactions of a user into the second return value,
|
|
12
|
+
and earlier interactions into the first.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
n (int): Number of most recent actions to assign to the second return value.
|
|
16
|
+
n_seq_data (int, optional): Number of last interactions to provide as unlabeled data
|
|
17
|
+
for model to make prediction. Defaults to 1.
|
|
18
|
+
|
|
19
|
+
Raises:
|
|
20
|
+
ValueError: If n is less than 1, as this would cause the ground truth data to be empty.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, n: int, n_seq_data: int = 1) -> None:
|
|
24
|
+
super().__init__()
|
|
25
|
+
if n < 1:
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"n must be greater than 0, got {n}. "
|
|
28
|
+
f"Values for n < 1 will cause the ground truth data to be empty."
|
|
29
|
+
)
|
|
30
|
+
self.n = n
|
|
31
|
+
self.n_seq_data = n_seq_data
|
|
32
|
+
|
|
33
|
+
def split(self, data: InteractionMatrix) -> tuple[InteractionMatrix, InteractionMatrix]:
|
|
34
|
+
future_interaction = data.get_users_n_last_interaction(self.n)
|
|
35
|
+
past_interaction = data - future_interaction
|
|
36
|
+
past_interaction = past_interaction.get_users_n_last_interaction(self.n_seq_data)
|
|
37
|
+
logger.debug(f"{self.identifier} has complete split")
|
|
38
|
+
|
|
39
|
+
return past_interaction, future_interaction
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from recnexteval.matrix import InteractionMatrix
|
|
4
|
+
from .timestamp import TimestampSplitter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class NLastInteractionTimestampSplitter(TimestampSplitter):
|
|
11
|
+
"""Splits with n last interactions based on a timestamp.
|
|
12
|
+
|
|
13
|
+
Splits the data into unlabeled and ground truth data based on a timestamp.
|
|
14
|
+
Historical data contains last `n_seq_data` interactions before the timestamp `t`
|
|
15
|
+
and the future interaction contains interactions after the timestamp `t`.
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
past_interaction: List of unlabeled data. Interval is `[0, t)`.
|
|
20
|
+
- future_interaction: Data used for training the model.
|
|
21
|
+
Interval is `[t, t+t_upper)` or `[t,inf]`.
|
|
22
|
+
n_seq_data: Number of last interactions to provide as data for model to make prediction.
|
|
23
|
+
These interactions are past interactions from before the timestamp `t`.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
t: Timestamp to split on in seconds since epoch.
|
|
27
|
+
t_upper: Seconds past t. Upper bound on the timestamp
|
|
28
|
+
of interactions. Defaults to None (infinity).
|
|
29
|
+
n_seq_data: Number of last interactions to provide as data
|
|
30
|
+
for model to make prediction. Defaults to 1.
|
|
31
|
+
include_all_past_data: If True, include all past data in the past_interaction.
|
|
32
|
+
Defaults to False.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
t: int,
|
|
38
|
+
t_upper: None | int = None,
|
|
39
|
+
n_seq_data: int = 1,
|
|
40
|
+
include_all_past_data: bool = False,
|
|
41
|
+
) -> None:
|
|
42
|
+
super().__init__(t=t, t_lower=None, t_upper=t_upper)
|
|
43
|
+
self.n_seq_data = n_seq_data
|
|
44
|
+
self.include_all_past_data = include_all_past_data
|
|
45
|
+
|
|
46
|
+
def update_split_point(self, t: int) -> None:
|
|
47
|
+
logger.debug(f"{self.identifier} - Updating split point to t={t}")
|
|
48
|
+
self.t = t
|
|
49
|
+
|
|
50
|
+
def split(self, data: InteractionMatrix) -> tuple[InteractionMatrix, InteractionMatrix]:
|
|
51
|
+
"""Splits data such that the following definition holds:
|
|
52
|
+
|
|
53
|
+
- past_interaction: List of unlabeled data. Interval is `[0, t)`.
|
|
54
|
+
- future_interaction: Data used for training the model.
|
|
55
|
+
Interval is `[t, t+t_upper)` or `[t,inf]`.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
data: Interaction matrix to be split. Must contain timestamps.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
A 2-tuple containing the `past_interaction` and `future_interaction` matrices.
|
|
62
|
+
"""
|
|
63
|
+
if self.t_upper is None:
|
|
64
|
+
future_interaction = data.timestamps_gte(self.t)
|
|
65
|
+
else:
|
|
66
|
+
future_interaction = data.timestamps_lt(self.t + self.t_upper).timestamps_gte(self.t)
|
|
67
|
+
|
|
68
|
+
if self.include_all_past_data:
|
|
69
|
+
past_interaction = data.timestamps_lt(self.t)
|
|
70
|
+
else:
|
|
71
|
+
past_interaction = data.get_users_n_last_interaction(
|
|
72
|
+
self.n_seq_data, self.t, future_interaction.user_ids
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
logger.debug(f"{self.identifier} has complete split")
|
|
76
|
+
return past_interaction, future_interaction
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from recnexteval.matrix import InteractionMatrix
|
|
4
|
+
from .base import Splitter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TimestampSplitter(Splitter):
|
|
11
|
+
"""Split an interaction dataset by timestamp.
|
|
12
|
+
|
|
13
|
+
The splitter divides the data into two parts:
|
|
14
|
+
|
|
15
|
+
1. Interactions with timestamps in the interval `[t - t_lower, t)`,
|
|
16
|
+
representing past interactions.
|
|
17
|
+
2. Interactions with timestamps in the interval `[t, t + t_upper]`,
|
|
18
|
+
representing future interactions.
|
|
19
|
+
|
|
20
|
+
If `t_lower` or `t_upper` are not provided, they default to infinity,
|
|
21
|
+
meaning the corresponding interval is unbounded on that side.
|
|
22
|
+
|
|
23
|
+
Note that a user can appear in both the past and future interaction sets.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
past_interaction (InteractionMatrix): Interactions in the interval
|
|
27
|
+
`[0, t)`, representing unlabeled data for prediction.
|
|
28
|
+
future_interaction (InteractionMatrix): Interactions in the interval
|
|
29
|
+
`[t, t + t_upper)` or `[t, inf)`, used for training the model.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
t: Timestamp to split on, in seconds since the Unix epoch.
|
|
33
|
+
t_lower: Seconds before `t` to include in
|
|
34
|
+
the past interactions. If None, the interval is unbounded.
|
|
35
|
+
Defaults to None.
|
|
36
|
+
t_upper: Seconds after `t` to include in
|
|
37
|
+
the future interactions. If None, the interval is unbounded.
|
|
38
|
+
Defaults to None.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
t: int,
|
|
44
|
+
t_lower: None | int = None,
|
|
45
|
+
t_upper: None | int = None,
|
|
46
|
+
) -> None:
|
|
47
|
+
super().__init__()
|
|
48
|
+
self.t = t
|
|
49
|
+
self.t_lower = t_lower
|
|
50
|
+
self.t_upper = t_upper
|
|
51
|
+
|
|
52
|
+
def split(self, data: InteractionMatrix) -> tuple[InteractionMatrix, InteractionMatrix]:
|
|
53
|
+
"""Split the interaction data by timestamp.
|
|
54
|
+
|
|
55
|
+
The method populates the `past_interaction` and `future_interaction`
|
|
56
|
+
attributes with the corresponding subsets of the input data.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
data: The interaction dataset to split.
|
|
60
|
+
Must include timestamp information.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
A pair containing the past interactions and future interactions.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
if self.t_lower is None:
|
|
67
|
+
# timestamp < t
|
|
68
|
+
past_interaction = data.timestamps_lt(self.t)
|
|
69
|
+
else:
|
|
70
|
+
# t-t_lower =< timestamp < t
|
|
71
|
+
past_interaction = data.timestamps_lt(self.t).timestamps_gte(self.t - self.t_lower)
|
|
72
|
+
|
|
73
|
+
if self.t_upper is None:
|
|
74
|
+
# timestamp >= t
|
|
75
|
+
future_interaction = data.timestamps_gte(self.t)
|
|
76
|
+
else:
|
|
77
|
+
# t =< timestamp < t + t_upper
|
|
78
|
+
future_interaction = data.timestamps_gte(self.t).timestamps_lt(self.t + self.t_upper)
|
|
79
|
+
|
|
80
|
+
logger.debug(f"{self.identifier} has complete split")
|
|
81
|
+
|
|
82
|
+
return past_interaction, future_interaction
|
|
File without changes
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Utility module for RecNextEval library.
|
|
2
|
+
|
|
3
|
+
This module provides a set of general utility functions used throughout RecNextEval.
|
|
4
|
+
It includes utilities for file handling, configuration, matrix operations, and logging.
|
|
5
|
+
|
|
6
|
+
## Utility Functions
|
|
7
|
+
|
|
8
|
+
General-purpose utility functions that support library operations:
|
|
9
|
+
|
|
10
|
+
- `create_config_yaml`: Create configuration YAML file
|
|
11
|
+
- `safe_dir`: Safely manage directory operations
|
|
12
|
+
- `add_columns_to_csr_matrix`: Add columns to sparse matrix
|
|
13
|
+
- `add_rows_to_csr_matrix`: Add rows to sparse matrix
|
|
14
|
+
- `arg_to_str`: Convert arguments to string representation
|
|
15
|
+
- `df_to_sparse`: Convert DataFrame to sparse matrix
|
|
16
|
+
- `to_binary`: Convert data to binary format
|
|
17
|
+
- `to_tuple`: Convert data to tuple format
|
|
18
|
+
- `ProgressBar`: Progress bar utility for tracking operations
|
|
19
|
+
|
|
20
|
+
## Path Utilities
|
|
21
|
+
|
|
22
|
+
Directory and path management functions:
|
|
23
|
+
|
|
24
|
+
- `get_cache_dir`: Get cache directory path
|
|
25
|
+
- `get_data_dir`: Get data directory path
|
|
26
|
+
- `get_logs_dir`: Get logs directory path
|
|
27
|
+
- `get_repo_root`: Get repository root directory
|
|
28
|
+
- `safe_dir`: Safely create and manage directories
|
|
29
|
+
|
|
30
|
+
## Logging Control
|
|
31
|
+
|
|
32
|
+
Functions to control logging level and warning suppression:
|
|
33
|
+
|
|
34
|
+
- `log_level`: Get current logging level
|
|
35
|
+
- `log_level_by_name`: Set logging level by name (DEBUG, INFO, WARNING, ERROR)
|
|
36
|
+
- `prepare_logger`: Initialize logger for RecNextEval
|
|
37
|
+
- `suppress_warnings`: Suppress all Python warnings
|
|
38
|
+
- `suppress_specific_warnings`: Suppress specific warning types
|
|
39
|
+
|
|
40
|
+
## Logging Example
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import logging
|
|
44
|
+
import warnings
|
|
45
|
+
import recnexteval
|
|
46
|
+
|
|
47
|
+
# Set log level to INFO
|
|
48
|
+
recnexteval.log_level_by_name("INFO")
|
|
49
|
+
|
|
50
|
+
# Suppress all warnings
|
|
51
|
+
recnexteval.suppress_warnings(suppress=True)
|
|
52
|
+
|
|
53
|
+
# Log information
|
|
54
|
+
logger = logging.getLogger("recnexteval")
|
|
55
|
+
logger.info("This is an informational message.")
|
|
56
|
+
|
|
57
|
+
# Warnings will be suppressed
|
|
58
|
+
warnings.warn("This warning will not appear.")
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Configuration
|
|
62
|
+
|
|
63
|
+
- `create_config_yaml`: Generate configuration YAML file for RecNextEval
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
from .logging_tools import (
|
|
67
|
+
log_level,
|
|
68
|
+
log_level_by_name,
|
|
69
|
+
prepare_logger,
|
|
70
|
+
suppress_specific_warnings,
|
|
71
|
+
suppress_warnings,
|
|
72
|
+
)
|
|
73
|
+
from .path import (
|
|
74
|
+
get_cache_dir,
|
|
75
|
+
get_data_dir,
|
|
76
|
+
get_logs_dir,
|
|
77
|
+
get_repo_root,
|
|
78
|
+
safe_dir,
|
|
79
|
+
)
|
|
80
|
+
from .util import (
|
|
81
|
+
ProgressBar,
|
|
82
|
+
add_columns_to_csr_matrix,
|
|
83
|
+
add_rows_to_csr_matrix,
|
|
84
|
+
arg_to_str,
|
|
85
|
+
df_to_sparse,
|
|
86
|
+
invert,
|
|
87
|
+
to_binary,
|
|
88
|
+
to_tuple,
|
|
89
|
+
)
|
|
90
|
+
from .uuid_util import generate_algorithm_uuid
|
|
91
|
+
from .yaml_tool import create_config_yaml
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
__all__ = [
|
|
95
|
+
"create_config_yaml",
|
|
96
|
+
"safe_dir",
|
|
97
|
+
"add_columns_to_csr_matrix",
|
|
98
|
+
"add_rows_to_csr_matrix",
|
|
99
|
+
"arg_to_str",
|
|
100
|
+
"df_to_sparse",
|
|
101
|
+
"prepare_logger",
|
|
102
|
+
"to_binary",
|
|
103
|
+
"to_tuple",
|
|
104
|
+
"ProgressBar",
|
|
105
|
+
"log_level",
|
|
106
|
+
"log_level_by_name",
|
|
107
|
+
"suppress_warnings",
|
|
108
|
+
"suppress_specific_warnings",
|
|
109
|
+
"get_cache_dir",
|
|
110
|
+
"get_data_dir",
|
|
111
|
+
"get_logs_dir",
|
|
112
|
+
"get_repo_root",
|
|
113
|
+
"invert",
|
|
114
|
+
"generate_algorithm_uuid",
|
|
115
|
+
]
|