recnexteval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. recnexteval/__init__.py +20 -0
  2. recnexteval/algorithms/__init__.py +99 -0
  3. recnexteval/algorithms/base.py +377 -0
  4. recnexteval/algorithms/baseline/__init__.py +10 -0
  5. recnexteval/algorithms/baseline/decay_popularity.py +110 -0
  6. recnexteval/algorithms/baseline/most_popular.py +72 -0
  7. recnexteval/algorithms/baseline/random.py +39 -0
  8. recnexteval/algorithms/baseline/recent_popularity.py +34 -0
  9. recnexteval/algorithms/itemknn/__init__.py +14 -0
  10. recnexteval/algorithms/itemknn/itemknn.py +119 -0
  11. recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
  12. recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
  13. recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
  14. recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
  15. recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
  16. recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
  17. recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
  18. recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
  19. recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
  20. recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
  21. recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
  22. recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
  23. recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
  24. recnexteval/algorithms/utils.py +51 -0
  25. recnexteval/datasets/__init__.py +109 -0
  26. recnexteval/datasets/base.py +316 -0
  27. recnexteval/datasets/config/__init__.py +113 -0
  28. recnexteval/datasets/config/amazon.py +188 -0
  29. recnexteval/datasets/config/base.py +72 -0
  30. recnexteval/datasets/config/lastfm.py +105 -0
  31. recnexteval/datasets/config/movielens.py +169 -0
  32. recnexteval/datasets/config/yelp.py +25 -0
  33. recnexteval/datasets/datasets/__init__.py +24 -0
  34. recnexteval/datasets/datasets/amazon.py +151 -0
  35. recnexteval/datasets/datasets/base.py +250 -0
  36. recnexteval/datasets/datasets/lastfm.py +121 -0
  37. recnexteval/datasets/datasets/movielens.py +93 -0
  38. recnexteval/datasets/datasets/test.py +46 -0
  39. recnexteval/datasets/datasets/yelp.py +103 -0
  40. recnexteval/datasets/metadata/__init__.py +58 -0
  41. recnexteval/datasets/metadata/amazon.py +68 -0
  42. recnexteval/datasets/metadata/base.py +38 -0
  43. recnexteval/datasets/metadata/lastfm.py +110 -0
  44. recnexteval/datasets/metadata/movielens.py +87 -0
  45. recnexteval/evaluators/__init__.py +189 -0
  46. recnexteval/evaluators/accumulator.py +167 -0
  47. recnexteval/evaluators/base.py +216 -0
  48. recnexteval/evaluators/builder/__init__.py +125 -0
  49. recnexteval/evaluators/builder/base.py +166 -0
  50. recnexteval/evaluators/builder/pipeline.py +111 -0
  51. recnexteval/evaluators/builder/stream.py +54 -0
  52. recnexteval/evaluators/evaluator_pipeline.py +287 -0
  53. recnexteval/evaluators/evaluator_stream.py +374 -0
  54. recnexteval/evaluators/state_management.py +310 -0
  55. recnexteval/evaluators/strategy.py +32 -0
  56. recnexteval/evaluators/util.py +124 -0
  57. recnexteval/matrix/__init__.py +48 -0
  58. recnexteval/matrix/exception.py +5 -0
  59. recnexteval/matrix/interaction_matrix.py +784 -0
  60. recnexteval/matrix/prediction_matrix.py +153 -0
  61. recnexteval/matrix/util.py +24 -0
  62. recnexteval/metrics/__init__.py +57 -0
  63. recnexteval/metrics/binary/__init__.py +4 -0
  64. recnexteval/metrics/binary/hit.py +49 -0
  65. recnexteval/metrics/core/__init__.py +10 -0
  66. recnexteval/metrics/core/base.py +126 -0
  67. recnexteval/metrics/core/elementwise_top_k.py +75 -0
  68. recnexteval/metrics/core/listwise_top_k.py +72 -0
  69. recnexteval/metrics/core/top_k.py +60 -0
  70. recnexteval/metrics/core/util.py +29 -0
  71. recnexteval/metrics/ranking/__init__.py +6 -0
  72. recnexteval/metrics/ranking/dcg.py +55 -0
  73. recnexteval/metrics/ranking/ndcg.py +78 -0
  74. recnexteval/metrics/ranking/precision.py +51 -0
  75. recnexteval/metrics/ranking/recall.py +42 -0
  76. recnexteval/models/__init__.py +4 -0
  77. recnexteval/models/base.py +69 -0
  78. recnexteval/preprocessing/__init__.py +37 -0
  79. recnexteval/preprocessing/filter.py +181 -0
  80. recnexteval/preprocessing/preprocessor.py +137 -0
  81. recnexteval/registries/__init__.py +67 -0
  82. recnexteval/registries/algorithm.py +68 -0
  83. recnexteval/registries/base.py +131 -0
  84. recnexteval/registries/dataset.py +37 -0
  85. recnexteval/registries/metric.py +57 -0
  86. recnexteval/settings/__init__.py +127 -0
  87. recnexteval/settings/base.py +414 -0
  88. recnexteval/settings/exception.py +8 -0
  89. recnexteval/settings/leave_n_out_setting.py +48 -0
  90. recnexteval/settings/processor.py +115 -0
  91. recnexteval/settings/schema.py +11 -0
  92. recnexteval/settings/single_time_point_setting.py +111 -0
  93. recnexteval/settings/sliding_window_setting.py +153 -0
  94. recnexteval/settings/splitters/__init__.py +14 -0
  95. recnexteval/settings/splitters/base.py +57 -0
  96. recnexteval/settings/splitters/n_last.py +39 -0
  97. recnexteval/settings/splitters/n_last_timestamp.py +76 -0
  98. recnexteval/settings/splitters/timestamp.py +82 -0
  99. recnexteval/settings/util.py +0 -0
  100. recnexteval/utils/__init__.py +115 -0
  101. recnexteval/utils/json_to_csv_converter.py +128 -0
  102. recnexteval/utils/logging_tools.py +159 -0
  103. recnexteval/utils/path.py +155 -0
  104. recnexteval/utils/url_certificate_installer.py +54 -0
  105. recnexteval/utils/util.py +166 -0
  106. recnexteval/utils/uuid_util.py +7 -0
  107. recnexteval/utils/yaml_tool.py +65 -0
  108. recnexteval-0.1.0.dist-info/METADATA +85 -0
  109. recnexteval-0.1.0.dist-info/RECORD +110 -0
  110. recnexteval-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,151 @@
1
+ import logging
2
+ from typing import ClassVar
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from ..config import (
8
+ AmazonBookDatasetConfig,
9
+ AmazonDatasetConfig,
10
+ AmazonMovieDatasetConfig,
11
+ AmazonMusicDatasetConfig,
12
+ AmazonSubscriptionBoxesDatasetConfig,
13
+ )
14
+ from ..metadata.amazon import (
15
+ AmazonBookItemMetadata,
16
+ AmazonMovieItemMetadata,
17
+ AmazonMusicItemMetadata,
18
+ AmazonSubscriptionBoxesItemMetadata,
19
+ )
20
+ from .base import Dataset
21
+
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class AmazonDataset(Dataset):
27
+ """Base class for Amazon datasets.
28
+
29
+ Other Amazon datasets should inherit from this class.
30
+ """
31
+
32
+ ITEM_METADATA = None
33
+ IS_BASE: bool = True
34
+
35
+ config: ClassVar[AmazonDatasetConfig] = AmazonDatasetConfig()
36
+
37
+ def _download_dataset(self) -> None:
38
+ """Downloads the dataset.
39
+
40
+ Downloads the csv file from the dataset URL and saves it to the file path.
41
+ """
42
+ if not self.config.dataset_url:
43
+ raise ValueError(f"{self.name} does not have URL specified in config.")
44
+
45
+ logger.debug(f"Downloading {self.name} dataset from {self.config.dataset_url}")
46
+ self._fetch_remote(
47
+ self.config.dataset_url,
48
+ self.file_path,
49
+ )
50
+
51
+ def _load_dataframe(self) -> pd.DataFrame:
52
+ """Load the raw dataset from file, and return it as a pandas DataFrame.
53
+
54
+ Transform the dataset downloaded to have integer user and item ids. This
55
+ will be needed for representation in the interaction matrix.
56
+
57
+ Returns:
58
+ The interaction data as a DataFrame with a row per interaction.
59
+ """
60
+ self.fetch_dataset()
61
+
62
+ # Read JSONL in chunks and show progress per chunk. We import tqdm
63
+ # locally to avoid global pandas monkeypatching (`tqdm.pandas()`).
64
+ from tqdm.auto import tqdm
65
+
66
+ chunksize = 100_000
67
+ chunks = pd.read_json(
68
+ self.file_path,
69
+ dtype={
70
+ self.config.item_ix: str,
71
+ self.config.user_ix: str,
72
+ self.config.timestamp_ix: np.int64,
73
+ self.config.rating_ix: np.float32,
74
+ self.config.helpful_vote_ix: np.int64,
75
+ },
76
+ lines=True,
77
+ chunksize=chunksize,
78
+ )
79
+ df = pd.concat(
80
+ [chunk for chunk in tqdm(chunks, desc="Reading JSONL", unit="chunk")], ignore_index=True
81
+ )
82
+
83
+ df = df[
84
+ [
85
+ self.config.item_ix,
86
+ self.config.user_ix,
87
+ self.config.timestamp_ix,
88
+ self.config.rating_ix,
89
+ self.config.helpful_vote_ix,
90
+ ]
91
+ ]
92
+
93
+ # Convert nanosecond timestamps to seconds
94
+ df[self.config.timestamp_ix] = df[self.config.timestamp_ix] // 1_000_000_000
95
+
96
+ logger.debug(f"Loaded {len(df)} interactions")
97
+ return df
98
+
99
+
100
+ class AmazonMusicDataset(AmazonDataset):
101
+ """Handles Amazon Music dataset."""
102
+
103
+ IS_BASE: bool = False
104
+
105
+ config: ClassVar[AmazonMusicDatasetConfig] = AmazonMusicDatasetConfig()
106
+
107
+ def _fetch_dataset_metadata(
108
+ self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
109
+ ) -> None:
110
+ self.ITEM_METADATA = AmazonMusicItemMetadata(item_id_mapping=item_id_mapping).load()
111
+
112
+
113
+ class AmazonMovieDataset(AmazonDataset):
114
+ """Handles Amazon Movie dataset."""
115
+
116
+ IS_BASE: bool = False
117
+
118
+ config: ClassVar[AmazonMovieDatasetConfig] = AmazonMovieDatasetConfig()
119
+
120
+ def _fetch_dataset_metadata(
121
+ self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
122
+ ) -> None:
123
+ self.ITEM_METADATA = AmazonMovieItemMetadata(item_id_mapping=item_id_mapping).load()
124
+
125
+
126
+ class AmazonSubscriptionBoxesDataset(AmazonDataset):
127
+ """Handles Amazon Computer dataset."""
128
+
129
+ IS_BASE: bool = False
130
+
131
+ config: ClassVar[AmazonSubscriptionBoxesDatasetConfig] = AmazonSubscriptionBoxesDatasetConfig()
132
+
133
+ def _fetch_dataset_metadata(
134
+ self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
135
+ ) -> None:
136
+ self.ITEM_METADATA = AmazonSubscriptionBoxesItemMetadata(
137
+ item_id_mapping=item_id_mapping
138
+ ).load()
139
+
140
+
141
+ class AmazonBookDataset(AmazonDataset):
142
+ """Handles Amazon Book dataset."""
143
+
144
+ IS_BASE: bool = False
145
+
146
+ config: ClassVar[AmazonBookDatasetConfig] = AmazonBookDatasetConfig()
147
+
148
+ def _fetch_dataset_metadata(
149
+ self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
150
+ ) -> None:
151
+ self.ITEM_METADATA = AmazonBookItemMetadata(item_id_mapping=item_id_mapping).load()
@@ -0,0 +1,250 @@
1
+ import logging
2
+ import os
3
+ import time
4
+ from abc import abstractmethod
5
+ from datetime import datetime
6
+ from typing import ClassVar
7
+
8
+ import pandas as pd
9
+
10
+ from recnexteval.matrix import InteractionMatrix
11
+ from recnexteval.preprocessing.filter import Filter, MinItemsPerUser, MinUsersPerItem
12
+ from recnexteval.preprocessing.preprocessor import DataFramePreprocessor
13
+ from recnexteval.utils.path import safe_dir
14
+ from ..base import DataFetcher
15
+ from ..config import DatasetConfig
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class Dataset(DataFetcher):
22
+ """Represents a collaborative filtering dataset.
23
+
24
+ Dataset must minimally contain user, item and timestamp columns for the
25
+ other modules to work.
26
+
27
+ Assumption
28
+ ===========
29
+ User/item ID increments in the order of time. This is an assumption that will
30
+ be made for the purposes of splitting the dataset and eventually passing
31
+ the dataset to the model. The ID incrementing in the order of time allows us
32
+ to set the shape of the currently known user and item matrix allowing easier
33
+ manipulation of the data by the evaluator.
34
+
35
+ :param filename: Name of the file, if no name is provided the dataset default will be used if known.
36
+ If the dataset does not have a default filename, a ValueError will be raised.
37
+ :type filename: str, optional
38
+ :param base_path: The base_path to the data directory.
39
+ Defaults to `data`
40
+ :type base_path: str, optional
41
+ :param use_default_filters: If True, the default filters will be applied to the dataset.
42
+ Defaults to False.
43
+ :type use_default_filters: bool, optional
44
+ """
45
+
46
+ config: ClassVar[DatasetConfig] = DatasetConfig()
47
+ """Configuration for the dataset."""
48
+
49
+ def __init__(
50
+ self,
51
+ use_default_filters: bool = False, # noqa: FBT001, FBT002
52
+ fetch_metadata: bool = False, # noqa: FBT001, FBT002
53
+ ) -> None:
54
+ if not self.config.user_ix or not self.config.item_ix or not self.config.timestamp_ix:
55
+ raise AttributeError("user_ix, item_ix or timestamp_ix not set in config.")
56
+
57
+ logger.debug(
58
+ f"{self.name} being initialized with '{self.config.default_base_path}' as the base path."
59
+ )
60
+
61
+ if not self.config.default_filename:
62
+ raise ValueError("No filename specified, and no default known.")
63
+
64
+ self.fetch_metadata = fetch_metadata
65
+ self.preprocessor = DataFramePreprocessor(
66
+ self.config.item_ix, self.config.user_ix, self.config.timestamp_ix
67
+ )
68
+ self._timestamp_min: int | None = None
69
+ self._timestamp_max: int | None = None
70
+
71
+ if use_default_filters:
72
+ for f in self._default_filters:
73
+ self.add_filter(f)
74
+
75
+ safe_dir(self.config.default_base_path)
76
+ logger.debug(f"{self.name} is initialized.")
77
+
78
+ @property
79
+ def _default_filters(self) -> list[Filter]:
80
+ """The default filters for all datasets
81
+
82
+ Concrete classes can override this property to add more filters.
83
+
84
+ Returns:
85
+ List of filters to be applied to the dataset.
86
+ """
87
+ if not self.config.user_ix or not self.config.item_ix:
88
+ raise AttributeError("config.user_ix or config.item_ix not set.")
89
+
90
+ filters: list[Filter] = []
91
+ filters.append(
92
+ MinItemsPerUser(
93
+ min_items_per_user=3,
94
+ item_ix=self.config.item_ix,
95
+ user_ix=self.config.user_ix,
96
+ )
97
+ )
98
+ filters.append(
99
+ MinUsersPerItem(
100
+ min_users_per_item=3,
101
+ item_ix=self.config.item_ix,
102
+ user_ix=self.config.user_ix,
103
+ )
104
+ )
105
+ return filters
106
+
107
+ @property
108
+ def timestamp_min(self) -> int:
109
+ """Minimum timestamp in the dataset.
110
+
111
+ Returns:
112
+ Minimum timestamp in the dataset.
113
+ """
114
+ if self._timestamp_min is None:
115
+ raise RuntimeError("timestamp_min can only be accessed after load() has been called.")
116
+ return self._timestamp_min
117
+
118
+ @property
119
+ def timestamp_max(self) -> int:
120
+ """Maximum timestamp in the dataset.
121
+
122
+ Returns:
123
+ Maximum timestamp in the dataset.
124
+ """
125
+ if self._timestamp_max is None:
126
+ raise RuntimeError("timestamp_max can only be accessed after load() has been called.")
127
+ return self._timestamp_max
128
+
129
+ def get_timestamp_range_in_epoch(self) -> tuple[int, int]:
130
+ """Get the minimum and maximum timestamps in the dataset.
131
+
132
+ Returns:
133
+ A tuple of (min_timestamp, max_timestamp).
134
+
135
+ Raises:
136
+ RuntimeError: If load() has not been called yet.
137
+ """
138
+ return self.timestamp_min, self.timestamp_max
139
+
140
+ def get_timestamp_range_in_datetime(self) -> tuple[datetime, datetime]:
141
+ """Get the minimum and maximum timestamps in the dataset.
142
+
143
+ Returns:
144
+ A tuple of (min_timestamp, max_timestamp).
145
+
146
+ Raises:
147
+ RuntimeError: If load() has not been called yet.
148
+ """
149
+ min_dt = datetime.fromtimestamp(self.timestamp_min)
150
+ max_dt = datetime.fromtimestamp(self.timestamp_max)
151
+ return min_dt, max_dt
152
+
153
+ def add_filter(self, filter_: Filter) -> None:
154
+ """Add a filter to be applied when loading the data.
155
+
156
+ Utilize :class:`DataFramePreprocessor` class to add filters to the
157
+ dataset to load. The filter will be applied when the data is loaded into
158
+ an :class:`InteractionMatrix` object when :meth:`load` is called.
159
+
160
+ :param filter_: Filter to be applied to the loaded DataFrame
161
+ processing to interaction matrix.
162
+ :type filter_: Filter
163
+ """
164
+ self.preprocessor.add_filter(filter_)
165
+
166
+ def _load_dataframe_from_cache(self) -> pd.DataFrame:
167
+ if not os.path.exists(self.processed_cache_path):
168
+ raise FileNotFoundError("Processed cache file not found.")
169
+ logger.info(f"Loading from cache: {self.processed_cache_path}")
170
+ df = pd.read_parquet(self.processed_cache_path)
171
+ return df
172
+
173
+ def load(self, apply_filters: bool = True, use_cache: bool = True) -> InteractionMatrix:
174
+ """Loads data into an InteractionMatrix object.
175
+
176
+ Data is loaded into a DataFrame using the :func:`_load_dataframe` function.
177
+ Resulting DataFrame is parsed into an :class:`InteractionMatrix` object. If
178
+ :data:`apply_filters` is set to True, the filters set will be applied to the
179
+ dataset and mapping of user and item ids will be done. This is advised
180
+ even if there is no filter set, as it will ensure that the user and item
181
+ ids are incrementing in the order of time.
182
+
183
+ Args:
184
+ apply_filters: To apply the filters set and preprocessing,
185
+ defaults to True
186
+ use_cache: Whether to use cached processed data, defaults to True
187
+
188
+ Returns:
189
+ Resulting interaction matrix.
190
+ """
191
+ logger.info(f"{self.name} is loading dataset...")
192
+ start = time.time()
193
+ try:
194
+ df = self._load_dataframe_from_cache() if use_cache else self._load_dataframe()
195
+ except FileNotFoundError:
196
+ logger.warning("Processed cache not found, loading raw dataframe.")
197
+ df = self._load_dataframe()
198
+ self._cache_processed_dataframe(df)
199
+ if apply_filters:
200
+ logger.debug(f"{self.name} applying filters set.")
201
+ im = self.preprocessor.process(df)
202
+ else:
203
+ im = self._dataframe_to_matrix(df)
204
+ logger.warning(
205
+ "No filters applied, user and item ids may not be incrementing in the order of time. "
206
+ "Classes that use this dataset may not work as expected."
207
+ )
208
+ self._timestamp_min = int(df[self.config.timestamp_ix].min())
209
+ self._timestamp_max = int(df[self.config.timestamp_ix].max())
210
+
211
+ if self.fetch_metadata:
212
+ user_id_mapping, item_id_mapping = (
213
+ self.preprocessor.user_id_mapping,
214
+ self.preprocessor.item_id_mapping,
215
+ )
216
+ self._fetch_dataset_metadata(
217
+ user_id_mapping=user_id_mapping, item_id_mapping=item_id_mapping
218
+ )
219
+
220
+ end = time.time()
221
+ logger.info(f"{self.name} dataset loaded - Took {end - start:.3}s")
222
+ return im
223
+
224
+ def _dataframe_to_matrix(self, df: pd.DataFrame) -> InteractionMatrix:
225
+ """Converts a DataFrame to an InteractionMatrix.
226
+
227
+ Args:
228
+ df: DataFrame to convert
229
+
230
+ Returns:
231
+ InteractionMatrix object.
232
+ """
233
+ if not self.config.user_ix or not self.config.item_ix or not self.config.timestamp_ix:
234
+ raise AttributeError("config.user_ix, config.item_ix or config.timestamp_ix not set.")
235
+ return InteractionMatrix(
236
+ df,
237
+ user_ix=self.config.user_ix,
238
+ item_ix=self.config.item_ix,
239
+ timestamp_ix=self.config.timestamp_ix,
240
+ )
241
+
242
+ @abstractmethod
243
+ def _fetch_dataset_metadata(
244
+ self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
245
+ ) -> None:
246
+ """Fetch metadata for the dataset.
247
+
248
+ Fetch metadata for the dataset, if available.
249
+ """
250
+ raise NotImplementedError("Needs to be implemented")
@@ -0,0 +1,121 @@
1
+ import logging
2
+ import os
3
+ import zipfile
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from tqdm import tqdm
8
+ from typing_extensions import ClassVar
9
+
10
+ from ..config import LastFMDatasetConfig
11
+ from ..metadata.lastfm import LastFMItemMetadata, LastFMTagMetadata, LastFMUserMetadata
12
+ from .base import Dataset
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+ tqdm.pandas()
17
+
18
+
19
+ class LastFMDataset(Dataset):
20
+ """
21
+ Last FM dataset.
22
+
23
+ The Last FM dataset contains user interactions with artists. The tags in this
24
+ datasets are not used in this implementation. The dataset that will be used
25
+ would the the user_taggedartists-timestamps.dat file. The dataset contains
26
+ the following columns: [user, artist, tags, timestamp].
27
+
28
+ The dataset is downloaded from the GroupLens website :cite:`Cantador_RecSys2011`.
29
+ """
30
+ IS_BASE: bool = False
31
+
32
+ config: ClassVar[LastFMDatasetConfig] = LastFMDatasetConfig()
33
+
34
+ ITEM_METADATA = None
35
+ USER_METADATA = None
36
+ TAG_METADATA = None
37
+
38
+ def fetch_dataset(self) -> None:
39
+ """Check if dataset is present, if not download.
40
+
41
+ This method overrides the base class to handle the special case where
42
+ the zipfile may exist but the extracted file doesn't.
43
+ """
44
+ zip_path = os.path.join(
45
+ self.config.default_base_path, f"{self.config.remote_zipname}.zip"
46
+ )
47
+
48
+ if not os.path.exists(zip_path):
49
+ logger.debug(f"{self.name} dataset zipfile not found in {zip_path}.")
50
+ self._download_dataset()
51
+ elif not os.path.exists(self.file_path):
52
+ logger.debug(
53
+ f"{self.name} dataset file not found, but zipfile already downloaded. "
54
+ f"Extracting file from zipfile."
55
+ )
56
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
57
+ zip_ref.extract(self.config.remote_filename, self.config.default_base_path)
58
+ else:
59
+ logger.debug("Data zipfile is in memory and in dir specified.")
60
+
61
+ def _download_dataset(self) -> None:
62
+ """Downloads the dataset.
63
+
64
+ Downloads the zipfile, and extracts the interaction file to `self.file_path`
65
+ """
66
+ zip_path = os.path.join(
67
+ self.config.default_base_path, f"{self.config.remote_zipname}.zip"
68
+ )
69
+
70
+ logger.debug(f"Downloading {self.name} dataset from {self.config.dataset_url}")
71
+
72
+ # Download the zip into the data directory
73
+ self._fetch_remote(
74
+ f"{self.config.dataset_url}/{self.config.remote_zipname}.zip",
75
+ zip_path,
76
+ )
77
+
78
+ # Extract the interaction file which we will use
79
+ logger.debug(f"Extracting {self.config.remote_filename} from zip")
80
+ with zipfile.ZipFile(zip_path, "r") as zip_ref:
81
+ zip_ref.extract(self.config.remote_filename, self.config.default_base_path)
82
+
83
+ def _load_dataframe(self) -> pd.DataFrame:
84
+ """Load the raw dataset from file, and return it as a pandas DataFrame.
85
+
86
+ Transform the dataset downloaded to have integer user and item ids. This
87
+ will be needed for representation in the interaction matrix.
88
+
89
+ Returns:
90
+ The interaction data as a DataFrame with a row per interaction.
91
+ """
92
+ self.fetch_dataset()
93
+ df = pd.read_csv(
94
+ self.file_path,
95
+ dtype={
96
+ self.config.item_ix: np.int32,
97
+ self.config.user_ix: np.int32,
98
+ self.config.tag_ix: np.int32,
99
+ self.config.timestamp_ix: np.int64,
100
+ },
101
+ sep="\t",
102
+ names=[
103
+ self.config.user_ix,
104
+ self.config.item_ix,
105
+ self.config.tag_ix,
106
+ self.config.timestamp_ix,
107
+ ],
108
+ header=0,
109
+ )
110
+ # Convert from milliseconds to seconds
111
+ df[self.config.timestamp_ix] = df[self.config.timestamp_ix] // 1_000
112
+
113
+ logger.debug(f"Loaded {len(df)} interactions")
114
+ return df
115
+
116
+ def _fetch_dataset_metadata(
117
+ self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
118
+ ) -> None:
119
+ self.USER_METADATA = LastFMUserMetadata(user_id_mapping=user_id_mapping).load()
120
+ self.ITEM_METADATA = LastFMItemMetadata(item_id_mapping=item_id_mapping).load()
121
+ self.TAG_METADATA = LastFMTagMetadata().load()
@@ -0,0 +1,93 @@
1
+ import logging
2
+ import os
3
+ import zipfile
4
+ from typing import ClassVar
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from tqdm import tqdm
9
+
10
+ from ..config import MovieLens100KDatasetConfig, MovieLensDatasetConfig
11
+ from ..metadata.movielens import MovieLens100kItemMetadata, MovieLens100kUserMetadata
12
+ from .base import Dataset
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class MovieLensDataset(Dataset):
19
+ """Base class for Movielens datasets.
20
+
21
+ Other Movielens datasets should inherit from this class.
22
+
23
+ This code is adapted from RecPack :cite:`recpack`
24
+ """
25
+
26
+ IS_BASE: bool = True
27
+ config: ClassVar[MovieLensDatasetConfig] = MovieLensDatasetConfig()
28
+
29
+ def _download_dataset(self) -> None:
30
+ # Download the zip into the data directory
31
+ zip_file_path = os.path.join(
32
+ self.config.default_base_path, f"{self.config.remote_zipname}.zip"
33
+ )
34
+ self._fetch_remote(
35
+ url=f"{self.config.dataset_url}/{self.config.remote_zipname}.zip",
36
+ filename=zip_file_path,
37
+ )
38
+
39
+ # Extract the ratings file which we will use
40
+ with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
41
+ zip_ref.extract(
42
+ f"{self.config.remote_zipname}/{self.config.remote_filename}",
43
+ self.config.default_base_path,
44
+ )
45
+
46
+ # Rename the ratings file to the specified filename
47
+ os.rename(
48
+ os.path.join(
49
+ self.config.default_base_path,
50
+ f"{self.config.remote_zipname}/{self.config.remote_filename}",
51
+ ),
52
+ self.file_path,
53
+ )
54
+
55
+
56
+ class MovieLens100K(MovieLensDataset):
57
+ """MovieLens 100K dataset."""
58
+
59
+ ITEM_METADATA = None
60
+ USER_METADATA = None
61
+ IS_BASE: bool = False
62
+
63
+ config: ClassVar[MovieLens100KDatasetConfig] = MovieLens100KDatasetConfig()
64
+
65
+ def _load_dataframe(self) -> pd.DataFrame:
66
+ self.fetch_dataset()
67
+ chunks = pd.read_table(
68
+ self.file_path,
69
+ dtype={
70
+ self.config.user_ix: np.int64,
71
+ self.config.item_ix: np.int64,
72
+ self.config.rating_ix: np.float64,
73
+ self.config.timestamp_ix: np.int64,
74
+ },
75
+ sep="\t",
76
+ names=[
77
+ self.config.user_ix,
78
+ self.config.item_ix,
79
+ self.config.rating_ix,
80
+ self.config.timestamp_ix,
81
+ ],
82
+ chunksize=100_000,
83
+ )
84
+ df = pd.concat(
85
+ [chunk for chunk in tqdm(chunks, desc="Reading table", unit="chunk")], ignore_index=True
86
+ )
87
+ return df
88
+
89
+ def _fetch_dataset_metadata(
90
+ self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
91
+ ) -> None:
92
+ self.USER_METADATA = MovieLens100kUserMetadata(user_id_mapping=user_id_mapping).load()
93
+ self.ITEM_METADATA = MovieLens100kItemMetadata(item_id_mapping=item_id_mapping).load()
@@ -0,0 +1,46 @@
1
+ import pandas as pd
2
+
3
+ from .base import Dataset
4
+
5
+
6
+ class TestDataset(Dataset):
7
+ """
8
+ Test dataset.
9
+
10
+ The test dataset is a dummy dataset that is used for testing purposes.
11
+ """
12
+
13
+ USER_IX = "user_id"
14
+ """Name of the column in the DataFrame that contains user identifiers."""
15
+ ITEM_IX = "item_id"
16
+ """Name of the column in the DataFrame that contains item identifiers."""
17
+ TIMESTAMP_IX = "timestamp"
18
+ """Name of the column in the DataFrame that contains time of interaction in seconds since epoch."""
19
+ RATING_IX = "rating"
20
+ """Name of the column in the DataFrame that contains the rating a user gave to the item."""
21
+ DEFAULT_FILENAME = "dummy_input.csv"
22
+
23
+ def _download_dataset(self) -> None:
24
+ pass
25
+
26
+ def _load_dataframe(self) -> pd.DataFrame:
27
+ """Load the raw dataset from file, and return it as a pandas DataFrame.
28
+
29
+ Warning:
30
+ This does not apply any preprocessing, and returns the raw dataset.
31
+
32
+ Returns:
33
+ The interaction data as a DataFrame with a row per interaction.
34
+ """
35
+ input_dict = {
36
+ self.USER_IX: [1, 2, 3, 1, 2, 2, 4, 3, 3, 4, 5, 5, 5],
37
+ self.ITEM_IX: [1, 1, 2, 3, 2, 3, 2, 1, 3, 3, 1, 2, 3],
38
+ self.TIMESTAMP_IX: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 10],
39
+ self.RATING_IX: [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3],
40
+ }
41
+
42
+ df = pd.DataFrame.from_dict(input_dict)
43
+ return df
44
+
45
+ def _fetch_dataset_metadata(self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame) -> None:
46
+ pass