recnexteval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. recnexteval/__init__.py +20 -0
  2. recnexteval/algorithms/__init__.py +99 -0
  3. recnexteval/algorithms/base.py +377 -0
  4. recnexteval/algorithms/baseline/__init__.py +10 -0
  5. recnexteval/algorithms/baseline/decay_popularity.py +110 -0
  6. recnexteval/algorithms/baseline/most_popular.py +72 -0
  7. recnexteval/algorithms/baseline/random.py +39 -0
  8. recnexteval/algorithms/baseline/recent_popularity.py +34 -0
  9. recnexteval/algorithms/itemknn/__init__.py +14 -0
  10. recnexteval/algorithms/itemknn/itemknn.py +119 -0
  11. recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
  12. recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
  13. recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
  14. recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
  15. recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
  16. recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
  17. recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
  18. recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
  19. recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
  20. recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
  21. recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
  22. recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
  23. recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
  24. recnexteval/algorithms/utils.py +51 -0
  25. recnexteval/datasets/__init__.py +109 -0
  26. recnexteval/datasets/base.py +316 -0
  27. recnexteval/datasets/config/__init__.py +113 -0
  28. recnexteval/datasets/config/amazon.py +188 -0
  29. recnexteval/datasets/config/base.py +72 -0
  30. recnexteval/datasets/config/lastfm.py +105 -0
  31. recnexteval/datasets/config/movielens.py +169 -0
  32. recnexteval/datasets/config/yelp.py +25 -0
  33. recnexteval/datasets/datasets/__init__.py +24 -0
  34. recnexteval/datasets/datasets/amazon.py +151 -0
  35. recnexteval/datasets/datasets/base.py +250 -0
  36. recnexteval/datasets/datasets/lastfm.py +121 -0
  37. recnexteval/datasets/datasets/movielens.py +93 -0
  38. recnexteval/datasets/datasets/test.py +46 -0
  39. recnexteval/datasets/datasets/yelp.py +103 -0
  40. recnexteval/datasets/metadata/__init__.py +58 -0
  41. recnexteval/datasets/metadata/amazon.py +68 -0
  42. recnexteval/datasets/metadata/base.py +38 -0
  43. recnexteval/datasets/metadata/lastfm.py +110 -0
  44. recnexteval/datasets/metadata/movielens.py +87 -0
  45. recnexteval/evaluators/__init__.py +189 -0
  46. recnexteval/evaluators/accumulator.py +167 -0
  47. recnexteval/evaluators/base.py +216 -0
  48. recnexteval/evaluators/builder/__init__.py +125 -0
  49. recnexteval/evaluators/builder/base.py +166 -0
  50. recnexteval/evaluators/builder/pipeline.py +111 -0
  51. recnexteval/evaluators/builder/stream.py +54 -0
  52. recnexteval/evaluators/evaluator_pipeline.py +287 -0
  53. recnexteval/evaluators/evaluator_stream.py +374 -0
  54. recnexteval/evaluators/state_management.py +310 -0
  55. recnexteval/evaluators/strategy.py +32 -0
  56. recnexteval/evaluators/util.py +124 -0
  57. recnexteval/matrix/__init__.py +48 -0
  58. recnexteval/matrix/exception.py +5 -0
  59. recnexteval/matrix/interaction_matrix.py +784 -0
  60. recnexteval/matrix/prediction_matrix.py +153 -0
  61. recnexteval/matrix/util.py +24 -0
  62. recnexteval/metrics/__init__.py +57 -0
  63. recnexteval/metrics/binary/__init__.py +4 -0
  64. recnexteval/metrics/binary/hit.py +49 -0
  65. recnexteval/metrics/core/__init__.py +10 -0
  66. recnexteval/metrics/core/base.py +126 -0
  67. recnexteval/metrics/core/elementwise_top_k.py +75 -0
  68. recnexteval/metrics/core/listwise_top_k.py +72 -0
  69. recnexteval/metrics/core/top_k.py +60 -0
  70. recnexteval/metrics/core/util.py +29 -0
  71. recnexteval/metrics/ranking/__init__.py +6 -0
  72. recnexteval/metrics/ranking/dcg.py +55 -0
  73. recnexteval/metrics/ranking/ndcg.py +78 -0
  74. recnexteval/metrics/ranking/precision.py +51 -0
  75. recnexteval/metrics/ranking/recall.py +42 -0
  76. recnexteval/models/__init__.py +4 -0
  77. recnexteval/models/base.py +69 -0
  78. recnexteval/preprocessing/__init__.py +37 -0
  79. recnexteval/preprocessing/filter.py +181 -0
  80. recnexteval/preprocessing/preprocessor.py +137 -0
  81. recnexteval/registries/__init__.py +67 -0
  82. recnexteval/registries/algorithm.py +68 -0
  83. recnexteval/registries/base.py +131 -0
  84. recnexteval/registries/dataset.py +37 -0
  85. recnexteval/registries/metric.py +57 -0
  86. recnexteval/settings/__init__.py +127 -0
  87. recnexteval/settings/base.py +414 -0
  88. recnexteval/settings/exception.py +8 -0
  89. recnexteval/settings/leave_n_out_setting.py +48 -0
  90. recnexteval/settings/processor.py +115 -0
  91. recnexteval/settings/schema.py +11 -0
  92. recnexteval/settings/single_time_point_setting.py +111 -0
  93. recnexteval/settings/sliding_window_setting.py +153 -0
  94. recnexteval/settings/splitters/__init__.py +14 -0
  95. recnexteval/settings/splitters/base.py +57 -0
  96. recnexteval/settings/splitters/n_last.py +39 -0
  97. recnexteval/settings/splitters/n_last_timestamp.py +76 -0
  98. recnexteval/settings/splitters/timestamp.py +82 -0
  99. recnexteval/settings/util.py +0 -0
  100. recnexteval/utils/__init__.py +115 -0
  101. recnexteval/utils/json_to_csv_converter.py +128 -0
  102. recnexteval/utils/logging_tools.py +159 -0
  103. recnexteval/utils/path.py +155 -0
  104. recnexteval/utils/url_certificate_installer.py +54 -0
  105. recnexteval/utils/util.py +166 -0
  106. recnexteval/utils/uuid_util.py +7 -0
  107. recnexteval/utils/yaml_tool.py +65 -0
  108. recnexteval-0.1.0.dist-info/METADATA +85 -0
  109. recnexteval-0.1.0.dist-info/RECORD +110 -0
  110. recnexteval-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,51 @@
1
+ import numpy as np
2
+ from scipy.sparse import csr_matrix
3
+
4
+
5
+ def get_top_K_ranks(X: csr_matrix, K: None | int = None) -> csr_matrix:
6
+ """Returns a matrix of ranks assigned to the largest K values in X.
7
+
8
+ Selects K largest values for every row in X and assigns a rank to each.
9
+
10
+ :param X: Matrix from which we will select K values in every row.
11
+ :type X: csr_matrix
12
+ :param K: Amount of values to select.
13
+ :type K: int, optional
14
+ :return: Matrix with K values per row.
15
+ :rtype: csr_matrix
16
+ """
17
+ U, I, V = [], [], []
18
+ for row_ix, (le, ri) in enumerate(zip(X.indptr[:-1], X.indptr[1:])):
19
+ K_row_pick = min(K, ri - le) if K is not None else ri - le
20
+
21
+ if K_row_pick != 0:
22
+ top_k_row = X.indices[le + np.argpartition(X.data[le:ri], list(range(-K_row_pick, 0)))[-K_row_pick:]]
23
+
24
+ for rank, col_ix in enumerate(reversed(top_k_row)):
25
+ U.append(row_ix)
26
+ I.append(col_ix)
27
+ V.append(rank + 1)
28
+ # data, (row, col) = (V, (U, I)
29
+ X_top_K = csr_matrix((V, (U, I)), shape=X.shape)
30
+
31
+ return X_top_K
32
+
33
+
34
+ def get_top_K_values(X: csr_matrix, K: None | int = None) -> csr_matrix:
35
+ """Returns a matrix of only the K largest values for every row in X.
36
+
37
+ Selects the top-K items for every user (which is equal to the K nearest neighbours.)
38
+ In case of a tie for the last position, the item with the largest index of the tied items is used.
39
+
40
+ :param X: Matrix from which we will select K values in every row.
41
+ :type X: csr_matrix
42
+ :param K: Amount of values to select.
43
+ :type K: int, optional
44
+ :return: Matrix with K values per row.
45
+ :rtype: csr_matrix
46
+ """
47
+ top_K_ranks = get_top_K_ranks(X, K)
48
+ # Convert the position into binary values (1 if in top K, 0 otherwise)
49
+ top_K_ranks[top_K_ranks > 0] = 1
50
+ # elementwise multiplication with orignal matrix to get values
51
+ return top_K_ranks.multiply(X)
@@ -0,0 +1,109 @@
1
+ """Dataset module for public datasets in streaming experiments.
2
+
3
+ This module provides easy access to publicly available datasets for use in streaming
4
+ experiments. Dataset classes are built on top of the `Dataset` base class, allowing
5
+ for easy extension and customization.
6
+
7
+ ## Dataset Overview
8
+
9
+ Multiple public datasets are available from various sources. Additionally, a
10
+ lightweight test dataset is provided for testing algorithm functionality.
11
+
12
+ ### Data Chunking Note
13
+
14
+ The MovieLens 100K dataset is available but not chunked into "blocks". Setting
15
+ a global timeline to split the data could potentially cause a chunk of data to
16
+ be lost. Other publicly available datasets are recommended.
17
+
18
+ ## Available Datasets
19
+
20
+ - `AmazonBookDataset`: Amazon Books reviews
21
+ - `AmazonMovieDataset`: Amazon Movies reviews
22
+ - `AmazonMusicDataset`: Amazon Music reviews
23
+ - `AmazonSubscriptionBoxesDataset`: Amazon Subscription Boxes reviews
24
+ - `LastFMDataset`: Last.FM music listening history
25
+ - `MovieLens100K`: MovieLens 100K rating dataset
26
+ - `YelpDataset`: Yelp business reviews
27
+ - `TestDataset`: Lightweight dataset for testing algorithms
28
+
29
+ ## Loading Datasets
30
+
31
+ Basic loading:
32
+
33
+ ```python
34
+ from recnexteval.datasets import AmazonMusicDataset
35
+
36
+ dataset = AmazonMusicDataset()
37
+ data = dataset.load()
38
+ ```
39
+
40
+ If the file does not exist, it will be downloaded and written. Subsequent loads
41
+ will retrieve the file from disk without downloading again.
42
+
43
+ ### Using Default Filters
44
+
45
+ ```python
46
+ from recnexteval.datasets import AmazonMusicDataset
47
+
48
+ dataset = AmazonMusicDataset(use_default_filters=True)
49
+ data = dataset.load(apply_filters=False)
50
+ ```
51
+
52
+ Each dataset can be loaded with default filters applied. Default filters ensure
53
+ that user and item IDs increment in the order of time. **This is the recommended
54
+ loading approach.**
55
+
56
+ ## Extending the Framework
57
+
58
+ To add custom datasets, inherit from `Dataset` and implement all abstract methods.
59
+ Refer to the base class documentation for implementation details.
60
+
61
+ ## Related Modules
62
+
63
+ - `recnexteval.preprocessing`: Data preprocessing and filtering utilities
64
+ """
65
+
66
+ from .datasets import (
67
+ AmazonBookDataset,
68
+ AmazonMovieDataset,
69
+ AmazonMusicDataset,
70
+ AmazonSubscriptionBoxesDataset,
71
+ Dataset,
72
+ LastFMDataset,
73
+ MovieLens100K,
74
+ TestDataset,
75
+ YelpDataset,
76
+ )
77
+ from .metadata import (
78
+ AmazonBookItemMetadata,
79
+ AmazonMovieItemMetadata,
80
+ AmazonMusicItemMetadata,
81
+ LastFMItemMetadata,
82
+ LastFMTagMetadata,
83
+ LastFMUserMetadata,
84
+ Metadata,
85
+ MovieLens100kItemMetadata,
86
+ MovieLens100kUserMetadata,
87
+ )
88
+
89
+
90
+ __all__ = [
91
+ "AmazonBookDataset",
92
+ "AmazonMovieDataset",
93
+ "AmazonMusicDataset",
94
+ "AmazonSubscriptionBoxesDataset",
95
+ "LastFMDataset",
96
+ "MovieLens100K",
97
+ "YelpDataset",
98
+ "TestDataset",
99
+ "Dataset",
100
+ "Metadata",
101
+ "MovieLens100kUserMetadata",
102
+ "MovieLens100kItemMetadata",
103
+ "AmazonBookItemMetadata",
104
+ "AmazonMovieItemMetadata",
105
+ "AmazonMusicItemMetadata",
106
+ "LastFMUserMetadata",
107
+ "LastFMItemMetadata",
108
+ "LastFMTagMetadata",
109
+ ]
@@ -0,0 +1,316 @@
1
+ import logging
2
+ import os
3
+ import time
4
+ from abc import abstractmethod
5
+ from typing import ClassVar
6
+
7
+ import httpx
8
+ import pandas as pd
9
+
10
+ from ..matrix import InteractionMatrix
11
+ from ..models import BaseModel
12
+ from ..preprocessing.filter import Filter, MinItemsPerUser, MinUsersPerItem
13
+ from ..preprocessing.preprocessor import DataFramePreprocessor
14
+ from ..utils.path import safe_dir
15
+ from .config import DatasetConfig
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class DataFetcher(BaseModel):
22
+ """Represents a abstract class to be used by Dataset or Metadata subclass.
23
+ """
24
+ config: ClassVar[DatasetConfig] = DatasetConfig()
25
+ """Configuration for the dataset."""
26
+
27
+ @property
28
+ def file_path(self) -> str:
29
+ """File path of the dataset."""
30
+ return os.path.join(self.config.default_base_path, self.config.default_filename)
31
+
32
+ @property
33
+ def processed_cache_path(self) -> str:
34
+ """Path for cached processed data."""
35
+ return os.path.join(
36
+ self.config.default_base_path, f"{self.config.default_filename}.processed.parquet"
37
+ )
38
+
39
+ def fetch_dataset(self) -> None:
40
+ """Check if dataset is present, if not download"""
41
+ if os.path.exists(self.file_path):
42
+ logger.debug("Data file is in memory and in dir specified.")
43
+ return
44
+ logger.debug(f"{self.name} dataset not found in {self.file_path}.")
45
+ self._download_dataset()
46
+
47
+ def fetch_dataset_force(self) -> None:
48
+ """Force re-download of the dataset."""
49
+ logger.debug(f"{self.name} force re-download of dataset.")
50
+ self._download_dataset()
51
+
52
+ def _fetch_remote(self, url: str, filename: str) -> str:
53
+ """Fetch data from remote url and save locally (synchronous fallback).
54
+
55
+ This function keeps the previous synchronous behaviour but uses
56
+ `httpx.Client` to stream the response and write to disk. If you
57
+ want async behaviour, use :meth:`_fetch_remote_async` instead.
58
+
59
+ Args:
60
+ url: url to fetch data from
61
+ filename: Path to save file to
62
+
63
+ Returns:
64
+ The filename where data was saved
65
+ """
66
+ logger.debug(f"{self.name} will fetch dataset from remote url at {url}.")
67
+
68
+ with httpx.Client(timeout=httpx.Timeout(60.0)) as client, client.stream("GET", url) as resp:
69
+ resp.raise_for_status()
70
+ with open(filename, "wb") as fd:
71
+ for chunk in resp.iter_bytes():
72
+ if chunk:
73
+ fd.write(chunk)
74
+
75
+ return filename
76
+
77
+ async def _fetch_remote_async(self, url: str, filename: str) -> str:
78
+ """Asynchronously fetch data from a remote URL and save locally.
79
+
80
+ Uses `httpx.AsyncClient` and streams the response to disk. Callers
81
+ running inside an event loop should use this coroutine instead of
82
+ the synchronous `_fetch_remote`.
83
+
84
+ Args:
85
+ url: url to fetch data from
86
+ filename: Path to save file to
87
+
88
+ Returns:
89
+ The filename where data was saved
90
+ """
91
+ logger.debug(f"{self.name} will asynchronously fetch dataset from {url}.")
92
+
93
+ async with (
94
+ httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client,
95
+ client.stream("GET", url) as resp,
96
+ ):
97
+ resp.raise_for_status()
98
+ # Write bytes as they arrive
99
+ with open(filename, "wb") as fd:
100
+ async for chunk in resp.aiter_bytes():
101
+ if chunk:
102
+ fd.write(chunk)
103
+
104
+ return filename
105
+
106
+ def _cache_processed_dataframe(self, df: pd.DataFrame) -> None:
107
+ """Cache the processed DataFrame to disk.
108
+
109
+ :param df: DataFrame to cache
110
+ :type df: pd.DataFrame
111
+ """
112
+ logger.debug(f"Caching processed DataFrame to {self.processed_cache_path}")
113
+ df.to_parquet(self.processed_cache_path)
114
+ logger.debug("Processed DataFrame cached successfully.")
115
+
116
+ @abstractmethod
117
+ def _load_dataframe(self) -> pd.DataFrame:
118
+ """Load the raw dataset from file, and return it as a pandas DataFrame.
119
+
120
+ Warning:
121
+ This does not apply any preprocessing, and returns the raw dataset.
122
+
123
+ Returns:
124
+ Interaction with minimal columns of {user, item, timestamp}.
125
+ """
126
+ raise NotImplementedError("Needs to be implemented")
127
+
128
+ @abstractmethod
129
+ def _download_dataset(self) -> None:
130
+ """Downloads the dataset.
131
+
132
+ Downloads the csv file from the dataset URL and saves it to the file path.
133
+ """
134
+ raise NotImplementedError("Needs to be implemented")
135
+
136
+
137
+ class Dataset(DataFetcher):
138
+ """Represents a collaborative filtering dataset.
139
+
140
+ Dataset must minimally contain user, item and timestamp columns for the
141
+ other modules to work.
142
+
143
+ Assumption
144
+ ===========
145
+ User/item ID increments in the order of time. This is an assumption that will
146
+ be made for the purposes of splitting the dataset and eventually passing
147
+ the dataset to the model. The ID incrementing in the order of time allows us
148
+ to set the shape of the currently known user and item matrix allowing easier
149
+ manipulation of the data by the evaluator.
150
+
151
+ :param filename: Name of the file, if no name is provided the dataset default will be used if known.
152
+ If the dataset does not have a default filename, a ValueError will be raised.
153
+ :type filename: str, optional
154
+ :param base_path: The base_path to the data directory.
155
+ Defaults to `data`
156
+ :type base_path: str, optional
157
+ :param use_default_filters: If True, the default filters will be applied to the dataset.
158
+ Defaults to False.
159
+ :type use_default_filters: bool, optional
160
+ """
161
+
162
+ config: ClassVar[DatasetConfig] = DatasetConfig()
163
+ """Configuration for the dataset."""
164
+
165
+ def __init__(
166
+ self,
167
+ use_default_filters: bool = False, # noqa: FBT001, FBT002
168
+ fetch_metadata: bool = False, # noqa: FBT001, FBT002
169
+ ) -> None:
170
+ if not self.config.user_ix or not self.config.item_ix or not self.config.timestamp_ix:
171
+ raise AttributeError("user_ix, item_ix or timestamp_ix not set in config.")
172
+
173
+ logger.debug(
174
+ f"{self.name} being initialized with '{self.config.default_base_path}' as the base path."
175
+ )
176
+
177
+ if not self.config.default_filename:
178
+ raise ValueError("No filename specified, and no default known.")
179
+
180
+ self.fetch_metadata = fetch_metadata
181
+ self.preprocessor = DataFramePreprocessor(
182
+ self.config.item_ix, self.config.user_ix, self.config.timestamp_ix
183
+ )
184
+
185
+ if use_default_filters:
186
+ for f in self._default_filters:
187
+ self.add_filter(f)
188
+
189
+ safe_dir(self.config.default_base_path)
190
+ logger.debug(f"{self.name} is initialized.")
191
+
192
+ @property
193
+ def _default_filters(self) -> list[Filter]:
194
+ """The default filters for all datasets
195
+
196
+ Concrete classes can override this property to add more filters.
197
+
198
+ Returns:
199
+ List of filters to be applied to the dataset.
200
+ """
201
+ if not self.config.user_ix or not self.config.item_ix:
202
+ raise AttributeError("config.user_ix or config.item_ix not set.")
203
+
204
+ filters: list[Filter] = []
205
+ filters.append(
206
+ MinItemsPerUser(
207
+ min_items_per_user=3,
208
+ item_ix=self.config.item_ix,
209
+ user_ix=self.config.user_ix,
210
+ )
211
+ )
212
+ filters.append(
213
+ MinUsersPerItem(
214
+ min_users_per_item=3,
215
+ item_ix=self.config.item_ix,
216
+ user_ix=self.config.user_ix,
217
+ )
218
+ )
219
+ return filters
220
+
221
+ def add_filter(self, filter_: Filter) -> None:
222
+ """Add a filter to be applied when loading the data.
223
+
224
+ Utilize :class:`DataFramePreprocessor` class to add filters to the
225
+ dataset to load. The filter will be applied when the data is loaded into
226
+ an :class:`InteractionMatrix` object when :meth:`load` is called.
227
+
228
+ :param filter_: Filter to be applied to the loaded DataFrame
229
+ processing to interaction matrix.
230
+ :type filter_: Filter
231
+ """
232
+ self.preprocessor.add_filter(filter_)
233
+
234
+ def _load_dataframe_from_cache(self) -> pd.DataFrame:
235
+ if not os.path.exists(self.processed_cache_path):
236
+ raise FileNotFoundError("Processed cache file not found.")
237
+ logger.info(f"Loading from cache: {self.processed_cache_path}")
238
+ df = pd.read_parquet(self.processed_cache_path)
239
+ return df
240
+
241
+ def load(self, apply_filters: bool = True, use_cache: bool = True) -> InteractionMatrix:
242
+ """Loads data into an InteractionMatrix object.
243
+
244
+ Data is loaded into a DataFrame using the :func:`_load_dataframe` function.
245
+ Resulting DataFrame is parsed into an :class:`InteractionMatrix` object. If
246
+ :data:`apply_filters` is set to True, the filters set will be applied to the
247
+ dataset and mapping of user and item ids will be done. This is advised
248
+ even if there is no filter set, as it will ensure that the user and item
249
+ ids are incrementing in the order of time.
250
+
251
+ Args:
252
+ apply_filters: To apply the filters set and preprocessing,
253
+ defaults to True
254
+ use_cache: Whether to use cached processed data, defaults to True
255
+
256
+ Returns:
257
+ Resulting interaction matrix.
258
+ """
259
+ logger.info(f"{self.name} is loading dataset...")
260
+ start = time.time()
261
+ try:
262
+ df = self._load_dataframe_from_cache() if use_cache else self._load_dataframe()
263
+ except FileNotFoundError:
264
+ logger.warning("Processed cache not found, loading raw dataframe.")
265
+ df = self._load_dataframe()
266
+ self._cache_processed_dataframe(df)
267
+ if apply_filters:
268
+ logger.debug(f"{self.name} applying filters set.")
269
+ im = self.preprocessor.process(df)
270
+ else:
271
+ im = self._dataframe_to_matrix(df)
272
+ logger.warning(
273
+ "No filters applied, user and item ids may not be incrementing in the order of time. "
274
+ "Classes that use this dataset may not work as expected."
275
+ )
276
+
277
+ if self.fetch_metadata:
278
+ user_id_mapping, item_id_mapping = (
279
+ self.preprocessor.user_id_mapping,
280
+ self.preprocessor.item_id_mapping,
281
+ )
282
+ self._fetch_dataset_metadata(
283
+ user_id_mapping=user_id_mapping, item_id_mapping=item_id_mapping
284
+ )
285
+
286
+ end = time.time()
287
+ logger.info(f"{self.name} dataset loaded - Took {end - start:.3}s")
288
+ return im
289
+
290
+ def _dataframe_to_matrix(self, df: pd.DataFrame) -> InteractionMatrix:
291
+ """Converts a DataFrame to an InteractionMatrix.
292
+
293
+ Args:
294
+ df: DataFrame to convert
295
+
296
+ Returns:
297
+ InteractionMatrix object.
298
+ """
299
+ if not self.config.user_ix or not self.config.item_ix or not self.config.timestamp_ix:
300
+ raise AttributeError("config.user_ix, config.item_ix or config.timestamp_ix not set.")
301
+ return InteractionMatrix(
302
+ df,
303
+ user_ix=self.config.user_ix,
304
+ item_ix=self.config.item_ix,
305
+ timestamp_ix=self.config.timestamp_ix,
306
+ )
307
+
308
+ @abstractmethod
309
+ def _fetch_dataset_metadata(
310
+ self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
311
+ ) -> None:
312
+ """Fetch metadata for the dataset.
313
+
314
+ Fetch metadata for the dataset, if available.
315
+ """
316
+ raise NotImplementedError("Needs to be implemented")
@@ -0,0 +1,113 @@
1
+ """Dataset configuration module.
2
+
3
+ This module provides configuration classes for dataset loading and metadata
4
+ handling. Configurations define dataset properties such as paths, URLs, and
5
+ processing parameters.
6
+
7
+ ## Available Configurations
8
+
9
+ ### Base Classes
10
+
11
+ - `DatasetConfig`: Base class for dataset configurations
12
+ - `MetadataConfig`: Base class for metadata configurations
13
+
14
+ ### Dataset Configurations
15
+
16
+ - `MovieLensDatasetConfig`: Base configuration for MovieLens datasets
17
+ - `MovieLens100KDatasetConfig`: Configuration for MovieLens 100K dataset
18
+ - `AmazonDatasetConfig`: Base configuration for Amazon datasets
19
+ - `AmazonMusicDatasetConfig`: Configuration for Amazon Music dataset
20
+ - `AmazonMovieDatasetConfig`: Configuration for Amazon Movies dataset
21
+ - `AmazonBookDatasetConfig`: Configuration for Amazon Books dataset
22
+ - `AmazonSubscriptionBoxesDatasetConfig`: Configuration for Amazon Subscription Boxes dataset
23
+ - `LastFMDatasetConfig`: Configuration for Last.FM dataset
24
+ - `YelpDatasetConfig`: Configuration for Yelp dataset
25
+
26
+ ### Metadata Configurations
27
+
28
+ - `MovieLens100kItemMetadataConfig`: Item metadata configuration for MovieLens 100K
29
+ - `MovieLens100kUserMetadataConfig`: User metadata configuration for MovieLens 100K
30
+ - `AmazonItemMetadataConfig`: Base Amazon item metadata configuration
31
+ - `AmazonBooksItemMetadataConfig`: Item metadata for Amazon Books
32
+ - `AmazonDigitalMusicItemMetadataConfig`: Item metadata for Amazon Digital Music
33
+ - `AmazonMoviesAndTVItemMetadataConfig`: Item metadata for Amazon Movies and TV
34
+ - `AmazonSubscriptionBoxesItemMetadataConfig`: Item metadata for Amazon Subscription Boxes
35
+ - `LastFMUserMetadataConfig`: User metadata configuration for Last.FM
36
+ - `LastFMItemMetadataConfig`: Item metadata configuration for Last.FM
37
+ - `LastFMTagMetadataConfig`: Tag metadata configuration for Last.FM
38
+
39
+ ## Usage
40
+
41
+ A typical usage pattern is to import a dataset config, optionally override fields,
42
+ and pass it to dataset-loading utilities or custom convenience wrappers:
43
+
44
+ ```python
45
+ from recnexteval.datasets.config import AmazonMusicDatasetConfig
46
+
47
+ # Create config instance using defaults
48
+ cfg = AmazonMusicDatasetConfig()
49
+
50
+ # Inspect config values
51
+ print(cfg.name)
52
+ print(cfg.local_path)
53
+ print(cfg.source_url)
54
+
55
+ # Optionally override defaults at runtime
56
+ custom_cfg = AmazonMusicDatasetConfig(
57
+ min_user_interactions=5,
58
+ min_item_interactions=10
59
+ )
60
+ ```
61
+ """
62
+
63
+ from .amazon import (
64
+ AmazonBookDatasetConfig,
65
+ AmazonBooksItemMetadataConfig,
66
+ AmazonDatasetConfig,
67
+ AmazonDigitalMusicItemMetadataConfig,
68
+ AmazonItemMetadataConfig,
69
+ AmazonMovieDatasetConfig,
70
+ AmazonMoviesAndTVItemMetadataConfig,
71
+ AmazonMusicDatasetConfig,
72
+ AmazonSubscriptionBoxesDatasetConfig,
73
+ AmazonSubscriptionBoxesItemMetadataConfig,
74
+ )
75
+ from .base import DatasetConfig, MetadataConfig
76
+ from .lastfm import (
77
+ LastFMDatasetConfig,
78
+ LastFMItemMetadataConfig,
79
+ LastFMTagMetadataConfig,
80
+ LastFMUserMetadataConfig,
81
+ )
82
+ from .movielens import (
83
+ MovieLens100KDatasetConfig,
84
+ MovieLens100kItemMetadataConfig,
85
+ MovieLens100kUserMetadataConfig,
86
+ MovieLensDatasetConfig,
87
+ )
88
+ from .yelp import YelpDatasetConfig
89
+
90
+
91
+ __all__ = [
92
+ "AmazonDatasetConfig",
93
+ "AmazonMusicDatasetConfig",
94
+ "AmazonMovieDatasetConfig",
95
+ "AmazonBookDatasetConfig",
96
+ "AmazonSubscriptionBoxesDatasetConfig",
97
+ "LastFMDatasetConfig",
98
+ "YelpDatasetConfig",
99
+ "DatasetConfig",
100
+ "MetadataConfig",
101
+ "MovieLensDatasetConfig",
102
+ "MovieLens100KDatasetConfig",
103
+ "MovieLens100kItemMetadataConfig",
104
+ "MovieLens100kUserMetadataConfig",
105
+ "AmazonBooksItemMetadataConfig",
106
+ "AmazonDigitalMusicItemMetadataConfig",
107
+ "AmazonItemMetadataConfig",
108
+ "AmazonMoviesAndTVItemMetadataConfig",
109
+ "AmazonSubscriptionBoxesItemMetadataConfig",
110
+ "LastFMItemMetadataConfig",
111
+ "LastFMTagMetadataConfig",
112
+ "LastFMUserMetadataConfig",
113
+ ]