recnexteval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. recnexteval/__init__.py +20 -0
  2. recnexteval/algorithms/__init__.py +99 -0
  3. recnexteval/algorithms/base.py +377 -0
  4. recnexteval/algorithms/baseline/__init__.py +10 -0
  5. recnexteval/algorithms/baseline/decay_popularity.py +110 -0
  6. recnexteval/algorithms/baseline/most_popular.py +72 -0
  7. recnexteval/algorithms/baseline/random.py +39 -0
  8. recnexteval/algorithms/baseline/recent_popularity.py +34 -0
  9. recnexteval/algorithms/itemknn/__init__.py +14 -0
  10. recnexteval/algorithms/itemknn/itemknn.py +119 -0
  11. recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
  12. recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
  13. recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
  14. recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
  15. recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
  16. recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
  17. recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
  18. recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
  19. recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
  20. recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
  21. recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
  22. recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
  23. recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
  24. recnexteval/algorithms/utils.py +51 -0
  25. recnexteval/datasets/__init__.py +109 -0
  26. recnexteval/datasets/base.py +316 -0
  27. recnexteval/datasets/config/__init__.py +113 -0
  28. recnexteval/datasets/config/amazon.py +188 -0
  29. recnexteval/datasets/config/base.py +72 -0
  30. recnexteval/datasets/config/lastfm.py +105 -0
  31. recnexteval/datasets/config/movielens.py +169 -0
  32. recnexteval/datasets/config/yelp.py +25 -0
  33. recnexteval/datasets/datasets/__init__.py +24 -0
  34. recnexteval/datasets/datasets/amazon.py +151 -0
  35. recnexteval/datasets/datasets/base.py +250 -0
  36. recnexteval/datasets/datasets/lastfm.py +121 -0
  37. recnexteval/datasets/datasets/movielens.py +93 -0
  38. recnexteval/datasets/datasets/test.py +46 -0
  39. recnexteval/datasets/datasets/yelp.py +103 -0
  40. recnexteval/datasets/metadata/__init__.py +58 -0
  41. recnexteval/datasets/metadata/amazon.py +68 -0
  42. recnexteval/datasets/metadata/base.py +38 -0
  43. recnexteval/datasets/metadata/lastfm.py +110 -0
  44. recnexteval/datasets/metadata/movielens.py +87 -0
  45. recnexteval/evaluators/__init__.py +189 -0
  46. recnexteval/evaluators/accumulator.py +167 -0
  47. recnexteval/evaluators/base.py +216 -0
  48. recnexteval/evaluators/builder/__init__.py +125 -0
  49. recnexteval/evaluators/builder/base.py +166 -0
  50. recnexteval/evaluators/builder/pipeline.py +111 -0
  51. recnexteval/evaluators/builder/stream.py +54 -0
  52. recnexteval/evaluators/evaluator_pipeline.py +287 -0
  53. recnexteval/evaluators/evaluator_stream.py +374 -0
  54. recnexteval/evaluators/state_management.py +310 -0
  55. recnexteval/evaluators/strategy.py +32 -0
  56. recnexteval/evaluators/util.py +124 -0
  57. recnexteval/matrix/__init__.py +48 -0
  58. recnexteval/matrix/exception.py +5 -0
  59. recnexteval/matrix/interaction_matrix.py +784 -0
  60. recnexteval/matrix/prediction_matrix.py +153 -0
  61. recnexteval/matrix/util.py +24 -0
  62. recnexteval/metrics/__init__.py +57 -0
  63. recnexteval/metrics/binary/__init__.py +4 -0
  64. recnexteval/metrics/binary/hit.py +49 -0
  65. recnexteval/metrics/core/__init__.py +10 -0
  66. recnexteval/metrics/core/base.py +126 -0
  67. recnexteval/metrics/core/elementwise_top_k.py +75 -0
  68. recnexteval/metrics/core/listwise_top_k.py +72 -0
  69. recnexteval/metrics/core/top_k.py +60 -0
  70. recnexteval/metrics/core/util.py +29 -0
  71. recnexteval/metrics/ranking/__init__.py +6 -0
  72. recnexteval/metrics/ranking/dcg.py +55 -0
  73. recnexteval/metrics/ranking/ndcg.py +78 -0
  74. recnexteval/metrics/ranking/precision.py +51 -0
  75. recnexteval/metrics/ranking/recall.py +42 -0
  76. recnexteval/models/__init__.py +4 -0
  77. recnexteval/models/base.py +69 -0
  78. recnexteval/preprocessing/__init__.py +37 -0
  79. recnexteval/preprocessing/filter.py +181 -0
  80. recnexteval/preprocessing/preprocessor.py +137 -0
  81. recnexteval/registries/__init__.py +67 -0
  82. recnexteval/registries/algorithm.py +68 -0
  83. recnexteval/registries/base.py +131 -0
  84. recnexteval/registries/dataset.py +37 -0
  85. recnexteval/registries/metric.py +57 -0
  86. recnexteval/settings/__init__.py +127 -0
  87. recnexteval/settings/base.py +414 -0
  88. recnexteval/settings/exception.py +8 -0
  89. recnexteval/settings/leave_n_out_setting.py +48 -0
  90. recnexteval/settings/processor.py +115 -0
  91. recnexteval/settings/schema.py +11 -0
  92. recnexteval/settings/single_time_point_setting.py +111 -0
  93. recnexteval/settings/sliding_window_setting.py +153 -0
  94. recnexteval/settings/splitters/__init__.py +14 -0
  95. recnexteval/settings/splitters/base.py +57 -0
  96. recnexteval/settings/splitters/n_last.py +39 -0
  97. recnexteval/settings/splitters/n_last_timestamp.py +76 -0
  98. recnexteval/settings/splitters/timestamp.py +82 -0
  99. recnexteval/settings/util.py +0 -0
  100. recnexteval/utils/__init__.py +115 -0
  101. recnexteval/utils/json_to_csv_converter.py +128 -0
  102. recnexteval/utils/logging_tools.py +159 -0
  103. recnexteval/utils/path.py +155 -0
  104. recnexteval/utils/url_certificate_installer.py +54 -0
  105. recnexteval/utils/util.py +166 -0
  106. recnexteval/utils/uuid_util.py +7 -0
  107. recnexteval/utils/yaml_tool.py +65 -0
  108. recnexteval-0.1.0.dist-info/METADATA +85 -0
  109. recnexteval-0.1.0.dist-info/RECORD +110 -0
  110. recnexteval-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,103 @@
1
+ import logging
2
+ from typing import ClassVar, NoReturn
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from ..config.yelp import YelpDatasetConfig
8
+ from .base import Dataset
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class YelpDataset(Dataset):
15
+ """Yelp dataset
16
+
17
+ The Yelp dataset contains user reviews of businesses. The main columns that
18
+ will be used are:
19
+
20
+ - user_id: The user identifier
21
+ - business_id: The business identifier
22
+ - stars: The rating given by the user to the business
23
+ - date: The date of the review
24
+
25
+ The dataset can be downloaded from https://www.yelp.com/dataset/download.
26
+ The dataset is in a zip file, there are online codes that will aid you in
27
+ converting the json file to a csv file for usage. Note that for the purposes
28
+ of this class, it is assumed that the dataset has been converted to a csv file
29
+ and is named `yelp_academic_dataset_review.csv`.
30
+
31
+ Reference is made to the following code from the official repo from
32
+ Yelp: https://github.com/Yelp/dataset-examples/blob/master/json_to_csv_converter.py
33
+
34
+ you can use the following command to convert the json file to a csv file:
35
+
36
+ .. code-block:: shell
37
+ python json_to_csv_converter.py yelp_academic_dataset_review.json
38
+
39
+ """
40
+
41
+ IS_BASE: bool = False
42
+ config: ClassVar[YelpDatasetConfig] = YelpDatasetConfig()
43
+
44
+ def _download_dataset(self) -> NoReturn:
45
+ raise ValueError(
46
+ "Yelp dataset has not been downloaded. Please head over"
47
+ f"to {self.config.dataset_url} to download the dataset."
48
+ "As there is a license agreement, we cannot download it for you."
49
+ "Place the unzip dataset under the data directory when done."
50
+ f"Expected filename: {self.config.default_filename}"
51
+ )
52
+
53
+ def _load_dataframe(self) -> pd.DataFrame:
54
+ """Load the raw dataset from file, and return it as a pandas DataFrame.
55
+
56
+ Transform the dataset downloaded to have integer user and item ids. This
57
+ will be needed for representation in the interaction matrix.
58
+
59
+ Returns:
60
+ The interaction data as a DataFrame with a row per interaction.
61
+ """
62
+ self.fetch_dataset()
63
+
64
+ df = pd.read_csv(
65
+ self.file_path,
66
+ dtype={
67
+ self.config.item_ix: str,
68
+ self.config.user_ix: str,
69
+ self.config.rating_ix: np.float32,
70
+ self.config.timestamp_ix: str,
71
+ },
72
+ usecols=[
73
+ self.config.item_ix,
74
+ self.config.user_ix,
75
+ self.config.rating_ix,
76
+ self.config.timestamp_ix,
77
+ ],
78
+ parse_dates=[self.config.timestamp_ix],
79
+ date_format="%Y-%m-%d %H:%M:%S",
80
+ header=0,
81
+ sep=",",
82
+ encoding="utf-8",
83
+ )
84
+
85
+ # remove the byte literal char from the string columns
86
+ str_df = df.select_dtypes(["object"])
87
+ str_df = str_df.stack().str[2:-1].unstack()
88
+ for col in str_df:
89
+ df[col] = str_df[col]
90
+
91
+ # convert the timestamp to epoch time
92
+ df[self.config.timestamp_ix] = pd.to_datetime(
93
+ df[self.config.timestamp_ix], format="%Y-%m-%d %H:%M:%S", errors="coerce"
94
+ )
95
+ df.dropna(inplace=True)
96
+ df[self.config.timestamp_ix] = df[self.config.timestamp_ix].astype(np.int64) // 10**9
97
+
98
+ return df
99
+
100
+ def _fetch_dataset_metadata(
101
+ self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
102
+ ) -> None:
103
+ pass
@@ -0,0 +1,58 @@
1
+ """Metadata module for dataset information.
2
+
3
+ This module allows users to include metadata information corresponding to datasets.
4
+ Metadata classes are built on top of the `Metadata` base class, allowing for easy
5
+ extension and customization.
6
+
7
+ ## Important Notes
8
+
9
+ User and item IDs in the metadata module are mapped according to RecNextEval's
10
+ internal mapping, not the original IDs. Developers should not load metadata from
11
+ source separately. Instead, implement the metadata class and load metadata while
12
+ loading the dataset.
13
+
14
+ ## Available Metadata
15
+
16
+ - `Metadata`: Abstract base class for metadata implementations
17
+ - `MovieLens100kUserMetadata`: User metadata from MovieLens 100K dataset
18
+ - `MovieLens100kItemMetadata`: Item metadata from MovieLens 100K dataset
19
+ - `AmazonBookItemMetadata`: Item metadata from Amazon Books dataset
20
+ - `AmazonMovieItemMetadata`: Item metadata from Amazon Movies dataset
21
+ - `AmazonMusicItemMetadata`: Item metadata from Amazon Music dataset
22
+ - `LastFMUserMetadata`: User metadata from Last.FM dataset
23
+ - `LastFMItemMetadata`: Item metadata from Last.FM dataset
24
+ - `LastFMTagMetadata`: Tag metadata from Last.FM dataset
25
+
26
+ ## Example
27
+
28
+ Load metadata from the MovieLens 100K dataset:
29
+
30
+ ```python
31
+ from recnexteval.datasets.movielens import MovieLens100K
32
+
33
+ dataset = MovieLens100K(fetch_dataset=True)
34
+ data = dataset.load()
35
+ ```
36
+ """
37
+
38
+ from .amazon import (
39
+ AmazonBookItemMetadata,
40
+ AmazonMovieItemMetadata,
41
+ AmazonMusicItemMetadata,
42
+ )
43
+ from .base import Metadata
44
+ from .lastfm import LastFMItemMetadata, LastFMTagMetadata, LastFMUserMetadata
45
+ from .movielens import MovieLens100kItemMetadata, MovieLens100kUserMetadata
46
+
47
+
48
+ __all__ = [
49
+ "Metadata",
50
+ "MovieLens100kUserMetadata",
51
+ "MovieLens100kItemMetadata",
52
+ "AmazonBookItemMetadata",
53
+ "AmazonMovieItemMetadata",
54
+ "AmazonMusicItemMetadata",
55
+ "LastFMUserMetadata",
56
+ "LastFMItemMetadata",
57
+ "LastFMTagMetadata",
58
+ ]
@@ -0,0 +1,68 @@
1
+ import logging
2
+ import os
3
+ from typing import ClassVar
4
+
5
+ import pandas as pd
6
+
7
+ from recnexteval.datasets.config import (
8
+ AmazonBooksItemMetadataConfig,
9
+ AmazonDigitalMusicItemMetadataConfig,
10
+ AmazonItemMetadataConfig,
11
+ AmazonMoviesAndTVItemMetadataConfig,
12
+ AmazonSubscriptionBoxesItemMetadataConfig,
13
+ )
14
+ from .base import Metadata
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class AmazonItemMetadata(Metadata):
21
+ config: ClassVar[AmazonItemMetadataConfig] = AmazonItemMetadataConfig()
22
+
23
+ def __init__(self, item_id_mapping: pd.DataFrame) -> None:
24
+ super().__init__()
25
+ self.item_id_mapping = item_id_mapping
26
+
27
+ def _load_dataframe(self) -> pd.DataFrame:
28
+ self.fetch_dataset()
29
+ df = pd.read_json(
30
+ self.file_path, # Ensure file_path contains the JSONL file path
31
+ dtype=self.config.dtype_dict,
32
+ lines=True, # Required for JSONL format
33
+ )
34
+
35
+ item_id_to_iid = dict(zip(self.item_id_mapping[self.config.item_ix], self.item_id_mapping["iid"]))
36
+
37
+ # Map config.item_ix in metadata_df using the optimized function
38
+ df[self.config.item_ix] = df[self.config.item_ix].map(lambda x: item_id_to_iid.get(x, x))
39
+
40
+ return df
41
+
42
+ def _download_dataset(self) -> None:
43
+ """Downloads the metadata for the dataset.
44
+
45
+ Downloads the zipfile, and extracts the ratings file to `self.file_path`
46
+ """
47
+ if not self.config.dataset_url:
48
+ raise ValueError(f"{self.name} does not have URL specified.")
49
+
50
+ self._fetch_remote(
51
+ self.config.dataset_url, os.path.join(self.base_path, f"{self.config.remote_filename}")
52
+ )
53
+
54
+
55
+ class AmazonMusicItemMetadata(AmazonItemMetadata):
56
+ config: ClassVar[AmazonDigitalMusicItemMetadataConfig] = AmazonDigitalMusicItemMetadataConfig()
57
+
58
+
59
+ class AmazonMovieItemMetadata(AmazonItemMetadata):
60
+ config: ClassVar[AmazonMoviesAndTVItemMetadataConfig] = AmazonMoviesAndTVItemMetadataConfig()
61
+
62
+
63
+ class AmazonSubscriptionBoxesItemMetadata(AmazonItemMetadata):
64
+ config: ClassVar[AmazonSubscriptionBoxesItemMetadataConfig] = AmazonSubscriptionBoxesItemMetadataConfig()
65
+
66
+
67
+ class AmazonBookItemMetadata(AmazonItemMetadata):
68
+ config: ClassVar[AmazonBooksItemMetadataConfig] = AmazonBooksItemMetadataConfig()
@@ -0,0 +1,38 @@
1
+ import logging
2
+ from typing import ClassVar, Optional
3
+
4
+ import pandas as pd
5
+
6
+ from recnexteval.datasets.base import DataFetcher
7
+ from recnexteval.datasets.config import MetadataConfig
8
+ from recnexteval.utils.path import safe_dir
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class Metadata(DataFetcher):
15
+ config: ClassVar[MetadataConfig] = MetadataConfig()
16
+
17
+ def __init__(
18
+ self,
19
+ filename: Optional[str] = None,
20
+ base_path: Optional[str] = None,
21
+ ) -> None:
22
+ self.base_path = base_path if base_path else self.config.default_base_path
23
+ logger.debug(f"{self.name} being initialized with '{self.base_path}' as the base path.")
24
+
25
+ self.filename = filename if filename else self.config.default_filename
26
+ if not self.filename:
27
+ raise ValueError("No filename specified, and no default known.")
28
+
29
+ safe_dir(self.base_path)
30
+ logger.debug(f"{self.name} is initialized.")
31
+
32
+ def load(self) -> pd.DataFrame:
33
+ """Load the metadata from file and return it as a DataFrame.
34
+
35
+ Returns:
36
+ DataFrame containing the metadata.
37
+ """
38
+ return self._load_dataframe()
@@ -0,0 +1,110 @@
1
+ import logging
2
+ import os
3
+ import zipfile
4
+ from typing import ClassVar
5
+
6
+ import pandas as pd
7
+
8
+ from recnexteval.datasets.config import (
9
+ LastFMItemMetadataConfig,
10
+ LastFMTagMetadataConfig,
11
+ LastFMUserMetadataConfig,
12
+ )
13
+ from .base import Metadata
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class LastFMMetadata(Metadata):
20
+ config: ClassVar[LastFMUserMetadataConfig] = LastFMUserMetadataConfig() # type: ignore
21
+
22
+ def _download_dataset(self) -> None:
23
+ """Downloads the metadata for the dataset.
24
+
25
+ Downloads the zipfile, and extracts the ratings file to `self.file_path`
26
+ """
27
+ # Download the zip into the data directory
28
+ self._fetch_remote(
29
+ f"{self.config.dataset_url}/{self.config.remote_filename}.zip",
30
+ os.path.join(self.base_path, f"{self.config.remote_filename}.zip"),
31
+ )
32
+
33
+ # Extract the interaction file which we will use
34
+ with zipfile.ZipFile(
35
+ os.path.join(self.base_path, f"{self.config.remote_filename}.zip"), "r"
36
+ ) as zip_ref:
37
+ zip_ref.extract(f"{self.config.remote_filename}", self.base_path)
38
+
39
+
40
+ class LastFMUserMetadata(Metadata):
41
+ config: ClassVar[LastFMUserMetadataConfig] = LastFMUserMetadataConfig() # type: ignore
42
+
43
+ def __init__(self, user_id_mapping: pd.DataFrame) -> None:
44
+ super().__init__()
45
+ self.user_id_mapping = user_id_mapping
46
+
47
+ def _load_dataframe(self) -> pd.DataFrame:
48
+ self.fetch_dataset()
49
+ df = pd.read_csv(
50
+ self.file_path,
51
+ sep=self.config.sep,
52
+ names=self.config.column_names,
53
+ converters={
54
+ self.config.user_ix: self._map_user_id,
55
+ self.config.friend_ix: self._map_user_id,
56
+ },
57
+ header=0,
58
+ )
59
+ return df
60
+
61
+ def _map_user_id(self, user_id):
62
+ user_id_to_uid = dict(
63
+ zip(self.user_id_mapping[self.config.user_ix], self.user_id_mapping["uid"])
64
+ )
65
+ return user_id_to_uid.get(int(user_id), user_id)
66
+
67
+
68
+ class LastFMItemMetadata(Metadata):
69
+ config: ClassVar[LastFMItemMetadataConfig] = LastFMItemMetadataConfig() # type: ignore
70
+
71
+ def __init__(self, item_id_mapping: pd.DataFrame) -> None:
72
+ super().__init__()
73
+ self.item_id_mapping = item_id_mapping
74
+
75
+ def _load_dataframe(self) -> pd.DataFrame:
76
+ self.fetch_dataset()
77
+ df = pd.read_csv(
78
+ self.file_path,
79
+ dtype=self.config.dtype_dict,
80
+ sep=self.config.sep,
81
+ names=self.config.column_names,
82
+ converters={
83
+ self.config.item_ix: self._map_item_id,
84
+ },
85
+ header=0,
86
+ )
87
+ return df
88
+
89
+ def _map_item_id(self, item_id):
90
+ item_id_to_iid = dict(zip(self.item_id_mapping["artistID"], self.item_id_mapping["iid"]))
91
+ return item_id_to_iid.get(int(item_id), item_id)
92
+
93
+
94
+ class LastFMTagMetadata(Metadata):
95
+ config: ClassVar[LastFMTagMetadataConfig] = LastFMTagMetadataConfig() # type: ignore
96
+
97
+ def __init__(self) -> None:
98
+ super().__init__()
99
+
100
+ def _load_dataframe(self) -> pd.DataFrame:
101
+ self.fetch_dataset()
102
+ df = pd.read_csv(
103
+ self.file_path,
104
+ dtype=self.config.dtype_dict,
105
+ sep=self.config.sep,
106
+ names=self.config.column_names,
107
+ encoding="ISO-8859-1",
108
+ header=0,
109
+ )
110
+ return df
@@ -0,0 +1,87 @@
1
+ import os
2
+ import zipfile
3
+ from typing import ClassVar
4
+
5
+ import pandas as pd
6
+
7
+ from recnexteval.datasets.config import (
8
+ MovieLens100kItemMetadataConfig,
9
+ MovieLens100kUserMetadataConfig,
10
+ )
11
+ from .base import Metadata
12
+
13
+
14
+ class MovieLens100kMetadata(Metadata):
15
+ def _download_dataset(self) -> None:
16
+ # Download the zip into the data directory
17
+ self._fetch_remote(
18
+ f"{self.config.dataset_url}/{self.config.remote_zipname}.zip",
19
+ os.path.join(self.base_path, f"{self.config.remote_zipname}.zip"),
20
+ )
21
+
22
+ # Extract the ratings file which we will use
23
+ with zipfile.ZipFile(
24
+ os.path.join(self.base_path, f"{self.config.remote_zipname}.zip"), "r"
25
+ ) as zip_ref:
26
+ zip_ref.extract(
27
+ f"{self.config.remote_zipname}/{self.config.remote_filename}", self.base_path
28
+ )
29
+
30
+ # Rename the ratings file to the specified filename
31
+ os.rename(
32
+ os.path.join(
33
+ self.base_path, f"{self.config.remote_zipname}/{self.config.remote_filename}"
34
+ ),
35
+ self.file_path,
36
+ )
37
+
38
+
39
+ class MovieLens100kUserMetadata(MovieLens100kMetadata):
40
+ config: ClassVar[MovieLens100kUserMetadataConfig] = MovieLens100kUserMetadataConfig() # type: ignore
41
+
42
+ def __init__(self, user_id_mapping: pd.DataFrame) -> None:
43
+ super().__init__()
44
+ self.user_id_mapping = user_id_mapping
45
+
46
+ def _load_dataframe(self) -> pd.DataFrame:
47
+ self.fetch_dataset()
48
+ df = pd.read_table(
49
+ self.file_path,
50
+ dtype=self.config.dtype_dict,
51
+ sep=self.config.sep,
52
+ names=self.config.column_names,
53
+ converters={self.config.user_ix: self._map_user_id},
54
+ )
55
+ return df
56
+
57
+ def _map_user_id(self, user_id):
58
+ user_id_to_uid = dict(
59
+ zip(self.user_id_mapping[self.config.user_ix], self.user_id_mapping["uid"])
60
+ )
61
+ return user_id_to_uid.get(int(user_id), user_id)
62
+
63
+
64
+ class MovieLens100kItemMetadata(MovieLens100kMetadata):
65
+ config: ClassVar[MovieLens100kItemMetadataConfig] = MovieLens100kItemMetadataConfig() # type: ignore
66
+
67
+ def __init__(self, item_id_mapping: pd.DataFrame) -> None:
68
+ super().__init__()
69
+ self.item_id_mapping = item_id_mapping
70
+
71
+ def _load_dataframe(self) -> pd.DataFrame:
72
+ self.fetch_dataset()
73
+ df = pd.read_table(
74
+ self.file_path,
75
+ dtype=self.config.dtype_dict,
76
+ sep=self.config.sep,
77
+ names=self.config.column_names,
78
+ converters={self.config.item_ix: self._map_item_id},
79
+ encoding=self.config.encoding,
80
+ )
81
+ return df
82
+
83
+ def _map_item_id(self, item_id):
84
+ item_id_to_iid = dict(
85
+ zip(self.item_id_mapping[self.config.item_ix], self.item_id_mapping["iid"])
86
+ )
87
+ return item_id_to_iid.get(int(item_id), item_id)
@@ -0,0 +1,189 @@
1
+ """Evaluator module for algorithm evaluation in streaming settings.
2
+
3
+ This module provides classes and utilities for evaluating recommendation algorithms
4
+ in streaming environments. It supports both batch pipeline evaluation and interactive
5
+ streaming evaluation with comprehensive metric computation and result analysis.
6
+
7
+ Evaluator Builder
8
+ =================
9
+
10
+ The evaluator module contains builder classes for constructing evaluator objects.
11
+ The builders provide a fluent API for configuring evaluators with proper validation
12
+ and error checking.
13
+
14
+ For detailed information about the builder classes and usage examples,
15
+ see the `recnexteval.evaluators.builder` module.
16
+
17
+ Available Builders:
18
+ - `Builder`: Abstract base class for all builder implementations
19
+ - `EvaluatorPipelineBuilder`: Builder for pipeline evaluators
20
+ - `EvaluatorStreamerBuilder`: Builder for streaming evaluators
21
+
22
+ Evaluator Classes
23
+ =================
24
+
25
+ The core evaluator classes handle the evaluation of recommendation algorithms
26
+ on streaming data. The evaluators manage data splitting, algorithm training,
27
+ prediction generation, and metric computation.
28
+
29
+ EvaluatorPipeline
30
+ -----------------
31
+
32
+ For batch evaluation of multiple algorithms on static or sliding window settings.
33
+ This evaluator runs algorithms through a complete pipeline including training,
34
+ prediction, and evaluation phases.
35
+
36
+ EvaluatorStreamer
37
+ -----------------
38
+
39
+ For interactive streaming evaluation where algorithms can be registered and
40
+ evaluated in real-time as data streams in. This allows for more flexible
41
+ evaluation scenarios where algorithms can request data and submit predictions
42
+ asynchronously.
43
+
44
+ Both evaluators inherit from `EvaluatorBase` which provides common functionality
45
+ for metric computation, data masking, and result aggregation.
46
+
47
+ Key Features
48
+ ------------
49
+
50
+ - **Multi-algorithm evaluation**: Evaluate multiple algorithms simultaneously
51
+ - **Streaming support**: Handle temporal data streams with sliding windows
52
+ - **Metric aggregation**: Compute metrics at different levels (user, window, macro, micro)
53
+ - **Data masking**: Properly handle unknown users and items during evaluation
54
+ - **Result analysis**: Rich DataFrame outputs for metric analysis and comparison
55
+
56
+ Basic Usage
57
+ -----------
58
+
59
+ Pipeline Evaluation:
60
+
61
+ ```python
62
+ from recnexteval.evaluators import EvaluatorPipeline
63
+ from recnexteval.evaluators.builder import EvaluatorPipelineBuilder
64
+ from recnexteval.settings import Setting
65
+ from recnexteval.datasets import AmazonMusicDataset
66
+
67
+ # Load data and create setting
68
+ dataset = AmazonMusicDataset()
69
+ data = dataset.load()
70
+ setting = Setting(data=data, top_K=10)
71
+ setting.split()
72
+
73
+ # Build evaluator
74
+ builder = EvaluatorPipelineBuilder()
75
+ builder.add_setting(setting)
76
+ builder.set_metric_K(10)
77
+ builder.add_metric("PrecisionK")
78
+ builder.add_algorithm("MostPopular")
79
+ evaluator = builder.build()
80
+
81
+ # Run evaluation
82
+ evaluator.run()
83
+
84
+ # Get results
85
+ results = evaluator.metric_results(level="macro")
86
+ ```
87
+
88
+ Streaming Evaluation:
89
+
90
+ ```python
91
+ from recnexteval.evaluators import EvaluatorStreamer
92
+ from recnexteval.evaluators.builder import EvaluatorStreamerBuilder
93
+ from recnexteval.algorithms import MostPopular
94
+
95
+ # Build streaming evaluator
96
+ builder = EvaluatorStreamerBuilder()
97
+ builder.add_setting(setting)
98
+ builder.set_metric_K(10)
99
+ builder.add_metric("HitK")
100
+ evaluator = builder.build()
101
+
102
+ # Start streaming
103
+ evaluator.start_stream()
104
+
105
+ # Register algorithm
106
+ algo_id = evaluator.register_algorithm(MostPopular())
107
+
108
+ # Stream evaluation loop
109
+ while True:
110
+ try:
111
+ # Get training data
112
+ training_data = evaluator.get_training_data(algo_id)
113
+
114
+ # Get unlabeled data
115
+ unlabeled_data = evaluator.get_unlabeled_data(algo_id)
116
+
117
+ # Algorithm makes predictions
118
+ predictions = algorithm.predict(unlabeled_data)
119
+
120
+ # Submit predictions
121
+ evaluator.submit_prediction(algo_id, predictions)
122
+
123
+ except EOWSettingError:
124
+ break
125
+ ```
126
+
127
+ Metric Levels
128
+ -------------
129
+
130
+ Evaluators support computing metrics at different aggregation levels:
131
+
132
+ - **User level**: Metrics computed per user across all timestamps
133
+ - **Window level**: Metrics computed per time window, averaging user scores within each window
134
+ - **Macro level**: Overall metrics averaging across all windows equally
135
+ - **Micro level**: Overall metrics averaging across all user-timestamp combinations equally
136
+
137
+ Available Evaluator Classes:
138
+ - `EvaluatorBase`: Base class providing common evaluation functionality
139
+ - `EvaluatorPipeline`: Pipeline-based batch evaluator
140
+ - `EvaluatorStreamer`: Interactive streaming evaluator
141
+
142
+ Accumulator
143
+ ==========
144
+
145
+ The `MetricAccumulator` class accumulates and stores metric results during
146
+ evaluation. The accumulator maintains a collection of metric objects organized
147
+ by algorithm name and provides methods for retrieving results in various formats.
148
+
149
+ Features:
150
+ - Storing metrics for multiple algorithms
151
+ - Computing aggregated results at different levels
152
+ - Exporting results to pandas DataFrames
153
+ - Filtering results by algorithm, timestamp, or metric type
154
+
155
+ Utility Classes
156
+ ===============
157
+
158
+ Utility classes that support the evaluation process:
159
+
160
+ - `MetricLevelEnum`: Enumeration for different metric aggregation levels
161
+ - `UserItemBaseStatus`: Tracks known and unknown users/items during evaluation
162
+
163
+ These utilities handle the complex state management required for streaming
164
+ evaluation scenarios.
165
+ """
166
+
167
+ from .accumulator import MetricAccumulator
168
+ from .base import EvaluatorBase
169
+ from .builder import (
170
+ Builder,
171
+ EvaluatorPipelineBuilder,
172
+ EvaluatorStreamerBuilder,
173
+ )
174
+ from .evaluator_pipeline import EvaluatorPipeline
175
+ from .evaluator_stream import EvaluatorStreamer
176
+ from .util import MetricLevelEnum, UserItemBaseStatus
177
+
178
+
179
+ __all__ = [
180
+ "Builder",
181
+ "EvaluatorPipelineBuilder",
182
+ "EvaluatorStreamerBuilder",
183
+ "EvaluatorBase",
184
+ "EvaluatorPipeline",
185
+ "EvaluatorStreamer",
186
+ "MetricAccumulator",
187
+ "MetricLevelEnum",
188
+ "UserItemBaseStatus",
189
+ ]