recnexteval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recnexteval/__init__.py +20 -0
- recnexteval/algorithms/__init__.py +99 -0
- recnexteval/algorithms/base.py +377 -0
- recnexteval/algorithms/baseline/__init__.py +10 -0
- recnexteval/algorithms/baseline/decay_popularity.py +110 -0
- recnexteval/algorithms/baseline/most_popular.py +72 -0
- recnexteval/algorithms/baseline/random.py +39 -0
- recnexteval/algorithms/baseline/recent_popularity.py +34 -0
- recnexteval/algorithms/itemknn/__init__.py +14 -0
- recnexteval/algorithms/itemknn/itemknn.py +119 -0
- recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
- recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
- recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
- recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
- recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
- recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
- recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
- recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
- recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
- recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
- recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
- recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
- recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
- recnexteval/algorithms/utils.py +51 -0
- recnexteval/datasets/__init__.py +109 -0
- recnexteval/datasets/base.py +316 -0
- recnexteval/datasets/config/__init__.py +113 -0
- recnexteval/datasets/config/amazon.py +188 -0
- recnexteval/datasets/config/base.py +72 -0
- recnexteval/datasets/config/lastfm.py +105 -0
- recnexteval/datasets/config/movielens.py +169 -0
- recnexteval/datasets/config/yelp.py +25 -0
- recnexteval/datasets/datasets/__init__.py +24 -0
- recnexteval/datasets/datasets/amazon.py +151 -0
- recnexteval/datasets/datasets/base.py +250 -0
- recnexteval/datasets/datasets/lastfm.py +121 -0
- recnexteval/datasets/datasets/movielens.py +93 -0
- recnexteval/datasets/datasets/test.py +46 -0
- recnexteval/datasets/datasets/yelp.py +103 -0
- recnexteval/datasets/metadata/__init__.py +58 -0
- recnexteval/datasets/metadata/amazon.py +68 -0
- recnexteval/datasets/metadata/base.py +38 -0
- recnexteval/datasets/metadata/lastfm.py +110 -0
- recnexteval/datasets/metadata/movielens.py +87 -0
- recnexteval/evaluators/__init__.py +189 -0
- recnexteval/evaluators/accumulator.py +167 -0
- recnexteval/evaluators/base.py +216 -0
- recnexteval/evaluators/builder/__init__.py +125 -0
- recnexteval/evaluators/builder/base.py +166 -0
- recnexteval/evaluators/builder/pipeline.py +111 -0
- recnexteval/evaluators/builder/stream.py +54 -0
- recnexteval/evaluators/evaluator_pipeline.py +287 -0
- recnexteval/evaluators/evaluator_stream.py +374 -0
- recnexteval/evaluators/state_management.py +310 -0
- recnexteval/evaluators/strategy.py +32 -0
- recnexteval/evaluators/util.py +124 -0
- recnexteval/matrix/__init__.py +48 -0
- recnexteval/matrix/exception.py +5 -0
- recnexteval/matrix/interaction_matrix.py +784 -0
- recnexteval/matrix/prediction_matrix.py +153 -0
- recnexteval/matrix/util.py +24 -0
- recnexteval/metrics/__init__.py +57 -0
- recnexteval/metrics/binary/__init__.py +4 -0
- recnexteval/metrics/binary/hit.py +49 -0
- recnexteval/metrics/core/__init__.py +10 -0
- recnexteval/metrics/core/base.py +126 -0
- recnexteval/metrics/core/elementwise_top_k.py +75 -0
- recnexteval/metrics/core/listwise_top_k.py +72 -0
- recnexteval/metrics/core/top_k.py +60 -0
- recnexteval/metrics/core/util.py +29 -0
- recnexteval/metrics/ranking/__init__.py +6 -0
- recnexteval/metrics/ranking/dcg.py +55 -0
- recnexteval/metrics/ranking/ndcg.py +78 -0
- recnexteval/metrics/ranking/precision.py +51 -0
- recnexteval/metrics/ranking/recall.py +42 -0
- recnexteval/models/__init__.py +4 -0
- recnexteval/models/base.py +69 -0
- recnexteval/preprocessing/__init__.py +37 -0
- recnexteval/preprocessing/filter.py +181 -0
- recnexteval/preprocessing/preprocessor.py +137 -0
- recnexteval/registries/__init__.py +67 -0
- recnexteval/registries/algorithm.py +68 -0
- recnexteval/registries/base.py +131 -0
- recnexteval/registries/dataset.py +37 -0
- recnexteval/registries/metric.py +57 -0
- recnexteval/settings/__init__.py +127 -0
- recnexteval/settings/base.py +414 -0
- recnexteval/settings/exception.py +8 -0
- recnexteval/settings/leave_n_out_setting.py +48 -0
- recnexteval/settings/processor.py +115 -0
- recnexteval/settings/schema.py +11 -0
- recnexteval/settings/single_time_point_setting.py +111 -0
- recnexteval/settings/sliding_window_setting.py +153 -0
- recnexteval/settings/splitters/__init__.py +14 -0
- recnexteval/settings/splitters/base.py +57 -0
- recnexteval/settings/splitters/n_last.py +39 -0
- recnexteval/settings/splitters/n_last_timestamp.py +76 -0
- recnexteval/settings/splitters/timestamp.py +82 -0
- recnexteval/settings/util.py +0 -0
- recnexteval/utils/__init__.py +115 -0
- recnexteval/utils/json_to_csv_converter.py +128 -0
- recnexteval/utils/logging_tools.py +159 -0
- recnexteval/utils/path.py +155 -0
- recnexteval/utils/url_certificate_installer.py +54 -0
- recnexteval/utils/util.py +166 -0
- recnexteval/utils/uuid_util.py +7 -0
- recnexteval/utils/yaml_tool.py +65 -0
- recnexteval-0.1.0.dist-info/METADATA +85 -0
- recnexteval-0.1.0.dist-info/RECORD +110 -0
- recnexteval-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import ClassVar, NoReturn
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from ..config.yelp import YelpDatasetConfig
|
|
8
|
+
from .base import Dataset
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class YelpDataset(Dataset):
|
|
15
|
+
"""Yelp dataset
|
|
16
|
+
|
|
17
|
+
The Yelp dataset contains user reviews of businesses. The main columns that
|
|
18
|
+
will be used are:
|
|
19
|
+
|
|
20
|
+
- user_id: The user identifier
|
|
21
|
+
- business_id: The business identifier
|
|
22
|
+
- stars: The rating given by the user to the business
|
|
23
|
+
- date: The date of the review
|
|
24
|
+
|
|
25
|
+
The dataset can be downloaded from https://www.yelp.com/dataset/download.
|
|
26
|
+
The dataset is in a zip file, there are online codes that will aid you in
|
|
27
|
+
converting the json file to a csv file for usage. Note that for the purposes
|
|
28
|
+
of this class, it is assumed that the dataset has been converted to a csv file
|
|
29
|
+
and is named `yelp_academic_dataset_review.csv`.
|
|
30
|
+
|
|
31
|
+
Reference is made to the following code from the official repo from
|
|
32
|
+
Yelp: https://github.com/Yelp/dataset-examples/blob/master/json_to_csv_converter.py
|
|
33
|
+
|
|
34
|
+
you can use the following command to convert the json file to a csv file:
|
|
35
|
+
|
|
36
|
+
.. code-block:: shell
|
|
37
|
+
python json_to_csv_converter.py yelp_academic_dataset_review.json
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
IS_BASE: bool = False
|
|
42
|
+
config: ClassVar[YelpDatasetConfig] = YelpDatasetConfig()
|
|
43
|
+
|
|
44
|
+
def _download_dataset(self) -> NoReturn:
|
|
45
|
+
raise ValueError(
|
|
46
|
+
"Yelp dataset has not been downloaded. Please head over"
|
|
47
|
+
f"to {self.config.dataset_url} to download the dataset."
|
|
48
|
+
"As there is a license agreement, we cannot download it for you."
|
|
49
|
+
"Place the unzip dataset under the data directory when done."
|
|
50
|
+
f"Expected filename: {self.config.default_filename}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
54
|
+
"""Load the raw dataset from file, and return it as a pandas DataFrame.
|
|
55
|
+
|
|
56
|
+
Transform the dataset downloaded to have integer user and item ids. This
|
|
57
|
+
will be needed for representation in the interaction matrix.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
The interaction data as a DataFrame with a row per interaction.
|
|
61
|
+
"""
|
|
62
|
+
self.fetch_dataset()
|
|
63
|
+
|
|
64
|
+
df = pd.read_csv(
|
|
65
|
+
self.file_path,
|
|
66
|
+
dtype={
|
|
67
|
+
self.config.item_ix: str,
|
|
68
|
+
self.config.user_ix: str,
|
|
69
|
+
self.config.rating_ix: np.float32,
|
|
70
|
+
self.config.timestamp_ix: str,
|
|
71
|
+
},
|
|
72
|
+
usecols=[
|
|
73
|
+
self.config.item_ix,
|
|
74
|
+
self.config.user_ix,
|
|
75
|
+
self.config.rating_ix,
|
|
76
|
+
self.config.timestamp_ix,
|
|
77
|
+
],
|
|
78
|
+
parse_dates=[self.config.timestamp_ix],
|
|
79
|
+
date_format="%Y-%m-%d %H:%M:%S",
|
|
80
|
+
header=0,
|
|
81
|
+
sep=",",
|
|
82
|
+
encoding="utf-8",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# remove the byte literal char from the string columns
|
|
86
|
+
str_df = df.select_dtypes(["object"])
|
|
87
|
+
str_df = str_df.stack().str[2:-1].unstack()
|
|
88
|
+
for col in str_df:
|
|
89
|
+
df[col] = str_df[col]
|
|
90
|
+
|
|
91
|
+
# convert the timestamp to epoch time
|
|
92
|
+
df[self.config.timestamp_ix] = pd.to_datetime(
|
|
93
|
+
df[self.config.timestamp_ix], format="%Y-%m-%d %H:%M:%S", errors="coerce"
|
|
94
|
+
)
|
|
95
|
+
df.dropna(inplace=True)
|
|
96
|
+
df[self.config.timestamp_ix] = df[self.config.timestamp_ix].astype(np.int64) // 10**9
|
|
97
|
+
|
|
98
|
+
return df
|
|
99
|
+
|
|
100
|
+
def _fetch_dataset_metadata(
|
|
101
|
+
self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
|
|
102
|
+
) -> None:
|
|
103
|
+
pass
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Metadata module for dataset information.
|
|
2
|
+
|
|
3
|
+
This module allows users to include metadata information corresponding to datasets.
|
|
4
|
+
Metadata classes are built on top of the `Metadata` base class, allowing for easy
|
|
5
|
+
extension and customization.
|
|
6
|
+
|
|
7
|
+
## Important Notes
|
|
8
|
+
|
|
9
|
+
User and item IDs in the metadata module are mapped according to RecNextEval's
|
|
10
|
+
internal mapping, not the original IDs. Developers should not load metadata from
|
|
11
|
+
source separately. Instead, implement the metadata class and load metadata while
|
|
12
|
+
loading the dataset.
|
|
13
|
+
|
|
14
|
+
## Available Metadata
|
|
15
|
+
|
|
16
|
+
- `Metadata`: Abstract base class for metadata implementations
|
|
17
|
+
- `MovieLens100kUserMetadata`: User metadata from MovieLens 100K dataset
|
|
18
|
+
- `MovieLens100kItemMetadata`: Item metadata from MovieLens 100K dataset
|
|
19
|
+
- `AmazonBookItemMetadata`: Item metadata from Amazon Books dataset
|
|
20
|
+
- `AmazonMovieItemMetadata`: Item metadata from Amazon Movies dataset
|
|
21
|
+
- `AmazonMusicItemMetadata`: Item metadata from Amazon Music dataset
|
|
22
|
+
- `LastFMUserMetadata`: User metadata from Last.FM dataset
|
|
23
|
+
- `LastFMItemMetadata`: Item metadata from Last.FM dataset
|
|
24
|
+
- `LastFMTagMetadata`: Tag metadata from Last.FM dataset
|
|
25
|
+
|
|
26
|
+
## Example
|
|
27
|
+
|
|
28
|
+
Load metadata from the MovieLens 100K dataset:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from recnexteval.datasets.movielens import MovieLens100K
|
|
32
|
+
|
|
33
|
+
dataset = MovieLens100K(fetch_dataset=True)
|
|
34
|
+
data = dataset.load()
|
|
35
|
+
```
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from .amazon import (
|
|
39
|
+
AmazonBookItemMetadata,
|
|
40
|
+
AmazonMovieItemMetadata,
|
|
41
|
+
AmazonMusicItemMetadata,
|
|
42
|
+
)
|
|
43
|
+
from .base import Metadata
|
|
44
|
+
from .lastfm import LastFMItemMetadata, LastFMTagMetadata, LastFMUserMetadata
|
|
45
|
+
from .movielens import MovieLens100kItemMetadata, MovieLens100kUserMetadata
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
"Metadata",
|
|
50
|
+
"MovieLens100kUserMetadata",
|
|
51
|
+
"MovieLens100kItemMetadata",
|
|
52
|
+
"AmazonBookItemMetadata",
|
|
53
|
+
"AmazonMovieItemMetadata",
|
|
54
|
+
"AmazonMusicItemMetadata",
|
|
55
|
+
"LastFMUserMetadata",
|
|
56
|
+
"LastFMItemMetadata",
|
|
57
|
+
"LastFMTagMetadata",
|
|
58
|
+
]
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import ClassVar
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from recnexteval.datasets.config import (
|
|
8
|
+
AmazonBooksItemMetadataConfig,
|
|
9
|
+
AmazonDigitalMusicItemMetadataConfig,
|
|
10
|
+
AmazonItemMetadataConfig,
|
|
11
|
+
AmazonMoviesAndTVItemMetadataConfig,
|
|
12
|
+
AmazonSubscriptionBoxesItemMetadataConfig,
|
|
13
|
+
)
|
|
14
|
+
from .base import Metadata
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AmazonItemMetadata(Metadata):
|
|
21
|
+
config: ClassVar[AmazonItemMetadataConfig] = AmazonItemMetadataConfig()
|
|
22
|
+
|
|
23
|
+
def __init__(self, item_id_mapping: pd.DataFrame) -> None:
|
|
24
|
+
super().__init__()
|
|
25
|
+
self.item_id_mapping = item_id_mapping
|
|
26
|
+
|
|
27
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
28
|
+
self.fetch_dataset()
|
|
29
|
+
df = pd.read_json(
|
|
30
|
+
self.file_path, # Ensure file_path contains the JSONL file path
|
|
31
|
+
dtype=self.config.dtype_dict,
|
|
32
|
+
lines=True, # Required for JSONL format
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
item_id_to_iid = dict(zip(self.item_id_mapping[self.config.item_ix], self.item_id_mapping["iid"]))
|
|
36
|
+
|
|
37
|
+
# Map config.item_ix in metadata_df using the optimized function
|
|
38
|
+
df[self.config.item_ix] = df[self.config.item_ix].map(lambda x: item_id_to_iid.get(x, x))
|
|
39
|
+
|
|
40
|
+
return df
|
|
41
|
+
|
|
42
|
+
def _download_dataset(self) -> None:
|
|
43
|
+
"""Downloads the metadata for the dataset.
|
|
44
|
+
|
|
45
|
+
Downloads the zipfile, and extracts the ratings file to `self.file_path`
|
|
46
|
+
"""
|
|
47
|
+
if not self.config.dataset_url:
|
|
48
|
+
raise ValueError(f"{self.name} does not have URL specified.")
|
|
49
|
+
|
|
50
|
+
self._fetch_remote(
|
|
51
|
+
self.config.dataset_url, os.path.join(self.base_path, f"{self.config.remote_filename}")
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class AmazonMusicItemMetadata(AmazonItemMetadata):
|
|
56
|
+
config: ClassVar[AmazonDigitalMusicItemMetadataConfig] = AmazonDigitalMusicItemMetadataConfig()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class AmazonMovieItemMetadata(AmazonItemMetadata):
|
|
60
|
+
config: ClassVar[AmazonMoviesAndTVItemMetadataConfig] = AmazonMoviesAndTVItemMetadataConfig()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class AmazonSubscriptionBoxesItemMetadata(AmazonItemMetadata):
|
|
64
|
+
config: ClassVar[AmazonSubscriptionBoxesItemMetadataConfig] = AmazonSubscriptionBoxesItemMetadataConfig()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class AmazonBookItemMetadata(AmazonItemMetadata):
|
|
68
|
+
config: ClassVar[AmazonBooksItemMetadataConfig] = AmazonBooksItemMetadataConfig()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import ClassVar, Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from recnexteval.datasets.base import DataFetcher
|
|
7
|
+
from recnexteval.datasets.config import MetadataConfig
|
|
8
|
+
from recnexteval.utils.path import safe_dir
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Metadata(DataFetcher):
|
|
15
|
+
config: ClassVar[MetadataConfig] = MetadataConfig()
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
filename: Optional[str] = None,
|
|
20
|
+
base_path: Optional[str] = None,
|
|
21
|
+
) -> None:
|
|
22
|
+
self.base_path = base_path if base_path else self.config.default_base_path
|
|
23
|
+
logger.debug(f"{self.name} being initialized with '{self.base_path}' as the base path.")
|
|
24
|
+
|
|
25
|
+
self.filename = filename if filename else self.config.default_filename
|
|
26
|
+
if not self.filename:
|
|
27
|
+
raise ValueError("No filename specified, and no default known.")
|
|
28
|
+
|
|
29
|
+
safe_dir(self.base_path)
|
|
30
|
+
logger.debug(f"{self.name} is initialized.")
|
|
31
|
+
|
|
32
|
+
def load(self) -> pd.DataFrame:
|
|
33
|
+
"""Load the metadata from file and return it as a DataFrame.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
DataFrame containing the metadata.
|
|
37
|
+
"""
|
|
38
|
+
return self._load_dataframe()
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import zipfile
|
|
4
|
+
from typing import ClassVar
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from recnexteval.datasets.config import (
|
|
9
|
+
LastFMItemMetadataConfig,
|
|
10
|
+
LastFMTagMetadataConfig,
|
|
11
|
+
LastFMUserMetadataConfig,
|
|
12
|
+
)
|
|
13
|
+
from .base import Metadata
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class LastFMMetadata(Metadata):
|
|
20
|
+
config: ClassVar[LastFMUserMetadataConfig] = LastFMUserMetadataConfig() # type: ignore
|
|
21
|
+
|
|
22
|
+
def _download_dataset(self) -> None:
|
|
23
|
+
"""Downloads the metadata for the dataset.
|
|
24
|
+
|
|
25
|
+
Downloads the zipfile, and extracts the ratings file to `self.file_path`
|
|
26
|
+
"""
|
|
27
|
+
# Download the zip into the data directory
|
|
28
|
+
self._fetch_remote(
|
|
29
|
+
f"{self.config.dataset_url}/{self.config.remote_filename}.zip",
|
|
30
|
+
os.path.join(self.base_path, f"{self.config.remote_filename}.zip"),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Extract the interaction file which we will use
|
|
34
|
+
with zipfile.ZipFile(
|
|
35
|
+
os.path.join(self.base_path, f"{self.config.remote_filename}.zip"), "r"
|
|
36
|
+
) as zip_ref:
|
|
37
|
+
zip_ref.extract(f"{self.config.remote_filename}", self.base_path)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LastFMUserMetadata(Metadata):
|
|
41
|
+
config: ClassVar[LastFMUserMetadataConfig] = LastFMUserMetadataConfig() # type: ignore
|
|
42
|
+
|
|
43
|
+
def __init__(self, user_id_mapping: pd.DataFrame) -> None:
|
|
44
|
+
super().__init__()
|
|
45
|
+
self.user_id_mapping = user_id_mapping
|
|
46
|
+
|
|
47
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
48
|
+
self.fetch_dataset()
|
|
49
|
+
df = pd.read_csv(
|
|
50
|
+
self.file_path,
|
|
51
|
+
sep=self.config.sep,
|
|
52
|
+
names=self.config.column_names,
|
|
53
|
+
converters={
|
|
54
|
+
self.config.user_ix: self._map_user_id,
|
|
55
|
+
self.config.friend_ix: self._map_user_id,
|
|
56
|
+
},
|
|
57
|
+
header=0,
|
|
58
|
+
)
|
|
59
|
+
return df
|
|
60
|
+
|
|
61
|
+
def _map_user_id(self, user_id):
|
|
62
|
+
user_id_to_uid = dict(
|
|
63
|
+
zip(self.user_id_mapping[self.config.user_ix], self.user_id_mapping["uid"])
|
|
64
|
+
)
|
|
65
|
+
return user_id_to_uid.get(int(user_id), user_id)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class LastFMItemMetadata(Metadata):
|
|
69
|
+
config: ClassVar[LastFMItemMetadataConfig] = LastFMItemMetadataConfig() # type: ignore
|
|
70
|
+
|
|
71
|
+
def __init__(self, item_id_mapping: pd.DataFrame) -> None:
|
|
72
|
+
super().__init__()
|
|
73
|
+
self.item_id_mapping = item_id_mapping
|
|
74
|
+
|
|
75
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
76
|
+
self.fetch_dataset()
|
|
77
|
+
df = pd.read_csv(
|
|
78
|
+
self.file_path,
|
|
79
|
+
dtype=self.config.dtype_dict,
|
|
80
|
+
sep=self.config.sep,
|
|
81
|
+
names=self.config.column_names,
|
|
82
|
+
converters={
|
|
83
|
+
self.config.item_ix: self._map_item_id,
|
|
84
|
+
},
|
|
85
|
+
header=0,
|
|
86
|
+
)
|
|
87
|
+
return df
|
|
88
|
+
|
|
89
|
+
def _map_item_id(self, item_id):
|
|
90
|
+
item_id_to_iid = dict(zip(self.item_id_mapping["artistID"], self.item_id_mapping["iid"]))
|
|
91
|
+
return item_id_to_iid.get(int(item_id), item_id)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class LastFMTagMetadata(Metadata):
|
|
95
|
+
config: ClassVar[LastFMTagMetadataConfig] = LastFMTagMetadataConfig() # type: ignore
|
|
96
|
+
|
|
97
|
+
def __init__(self) -> None:
|
|
98
|
+
super().__init__()
|
|
99
|
+
|
|
100
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
101
|
+
self.fetch_dataset()
|
|
102
|
+
df = pd.read_csv(
|
|
103
|
+
self.file_path,
|
|
104
|
+
dtype=self.config.dtype_dict,
|
|
105
|
+
sep=self.config.sep,
|
|
106
|
+
names=self.config.column_names,
|
|
107
|
+
encoding="ISO-8859-1",
|
|
108
|
+
header=0,
|
|
109
|
+
)
|
|
110
|
+
return df
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import zipfile
|
|
3
|
+
from typing import ClassVar
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from recnexteval.datasets.config import (
|
|
8
|
+
MovieLens100kItemMetadataConfig,
|
|
9
|
+
MovieLens100kUserMetadataConfig,
|
|
10
|
+
)
|
|
11
|
+
from .base import Metadata
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MovieLens100kMetadata(Metadata):
|
|
15
|
+
def _download_dataset(self) -> None:
|
|
16
|
+
# Download the zip into the data directory
|
|
17
|
+
self._fetch_remote(
|
|
18
|
+
f"{self.config.dataset_url}/{self.config.remote_zipname}.zip",
|
|
19
|
+
os.path.join(self.base_path, f"{self.config.remote_zipname}.zip"),
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Extract the ratings file which we will use
|
|
23
|
+
with zipfile.ZipFile(
|
|
24
|
+
os.path.join(self.base_path, f"{self.config.remote_zipname}.zip"), "r"
|
|
25
|
+
) as zip_ref:
|
|
26
|
+
zip_ref.extract(
|
|
27
|
+
f"{self.config.remote_zipname}/{self.config.remote_filename}", self.base_path
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Rename the ratings file to the specified filename
|
|
31
|
+
os.rename(
|
|
32
|
+
os.path.join(
|
|
33
|
+
self.base_path, f"{self.config.remote_zipname}/{self.config.remote_filename}"
|
|
34
|
+
),
|
|
35
|
+
self.file_path,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class MovieLens100kUserMetadata(MovieLens100kMetadata):
|
|
40
|
+
config: ClassVar[MovieLens100kUserMetadataConfig] = MovieLens100kUserMetadataConfig() # type: ignore
|
|
41
|
+
|
|
42
|
+
def __init__(self, user_id_mapping: pd.DataFrame) -> None:
|
|
43
|
+
super().__init__()
|
|
44
|
+
self.user_id_mapping = user_id_mapping
|
|
45
|
+
|
|
46
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
47
|
+
self.fetch_dataset()
|
|
48
|
+
df = pd.read_table(
|
|
49
|
+
self.file_path,
|
|
50
|
+
dtype=self.config.dtype_dict,
|
|
51
|
+
sep=self.config.sep,
|
|
52
|
+
names=self.config.column_names,
|
|
53
|
+
converters={self.config.user_ix: self._map_user_id},
|
|
54
|
+
)
|
|
55
|
+
return df
|
|
56
|
+
|
|
57
|
+
def _map_user_id(self, user_id):
|
|
58
|
+
user_id_to_uid = dict(
|
|
59
|
+
zip(self.user_id_mapping[self.config.user_ix], self.user_id_mapping["uid"])
|
|
60
|
+
)
|
|
61
|
+
return user_id_to_uid.get(int(user_id), user_id)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class MovieLens100kItemMetadata(MovieLens100kMetadata):
|
|
65
|
+
config: ClassVar[MovieLens100kItemMetadataConfig] = MovieLens100kItemMetadataConfig() # type: ignore
|
|
66
|
+
|
|
67
|
+
def __init__(self, item_id_mapping: pd.DataFrame) -> None:
|
|
68
|
+
super().__init__()
|
|
69
|
+
self.item_id_mapping = item_id_mapping
|
|
70
|
+
|
|
71
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
72
|
+
self.fetch_dataset()
|
|
73
|
+
df = pd.read_table(
|
|
74
|
+
self.file_path,
|
|
75
|
+
dtype=self.config.dtype_dict,
|
|
76
|
+
sep=self.config.sep,
|
|
77
|
+
names=self.config.column_names,
|
|
78
|
+
converters={self.config.item_ix: self._map_item_id},
|
|
79
|
+
encoding=self.config.encoding,
|
|
80
|
+
)
|
|
81
|
+
return df
|
|
82
|
+
|
|
83
|
+
def _map_item_id(self, item_id):
|
|
84
|
+
item_id_to_iid = dict(
|
|
85
|
+
zip(self.item_id_mapping[self.config.item_ix], self.item_id_mapping["iid"])
|
|
86
|
+
)
|
|
87
|
+
return item_id_to_iid.get(int(item_id), item_id)
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Evaluator module for algorithm evaluation in streaming settings.
|
|
2
|
+
|
|
3
|
+
This module provides classes and utilities for evaluating recommendation algorithms
|
|
4
|
+
in streaming environments. It supports both batch pipeline evaluation and interactive
|
|
5
|
+
streaming evaluation with comprehensive metric computation and result analysis.
|
|
6
|
+
|
|
7
|
+
Evaluator Builder
|
|
8
|
+
=================
|
|
9
|
+
|
|
10
|
+
The evaluator module contains builder classes for constructing evaluator objects.
|
|
11
|
+
The builders provide a fluent API for configuring evaluators with proper validation
|
|
12
|
+
and error checking.
|
|
13
|
+
|
|
14
|
+
For detailed information about the builder classes and usage examples,
|
|
15
|
+
see the `recnexteval.evaluators.builder` module.
|
|
16
|
+
|
|
17
|
+
Available Builders:
|
|
18
|
+
- `Builder`: Abstract base class for all builder implementations
|
|
19
|
+
- `EvaluatorPipelineBuilder`: Builder for pipeline evaluators
|
|
20
|
+
- `EvaluatorStreamerBuilder`: Builder for streaming evaluators
|
|
21
|
+
|
|
22
|
+
Evaluator Classes
|
|
23
|
+
=================
|
|
24
|
+
|
|
25
|
+
The core evaluator classes handle the evaluation of recommendation algorithms
|
|
26
|
+
on streaming data. The evaluators manage data splitting, algorithm training,
|
|
27
|
+
prediction generation, and metric computation.
|
|
28
|
+
|
|
29
|
+
EvaluatorPipeline
|
|
30
|
+
-----------------
|
|
31
|
+
|
|
32
|
+
For batch evaluation of multiple algorithms on static or sliding window settings.
|
|
33
|
+
This evaluator runs algorithms through a complete pipeline including training,
|
|
34
|
+
prediction, and evaluation phases.
|
|
35
|
+
|
|
36
|
+
EvaluatorStreamer
|
|
37
|
+
-----------------
|
|
38
|
+
|
|
39
|
+
For interactive streaming evaluation where algorithms can be registered and
|
|
40
|
+
evaluated in real-time as data streams in. This allows for more flexible
|
|
41
|
+
evaluation scenarios where algorithms can request data and submit predictions
|
|
42
|
+
asynchronously.
|
|
43
|
+
|
|
44
|
+
Both evaluators inherit from `EvaluatorBase` which provides common functionality
|
|
45
|
+
for metric computation, data masking, and result aggregation.
|
|
46
|
+
|
|
47
|
+
Key Features
|
|
48
|
+
------------
|
|
49
|
+
|
|
50
|
+
- **Multi-algorithm evaluation**: Evaluate multiple algorithms simultaneously
|
|
51
|
+
- **Streaming support**: Handle temporal data streams with sliding windows
|
|
52
|
+
- **Metric aggregation**: Compute metrics at different levels (user, window, macro, micro)
|
|
53
|
+
- **Data masking**: Properly handle unknown users and items during evaluation
|
|
54
|
+
- **Result analysis**: Rich DataFrame outputs for metric analysis and comparison
|
|
55
|
+
|
|
56
|
+
Basic Usage
|
|
57
|
+
-----------
|
|
58
|
+
|
|
59
|
+
Pipeline Evaluation:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from recnexteval.evaluators import EvaluatorPipeline
|
|
63
|
+
from recnexteval.evaluators.builder import EvaluatorPipelineBuilder
|
|
64
|
+
from recnexteval.settings import Setting
|
|
65
|
+
from recnexteval.datasets import AmazonMusicDataset
|
|
66
|
+
|
|
67
|
+
# Load data and create setting
|
|
68
|
+
dataset = AmazonMusicDataset()
|
|
69
|
+
data = dataset.load()
|
|
70
|
+
setting = Setting(data=data, top_K=10)
|
|
71
|
+
setting.split()
|
|
72
|
+
|
|
73
|
+
# Build evaluator
|
|
74
|
+
builder = EvaluatorPipelineBuilder()
|
|
75
|
+
builder.add_setting(setting)
|
|
76
|
+
builder.set_metric_K(10)
|
|
77
|
+
builder.add_metric("PrecisionK")
|
|
78
|
+
builder.add_algorithm("MostPopular")
|
|
79
|
+
evaluator = builder.build()
|
|
80
|
+
|
|
81
|
+
# Run evaluation
|
|
82
|
+
evaluator.run()
|
|
83
|
+
|
|
84
|
+
# Get results
|
|
85
|
+
results = evaluator.metric_results(level="macro")
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Streaming Evaluation:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from recnexteval.evaluators import EvaluatorStreamer
|
|
92
|
+
from recnexteval.evaluators.builder import EvaluatorStreamerBuilder
|
|
93
|
+
from recnexteval.algorithms import MostPopular
|
|
94
|
+
|
|
95
|
+
# Build streaming evaluator
|
|
96
|
+
builder = EvaluatorStreamerBuilder()
|
|
97
|
+
builder.add_setting(setting)
|
|
98
|
+
builder.set_metric_K(10)
|
|
99
|
+
builder.add_metric("HitK")
|
|
100
|
+
evaluator = builder.build()
|
|
101
|
+
|
|
102
|
+
# Start streaming
|
|
103
|
+
evaluator.start_stream()
|
|
104
|
+
|
|
105
|
+
# Register algorithm
|
|
106
|
+
algo_id = evaluator.register_algorithm(MostPopular())
|
|
107
|
+
|
|
108
|
+
# Stream evaluation loop
|
|
109
|
+
while True:
|
|
110
|
+
try:
|
|
111
|
+
# Get training data
|
|
112
|
+
training_data = evaluator.get_training_data(algo_id)
|
|
113
|
+
|
|
114
|
+
# Get unlabeled data
|
|
115
|
+
unlabeled_data = evaluator.get_unlabeled_data(algo_id)
|
|
116
|
+
|
|
117
|
+
# Algorithm makes predictions
|
|
118
|
+
predictions = algorithm.predict(unlabeled_data)
|
|
119
|
+
|
|
120
|
+
# Submit predictions
|
|
121
|
+
evaluator.submit_prediction(algo_id, predictions)
|
|
122
|
+
|
|
123
|
+
except EOWSettingError:
|
|
124
|
+
break
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Metric Levels
|
|
128
|
+
-------------
|
|
129
|
+
|
|
130
|
+
Evaluators support computing metrics at different aggregation levels:
|
|
131
|
+
|
|
132
|
+
- **User level**: Metrics computed per user across all timestamps
|
|
133
|
+
- **Window level**: Metrics computed per time window, averaging user scores within each window
|
|
134
|
+
- **Macro level**: Overall metrics averaging across all windows equally
|
|
135
|
+
- **Micro level**: Overall metrics averaging across all user-timestamp combinations equally
|
|
136
|
+
|
|
137
|
+
Available Evaluator Classes:
|
|
138
|
+
- `EvaluatorBase`: Base class providing common evaluation functionality
|
|
139
|
+
- `EvaluatorPipeline`: Pipeline-based batch evaluator
|
|
140
|
+
- `EvaluatorStreamer`: Interactive streaming evaluator
|
|
141
|
+
|
|
142
|
+
Accumulator
|
|
143
|
+
==========
|
|
144
|
+
|
|
145
|
+
The `MetricAccumulator` class accumulates and stores metric results during
|
|
146
|
+
evaluation. The accumulator maintains a collection of metric objects organized
|
|
147
|
+
by algorithm name and provides methods for retrieving results in various formats.
|
|
148
|
+
|
|
149
|
+
Features:
|
|
150
|
+
- Storing metrics for multiple algorithms
|
|
151
|
+
- Computing aggregated results at different levels
|
|
152
|
+
- Exporting results to pandas DataFrames
|
|
153
|
+
- Filtering results by algorithm, timestamp, or metric type
|
|
154
|
+
|
|
155
|
+
Utility Classes
|
|
156
|
+
===============
|
|
157
|
+
|
|
158
|
+
Utility classes that support the evaluation process:
|
|
159
|
+
|
|
160
|
+
- `MetricLevelEnum`: Enumeration for different metric aggregation levels
|
|
161
|
+
- `UserItemBaseStatus`: Tracks known and unknown users/items during evaluation
|
|
162
|
+
|
|
163
|
+
These utilities handle the complex state management required for streaming
|
|
164
|
+
evaluation scenarios.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
from .accumulator import MetricAccumulator
|
|
168
|
+
from .base import EvaluatorBase
|
|
169
|
+
from .builder import (
|
|
170
|
+
Builder,
|
|
171
|
+
EvaluatorPipelineBuilder,
|
|
172
|
+
EvaluatorStreamerBuilder,
|
|
173
|
+
)
|
|
174
|
+
from .evaluator_pipeline import EvaluatorPipeline
|
|
175
|
+
from .evaluator_stream import EvaluatorStreamer
|
|
176
|
+
from .util import MetricLevelEnum, UserItemBaseStatus
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
__all__ = [
|
|
180
|
+
"Builder",
|
|
181
|
+
"EvaluatorPipelineBuilder",
|
|
182
|
+
"EvaluatorStreamerBuilder",
|
|
183
|
+
"EvaluatorBase",
|
|
184
|
+
"EvaluatorPipeline",
|
|
185
|
+
"EvaluatorStreamer",
|
|
186
|
+
"MetricAccumulator",
|
|
187
|
+
"MetricLevelEnum",
|
|
188
|
+
"UserItemBaseStatus",
|
|
189
|
+
]
|