recnexteval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recnexteval/__init__.py +20 -0
- recnexteval/algorithms/__init__.py +99 -0
- recnexteval/algorithms/base.py +377 -0
- recnexteval/algorithms/baseline/__init__.py +10 -0
- recnexteval/algorithms/baseline/decay_popularity.py +110 -0
- recnexteval/algorithms/baseline/most_popular.py +72 -0
- recnexteval/algorithms/baseline/random.py +39 -0
- recnexteval/algorithms/baseline/recent_popularity.py +34 -0
- recnexteval/algorithms/itemknn/__init__.py +14 -0
- recnexteval/algorithms/itemknn/itemknn.py +119 -0
- recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
- recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
- recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
- recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
- recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
- recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
- recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
- recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
- recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
- recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
- recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
- recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
- recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
- recnexteval/algorithms/utils.py +51 -0
- recnexteval/datasets/__init__.py +109 -0
- recnexteval/datasets/base.py +316 -0
- recnexteval/datasets/config/__init__.py +113 -0
- recnexteval/datasets/config/amazon.py +188 -0
- recnexteval/datasets/config/base.py +72 -0
- recnexteval/datasets/config/lastfm.py +105 -0
- recnexteval/datasets/config/movielens.py +169 -0
- recnexteval/datasets/config/yelp.py +25 -0
- recnexteval/datasets/datasets/__init__.py +24 -0
- recnexteval/datasets/datasets/amazon.py +151 -0
- recnexteval/datasets/datasets/base.py +250 -0
- recnexteval/datasets/datasets/lastfm.py +121 -0
- recnexteval/datasets/datasets/movielens.py +93 -0
- recnexteval/datasets/datasets/test.py +46 -0
- recnexteval/datasets/datasets/yelp.py +103 -0
- recnexteval/datasets/metadata/__init__.py +58 -0
- recnexteval/datasets/metadata/amazon.py +68 -0
- recnexteval/datasets/metadata/base.py +38 -0
- recnexteval/datasets/metadata/lastfm.py +110 -0
- recnexteval/datasets/metadata/movielens.py +87 -0
- recnexteval/evaluators/__init__.py +189 -0
- recnexteval/evaluators/accumulator.py +167 -0
- recnexteval/evaluators/base.py +216 -0
- recnexteval/evaluators/builder/__init__.py +125 -0
- recnexteval/evaluators/builder/base.py +166 -0
- recnexteval/evaluators/builder/pipeline.py +111 -0
- recnexteval/evaluators/builder/stream.py +54 -0
- recnexteval/evaluators/evaluator_pipeline.py +287 -0
- recnexteval/evaluators/evaluator_stream.py +374 -0
- recnexteval/evaluators/state_management.py +310 -0
- recnexteval/evaluators/strategy.py +32 -0
- recnexteval/evaluators/util.py +124 -0
- recnexteval/matrix/__init__.py +48 -0
- recnexteval/matrix/exception.py +5 -0
- recnexteval/matrix/interaction_matrix.py +784 -0
- recnexteval/matrix/prediction_matrix.py +153 -0
- recnexteval/matrix/util.py +24 -0
- recnexteval/metrics/__init__.py +57 -0
- recnexteval/metrics/binary/__init__.py +4 -0
- recnexteval/metrics/binary/hit.py +49 -0
- recnexteval/metrics/core/__init__.py +10 -0
- recnexteval/metrics/core/base.py +126 -0
- recnexteval/metrics/core/elementwise_top_k.py +75 -0
- recnexteval/metrics/core/listwise_top_k.py +72 -0
- recnexteval/metrics/core/top_k.py +60 -0
- recnexteval/metrics/core/util.py +29 -0
- recnexteval/metrics/ranking/__init__.py +6 -0
- recnexteval/metrics/ranking/dcg.py +55 -0
- recnexteval/metrics/ranking/ndcg.py +78 -0
- recnexteval/metrics/ranking/precision.py +51 -0
- recnexteval/metrics/ranking/recall.py +42 -0
- recnexteval/models/__init__.py +4 -0
- recnexteval/models/base.py +69 -0
- recnexteval/preprocessing/__init__.py +37 -0
- recnexteval/preprocessing/filter.py +181 -0
- recnexteval/preprocessing/preprocessor.py +137 -0
- recnexteval/registries/__init__.py +67 -0
- recnexteval/registries/algorithm.py +68 -0
- recnexteval/registries/base.py +131 -0
- recnexteval/registries/dataset.py +37 -0
- recnexteval/registries/metric.py +57 -0
- recnexteval/settings/__init__.py +127 -0
- recnexteval/settings/base.py +414 -0
- recnexteval/settings/exception.py +8 -0
- recnexteval/settings/leave_n_out_setting.py +48 -0
- recnexteval/settings/processor.py +115 -0
- recnexteval/settings/schema.py +11 -0
- recnexteval/settings/single_time_point_setting.py +111 -0
- recnexteval/settings/sliding_window_setting.py +153 -0
- recnexteval/settings/splitters/__init__.py +14 -0
- recnexteval/settings/splitters/base.py +57 -0
- recnexteval/settings/splitters/n_last.py +39 -0
- recnexteval/settings/splitters/n_last_timestamp.py +76 -0
- recnexteval/settings/splitters/timestamp.py +82 -0
- recnexteval/settings/util.py +0 -0
- recnexteval/utils/__init__.py +115 -0
- recnexteval/utils/json_to_csv_converter.py +128 -0
- recnexteval/utils/logging_tools.py +159 -0
- recnexteval/utils/path.py +155 -0
- recnexteval/utils/url_certificate_installer.py +54 -0
- recnexteval/utils/util.py +166 -0
- recnexteval/utils/uuid_util.py +7 -0
- recnexteval/utils/yaml_tool.py +65 -0
- recnexteval-0.1.0.dist-info/METADATA +85 -0
- recnexteval-0.1.0.dist-info/RECORD +110 -0
- recnexteval-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import ClassVar
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from ..config import (
|
|
8
|
+
AmazonBookDatasetConfig,
|
|
9
|
+
AmazonDatasetConfig,
|
|
10
|
+
AmazonMovieDatasetConfig,
|
|
11
|
+
AmazonMusicDatasetConfig,
|
|
12
|
+
AmazonSubscriptionBoxesDatasetConfig,
|
|
13
|
+
)
|
|
14
|
+
from ..metadata.amazon import (
|
|
15
|
+
AmazonBookItemMetadata,
|
|
16
|
+
AmazonMovieItemMetadata,
|
|
17
|
+
AmazonMusicItemMetadata,
|
|
18
|
+
AmazonSubscriptionBoxesItemMetadata,
|
|
19
|
+
)
|
|
20
|
+
from .base import Dataset
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class AmazonDataset(Dataset):
|
|
27
|
+
"""Base class for Amazon datasets.
|
|
28
|
+
|
|
29
|
+
Other Amazon datasets should inherit from this class.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
ITEM_METADATA = None
|
|
33
|
+
IS_BASE: bool = True
|
|
34
|
+
|
|
35
|
+
config: ClassVar[AmazonDatasetConfig] = AmazonDatasetConfig()
|
|
36
|
+
|
|
37
|
+
def _download_dataset(self) -> None:
|
|
38
|
+
"""Downloads the dataset.
|
|
39
|
+
|
|
40
|
+
Downloads the csv file from the dataset URL and saves it to the file path.
|
|
41
|
+
"""
|
|
42
|
+
if not self.config.dataset_url:
|
|
43
|
+
raise ValueError(f"{self.name} does not have URL specified in config.")
|
|
44
|
+
|
|
45
|
+
logger.debug(f"Downloading {self.name} dataset from {self.config.dataset_url}")
|
|
46
|
+
self._fetch_remote(
|
|
47
|
+
self.config.dataset_url,
|
|
48
|
+
self.file_path,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
52
|
+
"""Load the raw dataset from file, and return it as a pandas DataFrame.
|
|
53
|
+
|
|
54
|
+
Transform the dataset downloaded to have integer user and item ids. This
|
|
55
|
+
will be needed for representation in the interaction matrix.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
The interaction data as a DataFrame with a row per interaction.
|
|
59
|
+
"""
|
|
60
|
+
self.fetch_dataset()
|
|
61
|
+
|
|
62
|
+
# Read JSONL in chunks and show progress per chunk. We import tqdm
|
|
63
|
+
# locally to avoid global pandas monkeypatching (`tqdm.pandas()`).
|
|
64
|
+
from tqdm.auto import tqdm
|
|
65
|
+
|
|
66
|
+
chunksize = 100_000
|
|
67
|
+
chunks = pd.read_json(
|
|
68
|
+
self.file_path,
|
|
69
|
+
dtype={
|
|
70
|
+
self.config.item_ix: str,
|
|
71
|
+
self.config.user_ix: str,
|
|
72
|
+
self.config.timestamp_ix: np.int64,
|
|
73
|
+
self.config.rating_ix: np.float32,
|
|
74
|
+
self.config.helpful_vote_ix: np.int64,
|
|
75
|
+
},
|
|
76
|
+
lines=True,
|
|
77
|
+
chunksize=chunksize,
|
|
78
|
+
)
|
|
79
|
+
df = pd.concat(
|
|
80
|
+
[chunk for chunk in tqdm(chunks, desc="Reading JSONL", unit="chunk")], ignore_index=True
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
df = df[
|
|
84
|
+
[
|
|
85
|
+
self.config.item_ix,
|
|
86
|
+
self.config.user_ix,
|
|
87
|
+
self.config.timestamp_ix,
|
|
88
|
+
self.config.rating_ix,
|
|
89
|
+
self.config.helpful_vote_ix,
|
|
90
|
+
]
|
|
91
|
+
]
|
|
92
|
+
|
|
93
|
+
# Convert nanosecond timestamps to seconds
|
|
94
|
+
df[self.config.timestamp_ix] = df[self.config.timestamp_ix] // 1_000_000_000
|
|
95
|
+
|
|
96
|
+
logger.debug(f"Loaded {len(df)} interactions")
|
|
97
|
+
return df
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class AmazonMusicDataset(AmazonDataset):
|
|
101
|
+
"""Handles Amazon Music dataset."""
|
|
102
|
+
|
|
103
|
+
IS_BASE: bool = False
|
|
104
|
+
|
|
105
|
+
config: ClassVar[AmazonMusicDatasetConfig] = AmazonMusicDatasetConfig()
|
|
106
|
+
|
|
107
|
+
def _fetch_dataset_metadata(
|
|
108
|
+
self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
|
|
109
|
+
) -> None:
|
|
110
|
+
self.ITEM_METADATA = AmazonMusicItemMetadata(item_id_mapping=item_id_mapping).load()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class AmazonMovieDataset(AmazonDataset):
|
|
114
|
+
"""Handles Amazon Movie dataset."""
|
|
115
|
+
|
|
116
|
+
IS_BASE: bool = False
|
|
117
|
+
|
|
118
|
+
config: ClassVar[AmazonMovieDatasetConfig] = AmazonMovieDatasetConfig()
|
|
119
|
+
|
|
120
|
+
def _fetch_dataset_metadata(
|
|
121
|
+
self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
|
|
122
|
+
) -> None:
|
|
123
|
+
self.ITEM_METADATA = AmazonMovieItemMetadata(item_id_mapping=item_id_mapping).load()
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class AmazonSubscriptionBoxesDataset(AmazonDataset):
|
|
127
|
+
"""Handles Amazon Computer dataset."""
|
|
128
|
+
|
|
129
|
+
IS_BASE: bool = False
|
|
130
|
+
|
|
131
|
+
config: ClassVar[AmazonSubscriptionBoxesDatasetConfig] = AmazonSubscriptionBoxesDatasetConfig()
|
|
132
|
+
|
|
133
|
+
def _fetch_dataset_metadata(
|
|
134
|
+
self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
|
|
135
|
+
) -> None:
|
|
136
|
+
self.ITEM_METADATA = AmazonSubscriptionBoxesItemMetadata(
|
|
137
|
+
item_id_mapping=item_id_mapping
|
|
138
|
+
).load()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class AmazonBookDataset(AmazonDataset):
|
|
142
|
+
"""Handles Amazon Book dataset."""
|
|
143
|
+
|
|
144
|
+
IS_BASE: bool = False
|
|
145
|
+
|
|
146
|
+
config: ClassVar[AmazonBookDatasetConfig] = AmazonBookDatasetConfig()
|
|
147
|
+
|
|
148
|
+
def _fetch_dataset_metadata(
|
|
149
|
+
self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
|
|
150
|
+
) -> None:
|
|
151
|
+
self.ITEM_METADATA = AmazonBookItemMetadata(item_id_mapping=item_id_mapping).load()
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from abc import abstractmethod
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import ClassVar
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from recnexteval.matrix import InteractionMatrix
|
|
11
|
+
from recnexteval.preprocessing.filter import Filter, MinItemsPerUser, MinUsersPerItem
|
|
12
|
+
from recnexteval.preprocessing.preprocessor import DataFramePreprocessor
|
|
13
|
+
from recnexteval.utils.path import safe_dir
|
|
14
|
+
from ..base import DataFetcher
|
|
15
|
+
from ..config import DatasetConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Dataset(DataFetcher):
|
|
22
|
+
"""Represents a collaborative filtering dataset.
|
|
23
|
+
|
|
24
|
+
Dataset must minimally contain user, item and timestamp columns for the
|
|
25
|
+
other modules to work.
|
|
26
|
+
|
|
27
|
+
Assumption
|
|
28
|
+
===========
|
|
29
|
+
User/item ID increments in the order of time. This is an assumption that will
|
|
30
|
+
be made for the purposes of splitting the dataset and eventually passing
|
|
31
|
+
the dataset to the model. The ID incrementing in the order of time allows us
|
|
32
|
+
to set the shape of the currently known user and item matrix allowing easier
|
|
33
|
+
manipulation of the data by the evaluator.
|
|
34
|
+
|
|
35
|
+
:param filename: Name of the file, if no name is provided the dataset default will be used if known.
|
|
36
|
+
If the dataset does not have a default filename, a ValueError will be raised.
|
|
37
|
+
:type filename: str, optional
|
|
38
|
+
:param base_path: The base_path to the data directory.
|
|
39
|
+
Defaults to `data`
|
|
40
|
+
:type base_path: str, optional
|
|
41
|
+
:param use_default_filters: If True, the default filters will be applied to the dataset.
|
|
42
|
+
Defaults to False.
|
|
43
|
+
:type use_default_filters: bool, optional
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
config: ClassVar[DatasetConfig] = DatasetConfig()
|
|
47
|
+
"""Configuration for the dataset."""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
use_default_filters: bool = False, # noqa: FBT001, FBT002
|
|
52
|
+
fetch_metadata: bool = False, # noqa: FBT001, FBT002
|
|
53
|
+
) -> None:
|
|
54
|
+
if not self.config.user_ix or not self.config.item_ix or not self.config.timestamp_ix:
|
|
55
|
+
raise AttributeError("user_ix, item_ix or timestamp_ix not set in config.")
|
|
56
|
+
|
|
57
|
+
logger.debug(
|
|
58
|
+
f"{self.name} being initialized with '{self.config.default_base_path}' as the base path."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if not self.config.default_filename:
|
|
62
|
+
raise ValueError("No filename specified, and no default known.")
|
|
63
|
+
|
|
64
|
+
self.fetch_metadata = fetch_metadata
|
|
65
|
+
self.preprocessor = DataFramePreprocessor(
|
|
66
|
+
self.config.item_ix, self.config.user_ix, self.config.timestamp_ix
|
|
67
|
+
)
|
|
68
|
+
self._timestamp_min: int | None = None
|
|
69
|
+
self._timestamp_max: int | None = None
|
|
70
|
+
|
|
71
|
+
if use_default_filters:
|
|
72
|
+
for f in self._default_filters:
|
|
73
|
+
self.add_filter(f)
|
|
74
|
+
|
|
75
|
+
safe_dir(self.config.default_base_path)
|
|
76
|
+
logger.debug(f"{self.name} is initialized.")
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def _default_filters(self) -> list[Filter]:
|
|
80
|
+
"""The default filters for all datasets
|
|
81
|
+
|
|
82
|
+
Concrete classes can override this property to add more filters.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
List of filters to be applied to the dataset.
|
|
86
|
+
"""
|
|
87
|
+
if not self.config.user_ix or not self.config.item_ix:
|
|
88
|
+
raise AttributeError("config.user_ix or config.item_ix not set.")
|
|
89
|
+
|
|
90
|
+
filters: list[Filter] = []
|
|
91
|
+
filters.append(
|
|
92
|
+
MinItemsPerUser(
|
|
93
|
+
min_items_per_user=3,
|
|
94
|
+
item_ix=self.config.item_ix,
|
|
95
|
+
user_ix=self.config.user_ix,
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
filters.append(
|
|
99
|
+
MinUsersPerItem(
|
|
100
|
+
min_users_per_item=3,
|
|
101
|
+
item_ix=self.config.item_ix,
|
|
102
|
+
user_ix=self.config.user_ix,
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
return filters
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def timestamp_min(self) -> int:
|
|
109
|
+
"""Minimum timestamp in the dataset.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Minimum timestamp in the dataset.
|
|
113
|
+
"""
|
|
114
|
+
if self._timestamp_min is None:
|
|
115
|
+
raise RuntimeError("timestamp_min can only be accessed after load() has been called.")
|
|
116
|
+
return self._timestamp_min
|
|
117
|
+
|
|
118
|
+
@property
|
|
119
|
+
def timestamp_max(self) -> int:
|
|
120
|
+
"""Maximum timestamp in the dataset.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Maximum timestamp in the dataset.
|
|
124
|
+
"""
|
|
125
|
+
if self._timestamp_max is None:
|
|
126
|
+
raise RuntimeError("timestamp_max can only be accessed after load() has been called.")
|
|
127
|
+
return self._timestamp_max
|
|
128
|
+
|
|
129
|
+
def get_timestamp_range_in_epoch(self) -> tuple[int, int]:
|
|
130
|
+
"""Get the minimum and maximum timestamps in the dataset.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
A tuple of (min_timestamp, max_timestamp).
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
RuntimeError: If load() has not been called yet.
|
|
137
|
+
"""
|
|
138
|
+
return self.timestamp_min, self.timestamp_max
|
|
139
|
+
|
|
140
|
+
def get_timestamp_range_in_datetime(self) -> tuple[datetime, datetime]:
|
|
141
|
+
"""Get the minimum and maximum timestamps in the dataset.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
A tuple of (min_timestamp, max_timestamp).
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
RuntimeError: If load() has not been called yet.
|
|
148
|
+
"""
|
|
149
|
+
min_dt = datetime.fromtimestamp(self.timestamp_min)
|
|
150
|
+
max_dt = datetime.fromtimestamp(self.timestamp_max)
|
|
151
|
+
return min_dt, max_dt
|
|
152
|
+
|
|
153
|
+
def add_filter(self, filter_: Filter) -> None:
|
|
154
|
+
"""Add a filter to be applied when loading the data.
|
|
155
|
+
|
|
156
|
+
Utilize :class:`DataFramePreprocessor` class to add filters to the
|
|
157
|
+
dataset to load. The filter will be applied when the data is loaded into
|
|
158
|
+
an :class:`InteractionMatrix` object when :meth:`load` is called.
|
|
159
|
+
|
|
160
|
+
:param filter_: Filter to be applied to the loaded DataFrame
|
|
161
|
+
processing to interaction matrix.
|
|
162
|
+
:type filter_: Filter
|
|
163
|
+
"""
|
|
164
|
+
self.preprocessor.add_filter(filter_)
|
|
165
|
+
|
|
166
|
+
def _load_dataframe_from_cache(self) -> pd.DataFrame:
|
|
167
|
+
if not os.path.exists(self.processed_cache_path):
|
|
168
|
+
raise FileNotFoundError("Processed cache file not found.")
|
|
169
|
+
logger.info(f"Loading from cache: {self.processed_cache_path}")
|
|
170
|
+
df = pd.read_parquet(self.processed_cache_path)
|
|
171
|
+
return df
|
|
172
|
+
|
|
173
|
+
def load(self, apply_filters: bool = True, use_cache: bool = True) -> InteractionMatrix:
|
|
174
|
+
"""Loads data into an InteractionMatrix object.
|
|
175
|
+
|
|
176
|
+
Data is loaded into a DataFrame using the :func:`_load_dataframe` function.
|
|
177
|
+
Resulting DataFrame is parsed into an :class:`InteractionMatrix` object. If
|
|
178
|
+
:data:`apply_filters` is set to True, the filters set will be applied to the
|
|
179
|
+
dataset and mapping of user and item ids will be done. This is advised
|
|
180
|
+
even if there is no filter set, as it will ensure that the user and item
|
|
181
|
+
ids are incrementing in the order of time.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
apply_filters: To apply the filters set and preprocessing,
|
|
185
|
+
defaults to True
|
|
186
|
+
use_cache: Whether to use cached processed data, defaults to True
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Resulting interaction matrix.
|
|
190
|
+
"""
|
|
191
|
+
logger.info(f"{self.name} is loading dataset...")
|
|
192
|
+
start = time.time()
|
|
193
|
+
try:
|
|
194
|
+
df = self._load_dataframe_from_cache() if use_cache else self._load_dataframe()
|
|
195
|
+
except FileNotFoundError:
|
|
196
|
+
logger.warning("Processed cache not found, loading raw dataframe.")
|
|
197
|
+
df = self._load_dataframe()
|
|
198
|
+
self._cache_processed_dataframe(df)
|
|
199
|
+
if apply_filters:
|
|
200
|
+
logger.debug(f"{self.name} applying filters set.")
|
|
201
|
+
im = self.preprocessor.process(df)
|
|
202
|
+
else:
|
|
203
|
+
im = self._dataframe_to_matrix(df)
|
|
204
|
+
logger.warning(
|
|
205
|
+
"No filters applied, user and item ids may not be incrementing in the order of time. "
|
|
206
|
+
"Classes that use this dataset may not work as expected."
|
|
207
|
+
)
|
|
208
|
+
self._timestamp_min = int(df[self.config.timestamp_ix].min())
|
|
209
|
+
self._timestamp_max = int(df[self.config.timestamp_ix].max())
|
|
210
|
+
|
|
211
|
+
if self.fetch_metadata:
|
|
212
|
+
user_id_mapping, item_id_mapping = (
|
|
213
|
+
self.preprocessor.user_id_mapping,
|
|
214
|
+
self.preprocessor.item_id_mapping,
|
|
215
|
+
)
|
|
216
|
+
self._fetch_dataset_metadata(
|
|
217
|
+
user_id_mapping=user_id_mapping, item_id_mapping=item_id_mapping
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
end = time.time()
|
|
221
|
+
logger.info(f"{self.name} dataset loaded - Took {end - start:.3}s")
|
|
222
|
+
return im
|
|
223
|
+
|
|
224
|
+
def _dataframe_to_matrix(self, df: pd.DataFrame) -> InteractionMatrix:
|
|
225
|
+
"""Converts a DataFrame to an InteractionMatrix.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
df: DataFrame to convert
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
InteractionMatrix object.
|
|
232
|
+
"""
|
|
233
|
+
if not self.config.user_ix or not self.config.item_ix or not self.config.timestamp_ix:
|
|
234
|
+
raise AttributeError("config.user_ix, config.item_ix or config.timestamp_ix not set.")
|
|
235
|
+
return InteractionMatrix(
|
|
236
|
+
df,
|
|
237
|
+
user_ix=self.config.user_ix,
|
|
238
|
+
item_ix=self.config.item_ix,
|
|
239
|
+
timestamp_ix=self.config.timestamp_ix,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
@abstractmethod
|
|
243
|
+
def _fetch_dataset_metadata(
|
|
244
|
+
self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
|
|
245
|
+
) -> None:
|
|
246
|
+
"""Fetch metadata for the dataset.
|
|
247
|
+
|
|
248
|
+
Fetch metadata for the dataset, if available.
|
|
249
|
+
"""
|
|
250
|
+
raise NotImplementedError("Needs to be implemented")
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import zipfile
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
from typing_extensions import ClassVar
|
|
9
|
+
|
|
10
|
+
from ..config import LastFMDatasetConfig
|
|
11
|
+
from ..metadata.lastfm import LastFMItemMetadata, LastFMTagMetadata, LastFMUserMetadata
|
|
12
|
+
from .base import Dataset
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
tqdm.pandas()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class LastFMDataset(Dataset):
|
|
20
|
+
"""
|
|
21
|
+
Last FM dataset.
|
|
22
|
+
|
|
23
|
+
The Last FM dataset contains user interactions with artists. The tags in this
|
|
24
|
+
datasets are not used in this implementation. The dataset that will be used
|
|
25
|
+
would the the user_taggedartists-timestamps.dat file. The dataset contains
|
|
26
|
+
the following columns: [user, artist, tags, timestamp].
|
|
27
|
+
|
|
28
|
+
The dataset is downloaded from the GroupLens website :cite:`Cantador_RecSys2011`.
|
|
29
|
+
"""
|
|
30
|
+
IS_BASE: bool = False
|
|
31
|
+
|
|
32
|
+
config: ClassVar[LastFMDatasetConfig] = LastFMDatasetConfig()
|
|
33
|
+
|
|
34
|
+
ITEM_METADATA = None
|
|
35
|
+
USER_METADATA = None
|
|
36
|
+
TAG_METADATA = None
|
|
37
|
+
|
|
38
|
+
def fetch_dataset(self) -> None:
|
|
39
|
+
"""Check if dataset is present, if not download.
|
|
40
|
+
|
|
41
|
+
This method overrides the base class to handle the special case where
|
|
42
|
+
the zipfile may exist but the extracted file doesn't.
|
|
43
|
+
"""
|
|
44
|
+
zip_path = os.path.join(
|
|
45
|
+
self.config.default_base_path, f"{self.config.remote_zipname}.zip"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
if not os.path.exists(zip_path):
|
|
49
|
+
logger.debug(f"{self.name} dataset zipfile not found in {zip_path}.")
|
|
50
|
+
self._download_dataset()
|
|
51
|
+
elif not os.path.exists(self.file_path):
|
|
52
|
+
logger.debug(
|
|
53
|
+
f"{self.name} dataset file not found, but zipfile already downloaded. "
|
|
54
|
+
f"Extracting file from zipfile."
|
|
55
|
+
)
|
|
56
|
+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
|
57
|
+
zip_ref.extract(self.config.remote_filename, self.config.default_base_path)
|
|
58
|
+
else:
|
|
59
|
+
logger.debug("Data zipfile is in memory and in dir specified.")
|
|
60
|
+
|
|
61
|
+
def _download_dataset(self) -> None:
|
|
62
|
+
"""Downloads the dataset.
|
|
63
|
+
|
|
64
|
+
Downloads the zipfile, and extracts the interaction file to `self.file_path`
|
|
65
|
+
"""
|
|
66
|
+
zip_path = os.path.join(
|
|
67
|
+
self.config.default_base_path, f"{self.config.remote_zipname}.zip"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
logger.debug(f"Downloading {self.name} dataset from {self.config.dataset_url}")
|
|
71
|
+
|
|
72
|
+
# Download the zip into the data directory
|
|
73
|
+
self._fetch_remote(
|
|
74
|
+
f"{self.config.dataset_url}/{self.config.remote_zipname}.zip",
|
|
75
|
+
zip_path,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Extract the interaction file which we will use
|
|
79
|
+
logger.debug(f"Extracting {self.config.remote_filename} from zip")
|
|
80
|
+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
|
81
|
+
zip_ref.extract(self.config.remote_filename, self.config.default_base_path)
|
|
82
|
+
|
|
83
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
84
|
+
"""Load the raw dataset from file, and return it as a pandas DataFrame.
|
|
85
|
+
|
|
86
|
+
Transform the dataset downloaded to have integer user and item ids. This
|
|
87
|
+
will be needed for representation in the interaction matrix.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
The interaction data as a DataFrame with a row per interaction.
|
|
91
|
+
"""
|
|
92
|
+
self.fetch_dataset()
|
|
93
|
+
df = pd.read_csv(
|
|
94
|
+
self.file_path,
|
|
95
|
+
dtype={
|
|
96
|
+
self.config.item_ix: np.int32,
|
|
97
|
+
self.config.user_ix: np.int32,
|
|
98
|
+
self.config.tag_ix: np.int32,
|
|
99
|
+
self.config.timestamp_ix: np.int64,
|
|
100
|
+
},
|
|
101
|
+
sep="\t",
|
|
102
|
+
names=[
|
|
103
|
+
self.config.user_ix,
|
|
104
|
+
self.config.item_ix,
|
|
105
|
+
self.config.tag_ix,
|
|
106
|
+
self.config.timestamp_ix,
|
|
107
|
+
],
|
|
108
|
+
header=0,
|
|
109
|
+
)
|
|
110
|
+
# Convert from milliseconds to seconds
|
|
111
|
+
df[self.config.timestamp_ix] = df[self.config.timestamp_ix] // 1_000
|
|
112
|
+
|
|
113
|
+
logger.debug(f"Loaded {len(df)} interactions")
|
|
114
|
+
return df
|
|
115
|
+
|
|
116
|
+
def _fetch_dataset_metadata(
|
|
117
|
+
self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
|
|
118
|
+
) -> None:
|
|
119
|
+
self.USER_METADATA = LastFMUserMetadata(user_id_mapping=user_id_mapping).load()
|
|
120
|
+
self.ITEM_METADATA = LastFMItemMetadata(item_id_mapping=item_id_mapping).load()
|
|
121
|
+
self.TAG_METADATA = LastFMTagMetadata().load()
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import zipfile
|
|
4
|
+
from typing import ClassVar
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from ..config import MovieLens100KDatasetConfig, MovieLensDatasetConfig
|
|
11
|
+
from ..metadata.movielens import MovieLens100kItemMetadata, MovieLens100kUserMetadata
|
|
12
|
+
from .base import Dataset
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MovieLensDataset(Dataset):
|
|
19
|
+
"""Base class for Movielens datasets.
|
|
20
|
+
|
|
21
|
+
Other Movielens datasets should inherit from this class.
|
|
22
|
+
|
|
23
|
+
This code is adapted from RecPack :cite:`recpack`
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
IS_BASE: bool = True
|
|
27
|
+
config: ClassVar[MovieLensDatasetConfig] = MovieLensDatasetConfig()
|
|
28
|
+
|
|
29
|
+
def _download_dataset(self) -> None:
|
|
30
|
+
# Download the zip into the data directory
|
|
31
|
+
zip_file_path = os.path.join(
|
|
32
|
+
self.config.default_base_path, f"{self.config.remote_zipname}.zip"
|
|
33
|
+
)
|
|
34
|
+
self._fetch_remote(
|
|
35
|
+
url=f"{self.config.dataset_url}/{self.config.remote_zipname}.zip",
|
|
36
|
+
filename=zip_file_path,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Extract the ratings file which we will use
|
|
40
|
+
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
|
41
|
+
zip_ref.extract(
|
|
42
|
+
f"{self.config.remote_zipname}/{self.config.remote_filename}",
|
|
43
|
+
self.config.default_base_path,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Rename the ratings file to the specified filename
|
|
47
|
+
os.rename(
|
|
48
|
+
os.path.join(
|
|
49
|
+
self.config.default_base_path,
|
|
50
|
+
f"{self.config.remote_zipname}/{self.config.remote_filename}",
|
|
51
|
+
),
|
|
52
|
+
self.file_path,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class MovieLens100K(MovieLensDataset):
|
|
57
|
+
"""MovieLens 100K dataset."""
|
|
58
|
+
|
|
59
|
+
ITEM_METADATA = None
|
|
60
|
+
USER_METADATA = None
|
|
61
|
+
IS_BASE: bool = False
|
|
62
|
+
|
|
63
|
+
config: ClassVar[MovieLens100KDatasetConfig] = MovieLens100KDatasetConfig()
|
|
64
|
+
|
|
65
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
66
|
+
self.fetch_dataset()
|
|
67
|
+
chunks = pd.read_table(
|
|
68
|
+
self.file_path,
|
|
69
|
+
dtype={
|
|
70
|
+
self.config.user_ix: np.int64,
|
|
71
|
+
self.config.item_ix: np.int64,
|
|
72
|
+
self.config.rating_ix: np.float64,
|
|
73
|
+
self.config.timestamp_ix: np.int64,
|
|
74
|
+
},
|
|
75
|
+
sep="\t",
|
|
76
|
+
names=[
|
|
77
|
+
self.config.user_ix,
|
|
78
|
+
self.config.item_ix,
|
|
79
|
+
self.config.rating_ix,
|
|
80
|
+
self.config.timestamp_ix,
|
|
81
|
+
],
|
|
82
|
+
chunksize=100_000,
|
|
83
|
+
)
|
|
84
|
+
df = pd.concat(
|
|
85
|
+
[chunk for chunk in tqdm(chunks, desc="Reading table", unit="chunk")], ignore_index=True
|
|
86
|
+
)
|
|
87
|
+
return df
|
|
88
|
+
|
|
89
|
+
def _fetch_dataset_metadata(
|
|
90
|
+
self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
|
|
91
|
+
) -> None:
|
|
92
|
+
self.USER_METADATA = MovieLens100kUserMetadata(user_id_mapping=user_id_mapping).load()
|
|
93
|
+
self.ITEM_METADATA = MovieLens100kItemMetadata(item_id_mapping=item_id_mapping).load()
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
from .base import Dataset
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestDataset(Dataset):
|
|
7
|
+
"""
|
|
8
|
+
Test dataset.
|
|
9
|
+
|
|
10
|
+
The test dataset is a dummy dataset that is used for testing purposes.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
USER_IX = "user_id"
|
|
14
|
+
"""Name of the column in the DataFrame that contains user identifiers."""
|
|
15
|
+
ITEM_IX = "item_id"
|
|
16
|
+
"""Name of the column in the DataFrame that contains item identifiers."""
|
|
17
|
+
TIMESTAMP_IX = "timestamp"
|
|
18
|
+
"""Name of the column in the DataFrame that contains time of interaction in seconds since epoch."""
|
|
19
|
+
RATING_IX = "rating"
|
|
20
|
+
"""Name of the column in the DataFrame that contains the rating a user gave to the item."""
|
|
21
|
+
DEFAULT_FILENAME = "dummy_input.csv"
|
|
22
|
+
|
|
23
|
+
def _download_dataset(self) -> None:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
27
|
+
"""Load the raw dataset from file, and return it as a pandas DataFrame.
|
|
28
|
+
|
|
29
|
+
Warning:
|
|
30
|
+
This does not apply any preprocessing, and returns the raw dataset.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
The interaction data as a DataFrame with a row per interaction.
|
|
34
|
+
"""
|
|
35
|
+
input_dict = {
|
|
36
|
+
self.USER_IX: [1, 2, 3, 1, 2, 2, 4, 3, 3, 4, 5, 5, 5],
|
|
37
|
+
self.ITEM_IX: [1, 1, 2, 3, 2, 3, 2, 1, 3, 3, 1, 2, 3],
|
|
38
|
+
self.TIMESTAMP_IX: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 10],
|
|
39
|
+
self.RATING_IX: [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3],
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
df = pd.DataFrame.from_dict(input_dict)
|
|
43
|
+
return df
|
|
44
|
+
|
|
45
|
+
def _fetch_dataset_metadata(self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame) -> None:
|
|
46
|
+
pass
|