recnexteval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recnexteval/__init__.py +20 -0
- recnexteval/algorithms/__init__.py +99 -0
- recnexteval/algorithms/base.py +377 -0
- recnexteval/algorithms/baseline/__init__.py +10 -0
- recnexteval/algorithms/baseline/decay_popularity.py +110 -0
- recnexteval/algorithms/baseline/most_popular.py +72 -0
- recnexteval/algorithms/baseline/random.py +39 -0
- recnexteval/algorithms/baseline/recent_popularity.py +34 -0
- recnexteval/algorithms/itemknn/__init__.py +14 -0
- recnexteval/algorithms/itemknn/itemknn.py +119 -0
- recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
- recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
- recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
- recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
- recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
- recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
- recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
- recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
- recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
- recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
- recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
- recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
- recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
- recnexteval/algorithms/utils.py +51 -0
- recnexteval/datasets/__init__.py +109 -0
- recnexteval/datasets/base.py +316 -0
- recnexteval/datasets/config/__init__.py +113 -0
- recnexteval/datasets/config/amazon.py +188 -0
- recnexteval/datasets/config/base.py +72 -0
- recnexteval/datasets/config/lastfm.py +105 -0
- recnexteval/datasets/config/movielens.py +169 -0
- recnexteval/datasets/config/yelp.py +25 -0
- recnexteval/datasets/datasets/__init__.py +24 -0
- recnexteval/datasets/datasets/amazon.py +151 -0
- recnexteval/datasets/datasets/base.py +250 -0
- recnexteval/datasets/datasets/lastfm.py +121 -0
- recnexteval/datasets/datasets/movielens.py +93 -0
- recnexteval/datasets/datasets/test.py +46 -0
- recnexteval/datasets/datasets/yelp.py +103 -0
- recnexteval/datasets/metadata/__init__.py +58 -0
- recnexteval/datasets/metadata/amazon.py +68 -0
- recnexteval/datasets/metadata/base.py +38 -0
- recnexteval/datasets/metadata/lastfm.py +110 -0
- recnexteval/datasets/metadata/movielens.py +87 -0
- recnexteval/evaluators/__init__.py +189 -0
- recnexteval/evaluators/accumulator.py +167 -0
- recnexteval/evaluators/base.py +216 -0
- recnexteval/evaluators/builder/__init__.py +125 -0
- recnexteval/evaluators/builder/base.py +166 -0
- recnexteval/evaluators/builder/pipeline.py +111 -0
- recnexteval/evaluators/builder/stream.py +54 -0
- recnexteval/evaluators/evaluator_pipeline.py +287 -0
- recnexteval/evaluators/evaluator_stream.py +374 -0
- recnexteval/evaluators/state_management.py +310 -0
- recnexteval/evaluators/strategy.py +32 -0
- recnexteval/evaluators/util.py +124 -0
- recnexteval/matrix/__init__.py +48 -0
- recnexteval/matrix/exception.py +5 -0
- recnexteval/matrix/interaction_matrix.py +784 -0
- recnexteval/matrix/prediction_matrix.py +153 -0
- recnexteval/matrix/util.py +24 -0
- recnexteval/metrics/__init__.py +57 -0
- recnexteval/metrics/binary/__init__.py +4 -0
- recnexteval/metrics/binary/hit.py +49 -0
- recnexteval/metrics/core/__init__.py +10 -0
- recnexteval/metrics/core/base.py +126 -0
- recnexteval/metrics/core/elementwise_top_k.py +75 -0
- recnexteval/metrics/core/listwise_top_k.py +72 -0
- recnexteval/metrics/core/top_k.py +60 -0
- recnexteval/metrics/core/util.py +29 -0
- recnexteval/metrics/ranking/__init__.py +6 -0
- recnexteval/metrics/ranking/dcg.py +55 -0
- recnexteval/metrics/ranking/ndcg.py +78 -0
- recnexteval/metrics/ranking/precision.py +51 -0
- recnexteval/metrics/ranking/recall.py +42 -0
- recnexteval/models/__init__.py +4 -0
- recnexteval/models/base.py +69 -0
- recnexteval/preprocessing/__init__.py +37 -0
- recnexteval/preprocessing/filter.py +181 -0
- recnexteval/preprocessing/preprocessor.py +137 -0
- recnexteval/registries/__init__.py +67 -0
- recnexteval/registries/algorithm.py +68 -0
- recnexteval/registries/base.py +131 -0
- recnexteval/registries/dataset.py +37 -0
- recnexteval/registries/metric.py +57 -0
- recnexteval/settings/__init__.py +127 -0
- recnexteval/settings/base.py +414 -0
- recnexteval/settings/exception.py +8 -0
- recnexteval/settings/leave_n_out_setting.py +48 -0
- recnexteval/settings/processor.py +115 -0
- recnexteval/settings/schema.py +11 -0
- recnexteval/settings/single_time_point_setting.py +111 -0
- recnexteval/settings/sliding_window_setting.py +153 -0
- recnexteval/settings/splitters/__init__.py +14 -0
- recnexteval/settings/splitters/base.py +57 -0
- recnexteval/settings/splitters/n_last.py +39 -0
- recnexteval/settings/splitters/n_last_timestamp.py +76 -0
- recnexteval/settings/splitters/timestamp.py +82 -0
- recnexteval/settings/util.py +0 -0
- recnexteval/utils/__init__.py +115 -0
- recnexteval/utils/json_to_csv_converter.py +128 -0
- recnexteval/utils/logging_tools.py +159 -0
- recnexteval/utils/path.py +155 -0
- recnexteval/utils/url_certificate_installer.py +54 -0
- recnexteval/utils/util.py +166 -0
- recnexteval/utils/uuid_util.py +7 -0
- recnexteval/utils/yaml_tool.py +65 -0
- recnexteval-0.1.0.dist-info/METADATA +85 -0
- recnexteval-0.1.0.dist-info/RECORD +110 -0
- recnexteval-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from scipy.sparse import csr_matrix
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_top_K_ranks(X: csr_matrix, K: None | int = None) -> csr_matrix:
|
|
6
|
+
"""Returns a matrix of ranks assigned to the largest K values in X.
|
|
7
|
+
|
|
8
|
+
Selects K largest values for every row in X and assigns a rank to each.
|
|
9
|
+
|
|
10
|
+
:param X: Matrix from which we will select K values in every row.
|
|
11
|
+
:type X: csr_matrix
|
|
12
|
+
:param K: Amount of values to select.
|
|
13
|
+
:type K: int, optional
|
|
14
|
+
:return: Matrix with K values per row.
|
|
15
|
+
:rtype: csr_matrix
|
|
16
|
+
"""
|
|
17
|
+
U, I, V = [], [], []
|
|
18
|
+
for row_ix, (le, ri) in enumerate(zip(X.indptr[:-1], X.indptr[1:])):
|
|
19
|
+
K_row_pick = min(K, ri - le) if K is not None else ri - le
|
|
20
|
+
|
|
21
|
+
if K_row_pick != 0:
|
|
22
|
+
top_k_row = X.indices[le + np.argpartition(X.data[le:ri], list(range(-K_row_pick, 0)))[-K_row_pick:]]
|
|
23
|
+
|
|
24
|
+
for rank, col_ix in enumerate(reversed(top_k_row)):
|
|
25
|
+
U.append(row_ix)
|
|
26
|
+
I.append(col_ix)
|
|
27
|
+
V.append(rank + 1)
|
|
28
|
+
# data, (row, col) = (V, (U, I)
|
|
29
|
+
X_top_K = csr_matrix((V, (U, I)), shape=X.shape)
|
|
30
|
+
|
|
31
|
+
return X_top_K
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_top_K_values(X: csr_matrix, K: None | int = None) -> csr_matrix:
|
|
35
|
+
"""Returns a matrix of only the K largest values for every row in X.
|
|
36
|
+
|
|
37
|
+
Selects the top-K items for every user (which is equal to the K nearest neighbours.)
|
|
38
|
+
In case of a tie for the last position, the item with the largest index of the tied items is used.
|
|
39
|
+
|
|
40
|
+
:param X: Matrix from which we will select K values in every row.
|
|
41
|
+
:type X: csr_matrix
|
|
42
|
+
:param K: Amount of values to select.
|
|
43
|
+
:type K: int, optional
|
|
44
|
+
:return: Matrix with K values per row.
|
|
45
|
+
:rtype: csr_matrix
|
|
46
|
+
"""
|
|
47
|
+
top_K_ranks = get_top_K_ranks(X, K)
|
|
48
|
+
# Convert the position into binary values (1 if in top K, 0 otherwise)
|
|
49
|
+
top_K_ranks[top_K_ranks > 0] = 1
|
|
50
|
+
# elementwise multiplication with orignal matrix to get values
|
|
51
|
+
return top_K_ranks.multiply(X)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Dataset module for public datasets in streaming experiments.
|
|
2
|
+
|
|
3
|
+
This module provides easy access to publicly available datasets for use in streaming
|
|
4
|
+
experiments. Dataset classes are built on top of the `Dataset` base class, allowing
|
|
5
|
+
for easy extension and customization.
|
|
6
|
+
|
|
7
|
+
## Dataset Overview
|
|
8
|
+
|
|
9
|
+
Multiple public datasets are available from various sources. Additionally, a
|
|
10
|
+
lightweight test dataset is provided for testing algorithm functionality.
|
|
11
|
+
|
|
12
|
+
### Data Chunking Note
|
|
13
|
+
|
|
14
|
+
The MovieLens 100K dataset is available but not chunked into "blocks". Setting
|
|
15
|
+
a global timeline to split the data could potentially cause a chunk of data to
|
|
16
|
+
be lost. Other publicly available datasets are recommended.
|
|
17
|
+
|
|
18
|
+
## Available Datasets
|
|
19
|
+
|
|
20
|
+
- `AmazonBookDataset`: Amazon Books reviews
|
|
21
|
+
- `AmazonMovieDataset`: Amazon Movies reviews
|
|
22
|
+
- `AmazonMusicDataset`: Amazon Music reviews
|
|
23
|
+
- `AmazonSubscriptionBoxesDataset`: Amazon Subscription Boxes reviews
|
|
24
|
+
- `LastFMDataset`: Last.FM music listening history
|
|
25
|
+
- `MovieLens100K`: MovieLens 100K rating dataset
|
|
26
|
+
- `YelpDataset`: Yelp business reviews
|
|
27
|
+
- `TestDataset`: Lightweight dataset for testing algorithms
|
|
28
|
+
|
|
29
|
+
## Loading Datasets
|
|
30
|
+
|
|
31
|
+
Basic loading:
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from recnexteval.datasets import AmazonMusicDataset
|
|
35
|
+
|
|
36
|
+
dataset = AmazonMusicDataset()
|
|
37
|
+
data = dataset.load()
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
If the file does not exist, it will be downloaded and written. Subsequent loads
|
|
41
|
+
will retrieve the file from disk without downloading again.
|
|
42
|
+
|
|
43
|
+
### Using Default Filters
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from recnexteval.datasets import AmazonMusicDataset
|
|
47
|
+
|
|
48
|
+
dataset = AmazonMusicDataset(use_default_filters=True)
|
|
49
|
+
data = dataset.load(apply_filters=False)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Each dataset can be loaded with default filters applied. Default filters ensure
|
|
53
|
+
that user and item IDs increment in the order of time. **This is the recommended
|
|
54
|
+
loading approach.**
|
|
55
|
+
|
|
56
|
+
## Extending the Framework
|
|
57
|
+
|
|
58
|
+
To add custom datasets, inherit from `Dataset` and implement all abstract methods.
|
|
59
|
+
Refer to the base class documentation for implementation details.
|
|
60
|
+
|
|
61
|
+
## Related Modules
|
|
62
|
+
|
|
63
|
+
- `recnexteval.preprocessing`: Data preprocessing and filtering utilities
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
from .datasets import (
|
|
67
|
+
AmazonBookDataset,
|
|
68
|
+
AmazonMovieDataset,
|
|
69
|
+
AmazonMusicDataset,
|
|
70
|
+
AmazonSubscriptionBoxesDataset,
|
|
71
|
+
Dataset,
|
|
72
|
+
LastFMDataset,
|
|
73
|
+
MovieLens100K,
|
|
74
|
+
TestDataset,
|
|
75
|
+
YelpDataset,
|
|
76
|
+
)
|
|
77
|
+
from .metadata import (
|
|
78
|
+
AmazonBookItemMetadata,
|
|
79
|
+
AmazonMovieItemMetadata,
|
|
80
|
+
AmazonMusicItemMetadata,
|
|
81
|
+
LastFMItemMetadata,
|
|
82
|
+
LastFMTagMetadata,
|
|
83
|
+
LastFMUserMetadata,
|
|
84
|
+
Metadata,
|
|
85
|
+
MovieLens100kItemMetadata,
|
|
86
|
+
MovieLens100kUserMetadata,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
__all__ = [
|
|
91
|
+
"AmazonBookDataset",
|
|
92
|
+
"AmazonMovieDataset",
|
|
93
|
+
"AmazonMusicDataset",
|
|
94
|
+
"AmazonSubscriptionBoxesDataset",
|
|
95
|
+
"LastFMDataset",
|
|
96
|
+
"MovieLens100K",
|
|
97
|
+
"YelpDataset",
|
|
98
|
+
"TestDataset",
|
|
99
|
+
"Dataset",
|
|
100
|
+
"Metadata",
|
|
101
|
+
"MovieLens100kUserMetadata",
|
|
102
|
+
"MovieLens100kItemMetadata",
|
|
103
|
+
"AmazonBookItemMetadata",
|
|
104
|
+
"AmazonMovieItemMetadata",
|
|
105
|
+
"AmazonMusicItemMetadata",
|
|
106
|
+
"LastFMUserMetadata",
|
|
107
|
+
"LastFMItemMetadata",
|
|
108
|
+
"LastFMTagMetadata",
|
|
109
|
+
]
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from abc import abstractmethod
|
|
5
|
+
from typing import ClassVar
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from ..matrix import InteractionMatrix
|
|
11
|
+
from ..models import BaseModel
|
|
12
|
+
from ..preprocessing.filter import Filter, MinItemsPerUser, MinUsersPerItem
|
|
13
|
+
from ..preprocessing.preprocessor import DataFramePreprocessor
|
|
14
|
+
from ..utils.path import safe_dir
|
|
15
|
+
from .config import DatasetConfig
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DataFetcher(BaseModel):
|
|
22
|
+
"""Represents a abstract class to be used by Dataset or Metadata subclass.
|
|
23
|
+
"""
|
|
24
|
+
config: ClassVar[DatasetConfig] = DatasetConfig()
|
|
25
|
+
"""Configuration for the dataset."""
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def file_path(self) -> str:
|
|
29
|
+
"""File path of the dataset."""
|
|
30
|
+
return os.path.join(self.config.default_base_path, self.config.default_filename)
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def processed_cache_path(self) -> str:
|
|
34
|
+
"""Path for cached processed data."""
|
|
35
|
+
return os.path.join(
|
|
36
|
+
self.config.default_base_path, f"{self.config.default_filename}.processed.parquet"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def fetch_dataset(self) -> None:
|
|
40
|
+
"""Check if dataset is present, if not download"""
|
|
41
|
+
if os.path.exists(self.file_path):
|
|
42
|
+
logger.debug("Data file is in memory and in dir specified.")
|
|
43
|
+
return
|
|
44
|
+
logger.debug(f"{self.name} dataset not found in {self.file_path}.")
|
|
45
|
+
self._download_dataset()
|
|
46
|
+
|
|
47
|
+
def fetch_dataset_force(self) -> None:
|
|
48
|
+
"""Force re-download of the dataset."""
|
|
49
|
+
logger.debug(f"{self.name} force re-download of dataset.")
|
|
50
|
+
self._download_dataset()
|
|
51
|
+
|
|
52
|
+
def _fetch_remote(self, url: str, filename: str) -> str:
|
|
53
|
+
"""Fetch data from remote url and save locally (synchronous fallback).
|
|
54
|
+
|
|
55
|
+
This function keeps the previous synchronous behaviour but uses
|
|
56
|
+
`httpx.Client` to stream the response and write to disk. If you
|
|
57
|
+
want async behaviour, use :meth:`_fetch_remote_async` instead.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
url: url to fetch data from
|
|
61
|
+
filename: Path to save file to
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
The filename where data was saved
|
|
65
|
+
"""
|
|
66
|
+
logger.debug(f"{self.name} will fetch dataset from remote url at {url}.")
|
|
67
|
+
|
|
68
|
+
with httpx.Client(timeout=httpx.Timeout(60.0)) as client, client.stream("GET", url) as resp:
|
|
69
|
+
resp.raise_for_status()
|
|
70
|
+
with open(filename, "wb") as fd:
|
|
71
|
+
for chunk in resp.iter_bytes():
|
|
72
|
+
if chunk:
|
|
73
|
+
fd.write(chunk)
|
|
74
|
+
|
|
75
|
+
return filename
|
|
76
|
+
|
|
77
|
+
async def _fetch_remote_async(self, url: str, filename: str) -> str:
|
|
78
|
+
"""Asynchronously fetch data from a remote URL and save locally.
|
|
79
|
+
|
|
80
|
+
Uses `httpx.AsyncClient` and streams the response to disk. Callers
|
|
81
|
+
running inside an event loop should use this coroutine instead of
|
|
82
|
+
the synchronous `_fetch_remote`.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
url: url to fetch data from
|
|
86
|
+
filename: Path to save file to
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The filename where data was saved
|
|
90
|
+
"""
|
|
91
|
+
logger.debug(f"{self.name} will asynchronously fetch dataset from {url}.")
|
|
92
|
+
|
|
93
|
+
async with (
|
|
94
|
+
httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client,
|
|
95
|
+
client.stream("GET", url) as resp,
|
|
96
|
+
):
|
|
97
|
+
resp.raise_for_status()
|
|
98
|
+
# Write bytes as they arrive
|
|
99
|
+
with open(filename, "wb") as fd:
|
|
100
|
+
async for chunk in resp.aiter_bytes():
|
|
101
|
+
if chunk:
|
|
102
|
+
fd.write(chunk)
|
|
103
|
+
|
|
104
|
+
return filename
|
|
105
|
+
|
|
106
|
+
def _cache_processed_dataframe(self, df: pd.DataFrame) -> None:
|
|
107
|
+
"""Cache the processed DataFrame to disk.
|
|
108
|
+
|
|
109
|
+
:param df: DataFrame to cache
|
|
110
|
+
:type df: pd.DataFrame
|
|
111
|
+
"""
|
|
112
|
+
logger.debug(f"Caching processed DataFrame to {self.processed_cache_path}")
|
|
113
|
+
df.to_parquet(self.processed_cache_path)
|
|
114
|
+
logger.debug("Processed DataFrame cached successfully.")
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def _load_dataframe(self) -> pd.DataFrame:
|
|
118
|
+
"""Load the raw dataset from file, and return it as a pandas DataFrame.
|
|
119
|
+
|
|
120
|
+
Warning:
|
|
121
|
+
This does not apply any preprocessing, and returns the raw dataset.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Interaction with minimal columns of {user, item, timestamp}.
|
|
125
|
+
"""
|
|
126
|
+
raise NotImplementedError("Needs to be implemented")
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def _download_dataset(self) -> None:
|
|
130
|
+
"""Downloads the dataset.
|
|
131
|
+
|
|
132
|
+
Downloads the csv file from the dataset URL and saves it to the file path.
|
|
133
|
+
"""
|
|
134
|
+
raise NotImplementedError("Needs to be implemented")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class Dataset(DataFetcher):
|
|
138
|
+
"""Represents a collaborative filtering dataset.
|
|
139
|
+
|
|
140
|
+
Dataset must minimally contain user, item and timestamp columns for the
|
|
141
|
+
other modules to work.
|
|
142
|
+
|
|
143
|
+
Assumption
|
|
144
|
+
===========
|
|
145
|
+
User/item ID increments in the order of time. This is an assumption that will
|
|
146
|
+
be made for the purposes of splitting the dataset and eventually passing
|
|
147
|
+
the dataset to the model. The ID incrementing in the order of time allows us
|
|
148
|
+
to set the shape of the currently known user and item matrix allowing easier
|
|
149
|
+
manipulation of the data by the evaluator.
|
|
150
|
+
|
|
151
|
+
:param filename: Name of the file, if no name is provided the dataset default will be used if known.
|
|
152
|
+
If the dataset does not have a default filename, a ValueError will be raised.
|
|
153
|
+
:type filename: str, optional
|
|
154
|
+
:param base_path: The base_path to the data directory.
|
|
155
|
+
Defaults to `data`
|
|
156
|
+
:type base_path: str, optional
|
|
157
|
+
:param use_default_filters: If True, the default filters will be applied to the dataset.
|
|
158
|
+
Defaults to False.
|
|
159
|
+
:type use_default_filters: bool, optional
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
config: ClassVar[DatasetConfig] = DatasetConfig()
|
|
163
|
+
"""Configuration for the dataset."""
|
|
164
|
+
|
|
165
|
+
def __init__(
|
|
166
|
+
self,
|
|
167
|
+
use_default_filters: bool = False, # noqa: FBT001, FBT002
|
|
168
|
+
fetch_metadata: bool = False, # noqa: FBT001, FBT002
|
|
169
|
+
) -> None:
|
|
170
|
+
if not self.config.user_ix or not self.config.item_ix or not self.config.timestamp_ix:
|
|
171
|
+
raise AttributeError("user_ix, item_ix or timestamp_ix not set in config.")
|
|
172
|
+
|
|
173
|
+
logger.debug(
|
|
174
|
+
f"{self.name} being initialized with '{self.config.default_base_path}' as the base path."
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
if not self.config.default_filename:
|
|
178
|
+
raise ValueError("No filename specified, and no default known.")
|
|
179
|
+
|
|
180
|
+
self.fetch_metadata = fetch_metadata
|
|
181
|
+
self.preprocessor = DataFramePreprocessor(
|
|
182
|
+
self.config.item_ix, self.config.user_ix, self.config.timestamp_ix
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
if use_default_filters:
|
|
186
|
+
for f in self._default_filters:
|
|
187
|
+
self.add_filter(f)
|
|
188
|
+
|
|
189
|
+
safe_dir(self.config.default_base_path)
|
|
190
|
+
logger.debug(f"{self.name} is initialized.")
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def _default_filters(self) -> list[Filter]:
|
|
194
|
+
"""The default filters for all datasets
|
|
195
|
+
|
|
196
|
+
Concrete classes can override this property to add more filters.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
List of filters to be applied to the dataset.
|
|
200
|
+
"""
|
|
201
|
+
if not self.config.user_ix or not self.config.item_ix:
|
|
202
|
+
raise AttributeError("config.user_ix or config.item_ix not set.")
|
|
203
|
+
|
|
204
|
+
filters: list[Filter] = []
|
|
205
|
+
filters.append(
|
|
206
|
+
MinItemsPerUser(
|
|
207
|
+
min_items_per_user=3,
|
|
208
|
+
item_ix=self.config.item_ix,
|
|
209
|
+
user_ix=self.config.user_ix,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
filters.append(
|
|
213
|
+
MinUsersPerItem(
|
|
214
|
+
min_users_per_item=3,
|
|
215
|
+
item_ix=self.config.item_ix,
|
|
216
|
+
user_ix=self.config.user_ix,
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
return filters
|
|
220
|
+
|
|
221
|
+
def add_filter(self, filter_: Filter) -> None:
|
|
222
|
+
"""Add a filter to be applied when loading the data.
|
|
223
|
+
|
|
224
|
+
Utilize :class:`DataFramePreprocessor` class to add filters to the
|
|
225
|
+
dataset to load. The filter will be applied when the data is loaded into
|
|
226
|
+
an :class:`InteractionMatrix` object when :meth:`load` is called.
|
|
227
|
+
|
|
228
|
+
:param filter_: Filter to be applied to the loaded DataFrame
|
|
229
|
+
processing to interaction matrix.
|
|
230
|
+
:type filter_: Filter
|
|
231
|
+
"""
|
|
232
|
+
self.preprocessor.add_filter(filter_)
|
|
233
|
+
|
|
234
|
+
def _load_dataframe_from_cache(self) -> pd.DataFrame:
|
|
235
|
+
if not os.path.exists(self.processed_cache_path):
|
|
236
|
+
raise FileNotFoundError("Processed cache file not found.")
|
|
237
|
+
logger.info(f"Loading from cache: {self.processed_cache_path}")
|
|
238
|
+
df = pd.read_parquet(self.processed_cache_path)
|
|
239
|
+
return df
|
|
240
|
+
|
|
241
|
+
def load(self, apply_filters: bool = True, use_cache: bool = True) -> InteractionMatrix:
|
|
242
|
+
"""Loads data into an InteractionMatrix object.
|
|
243
|
+
|
|
244
|
+
Data is loaded into a DataFrame using the :func:`_load_dataframe` function.
|
|
245
|
+
Resulting DataFrame is parsed into an :class:`InteractionMatrix` object. If
|
|
246
|
+
:data:`apply_filters` is set to True, the filters set will be applied to the
|
|
247
|
+
dataset and mapping of user and item ids will be done. This is advised
|
|
248
|
+
even if there is no filter set, as it will ensure that the user and item
|
|
249
|
+
ids are incrementing in the order of time.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
apply_filters: To apply the filters set and preprocessing,
|
|
253
|
+
defaults to True
|
|
254
|
+
use_cache: Whether to use cached processed data, defaults to True
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Resulting interaction matrix.
|
|
258
|
+
"""
|
|
259
|
+
logger.info(f"{self.name} is loading dataset...")
|
|
260
|
+
start = time.time()
|
|
261
|
+
try:
|
|
262
|
+
df = self._load_dataframe_from_cache() if use_cache else self._load_dataframe()
|
|
263
|
+
except FileNotFoundError:
|
|
264
|
+
logger.warning("Processed cache not found, loading raw dataframe.")
|
|
265
|
+
df = self._load_dataframe()
|
|
266
|
+
self._cache_processed_dataframe(df)
|
|
267
|
+
if apply_filters:
|
|
268
|
+
logger.debug(f"{self.name} applying filters set.")
|
|
269
|
+
im = self.preprocessor.process(df)
|
|
270
|
+
else:
|
|
271
|
+
im = self._dataframe_to_matrix(df)
|
|
272
|
+
logger.warning(
|
|
273
|
+
"No filters applied, user and item ids may not be incrementing in the order of time. "
|
|
274
|
+
"Classes that use this dataset may not work as expected."
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
if self.fetch_metadata:
|
|
278
|
+
user_id_mapping, item_id_mapping = (
|
|
279
|
+
self.preprocessor.user_id_mapping,
|
|
280
|
+
self.preprocessor.item_id_mapping,
|
|
281
|
+
)
|
|
282
|
+
self._fetch_dataset_metadata(
|
|
283
|
+
user_id_mapping=user_id_mapping, item_id_mapping=item_id_mapping
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
end = time.time()
|
|
287
|
+
logger.info(f"{self.name} dataset loaded - Took {end - start:.3}s")
|
|
288
|
+
return im
|
|
289
|
+
|
|
290
|
+
def _dataframe_to_matrix(self, df: pd.DataFrame) -> InteractionMatrix:
|
|
291
|
+
"""Converts a DataFrame to an InteractionMatrix.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
df: DataFrame to convert
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
InteractionMatrix object.
|
|
298
|
+
"""
|
|
299
|
+
if not self.config.user_ix or not self.config.item_ix or not self.config.timestamp_ix:
|
|
300
|
+
raise AttributeError("config.user_ix, config.item_ix or config.timestamp_ix not set.")
|
|
301
|
+
return InteractionMatrix(
|
|
302
|
+
df,
|
|
303
|
+
user_ix=self.config.user_ix,
|
|
304
|
+
item_ix=self.config.item_ix,
|
|
305
|
+
timestamp_ix=self.config.timestamp_ix,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
@abstractmethod
|
|
309
|
+
def _fetch_dataset_metadata(
|
|
310
|
+
self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
|
|
311
|
+
) -> None:
|
|
312
|
+
"""Fetch metadata for the dataset.
|
|
313
|
+
|
|
314
|
+
Fetch metadata for the dataset, if available.
|
|
315
|
+
"""
|
|
316
|
+
raise NotImplementedError("Needs to be implemented")
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Dataset configuration module.
|
|
2
|
+
|
|
3
|
+
This module provides configuration classes for dataset loading and metadata
|
|
4
|
+
handling. Configurations define dataset properties such as paths, URLs, and
|
|
5
|
+
processing parameters.
|
|
6
|
+
|
|
7
|
+
## Available Configurations
|
|
8
|
+
|
|
9
|
+
### Base Classes
|
|
10
|
+
|
|
11
|
+
- `DatasetConfig`: Base class for dataset configurations
|
|
12
|
+
- `MetadataConfig`: Base class for metadata configurations
|
|
13
|
+
|
|
14
|
+
### Dataset Configurations
|
|
15
|
+
|
|
16
|
+
- `MovieLensDatasetConfig`: Base configuration for MovieLens datasets
|
|
17
|
+
- `MovieLens100KDatasetConfig`: Configuration for MovieLens 100K dataset
|
|
18
|
+
- `AmazonDatasetConfig`: Base configuration for Amazon datasets
|
|
19
|
+
- `AmazonMusicDatasetConfig`: Configuration for Amazon Music dataset
|
|
20
|
+
- `AmazonMovieDatasetConfig`: Configuration for Amazon Movies dataset
|
|
21
|
+
- `AmazonBookDatasetConfig`: Configuration for Amazon Books dataset
|
|
22
|
+
- `AmazonSubscriptionBoxesDatasetConfig`: Configuration for Amazon Subscription Boxes dataset
|
|
23
|
+
- `LastFMDatasetConfig`: Configuration for Last.FM dataset
|
|
24
|
+
- `YelpDatasetConfig`: Configuration for Yelp dataset
|
|
25
|
+
|
|
26
|
+
### Metadata Configurations
|
|
27
|
+
|
|
28
|
+
- `MovieLens100kItemMetadataConfig`: Item metadata configuration for MovieLens 100K
|
|
29
|
+
- `MovieLens100kUserMetadataConfig`: User metadata configuration for MovieLens 100K
|
|
30
|
+
- `AmazonItemMetadataConfig`: Base Amazon item metadata configuration
|
|
31
|
+
- `AmazonBooksItemMetadataConfig`: Item metadata for Amazon Books
|
|
32
|
+
- `AmazonDigitalMusicItemMetadataConfig`: Item metadata for Amazon Digital Music
|
|
33
|
+
- `AmazonMoviesAndTVItemMetadataConfig`: Item metadata for Amazon Movies and TV
|
|
34
|
+
- `AmazonSubscriptionBoxesItemMetadataConfig`: Item metadata for Amazon Subscription Boxes
|
|
35
|
+
- `LastFMUserMetadataConfig`: User metadata configuration for Last.FM
|
|
36
|
+
- `LastFMItemMetadataConfig`: Item metadata configuration for Last.FM
|
|
37
|
+
- `LastFMTagMetadataConfig`: Tag metadata configuration for Last.FM
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
A typical usage pattern is to import a dataset config, optionally override fields,
|
|
42
|
+
and pass it to dataset-loading utilities or custom convenience wrappers:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from recnexteval.datasets.config import AmazonMusicDatasetConfig
|
|
46
|
+
|
|
47
|
+
# Create config instance using defaults
|
|
48
|
+
cfg = AmazonMusicDatasetConfig()
|
|
49
|
+
|
|
50
|
+
# Inspect config values
|
|
51
|
+
print(cfg.name)
|
|
52
|
+
print(cfg.local_path)
|
|
53
|
+
print(cfg.source_url)
|
|
54
|
+
|
|
55
|
+
# Optionally override defaults at runtime
|
|
56
|
+
custom_cfg = AmazonMusicDatasetConfig(
|
|
57
|
+
min_user_interactions=5,
|
|
58
|
+
min_item_interactions=10
|
|
59
|
+
)
|
|
60
|
+
```
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
from .amazon import (
|
|
64
|
+
AmazonBookDatasetConfig,
|
|
65
|
+
AmazonBooksItemMetadataConfig,
|
|
66
|
+
AmazonDatasetConfig,
|
|
67
|
+
AmazonDigitalMusicItemMetadataConfig,
|
|
68
|
+
AmazonItemMetadataConfig,
|
|
69
|
+
AmazonMovieDatasetConfig,
|
|
70
|
+
AmazonMoviesAndTVItemMetadataConfig,
|
|
71
|
+
AmazonMusicDatasetConfig,
|
|
72
|
+
AmazonSubscriptionBoxesDatasetConfig,
|
|
73
|
+
AmazonSubscriptionBoxesItemMetadataConfig,
|
|
74
|
+
)
|
|
75
|
+
from .base import DatasetConfig, MetadataConfig
|
|
76
|
+
from .lastfm import (
|
|
77
|
+
LastFMDatasetConfig,
|
|
78
|
+
LastFMItemMetadataConfig,
|
|
79
|
+
LastFMTagMetadataConfig,
|
|
80
|
+
LastFMUserMetadataConfig,
|
|
81
|
+
)
|
|
82
|
+
from .movielens import (
|
|
83
|
+
MovieLens100KDatasetConfig,
|
|
84
|
+
MovieLens100kItemMetadataConfig,
|
|
85
|
+
MovieLens100kUserMetadataConfig,
|
|
86
|
+
MovieLensDatasetConfig,
|
|
87
|
+
)
|
|
88
|
+
from .yelp import YelpDatasetConfig
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
__all__ = [
|
|
92
|
+
"AmazonDatasetConfig",
|
|
93
|
+
"AmazonMusicDatasetConfig",
|
|
94
|
+
"AmazonMovieDatasetConfig",
|
|
95
|
+
"AmazonBookDatasetConfig",
|
|
96
|
+
"AmazonSubscriptionBoxesDatasetConfig",
|
|
97
|
+
"LastFMDatasetConfig",
|
|
98
|
+
"YelpDatasetConfig",
|
|
99
|
+
"DatasetConfig",
|
|
100
|
+
"MetadataConfig",
|
|
101
|
+
"MovieLensDatasetConfig",
|
|
102
|
+
"MovieLens100KDatasetConfig",
|
|
103
|
+
"MovieLens100kItemMetadataConfig",
|
|
104
|
+
"MovieLens100kUserMetadataConfig",
|
|
105
|
+
"AmazonBooksItemMetadataConfig",
|
|
106
|
+
"AmazonDigitalMusicItemMetadataConfig",
|
|
107
|
+
"AmazonItemMetadataConfig",
|
|
108
|
+
"AmazonMoviesAndTVItemMetadataConfig",
|
|
109
|
+
"AmazonSubscriptionBoxesItemMetadataConfig",
|
|
110
|
+
"LastFMItemMetadataConfig",
|
|
111
|
+
"LastFMTagMetadataConfig",
|
|
112
|
+
"LastFMUserMetadataConfig",
|
|
113
|
+
]
|