recnexteval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recnexteval/__init__.py +20 -0
- recnexteval/algorithms/__init__.py +99 -0
- recnexteval/algorithms/base.py +377 -0
- recnexteval/algorithms/baseline/__init__.py +10 -0
- recnexteval/algorithms/baseline/decay_popularity.py +110 -0
- recnexteval/algorithms/baseline/most_popular.py +72 -0
- recnexteval/algorithms/baseline/random.py +39 -0
- recnexteval/algorithms/baseline/recent_popularity.py +34 -0
- recnexteval/algorithms/itemknn/__init__.py +14 -0
- recnexteval/algorithms/itemknn/itemknn.py +119 -0
- recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
- recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
- recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
- recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
- recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
- recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
- recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
- recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
- recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
- recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
- recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
- recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
- recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
- recnexteval/algorithms/utils.py +51 -0
- recnexteval/datasets/__init__.py +109 -0
- recnexteval/datasets/base.py +316 -0
- recnexteval/datasets/config/__init__.py +113 -0
- recnexteval/datasets/config/amazon.py +188 -0
- recnexteval/datasets/config/base.py +72 -0
- recnexteval/datasets/config/lastfm.py +105 -0
- recnexteval/datasets/config/movielens.py +169 -0
- recnexteval/datasets/config/yelp.py +25 -0
- recnexteval/datasets/datasets/__init__.py +24 -0
- recnexteval/datasets/datasets/amazon.py +151 -0
- recnexteval/datasets/datasets/base.py +250 -0
- recnexteval/datasets/datasets/lastfm.py +121 -0
- recnexteval/datasets/datasets/movielens.py +93 -0
- recnexteval/datasets/datasets/test.py +46 -0
- recnexteval/datasets/datasets/yelp.py +103 -0
- recnexteval/datasets/metadata/__init__.py +58 -0
- recnexteval/datasets/metadata/amazon.py +68 -0
- recnexteval/datasets/metadata/base.py +38 -0
- recnexteval/datasets/metadata/lastfm.py +110 -0
- recnexteval/datasets/metadata/movielens.py +87 -0
- recnexteval/evaluators/__init__.py +189 -0
- recnexteval/evaluators/accumulator.py +167 -0
- recnexteval/evaluators/base.py +216 -0
- recnexteval/evaluators/builder/__init__.py +125 -0
- recnexteval/evaluators/builder/base.py +166 -0
- recnexteval/evaluators/builder/pipeline.py +111 -0
- recnexteval/evaluators/builder/stream.py +54 -0
- recnexteval/evaluators/evaluator_pipeline.py +287 -0
- recnexteval/evaluators/evaluator_stream.py +374 -0
- recnexteval/evaluators/state_management.py +310 -0
- recnexteval/evaluators/strategy.py +32 -0
- recnexteval/evaluators/util.py +124 -0
- recnexteval/matrix/__init__.py +48 -0
- recnexteval/matrix/exception.py +5 -0
- recnexteval/matrix/interaction_matrix.py +784 -0
- recnexteval/matrix/prediction_matrix.py +153 -0
- recnexteval/matrix/util.py +24 -0
- recnexteval/metrics/__init__.py +57 -0
- recnexteval/metrics/binary/__init__.py +4 -0
- recnexteval/metrics/binary/hit.py +49 -0
- recnexteval/metrics/core/__init__.py +10 -0
- recnexteval/metrics/core/base.py +126 -0
- recnexteval/metrics/core/elementwise_top_k.py +75 -0
- recnexteval/metrics/core/listwise_top_k.py +72 -0
- recnexteval/metrics/core/top_k.py +60 -0
- recnexteval/metrics/core/util.py +29 -0
- recnexteval/metrics/ranking/__init__.py +6 -0
- recnexteval/metrics/ranking/dcg.py +55 -0
- recnexteval/metrics/ranking/ndcg.py +78 -0
- recnexteval/metrics/ranking/precision.py +51 -0
- recnexteval/metrics/ranking/recall.py +42 -0
- recnexteval/models/__init__.py +4 -0
- recnexteval/models/base.py +69 -0
- recnexteval/preprocessing/__init__.py +37 -0
- recnexteval/preprocessing/filter.py +181 -0
- recnexteval/preprocessing/preprocessor.py +137 -0
- recnexteval/registries/__init__.py +67 -0
- recnexteval/registries/algorithm.py +68 -0
- recnexteval/registries/base.py +131 -0
- recnexteval/registries/dataset.py +37 -0
- recnexteval/registries/metric.py +57 -0
- recnexteval/settings/__init__.py +127 -0
- recnexteval/settings/base.py +414 -0
- recnexteval/settings/exception.py +8 -0
- recnexteval/settings/leave_n_out_setting.py +48 -0
- recnexteval/settings/processor.py +115 -0
- recnexteval/settings/schema.py +11 -0
- recnexteval/settings/single_time_point_setting.py +111 -0
- recnexteval/settings/sliding_window_setting.py +153 -0
- recnexteval/settings/splitters/__init__.py +14 -0
- recnexteval/settings/splitters/base.py +57 -0
- recnexteval/settings/splitters/n_last.py +39 -0
- recnexteval/settings/splitters/n_last_timestamp.py +76 -0
- recnexteval/settings/splitters/timestamp.py +82 -0
- recnexteval/settings/util.py +0 -0
- recnexteval/utils/__init__.py +115 -0
- recnexteval/utils/json_to_csv_converter.py +128 -0
- recnexteval/utils/logging_tools.py +159 -0
- recnexteval/utils/path.py +155 -0
- recnexteval/utils/url_certificate_installer.py +54 -0
- recnexteval/utils/util.py +166 -0
- recnexteval/utils/uuid_util.py +7 -0
- recnexteval/utils/yaml_tool.py +65 -0
- recnexteval-0.1.0.dist-info/METADATA +85 -0
- recnexteval-0.1.0.dist-info/RECORD +110 -0
- recnexteval-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from .base import DatasetConfig, MetadataConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class AmazonDatasetConfig(DatasetConfig):
|
|
8
|
+
"""Amazon dataset base configuration."""
|
|
9
|
+
|
|
10
|
+
user_ix: str = "user_id"
|
|
11
|
+
item_ix: str = "parent_asin"
|
|
12
|
+
timestamp_ix: str = "timestamp"
|
|
13
|
+
rating_ix: str = "rating"
|
|
14
|
+
helpful_vote_ix: str = "helpful_vote"
|
|
15
|
+
dataset_url: str = "" # Set per subclass
|
|
16
|
+
remote_filename: str = "" # Set per subclass
|
|
17
|
+
default_base_path: str = DatasetConfig.default_base_path + "/amazon"
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def default_filename(self) -> str:
|
|
21
|
+
"""Return just the filename for Amazon datasets (no zipname prefix)."""
|
|
22
|
+
return self.remote_filename
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class AmazonMusicDatasetConfig(AmazonDatasetConfig):
|
|
27
|
+
"""Amazon Music dataset configuration."""
|
|
28
|
+
|
|
29
|
+
remote_filename: str = "Digital_Music.jsonl.gz"
|
|
30
|
+
dataset_url: str = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Digital_Music.jsonl.gz"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class AmazonMovieDatasetConfig(AmazonDatasetConfig):
|
|
35
|
+
"""Amazon Movie dataset configuration."""
|
|
36
|
+
|
|
37
|
+
remote_filename: str = "Movies_and_TV.jsonl.gz"
|
|
38
|
+
dataset_url: str = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Movies_and_TV.jsonl.gz"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class AmazonSubscriptionBoxesDatasetConfig(AmazonDatasetConfig):
|
|
43
|
+
"""Amazon Subscription Boxes dataset configuration."""
|
|
44
|
+
|
|
45
|
+
remote_filename: str = "Subscription_Boxes.jsonl.gz"
|
|
46
|
+
dataset_url: str = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Subscription_Boxes.jsonl.gz"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class AmazonBookDatasetConfig(AmazonDatasetConfig):
|
|
51
|
+
"""Amazon Books dataset configuration."""
|
|
52
|
+
|
|
53
|
+
remote_filename: str = "Books.jsonl.gz"
|
|
54
|
+
dataset_url: str = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Books.jsonl.gz"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class AmazonItemMetadataConfig(MetadataConfig, AmazonDatasetConfig):
|
|
59
|
+
"""
|
|
60
|
+
Amazon Item Metadata Base Configuration.
|
|
61
|
+
|
|
62
|
+
Handles configuration for Amazon product metadata including:
|
|
63
|
+
- Product identifiers (ASIN)
|
|
64
|
+
- Product information (title, category, price, rating)
|
|
65
|
+
- Rich content (features, description, images, videos)
|
|
66
|
+
- Relational data (store, details, bought together)
|
|
67
|
+
|
|
68
|
+
All properties are computed from base fields to ensure consistency.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
item_ix: str = "parent_asin"
|
|
72
|
+
"""Name of the column containing product identifiers (parent ASIN)."""
|
|
73
|
+
main_category_ix: str = "main_category"
|
|
74
|
+
"""Name of the column containing the main product category."""
|
|
75
|
+
title_ix: str = "title"
|
|
76
|
+
"""Name of the column containing product title."""
|
|
77
|
+
average_rating_ix: str = "average_rating"
|
|
78
|
+
"""Name of the column containing average product rating (0-5)."""
|
|
79
|
+
rating_number_ix: str = "rating_number"
|
|
80
|
+
"""Name of the column containing number of ratings received."""
|
|
81
|
+
features_ix: str = "features"
|
|
82
|
+
"""Name of the column containing product features (list)."""
|
|
83
|
+
description_ix: str = "description"
|
|
84
|
+
"""Name of the column containing product description (list)."""
|
|
85
|
+
price_ix: str = "price"
|
|
86
|
+
"""Name of the column containing product price."""
|
|
87
|
+
images_ix: str = "images"
|
|
88
|
+
"""Name of the column containing product images URLs (list)."""
|
|
89
|
+
videos_ix: str = "videos"
|
|
90
|
+
"""Name of the column containing product videos URLs (list)."""
|
|
91
|
+
store_ix: str = "store"
|
|
92
|
+
"""Name of the column containing store/seller information."""
|
|
93
|
+
categories_ix: str = "categories"
|
|
94
|
+
"""Name of the column containing category hierarchy (list)."""
|
|
95
|
+
details_ix: str = "details"
|
|
96
|
+
"""Name of the column containing product details (dict)."""
|
|
97
|
+
bought_together_ix: str = "bought_together"
|
|
98
|
+
"""Name of the column containing products bought together (list)."""
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def column_names(self) -> list[str]:
|
|
102
|
+
return [
|
|
103
|
+
self.main_category_ix,
|
|
104
|
+
self.title_ix,
|
|
105
|
+
self.average_rating_ix,
|
|
106
|
+
self.rating_number_ix,
|
|
107
|
+
self.features_ix,
|
|
108
|
+
self.description_ix,
|
|
109
|
+
self.price_ix,
|
|
110
|
+
self.images_ix,
|
|
111
|
+
self.videos_ix,
|
|
112
|
+
self.store_ix,
|
|
113
|
+
self.categories_ix,
|
|
114
|
+
self.details_ix,
|
|
115
|
+
self.item_ix,
|
|
116
|
+
self.bought_together_ix,
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def dtype_dict(self) -> dict:
|
|
121
|
+
return {
|
|
122
|
+
self.main_category_ix: str,
|
|
123
|
+
self.title_ix: str,
|
|
124
|
+
self.average_rating_ix: "float32",
|
|
125
|
+
self.rating_number_ix: "int64",
|
|
126
|
+
self.features_ix: list,
|
|
127
|
+
self.description_ix: list,
|
|
128
|
+
self.price_ix: "float32",
|
|
129
|
+
self.images_ix: list,
|
|
130
|
+
self.videos_ix: list,
|
|
131
|
+
self.store_ix: str,
|
|
132
|
+
self.categories_ix: list,
|
|
133
|
+
self.details_ix: dict,
|
|
134
|
+
self.item_ix: str,
|
|
135
|
+
self.bought_together_ix: list,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class AmazonDigitalMusicItemMetadataConfig(AmazonItemMetadataConfig):
|
|
141
|
+
"""Amazon Digital Music metadata configuration."""
|
|
142
|
+
|
|
143
|
+
remote_filename: str = "meta_Digital_Music.jsonl.gz"
|
|
144
|
+
"""Filename for Digital Music metadata."""
|
|
145
|
+
|
|
146
|
+
dataset_url: str = (
|
|
147
|
+
"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/"
|
|
148
|
+
"raw/meta_categories/meta_Digital_Music.jsonl.gz"
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@dataclass
|
|
153
|
+
class AmazonMoviesAndTVItemMetadataConfig(AmazonItemMetadataConfig):
|
|
154
|
+
"""Amazon Movies and TV metadata configuration."""
|
|
155
|
+
|
|
156
|
+
remote_filename: str = "meta_Movies_and_TV.jsonl.gz"
|
|
157
|
+
"""Filename for Movies and TV metadata."""
|
|
158
|
+
|
|
159
|
+
dataset_url: str = (
|
|
160
|
+
"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/"
|
|
161
|
+
"raw/meta_categories/meta_Movies_and_TV.jsonl.gz"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@dataclass
|
|
166
|
+
class AmazonSubscriptionBoxesItemMetadataConfig(AmazonItemMetadataConfig):
|
|
167
|
+
"""Amazon Subscription Boxes metadata configuration."""
|
|
168
|
+
|
|
169
|
+
remote_filename: str = "meta_Subscription_Boxes.jsonl.gz"
|
|
170
|
+
"""Filename for Subscription Boxes metadata."""
|
|
171
|
+
|
|
172
|
+
dataset_url: str = (
|
|
173
|
+
"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/"
|
|
174
|
+
"raw/meta_categories/meta_Subscription_Boxes.jsonl.gz"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
@dataclass
|
|
179
|
+
class AmazonBooksItemMetadataConfig(AmazonItemMetadataConfig):
|
|
180
|
+
"""Amazon Books metadata configuration."""
|
|
181
|
+
|
|
182
|
+
remote_filename: str = "meta_Books.jsonl.gz"
|
|
183
|
+
"""Filename for Books metadata."""
|
|
184
|
+
|
|
185
|
+
dataset_url: str = (
|
|
186
|
+
"https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/"
|
|
187
|
+
"raw/meta_categories/meta_Books.jsonl.gz"
|
|
188
|
+
)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from recnexteval.utils import get_data_dir
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class DatasetConfig:
|
|
8
|
+
"""Base configuration for datasets."""
|
|
9
|
+
|
|
10
|
+
user_ix: str = "user_id"
|
|
11
|
+
"""Name of the column in the DataFrame with user identifiers"""
|
|
12
|
+
item_ix: str = "item_id"
|
|
13
|
+
"""Name of the column in the DataFrame with item identifiers"""
|
|
14
|
+
timestamp_ix: str = "timestamp"
|
|
15
|
+
"""Name of the column in the DataFrame that contains time of interaction in seconds since epoch."""
|
|
16
|
+
dataset_url: str = "http://example.com"
|
|
17
|
+
"""URL to fetch the dataset from."""
|
|
18
|
+
default_base_path: str = str(get_data_dir())
|
|
19
|
+
"""Default base path where the dataset will be stored."""
|
|
20
|
+
remote_zipname: str = ""
|
|
21
|
+
remote_filename: str = ""
|
|
22
|
+
"""Name of the file containing user interaction."""
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def default_filename(self) -> str:
|
|
26
|
+
"""Derived filename from remote components."""
|
|
27
|
+
if not self.remote_zipname or not self.remote_filename:
|
|
28
|
+
return "dataset.csv"
|
|
29
|
+
return f"{self.remote_zipname}_{self.remote_filename}"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class MetadataConfig(DatasetConfig):
|
|
34
|
+
sep: str = "|"
|
|
35
|
+
"""Column separator in the data file."""
|
|
36
|
+
|
|
37
|
+
def __post_init__(self) -> None:
|
|
38
|
+
self.default_base_path = super().default_base_path + "/metadata"
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def column_names(self) -> list[str]:
|
|
42
|
+
"""
|
|
43
|
+
Ordered list of column names for pd.read_table.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
list[str]: Column names in file order [user_id, age, gender, ...]
|
|
47
|
+
|
|
48
|
+
Example:
|
|
49
|
+
["userId", "age", "gender", "occupation", "zipcode"]
|
|
50
|
+
"""
|
|
51
|
+
return []
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def dtype_dict(self) -> dict:
|
|
55
|
+
"""
|
|
56
|
+
Data type mapping for all columns.
|
|
57
|
+
|
|
58
|
+
Used in pd.read_table() dtype parameter to ensure correct
|
|
59
|
+
column types are loaded from file.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
dict: Mapping of column names to numpy dtypes
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
{
|
|
66
|
+
"age": "int64",
|
|
67
|
+
"gender": "<U1", # string
|
|
68
|
+
"occupation": "object",
|
|
69
|
+
"zipcode": "object"
|
|
70
|
+
}
|
|
71
|
+
"""
|
|
72
|
+
return {}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from .base import DatasetConfig, MetadataConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class LastFMDatasetConfig(DatasetConfig):
|
|
8
|
+
"""LastFM dataset configuration."""
|
|
9
|
+
|
|
10
|
+
user_ix: str = "userID"
|
|
11
|
+
item_ix: str = "artistID"
|
|
12
|
+
timestamp_ix: str = "timestamp"
|
|
13
|
+
tag_ix: str = "tagID"
|
|
14
|
+
"""Name of the column in the DataFrame that contains the tag a user gave to the item."""
|
|
15
|
+
dataset_url: str = "https://files.grouplens.org/datasets/hetrec2011"
|
|
16
|
+
remote_zipname: str = "hetrec2011-lastfm-2k"
|
|
17
|
+
remote_filename: str = "user_taggedartists-timestamps.dat"
|
|
18
|
+
default_base_path: str = DatasetConfig.default_base_path + "/lastfm"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class LastFMUserMetadataConfig(MetadataConfig, LastFMDatasetConfig):
|
|
23
|
+
"""LastFM User Metadata Configuration."""
|
|
24
|
+
|
|
25
|
+
user_ix: str = "userID"
|
|
26
|
+
"""Name of the column containing user identifiers."""
|
|
27
|
+
friend_ix: str = "friendID"
|
|
28
|
+
"""Name of the column containing friend identifiers."""
|
|
29
|
+
|
|
30
|
+
remote_filename: str = "user_friends.dat"
|
|
31
|
+
remote_zipname: str = "hetrec2011-lastfm-2k"
|
|
32
|
+
dataset_url: str = "https://files.grouplens.org/datasets/hetrec2011"
|
|
33
|
+
sep: str = "\t"
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def column_names(self) -> list[str]:
|
|
37
|
+
return [
|
|
38
|
+
self.user_ix,
|
|
39
|
+
self.friend_ix,
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class LastFMItemMetadataConfig(MetadataConfig, LastFMDatasetConfig):
|
|
45
|
+
"""LastFM Item Metadata Configuration."""
|
|
46
|
+
|
|
47
|
+
item_ix: str = "id"
|
|
48
|
+
"""Name of the column containing item identifiers."""
|
|
49
|
+
name_ix: str = "name"
|
|
50
|
+
"""Name of the column containing item names."""
|
|
51
|
+
url_ix: str = "url"
|
|
52
|
+
"""Name of the column containing item URLs."""
|
|
53
|
+
picture_url_ix: str = "pictureURL"
|
|
54
|
+
"""Name of the column containing item picture URLs."""
|
|
55
|
+
|
|
56
|
+
remote_filename: str = "artists.dat"
|
|
57
|
+
remote_zipname: str = "hetrec2011-lastfm-2k"
|
|
58
|
+
dataset_url: str = "https://files.grouplens.org/datasets/hetrec2011"
|
|
59
|
+
sep: str = "\t"
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def column_names(self) -> list[str]:
|
|
63
|
+
return [
|
|
64
|
+
self.item_ix,
|
|
65
|
+
self.name_ix,
|
|
66
|
+
self.url_ix,
|
|
67
|
+
self.picture_url_ix,
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def dtype_dict(self) -> dict:
|
|
72
|
+
return {
|
|
73
|
+
self.name_ix: str,
|
|
74
|
+
self.url_ix: str,
|
|
75
|
+
self.picture_url_ix: str,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class LastFMTagMetadataConfig(MetadataConfig, LastFMDatasetConfig):
|
|
81
|
+
"""LastFM Tag Metadata Configuration."""
|
|
82
|
+
|
|
83
|
+
tag_ix: str = "tagID"
|
|
84
|
+
"""Name of the column containing tag identifiers."""
|
|
85
|
+
name_ix: str = "tagValue"
|
|
86
|
+
"""Name of the column containing tag names."""
|
|
87
|
+
|
|
88
|
+
remote_filename: str = "tags.dat"
|
|
89
|
+
remote_zipname: str = "hetrec2011-lastfm-2k"
|
|
90
|
+
dataset_url: str = "https://files.grouplens.org/datasets/hetrec2011"
|
|
91
|
+
sep: str = "\t"
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def column_names(self) -> list[str]:
|
|
95
|
+
return [
|
|
96
|
+
self.tag_ix,
|
|
97
|
+
self.name_ix,
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def dtype_dict(self) -> dict:
|
|
102
|
+
return {
|
|
103
|
+
self.tag_ix: str,
|
|
104
|
+
self.name_ix: str,
|
|
105
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
from .base import DatasetConfig, MetadataConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class MovieLensDatasetConfig(DatasetConfig):
|
|
11
|
+
"""MovieLens base configuration."""
|
|
12
|
+
|
|
13
|
+
user_ix: str = "userId"
|
|
14
|
+
item_ix: str = "movieId"
|
|
15
|
+
timestamp_ix: str = "timestamp"
|
|
16
|
+
rating_ix: str = "rating"
|
|
17
|
+
"""Name of the column in the DataFrame that contains the rating a user gave to the item."""
|
|
18
|
+
dataset_url: str = "https://files.grouplens.org/datasets/movielens"
|
|
19
|
+
remote_zipname: str = "ml-100k"
|
|
20
|
+
"""Name of the zip-file on the MovieLens server."""
|
|
21
|
+
remote_filename: str = "ratings.csv"
|
|
22
|
+
"""Name of the file containing user ratings on the MovieLens server."""
|
|
23
|
+
default_base_path: str = DatasetConfig.default_base_path + "/movielens"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class MovieLens100KDatasetConfig(MovieLensDatasetConfig):
|
|
28
|
+
"""MovieLens 100K specific configuration."""
|
|
29
|
+
|
|
30
|
+
remote_filename: str = "u.data"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class MovieLens100kUserMetadataConfig(MetadataConfig, MovieLensDatasetConfig):
|
|
35
|
+
"""
|
|
36
|
+
MovieLens 100K User Metadata Configuration.
|
|
37
|
+
|
|
38
|
+
Handles configuration for user demographic data:
|
|
39
|
+
- User ID mapping
|
|
40
|
+
- Age information
|
|
41
|
+
- Gender information
|
|
42
|
+
- Occupation information
|
|
43
|
+
- Zipcode information
|
|
44
|
+
|
|
45
|
+
All properties are computed from base fields to ensure consistency.
|
|
46
|
+
"""
|
|
47
|
+
user_ix: str = "userId"
|
|
48
|
+
"""Name of the column containing user identifiers."""
|
|
49
|
+
age_ix: str = "age"
|
|
50
|
+
"""Name of the column containing user age."""
|
|
51
|
+
gender_ix: str = "gender"
|
|
52
|
+
"""Name of the column containing user gender."""
|
|
53
|
+
occupation_ix: str = "occupation"
|
|
54
|
+
"""Name of the column containing user occupation."""
|
|
55
|
+
zipcode_ix: str = "zipcode"
|
|
56
|
+
"""Name of the column containing user zipcode."""
|
|
57
|
+
|
|
58
|
+
remote_filename: str = "u.user"
|
|
59
|
+
"""Filename of user metadata file in remote zip."""
|
|
60
|
+
remote_zipname: str = "ml-100k"
|
|
61
|
+
"""Name of the zip file on remote server."""
|
|
62
|
+
dataset_url: str = "https://files.grouplens.org/datasets/movielens"
|
|
63
|
+
"""URL to fetch the metadata from."""
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def column_names(self) -> list[str]:
|
|
67
|
+
return [
|
|
68
|
+
self.user_ix,
|
|
69
|
+
self.age_ix,
|
|
70
|
+
self.gender_ix,
|
|
71
|
+
self.occupation_ix,
|
|
72
|
+
self.zipcode_ix,
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def dtype_dict(self) -> dict:
|
|
77
|
+
return {
|
|
78
|
+
self.age_ix: np.int64,
|
|
79
|
+
self.gender_ix: str,
|
|
80
|
+
self.occupation_ix: str,
|
|
81
|
+
self.zipcode_ix: str,
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass
|
|
86
|
+
class MovieLens100kItemMetadataConfig(MetadataConfig, MovieLensDatasetConfig):
|
|
87
|
+
"""
|
|
88
|
+
MovieLens 100K Item Metadata Configuration.
|
|
89
|
+
|
|
90
|
+
Handles configuration for movie metadata including:
|
|
91
|
+
- Movie ID mapping
|
|
92
|
+
- Title, release date, IMDB URL
|
|
93
|
+
- 19 binary genre indicator columns
|
|
94
|
+
|
|
95
|
+
All properties are computed from base fields to ensure consistency.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
item_ix: str = "movieId"
|
|
99
|
+
"""Name of the column containing movie identifiers."""
|
|
100
|
+
title_ix: str = "title"
|
|
101
|
+
"""Name of the column containing movie title."""
|
|
102
|
+
release_date_ix: str = "releaseDate"
|
|
103
|
+
"""Name of the column containing movie release date."""
|
|
104
|
+
video_release_date_ix: str = "videoReleaseDate"
|
|
105
|
+
"""Name of the column containing video release date."""
|
|
106
|
+
imdb_url_ix: str = "imdbUrl"
|
|
107
|
+
"""Name of the column containing IMDB URL."""
|
|
108
|
+
genres: tuple[str, ...] = (
|
|
109
|
+
"unknown",
|
|
110
|
+
"action",
|
|
111
|
+
"adventure",
|
|
112
|
+
"animation",
|
|
113
|
+
"children",
|
|
114
|
+
"comedy",
|
|
115
|
+
"crime",
|
|
116
|
+
"documentary",
|
|
117
|
+
"drama",
|
|
118
|
+
"fantasy",
|
|
119
|
+
"filmNoir",
|
|
120
|
+
"horror",
|
|
121
|
+
"musical",
|
|
122
|
+
"mystery",
|
|
123
|
+
"romance",
|
|
124
|
+
"sciFi",
|
|
125
|
+
"thriller",
|
|
126
|
+
"war",
|
|
127
|
+
"western",
|
|
128
|
+
)
|
|
129
|
+
"""Tuple of 19 genre names in canonical order."""
|
|
130
|
+
|
|
131
|
+
remote_filename: str = "u.item"
|
|
132
|
+
remote_zipname: str = "ml-100k"
|
|
133
|
+
dataset_url: str = "https://files.grouplens.org/datasets/movielens"
|
|
134
|
+
encoding: str = "ISO-8859-1"
|
|
135
|
+
"""File encoding (ISO-8859-1 needed for special characters)."""
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def non_genre_columns(self) -> list[str]:
|
|
139
|
+
"""
|
|
140
|
+
Column names for non-genre metadata.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
list[str]: [movie_id, title, release_date, video_release_date, imdb_url]
|
|
144
|
+
|
|
145
|
+
Example:
|
|
146
|
+
["movieId", "title", "releaseDate", "videoReleaseDate", "imdbUrl"]
|
|
147
|
+
"""
|
|
148
|
+
return [
|
|
149
|
+
self.item_ix,
|
|
150
|
+
self.title_ix,
|
|
151
|
+
self.release_date_ix,
|
|
152
|
+
self.video_release_date_ix,
|
|
153
|
+
self.imdb_url_ix,
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def column_names(self) -> list[str]:
|
|
158
|
+
return self.non_genre_columns + list(self.genres)
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def dtype_dict(self) -> dict:
|
|
162
|
+
dtype_dict: dict[str, Any] = {
|
|
163
|
+
self.title_ix: str,
|
|
164
|
+
self.release_date_ix: str,
|
|
165
|
+
self.video_release_date_ix: str,
|
|
166
|
+
self.imdb_url_ix: str,
|
|
167
|
+
}
|
|
168
|
+
dtype_dict.update({genre: np.int64 for genre in self.genres})
|
|
169
|
+
return dtype_dict
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from .base import DatasetConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class YelpDatasetConfig(DatasetConfig):
|
|
8
|
+
"""Yelp dataset configuration.
|
|
9
|
+
|
|
10
|
+
Note: Yelp dataset must be manually downloaded from https://www.yelp.com/dataset/download
|
|
11
|
+
as it requires acceptance of a license agreement. The dataset should be converted
|
|
12
|
+
from JSON to CSV and placed in the data directory.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
user_ix: str = "user_id"
|
|
16
|
+
item_ix: str = "business_id"
|
|
17
|
+
timestamp_ix: str = "date"
|
|
18
|
+
rating_ix: str = "stars"
|
|
19
|
+
dataset_url: str = "https://www.yelp.com/dataset/download"
|
|
20
|
+
remote_filename: str = "yelp_academic_dataset_review.csv"
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def default_filename(self) -> str:
|
|
24
|
+
"""Return the default filename for Yelp dataset."""
|
|
25
|
+
return self.remote_filename
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from .amazon import (
|
|
2
|
+
AmazonBookDataset,
|
|
3
|
+
AmazonMovieDataset,
|
|
4
|
+
AmazonMusicDataset,
|
|
5
|
+
AmazonSubscriptionBoxesDataset,
|
|
6
|
+
)
|
|
7
|
+
from .base import Dataset
|
|
8
|
+
from .lastfm import LastFMDataset
|
|
9
|
+
from .movielens import MovieLens100K
|
|
10
|
+
from .test import TestDataset
|
|
11
|
+
from .yelp import YelpDataset
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"AmazonBookDataset",
|
|
16
|
+
"AmazonMovieDataset",
|
|
17
|
+
"AmazonMusicDataset",
|
|
18
|
+
"AmazonSubscriptionBoxesDataset",
|
|
19
|
+
"LastFMDataset",
|
|
20
|
+
"MovieLens100K",
|
|
21
|
+
"YelpDataset",
|
|
22
|
+
"TestDataset",
|
|
23
|
+
"Dataset",
|
|
24
|
+
]
|