recnexteval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. recnexteval/__init__.py +20 -0
  2. recnexteval/algorithms/__init__.py +99 -0
  3. recnexteval/algorithms/base.py +377 -0
  4. recnexteval/algorithms/baseline/__init__.py +10 -0
  5. recnexteval/algorithms/baseline/decay_popularity.py +110 -0
  6. recnexteval/algorithms/baseline/most_popular.py +72 -0
  7. recnexteval/algorithms/baseline/random.py +39 -0
  8. recnexteval/algorithms/baseline/recent_popularity.py +34 -0
  9. recnexteval/algorithms/itemknn/__init__.py +14 -0
  10. recnexteval/algorithms/itemknn/itemknn.py +119 -0
  11. recnexteval/algorithms/itemknn/itemknn_incremental.py +65 -0
  12. recnexteval/algorithms/itemknn/itemknn_incremental_movielens.py +95 -0
  13. recnexteval/algorithms/itemknn/itemknn_rolling.py +17 -0
  14. recnexteval/algorithms/itemknn/itemknn_static.py +31 -0
  15. recnexteval/algorithms/time_aware_item_knn/__init__.py +11 -0
  16. recnexteval/algorithms/time_aware_item_knn/base.py +248 -0
  17. recnexteval/algorithms/time_aware_item_knn/decay_functions.py +260 -0
  18. recnexteval/algorithms/time_aware_item_knn/ding_2005.py +52 -0
  19. recnexteval/algorithms/time_aware_item_knn/liu_2010.py +65 -0
  20. recnexteval/algorithms/time_aware_item_knn/similarity_functions.py +106 -0
  21. recnexteval/algorithms/time_aware_item_knn/top_k.py +61 -0
  22. recnexteval/algorithms/time_aware_item_knn/utils.py +47 -0
  23. recnexteval/algorithms/time_aware_item_knn/vaz_2013.py +50 -0
  24. recnexteval/algorithms/utils.py +51 -0
  25. recnexteval/datasets/__init__.py +109 -0
  26. recnexteval/datasets/base.py +316 -0
  27. recnexteval/datasets/config/__init__.py +113 -0
  28. recnexteval/datasets/config/amazon.py +188 -0
  29. recnexteval/datasets/config/base.py +72 -0
  30. recnexteval/datasets/config/lastfm.py +105 -0
  31. recnexteval/datasets/config/movielens.py +169 -0
  32. recnexteval/datasets/config/yelp.py +25 -0
  33. recnexteval/datasets/datasets/__init__.py +24 -0
  34. recnexteval/datasets/datasets/amazon.py +151 -0
  35. recnexteval/datasets/datasets/base.py +250 -0
  36. recnexteval/datasets/datasets/lastfm.py +121 -0
  37. recnexteval/datasets/datasets/movielens.py +93 -0
  38. recnexteval/datasets/datasets/test.py +46 -0
  39. recnexteval/datasets/datasets/yelp.py +103 -0
  40. recnexteval/datasets/metadata/__init__.py +58 -0
  41. recnexteval/datasets/metadata/amazon.py +68 -0
  42. recnexteval/datasets/metadata/base.py +38 -0
  43. recnexteval/datasets/metadata/lastfm.py +110 -0
  44. recnexteval/datasets/metadata/movielens.py +87 -0
  45. recnexteval/evaluators/__init__.py +189 -0
  46. recnexteval/evaluators/accumulator.py +167 -0
  47. recnexteval/evaluators/base.py +216 -0
  48. recnexteval/evaluators/builder/__init__.py +125 -0
  49. recnexteval/evaluators/builder/base.py +166 -0
  50. recnexteval/evaluators/builder/pipeline.py +111 -0
  51. recnexteval/evaluators/builder/stream.py +54 -0
  52. recnexteval/evaluators/evaluator_pipeline.py +287 -0
  53. recnexteval/evaluators/evaluator_stream.py +374 -0
  54. recnexteval/evaluators/state_management.py +310 -0
  55. recnexteval/evaluators/strategy.py +32 -0
  56. recnexteval/evaluators/util.py +124 -0
  57. recnexteval/matrix/__init__.py +48 -0
  58. recnexteval/matrix/exception.py +5 -0
  59. recnexteval/matrix/interaction_matrix.py +784 -0
  60. recnexteval/matrix/prediction_matrix.py +153 -0
  61. recnexteval/matrix/util.py +24 -0
  62. recnexteval/metrics/__init__.py +57 -0
  63. recnexteval/metrics/binary/__init__.py +4 -0
  64. recnexteval/metrics/binary/hit.py +49 -0
  65. recnexteval/metrics/core/__init__.py +10 -0
  66. recnexteval/metrics/core/base.py +126 -0
  67. recnexteval/metrics/core/elementwise_top_k.py +75 -0
  68. recnexteval/metrics/core/listwise_top_k.py +72 -0
  69. recnexteval/metrics/core/top_k.py +60 -0
  70. recnexteval/metrics/core/util.py +29 -0
  71. recnexteval/metrics/ranking/__init__.py +6 -0
  72. recnexteval/metrics/ranking/dcg.py +55 -0
  73. recnexteval/metrics/ranking/ndcg.py +78 -0
  74. recnexteval/metrics/ranking/precision.py +51 -0
  75. recnexteval/metrics/ranking/recall.py +42 -0
  76. recnexteval/models/__init__.py +4 -0
  77. recnexteval/models/base.py +69 -0
  78. recnexteval/preprocessing/__init__.py +37 -0
  79. recnexteval/preprocessing/filter.py +181 -0
  80. recnexteval/preprocessing/preprocessor.py +137 -0
  81. recnexteval/registries/__init__.py +67 -0
  82. recnexteval/registries/algorithm.py +68 -0
  83. recnexteval/registries/base.py +131 -0
  84. recnexteval/registries/dataset.py +37 -0
  85. recnexteval/registries/metric.py +57 -0
  86. recnexteval/settings/__init__.py +127 -0
  87. recnexteval/settings/base.py +414 -0
  88. recnexteval/settings/exception.py +8 -0
  89. recnexteval/settings/leave_n_out_setting.py +48 -0
  90. recnexteval/settings/processor.py +115 -0
  91. recnexteval/settings/schema.py +11 -0
  92. recnexteval/settings/single_time_point_setting.py +111 -0
  93. recnexteval/settings/sliding_window_setting.py +153 -0
  94. recnexteval/settings/splitters/__init__.py +14 -0
  95. recnexteval/settings/splitters/base.py +57 -0
  96. recnexteval/settings/splitters/n_last.py +39 -0
  97. recnexteval/settings/splitters/n_last_timestamp.py +76 -0
  98. recnexteval/settings/splitters/timestamp.py +82 -0
  99. recnexteval/settings/util.py +0 -0
  100. recnexteval/utils/__init__.py +115 -0
  101. recnexteval/utils/json_to_csv_converter.py +128 -0
  102. recnexteval/utils/logging_tools.py +159 -0
  103. recnexteval/utils/path.py +155 -0
  104. recnexteval/utils/url_certificate_installer.py +54 -0
  105. recnexteval/utils/util.py +166 -0
  106. recnexteval/utils/uuid_util.py +7 -0
  107. recnexteval/utils/yaml_tool.py +65 -0
  108. recnexteval-0.1.0.dist-info/METADATA +85 -0
  109. recnexteval-0.1.0.dist-info/RECORD +110 -0
  110. recnexteval-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,188 @@
1
+ from dataclasses import dataclass
2
+
3
+ from .base import DatasetConfig, MetadataConfig
4
+
5
+
6
+ @dataclass
7
+ class AmazonDatasetConfig(DatasetConfig):
8
+ """Amazon dataset base configuration."""
9
+
10
+ user_ix: str = "user_id"
11
+ item_ix: str = "parent_asin"
12
+ timestamp_ix: str = "timestamp"
13
+ rating_ix: str = "rating"
14
+ helpful_vote_ix: str = "helpful_vote"
15
+ dataset_url: str = "" # Set per subclass
16
+ remote_filename: str = "" # Set per subclass
17
+ default_base_path: str = DatasetConfig.default_base_path + "/amazon"
18
+
19
+ @property
20
+ def default_filename(self) -> str:
21
+ """Return just the filename for Amazon datasets (no zipname prefix)."""
22
+ return self.remote_filename
23
+
24
+
25
+ @dataclass
26
+ class AmazonMusicDatasetConfig(AmazonDatasetConfig):
27
+ """Amazon Music dataset configuration."""
28
+
29
+ remote_filename: str = "Digital_Music.jsonl.gz"
30
+ dataset_url: str = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Digital_Music.jsonl.gz"
31
+
32
+
33
+ @dataclass
34
+ class AmazonMovieDatasetConfig(AmazonDatasetConfig):
35
+ """Amazon Movie dataset configuration."""
36
+
37
+ remote_filename: str = "Movies_and_TV.jsonl.gz"
38
+ dataset_url: str = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Movies_and_TV.jsonl.gz"
39
+
40
+
41
+ @dataclass
42
+ class AmazonSubscriptionBoxesDatasetConfig(AmazonDatasetConfig):
43
+ """Amazon Subscription Boxes dataset configuration."""
44
+
45
+ remote_filename: str = "Subscription_Boxes.jsonl.gz"
46
+ dataset_url: str = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Subscription_Boxes.jsonl.gz"
47
+
48
+
49
+ @dataclass
50
+ class AmazonBookDatasetConfig(AmazonDatasetConfig):
51
+ """Amazon Books dataset configuration."""
52
+
53
+ remote_filename: str = "Books.jsonl.gz"
54
+ dataset_url: str = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Books.jsonl.gz"
55
+
56
+
57
+ @dataclass
58
+ class AmazonItemMetadataConfig(MetadataConfig, AmazonDatasetConfig):
59
+ """
60
+ Amazon Item Metadata Base Configuration.
61
+
62
+ Handles configuration for Amazon product metadata including:
63
+ - Product identifiers (ASIN)
64
+ - Product information (title, category, price, rating)
65
+ - Rich content (features, description, images, videos)
66
+ - Relational data (store, details, bought together)
67
+
68
+ All properties are computed from base fields to ensure consistency.
69
+ """
70
+
71
+ item_ix: str = "parent_asin"
72
+ """Name of the column containing product identifiers (parent ASIN)."""
73
+ main_category_ix: str = "main_category"
74
+ """Name of the column containing the main product category."""
75
+ title_ix: str = "title"
76
+ """Name of the column containing product title."""
77
+ average_rating_ix: str = "average_rating"
78
+ """Name of the column containing average product rating (0-5)."""
79
+ rating_number_ix: str = "rating_number"
80
+ """Name of the column containing number of ratings received."""
81
+ features_ix: str = "features"
82
+ """Name of the column containing product features (list)."""
83
+ description_ix: str = "description"
84
+ """Name of the column containing product description (list)."""
85
+ price_ix: str = "price"
86
+ """Name of the column containing product price."""
87
+ images_ix: str = "images"
88
+ """Name of the column containing product images URLs (list)."""
89
+ videos_ix: str = "videos"
90
+ """Name of the column containing product videos URLs (list)."""
91
+ store_ix: str = "store"
92
+ """Name of the column containing store/seller information."""
93
+ categories_ix: str = "categories"
94
+ """Name of the column containing category hierarchy (list)."""
95
+ details_ix: str = "details"
96
+ """Name of the column containing product details (dict)."""
97
+ bought_together_ix: str = "bought_together"
98
+ """Name of the column containing products bought together (list)."""
99
+
100
+ @property
101
+ def column_names(self) -> list[str]:
102
+ return [
103
+ self.main_category_ix,
104
+ self.title_ix,
105
+ self.average_rating_ix,
106
+ self.rating_number_ix,
107
+ self.features_ix,
108
+ self.description_ix,
109
+ self.price_ix,
110
+ self.images_ix,
111
+ self.videos_ix,
112
+ self.store_ix,
113
+ self.categories_ix,
114
+ self.details_ix,
115
+ self.item_ix,
116
+ self.bought_together_ix,
117
+ ]
118
+
119
+ @property
120
+ def dtype_dict(self) -> dict:
121
+ return {
122
+ self.main_category_ix: str,
123
+ self.title_ix: str,
124
+ self.average_rating_ix: "float32",
125
+ self.rating_number_ix: "int64",
126
+ self.features_ix: list,
127
+ self.description_ix: list,
128
+ self.price_ix: "float32",
129
+ self.images_ix: list,
130
+ self.videos_ix: list,
131
+ self.store_ix: str,
132
+ self.categories_ix: list,
133
+ self.details_ix: dict,
134
+ self.item_ix: str,
135
+ self.bought_together_ix: list,
136
+ }
137
+
138
+
139
+ @dataclass
140
+ class AmazonDigitalMusicItemMetadataConfig(AmazonItemMetadataConfig):
141
+ """Amazon Digital Music metadata configuration."""
142
+
143
+ remote_filename: str = "meta_Digital_Music.jsonl.gz"
144
+ """Filename for Digital Music metadata."""
145
+
146
+ dataset_url: str = (
147
+ "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/"
148
+ "raw/meta_categories/meta_Digital_Music.jsonl.gz"
149
+ )
150
+
151
+
152
+ @dataclass
153
+ class AmazonMoviesAndTVItemMetadataConfig(AmazonItemMetadataConfig):
154
+ """Amazon Movies and TV metadata configuration."""
155
+
156
+ remote_filename: str = "meta_Movies_and_TV.jsonl.gz"
157
+ """Filename for Movies and TV metadata."""
158
+
159
+ dataset_url: str = (
160
+ "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/"
161
+ "raw/meta_categories/meta_Movies_and_TV.jsonl.gz"
162
+ )
163
+
164
+
165
+ @dataclass
166
+ class AmazonSubscriptionBoxesItemMetadataConfig(AmazonItemMetadataConfig):
167
+ """Amazon Subscription Boxes metadata configuration."""
168
+
169
+ remote_filename: str = "meta_Subscription_Boxes.jsonl.gz"
170
+ """Filename for Subscription Boxes metadata."""
171
+
172
+ dataset_url: str = (
173
+ "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/"
174
+ "raw/meta_categories/meta_Subscription_Boxes.jsonl.gz"
175
+ )
176
+
177
+
178
+ @dataclass
179
+ class AmazonBooksItemMetadataConfig(AmazonItemMetadataConfig):
180
+ """Amazon Books metadata configuration."""
181
+
182
+ remote_filename: str = "meta_Books.jsonl.gz"
183
+ """Filename for Books metadata."""
184
+
185
+ dataset_url: str = (
186
+ "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/"
187
+ "raw/meta_categories/meta_Books.jsonl.gz"
188
+ )
@@ -0,0 +1,72 @@
1
+ from dataclasses import dataclass
2
+
3
+ from recnexteval.utils import get_data_dir
4
+
5
+
6
+ @dataclass
7
+ class DatasetConfig:
8
+ """Base configuration for datasets."""
9
+
10
+ user_ix: str = "user_id"
11
+ """Name of the column in the DataFrame with user identifiers"""
12
+ item_ix: str = "item_id"
13
+ """Name of the column in the DataFrame with item identifiers"""
14
+ timestamp_ix: str = "timestamp"
15
+ """Name of the column in the DataFrame that contains time of interaction in seconds since epoch."""
16
+ dataset_url: str = "http://example.com"
17
+ """URL to fetch the dataset from."""
18
+ default_base_path: str = str(get_data_dir())
19
+ """Default base path where the dataset will be stored."""
20
+ remote_zipname: str = ""
21
+ remote_filename: str = ""
22
+ """Name of the file containing user interaction."""
23
+
24
+ @property
25
+ def default_filename(self) -> str:
26
+ """Derived filename from remote components."""
27
+ if not self.remote_zipname or not self.remote_filename:
28
+ return "dataset.csv"
29
+ return f"{self.remote_zipname}_{self.remote_filename}"
30
+
31
+
32
+ @dataclass
33
+ class MetadataConfig(DatasetConfig):
34
+ sep: str = "|"
35
+ """Column separator in the data file."""
36
+
37
+ def __post_init__(self) -> None:
38
+ self.default_base_path = super().default_base_path + "/metadata"
39
+
40
+ @property
41
+ def column_names(self) -> list[str]:
42
+ """
43
+ Ordered list of column names for pd.read_table.
44
+
45
+ Returns:
46
+ list[str]: Column names in file order [user_id, age, gender, ...]
47
+
48
+ Example:
49
+ ["userId", "age", "gender", "occupation", "zipcode"]
50
+ """
51
+ return []
52
+
53
+ @property
54
+ def dtype_dict(self) -> dict:
55
+ """
56
+ Data type mapping for all columns.
57
+
58
+ Used in pd.read_table() dtype parameter to ensure correct
59
+ column types are loaded from file.
60
+
61
+ Returns:
62
+ dict: Mapping of column names to numpy dtypes
63
+
64
+ Example:
65
+ {
66
+ "age": "int64",
67
+ "gender": "<U1", # string
68
+ "occupation": "object",
69
+ "zipcode": "object"
70
+ }
71
+ """
72
+ return {}
@@ -0,0 +1,105 @@
1
+ from dataclasses import dataclass
2
+
3
+ from .base import DatasetConfig, MetadataConfig
4
+
5
+
6
+ @dataclass
7
+ class LastFMDatasetConfig(DatasetConfig):
8
+ """LastFM dataset configuration."""
9
+
10
+ user_ix: str = "userID"
11
+ item_ix: str = "artistID"
12
+ timestamp_ix: str = "timestamp"
13
+ tag_ix: str = "tagID"
14
+ """Name of the column in the DataFrame that contains the tag a user gave to the item."""
15
+ dataset_url: str = "https://files.grouplens.org/datasets/hetrec2011"
16
+ remote_zipname: str = "hetrec2011-lastfm-2k"
17
+ remote_filename: str = "user_taggedartists-timestamps.dat"
18
+ default_base_path: str = DatasetConfig.default_base_path + "/lastfm"
19
+
20
+
21
+ @dataclass
22
+ class LastFMUserMetadataConfig(MetadataConfig, LastFMDatasetConfig):
23
+ """LastFM User Metadata Configuration."""
24
+
25
+ user_ix: str = "userID"
26
+ """Name of the column containing user identifiers."""
27
+ friend_ix: str = "friendID"
28
+ """Name of the column containing friend identifiers."""
29
+
30
+ remote_filename: str = "user_friends.dat"
31
+ remote_zipname: str = "hetrec2011-lastfm-2k"
32
+ dataset_url: str = "https://files.grouplens.org/datasets/hetrec2011"
33
+ sep: str = "\t"
34
+
35
+ @property
36
+ def column_names(self) -> list[str]:
37
+ return [
38
+ self.user_ix,
39
+ self.friend_ix,
40
+ ]
41
+
42
+
43
+ @dataclass
44
+ class LastFMItemMetadataConfig(MetadataConfig, LastFMDatasetConfig):
45
+ """LastFM Item Metadata Configuration."""
46
+
47
+ item_ix: str = "id"
48
+ """Name of the column containing item identifiers."""
49
+ name_ix: str = "name"
50
+ """Name of the column containing item names."""
51
+ url_ix: str = "url"
52
+ """Name of the column containing item URLs."""
53
+ picture_url_ix: str = "pictureURL"
54
+ """Name of the column containing item picture URLs."""
55
+
56
+ remote_filename: str = "artists.dat"
57
+ remote_zipname: str = "hetrec2011-lastfm-2k"
58
+ dataset_url: str = "https://files.grouplens.org/datasets/hetrec2011"
59
+ sep: str = "\t"
60
+
61
+ @property
62
+ def column_names(self) -> list[str]:
63
+ return [
64
+ self.item_ix,
65
+ self.name_ix,
66
+ self.url_ix,
67
+ self.picture_url_ix,
68
+ ]
69
+
70
+ @property
71
+ def dtype_dict(self) -> dict:
72
+ return {
73
+ self.name_ix: str,
74
+ self.url_ix: str,
75
+ self.picture_url_ix: str,
76
+ }
77
+
78
+
79
+ @dataclass
80
+ class LastFMTagMetadataConfig(MetadataConfig, LastFMDatasetConfig):
81
+ """LastFM Tag Metadata Configuration."""
82
+
83
+ tag_ix: str = "tagID"
84
+ """Name of the column containing tag identifiers."""
85
+ name_ix: str = "tagValue"
86
+ """Name of the column containing tag names."""
87
+
88
+ remote_filename: str = "tags.dat"
89
+ remote_zipname: str = "hetrec2011-lastfm-2k"
90
+ dataset_url: str = "https://files.grouplens.org/datasets/hetrec2011"
91
+ sep: str = "\t"
92
+
93
+ @property
94
+ def column_names(self) -> list[str]:
95
+ return [
96
+ self.tag_ix,
97
+ self.name_ix,
98
+ ]
99
+
100
+ @property
101
+ def dtype_dict(self) -> dict:
102
+ return {
103
+ self.tag_ix: str,
104
+ self.name_ix: str,
105
+ }
@@ -0,0 +1,169 @@
1
+ from dataclasses import dataclass
2
+ from typing import Any
3
+
4
+ import numpy as np
5
+
6
+ from .base import DatasetConfig, MetadataConfig
7
+
8
+
9
+ @dataclass
10
+ class MovieLensDatasetConfig(DatasetConfig):
11
+ """MovieLens base configuration."""
12
+
13
+ user_ix: str = "userId"
14
+ item_ix: str = "movieId"
15
+ timestamp_ix: str = "timestamp"
16
+ rating_ix: str = "rating"
17
+ """Name of the column in the DataFrame that contains the rating a user gave to the item."""
18
+ dataset_url: str = "https://files.grouplens.org/datasets/movielens"
19
+ remote_zipname: str = "ml-100k"
20
+ """Name of the zip-file on the MovieLens server."""
21
+ remote_filename: str = "ratings.csv"
22
+ """Name of the file containing user ratings on the MovieLens server."""
23
+ default_base_path: str = DatasetConfig.default_base_path + "/movielens"
24
+
25
+
26
+ @dataclass
27
+ class MovieLens100KDatasetConfig(MovieLensDatasetConfig):
28
+ """MovieLens 100K specific configuration."""
29
+
30
+ remote_filename: str = "u.data"
31
+
32
+
33
+ @dataclass
34
+ class MovieLens100kUserMetadataConfig(MetadataConfig, MovieLensDatasetConfig):
35
+ """
36
+ MovieLens 100K User Metadata Configuration.
37
+
38
+ Handles configuration for user demographic data:
39
+ - User ID mapping
40
+ - Age information
41
+ - Gender information
42
+ - Occupation information
43
+ - Zipcode information
44
+
45
+ All properties are computed from base fields to ensure consistency.
46
+ """
47
+ user_ix: str = "userId"
48
+ """Name of the column containing user identifiers."""
49
+ age_ix: str = "age"
50
+ """Name of the column containing user age."""
51
+ gender_ix: str = "gender"
52
+ """Name of the column containing user gender."""
53
+ occupation_ix: str = "occupation"
54
+ """Name of the column containing user occupation."""
55
+ zipcode_ix: str = "zipcode"
56
+ """Name of the column containing user zipcode."""
57
+
58
+ remote_filename: str = "u.user"
59
+ """Filename of user metadata file in remote zip."""
60
+ remote_zipname: str = "ml-100k"
61
+ """Name of the zip file on remote server."""
62
+ dataset_url: str = "https://files.grouplens.org/datasets/movielens"
63
+ """URL to fetch the metadata from."""
64
+
65
+ @property
66
+ def column_names(self) -> list[str]:
67
+ return [
68
+ self.user_ix,
69
+ self.age_ix,
70
+ self.gender_ix,
71
+ self.occupation_ix,
72
+ self.zipcode_ix,
73
+ ]
74
+
75
+ @property
76
+ def dtype_dict(self) -> dict:
77
+ return {
78
+ self.age_ix: np.int64,
79
+ self.gender_ix: str,
80
+ self.occupation_ix: str,
81
+ self.zipcode_ix: str,
82
+ }
83
+
84
+
85
+ @dataclass
86
+ class MovieLens100kItemMetadataConfig(MetadataConfig, MovieLensDatasetConfig):
87
+ """
88
+ MovieLens 100K Item Metadata Configuration.
89
+
90
+ Handles configuration for movie metadata including:
91
+ - Movie ID mapping
92
+ - Title, release date, IMDB URL
93
+ - 19 binary genre indicator columns
94
+
95
+ All properties are computed from base fields to ensure consistency.
96
+ """
97
+
98
+ item_ix: str = "movieId"
99
+ """Name of the column containing movie identifiers."""
100
+ title_ix: str = "title"
101
+ """Name of the column containing movie title."""
102
+ release_date_ix: str = "releaseDate"
103
+ """Name of the column containing movie release date."""
104
+ video_release_date_ix: str = "videoReleaseDate"
105
+ """Name of the column containing video release date."""
106
+ imdb_url_ix: str = "imdbUrl"
107
+ """Name of the column containing IMDB URL."""
108
+ genres: tuple[str, ...] = (
109
+ "unknown",
110
+ "action",
111
+ "adventure",
112
+ "animation",
113
+ "children",
114
+ "comedy",
115
+ "crime",
116
+ "documentary",
117
+ "drama",
118
+ "fantasy",
119
+ "filmNoir",
120
+ "horror",
121
+ "musical",
122
+ "mystery",
123
+ "romance",
124
+ "sciFi",
125
+ "thriller",
126
+ "war",
127
+ "western",
128
+ )
129
+ """Tuple of 19 genre names in canonical order."""
130
+
131
+ remote_filename: str = "u.item"
132
+ remote_zipname: str = "ml-100k"
133
+ dataset_url: str = "https://files.grouplens.org/datasets/movielens"
134
+ encoding: str = "ISO-8859-1"
135
+ """File encoding (ISO-8859-1 needed for special characters)."""
136
+
137
+ @property
138
+ def non_genre_columns(self) -> list[str]:
139
+ """
140
+ Column names for non-genre metadata.
141
+
142
+ Returns:
143
+ list[str]: [movie_id, title, release_date, video_release_date, imdb_url]
144
+
145
+ Example:
146
+ ["movieId", "title", "releaseDate", "videoReleaseDate", "imdbUrl"]
147
+ """
148
+ return [
149
+ self.item_ix,
150
+ self.title_ix,
151
+ self.release_date_ix,
152
+ self.video_release_date_ix,
153
+ self.imdb_url_ix,
154
+ ]
155
+
156
+ @property
157
+ def column_names(self) -> list[str]:
158
+ return self.non_genre_columns + list(self.genres)
159
+
160
+ @property
161
+ def dtype_dict(self) -> dict:
162
+ dtype_dict: dict[str, Any] = {
163
+ self.title_ix: str,
164
+ self.release_date_ix: str,
165
+ self.video_release_date_ix: str,
166
+ self.imdb_url_ix: str,
167
+ }
168
+ dtype_dict.update({genre: np.int64 for genre in self.genres})
169
+ return dtype_dict
@@ -0,0 +1,25 @@
1
+ from dataclasses import dataclass
2
+
3
+ from .base import DatasetConfig
4
+
5
+
6
+ @dataclass
7
+ class YelpDatasetConfig(DatasetConfig):
8
+ """Yelp dataset configuration.
9
+
10
+ Note: Yelp dataset must be manually downloaded from https://www.yelp.com/dataset/download
11
+ as it requires acceptance of a license agreement. The dataset should be converted
12
+ from JSON to CSV and placed in the data directory.
13
+ """
14
+
15
+ user_ix: str = "user_id"
16
+ item_ix: str = "business_id"
17
+ timestamp_ix: str = "date"
18
+ rating_ix: str = "stars"
19
+ dataset_url: str = "https://www.yelp.com/dataset/download"
20
+ remote_filename: str = "yelp_academic_dataset_review.csv"
21
+
22
+ @property
23
+ def default_filename(self) -> str:
24
+ """Return the default filename for Yelp dataset."""
25
+ return self.remote_filename
@@ -0,0 +1,24 @@
1
+ from .amazon import (
2
+ AmazonBookDataset,
3
+ AmazonMovieDataset,
4
+ AmazonMusicDataset,
5
+ AmazonSubscriptionBoxesDataset,
6
+ )
7
+ from .base import Dataset
8
+ from .lastfm import LastFMDataset
9
+ from .movielens import MovieLens100K
10
+ from .test import TestDataset
11
+ from .yelp import YelpDataset
12
+
13
+
14
+ __all__ = [
15
+ "AmazonBookDataset",
16
+ "AmazonMovieDataset",
17
+ "AmazonMusicDataset",
18
+ "AmazonSubscriptionBoxesDataset",
19
+ "LastFMDataset",
20
+ "MovieLens100K",
21
+ "YelpDataset",
22
+ "TestDataset",
23
+ "Dataset",
24
+ ]