datarec-lib 1.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datarec_lib-1.3.2/LICENSE.txt +21 -0
- datarec_lib-1.3.2/PKG-INFO +260 -0
- datarec_lib-1.3.2/README.md +183 -0
- datarec_lib-1.3.2/datarec/__init__.py +28 -0
- datarec_lib-1.3.2/datarec/data/__init__.py +4 -0
- datarec_lib-1.3.2/datarec/data/characteristics.py +138 -0
- datarec_lib-1.3.2/datarec/data/datarec_builder.py +237 -0
- datarec_lib-1.3.2/datarec/data/dataset.py +804 -0
- datarec_lib-1.3.2/datarec/data/download.py +242 -0
- datarec_lib-1.3.2/datarec/data/graph.py +266 -0
- datarec_lib-1.3.2/datarec/data/resource.py +413 -0
- datarec_lib-1.3.2/datarec/data/source.py +318 -0
- datarec_lib-1.3.2/datarec/data/topological_characteristics.py +179 -0
- datarec_lib-1.3.2/datarec/data/torch_dataset.py +183 -0
- datarec_lib-1.3.2/datarec/data/utils.py +308 -0
- datarec_lib-1.3.2/datarec/datasets/__init__.py +100 -0
- datarec_lib-1.3.2/datarec/datasets/base.py +27 -0
- datarec_lib-1.3.2/datarec/datasets/examples.py +167 -0
- datarec_lib-1.3.2/datarec/io/__init__.py +10 -0
- datarec_lib-1.3.2/datarec/io/cache.py +522 -0
- datarec_lib-1.3.2/datarec/io/frameworks/__init__.py +1 -0
- datarec_lib-1.3.2/datarec/io/frameworks/clayrs/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/io/frameworks/clayrs/clayrs.py +79 -0
- datarec_lib-1.3.2/datarec/io/frameworks/cornac/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/io/frameworks/cornac/cornac.py +76 -0
- datarec_lib-1.3.2/datarec/io/frameworks/daisyrec/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/io/frameworks/daisyrec/daisyrec.py +114 -0
- datarec_lib-1.3.2/datarec/io/frameworks/daisyrec/loader.py +438 -0
- datarec_lib-1.3.2/datarec/io/frameworks/elliot/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/io/frameworks/elliot/datarec_config.py +21 -0
- datarec_lib-1.3.2/datarec/io/frameworks/elliot/elliot.py +105 -0
- datarec_lib-1.3.2/datarec/io/frameworks/exporter.py +223 -0
- datarec_lib-1.3.2/datarec/io/frameworks/lenskit/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/io/frameworks/lenskit/lenskit.py +73 -0
- datarec_lib-1.3.2/datarec/io/frameworks/manager.py +58 -0
- datarec_lib-1.3.2/datarec/io/frameworks/recbole/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/io/frameworks/recbole/recbole.py +136 -0
- datarec_lib-1.3.2/datarec/io/frameworks/rechorus/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/io/frameworks/rechorus/rechorus.py +83 -0
- datarec_lib-1.3.2/datarec/io/frameworks/recommenders/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/io/frameworks/recommenders/recommenders.py +80 -0
- datarec_lib-1.3.2/datarec/io/frameworks/recpack/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/io/frameworks/recpack/copy_me_in__init__.py +90 -0
- datarec_lib-1.3.2/datarec/io/frameworks/recpack/datarec.py +41 -0
- datarec_lib-1.3.2/datarec/io/frameworks/recpack/recpack.py +88 -0
- datarec_lib-1.3.2/datarec/io/paths.py +125 -0
- datarec_lib-1.3.2/datarec/io/rawdata.py +188 -0
- datarec_lib-1.3.2/datarec/io/readers/__init__.py +2 -0
- datarec_lib-1.3.2/datarec/io/readers/_decorators.py +28 -0
- datarec_lib-1.3.2/datarec/io/readers/sequences/__init__.py +2 -0
- datarec_lib-1.3.2/datarec/io/readers/sequences/json.py +209 -0
- datarec_lib-1.3.2/datarec/io/readers/sequences/tabular.py +498 -0
- datarec_lib-1.3.2/datarec/io/readers/transactions/__init__.py +3 -0
- datarec_lib-1.3.2/datarec/io/readers/transactions/json.py +189 -0
- datarec_lib-1.3.2/datarec/io/readers/transactions/jsonl.py +42 -0
- datarec_lib-1.3.2/datarec/io/readers/transactions/tabular.py +270 -0
- datarec_lib-1.3.2/datarec/io/utils.py +29 -0
- datarec_lib-1.3.2/datarec/io/writers/__init__.py +2 -0
- datarec_lib-1.3.2/datarec/io/writers/sequences/__init__.py +2 -0
- datarec_lib-1.3.2/datarec/io/writers/sequences/json.py +249 -0
- datarec_lib-1.3.2/datarec/io/writers/sequences/tabular.py +304 -0
- datarec_lib-1.3.2/datarec/io/writers/transactions/__init__.py +3 -0
- datarec_lib-1.3.2/datarec/io/writers/transactions/json.py +148 -0
- datarec_lib-1.3.2/datarec/io/writers/transactions/jsonl.py +42 -0
- datarec_lib-1.3.2/datarec/io/writers/transactions/tabular.py +110 -0
- datarec_lib-1.3.2/datarec/pipeline/__init__.py +1 -0
- datarec_lib-1.3.2/datarec/pipeline/pipeline.py +368 -0
- datarec_lib-1.3.2/datarec/pipeline/pipeline_step.py +19 -0
- datarec_lib-1.3.2/datarec/processing/__init__.py +5 -0
- datarec_lib-1.3.2/datarec/processing/binarizer.py +86 -0
- datarec_lib-1.3.2/datarec/processing/cold.py +51 -0
- datarec_lib-1.3.2/datarec/processing/kcore.py +348 -0
- datarec_lib-1.3.2/datarec/processing/processor.py +47 -0
- datarec_lib-1.3.2/datarec/processing/rating.py +149 -0
- datarec_lib-1.3.2/datarec/processing/temporal.py +63 -0
- datarec_lib-1.3.2/datarec/registry/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/registry/datasets/alibaba_ifashion.yml +39 -0
- datarec_lib-1.3.2/datarec/registry/datasets/amazon_baby.yml +21 -0
- datarec_lib-1.3.2/datarec/registry/datasets/amazon_beauty.yml +22 -0
- datarec_lib-1.3.2/datarec/registry/datasets/amazon_books.yml +22 -0
- datarec_lib-1.3.2/datarec/registry/datasets/amazon_clothing.yml +21 -0
- datarec_lib-1.3.2/datarec/registry/datasets/amazon_music.yml +22 -0
- datarec_lib-1.3.2/datarec/registry/datasets/amazon_office.yml +22 -0
- datarec_lib-1.3.2/datarec/registry/datasets/amazon_sports_and_outdoors.yml +22 -0
- datarec_lib-1.3.2/datarec/registry/datasets/amazon_toys_and_games.yml +22 -0
- datarec_lib-1.3.2/datarec/registry/datasets/amazon_videogames.yml +22 -0
- datarec_lib-1.3.2/datarec/registry/datasets/ambar.yml +21 -0
- datarec_lib-1.3.2/datarec/registry/datasets/ciao.yml +20 -0
- datarec_lib-1.3.2/datarec/registry/datasets/citeulike.yml +51 -0
- datarec_lib-1.3.2/datarec/registry/datasets/epinions.yml +30 -0
- datarec_lib-1.3.2/datarec/registry/datasets/gowalla.yml +30 -0
- datarec_lib-1.3.2/datarec/registry/datasets/lastfm.yml +23 -0
- datarec_lib-1.3.2/datarec/registry/datasets/mind.yml +28 -0
- datarec_lib-1.3.2/datarec/registry/datasets/movielens.yml +21 -0
- datarec_lib-1.3.2/datarec/registry/datasets/tmall.yml +19 -0
- datarec_lib-1.3.2/datarec/registry/datasets/yelp.yml +8 -0
- datarec_lib-1.3.2/datarec/registry/metrics/alibaba_ifashion_v1.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/amazon_baby_2023.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/amazon_beauty_2023.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/amazon_books_2023.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/amazon_clothing_2023.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/amazon_music_2023.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/amazon_office_2023.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/amazon_sports_and_outdoors_2023.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/amazon_toys_and_games_2023.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/amazon_videogames_2023.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/ambar_2024.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/ciao_v1.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/citeulike_a.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/citeulike_t.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/epinions_v1.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/gowalla_checkins.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/gowalla_friendships.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/lastfm_2011.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/mind_large.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/mind_small.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/movielens_100k.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/movielens_1m.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/movielens_20m.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/tmall_v1.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/metrics/yelp_v1.yml +17 -0
- datarec_lib-1.3.2/datarec/registry/utils.py +118 -0
- datarec_lib-1.3.2/datarec/registry/versions/alibaba_ifashion_v1.yml +64 -0
- datarec_lib-1.3.2/datarec/registry/versions/amazon_baby_2023.yml +30 -0
- datarec_lib-1.3.2/datarec/registry/versions/amazon_beauty_2023.yml +32 -0
- datarec_lib-1.3.2/datarec/registry/versions/amazon_books_2023.yml +30 -0
- datarec_lib-1.3.2/datarec/registry/versions/amazon_clothing_2023.yml +31 -0
- datarec_lib-1.3.2/datarec/registry/versions/amazon_music_2023.yml +31 -0
- datarec_lib-1.3.2/datarec/registry/versions/amazon_office_2023.yml +31 -0
- datarec_lib-1.3.2/datarec/registry/versions/amazon_sports_and_outdoors_2023.yml +30 -0
- datarec_lib-1.3.2/datarec/registry/versions/amazon_toys_and_games_2023.yml +31 -0
- datarec_lib-1.3.2/datarec/registry/versions/amazon_videogames_2023.yml +31 -0
- datarec_lib-1.3.2/datarec/registry/versions/ambar_2024.yml +79 -0
- datarec_lib-1.3.2/datarec/registry/versions/ciao_v1.yml +51 -0
- datarec_lib-1.3.2/datarec/registry/versions/citeulike_a.yml +121 -0
- datarec_lib-1.3.2/datarec/registry/versions/citeulike_t.yml +121 -0
- datarec_lib-1.3.2/datarec/registry/versions/epinions_v1.yml +28 -0
- datarec_lib-1.3.2/datarec/registry/versions/gowalla_checkins.yml +29 -0
- datarec_lib-1.3.2/datarec/registry/versions/gowalla_friendships.yml +29 -0
- datarec_lib-1.3.2/datarec/registry/versions/lastfm_2011.yml +59 -0
- datarec_lib-1.3.2/datarec/registry/versions/mind_large.yml +114 -0
- datarec_lib-1.3.2/datarec/registry/versions/mind_small.yml +79 -0
- datarec_lib-1.3.2/datarec/registry/versions/movielens_100k.yml +55 -0
- datarec_lib-1.3.2/datarec/registry/versions/movielens_1m.yml +42 -0
- datarec_lib-1.3.2/datarec/registry/versions/movielens_20m.yml +61 -0
- datarec_lib-1.3.2/datarec/registry/versions/tmall_v1.yml +58 -0
- datarec_lib-1.3.2/datarec/registry/versions/yelp_v1.yml +71 -0
- datarec_lib-1.3.2/datarec/splitters/__init__.py +6 -0
- datarec_lib-1.3.2/datarec/splitters/splitter.py +50 -0
- datarec_lib-1.3.2/datarec/splitters/uniform/__init__.py +3 -0
- datarec_lib-1.3.2/datarec/splitters/uniform/hold_out.py +110 -0
- datarec_lib-1.3.2/datarec/splitters/uniform/temporal/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/splitters/uniform/temporal/hold_out.py +110 -0
- datarec_lib-1.3.2/datarec/splitters/uniform/temporal/threshold.py +66 -0
- datarec_lib-1.3.2/datarec/splitters/user_stratified/__init__.py +3 -0
- datarec_lib-1.3.2/datarec/splitters/user_stratified/hold_out.py +114 -0
- datarec_lib-1.3.2/datarec/splitters/user_stratified/leave_out.py +240 -0
- datarec_lib-1.3.2/datarec/splitters/user_stratified/temporal/__init__.py +0 -0
- datarec_lib-1.3.2/datarec/splitters/user_stratified/temporal/leave_out.py +243 -0
- datarec_lib-1.3.2/datarec/splitters/utils.py +133 -0
- datarec_lib-1.3.2/datarec_lib.egg-info/PKG-INFO +260 -0
- datarec_lib-1.3.2/datarec_lib.egg-info/SOURCES.txt +165 -0
- datarec_lib-1.3.2/datarec_lib.egg-info/dependency_links.txt +1 -0
- datarec_lib-1.3.2/datarec_lib.egg-info/requires.txt +59 -0
- datarec_lib-1.3.2/datarec_lib.egg-info/top_level.txt +1 -0
- datarec_lib-1.3.2/pyproject.toml +109 -0
- datarec_lib-1.3.2/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Alberto Carlo Maria Mancino
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datarec-lib
|
|
3
|
+
Version: 1.3.2
|
|
4
|
+
Summary: Standardized & reproducible data management for recommender systems.
|
|
5
|
+
Author-email: Alberto Carlo Maria Mancino <alberto.mancino@poliba.it>, Salvatore Bufi <salvatore.bufi@poliba.it>, Angela Di Fazio <angela.difazio@poliba.it>, Daniele Malitesta <daniele.malitesta@centralesupelec.fr>, Antonio Ferrara <antonio.ferrara@poliba.it>, Claudio Pomo <claudio.pomo@poliba.it>, Tommaso Di Noia <tommaso.dinoia@poliba.it>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://sisinflab.github.io/DataRec/
|
|
8
|
+
Project-URL: Repository, https://github.com/sisinflab/DataRec
|
|
9
|
+
Project-URL: Issues, https://github.com/sisinflab/DataRec/issues
|
|
10
|
+
Keywords: recommender-systems,data,ml,etl
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: <3.15,>=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE.txt
|
|
20
|
+
Requires-Dist: numpy<3,>=1.23
|
|
21
|
+
Requires-Dist: pandas<3,>=2.3
|
|
22
|
+
Requires-Dist: scikit-learn<2,>=1.6
|
|
23
|
+
Requires-Dist: tqdm<5,>=4.64
|
|
24
|
+
Requires-Dist: gdown<5,>=4.7
|
|
25
|
+
Requires-Dist: requests<3,>=2.28
|
|
26
|
+
Requires-Dist: py7zr<1,>=0.22
|
|
27
|
+
Requires-Dist: PyYAML<7,>=6.0
|
|
28
|
+
Requires-Dist: platformdirs<5,>=4.4.0
|
|
29
|
+
Requires-Dist: appdirs<2,>=1.4.4
|
|
30
|
+
Requires-Dist: typing_extensions
|
|
31
|
+
Requires-Dist: networkx<4,>=3.2
|
|
32
|
+
Requires-Dist: python-igraph<2,>=1.0
|
|
33
|
+
Requires-Dist: PySocks<2,>=1.7
|
|
34
|
+
Provides-Extra: torch
|
|
35
|
+
Requires-Dist: torch>=2.0.0; extra == "torch"
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest<9,>=8.4; extra == "dev"
|
|
38
|
+
Requires-Dist: build<2,>=1.3; extra == "dev"
|
|
39
|
+
Requires-Dist: twine<7,>=6.2; extra == "dev"
|
|
40
|
+
Requires-Dist: pip-chill<2,>=1.0; extra == "dev"
|
|
41
|
+
Provides-Extra: docs
|
|
42
|
+
Requires-Dist: babel; extra == "docs"
|
|
43
|
+
Requires-Dist: backrefs; extra == "docs"
|
|
44
|
+
Requires-Dist: certifi; extra == "docs"
|
|
45
|
+
Requires-Dist: charset-normalizer; extra == "docs"
|
|
46
|
+
Requires-Dist: click; extra == "docs"
|
|
47
|
+
Requires-Dist: colorama; extra == "docs"
|
|
48
|
+
Requires-Dist: ghp-import; extra == "docs"
|
|
49
|
+
Requires-Dist: griffe; extra == "docs"
|
|
50
|
+
Requires-Dist: idna; extra == "docs"
|
|
51
|
+
Requires-Dist: Jinja2; extra == "docs"
|
|
52
|
+
Requires-Dist: Markdown; extra == "docs"
|
|
53
|
+
Requires-Dist: MarkupSafe; extra == "docs"
|
|
54
|
+
Requires-Dist: mergedeep; extra == "docs"
|
|
55
|
+
Requires-Dist: mkdocs; extra == "docs"
|
|
56
|
+
Requires-Dist: mkdocs-autorefs; extra == "docs"
|
|
57
|
+
Requires-Dist: mkdocs-get-deps; extra == "docs"
|
|
58
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
59
|
+
Requires-Dist: mkdocs-material-extensions; extra == "docs"
|
|
60
|
+
Requires-Dist: mkdocs-section-index; extra == "docs"
|
|
61
|
+
Requires-Dist: mkdocstrings; extra == "docs"
|
|
62
|
+
Requires-Dist: mkdocstrings-python; extra == "docs"
|
|
63
|
+
Requires-Dist: packaging; extra == "docs"
|
|
64
|
+
Requires-Dist: paginate; extra == "docs"
|
|
65
|
+
Requires-Dist: pathspec; extra == "docs"
|
|
66
|
+
Requires-Dist: platformdirs; extra == "docs"
|
|
67
|
+
Requires-Dist: Pygments; extra == "docs"
|
|
68
|
+
Requires-Dist: pymdown-extensions; extra == "docs"
|
|
69
|
+
Requires-Dist: python-dateutil; extra == "docs"
|
|
70
|
+
Requires-Dist: PyYAML; extra == "docs"
|
|
71
|
+
Requires-Dist: pyyaml_env_tag; extra == "docs"
|
|
72
|
+
Requires-Dist: requests; extra == "docs"
|
|
73
|
+
Requires-Dist: six; extra == "docs"
|
|
74
|
+
Requires-Dist: urllib3; extra == "docs"
|
|
75
|
+
Requires-Dist: watchdog; extra == "docs"
|
|
76
|
+
Dynamic: license-file
|
|
77
|
+
|
|
78
|
+
# 🧩 DataRec: A Python Library for Standardized and Reproducible Data Management in Recommender Systems
|
|
79
|
+
|
|
80
|
+
[](https://sisinflab.github.io/DataRec/)
|
|
81
|
+
[](LICENSE)
|
|
82
|
+
[](https://www.python.org/downloads/)
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
<img src="docs/assets/images/datarec_logo.png" width="600" alt="DataRec Logo">
|
|
87
|
+
|
|
88
|
+
**DataRec** focuses on the **data management phase** of recommender systems, promoting **standardization**, **interoperability**, and **best practices** for data filtering, splitting, analysis, and export.
|
|
89
|
+
|
|
90
|
+
Official repository of the paper:
|
|
91
|
+
📄 *DataRec: A Python Library for Standardized and Reproducible Data Management in Recommender Systems* (SIGIR 2025) [doi](https://dl.acm.org/doi/10.1145/3726302.3730320)
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## 📑 Table of Contents
|
|
96
|
+
- [Features](#features-)
|
|
97
|
+
- [Installation](#installation)
|
|
98
|
+
- [Quickstart](#quickstart-)
|
|
99
|
+
- [Datasets](#datasets-)
|
|
100
|
+
- [Documentation](#documentation-)
|
|
101
|
+
- [Contributing](#contributing-)
|
|
102
|
+
- [Citation](#citation-)
|
|
103
|
+
- [Authors and Contributors](#authors-and-contributors-)
|
|
104
|
+
- [Related Projects](#related-projects-)
|
|
105
|
+
- [License](#license-)
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Features ✨
|
|
110
|
+
- **Dataset Management**: multi-format I/O with dynamic schema specification.
|
|
111
|
+
- **Reference Datasets**: curated, versioned, and traceable datasets.
|
|
112
|
+
- **Filtering Strategies**: widely used user/item interaction filters.
|
|
113
|
+
- **Splitting Strategies**: temporal and random splits for reproducible evaluation.
|
|
114
|
+
- **Data Characteristics**: compute dataset-level statistics (e.g., sparsity, popularity).
|
|
115
|
+
- **Interoperability**: export datasets to external recommendation frameworks.
|
|
116
|
+
|
|
117
|
+
<img src="docs/assets/images/datarec_architecture.png" width="400" alt="DataRec Architecture">
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Installation
|
|
122
|
+
|
|
123
|
+
### From source (recommended for development)
|
|
124
|
+
```bash
|
|
125
|
+
git clone https://github.com/sisinflab/DataRec.git
|
|
126
|
+
cd DataRec
|
|
127
|
+
python3.9 -m venv venv
|
|
128
|
+
source venv/bin/activate
|
|
129
|
+
pip install --upgrade pip
|
|
130
|
+
pip install -r requirements.txt
|
|
131
|
+
# editable mode + optional dependency groups (defined in pyproject.toml)
|
|
132
|
+
pip install -e '.[dev,docs]'
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
## Quickstart 🚀
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from datarec.datasets import AmazonOffice
|
|
140
|
+
from datarec.processing import FilterOutDuplicatedInteractions, UserItemIterativeKCore
|
|
141
|
+
from datarec.splitters import RandomHoldOut
|
|
142
|
+
|
|
143
|
+
# 1️⃣ Load a reference dataset
|
|
144
|
+
data = AmazonOffice(version='2014').prepare_and_load()
|
|
145
|
+
|
|
146
|
+
# 2️⃣ Apply preprocessing filters
|
|
147
|
+
data = FilterOutDuplicatedInteractions().run(data)
|
|
148
|
+
data = UserItemIterativeKCore(cores=5).run(data)
|
|
149
|
+
|
|
150
|
+
# 3️⃣ Split into train/validation/test
|
|
151
|
+
splitter = RandomHoldOut(test_ratio=0.2, val_ratio=0.1, seed=42)
|
|
152
|
+
splits = splitter.run(data)
|
|
153
|
+
|
|
154
|
+
train, val, test = splits['train'], splits['val'], splits['test']
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Pipeline paths
|
|
158
|
+
When using YAML pipelines, store only filenames in the steps and pass the base folders at runtime:
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
from datarec.pipeline import Pipeline
|
|
162
|
+
|
|
163
|
+
pipeline = Pipeline.from_yaml("create_pipeline.yml")
|
|
164
|
+
pipeline.apply(input_folder="./data", output_folder="./outputs")
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
For file loaders use `filename` (instead of `path`) and for export steps use `filename`
|
|
168
|
+
(instead of `output_path`) in the YAML.
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Datasets 📊
|
|
173
|
+
The complete and up-to-date list of datasets (with metadata and statistics) is available in the documentation:
|
|
174
|
+
|
|
175
|
+
👉 **[Datasets Section](https://sisinflab.github.io/DataRec/datasets_nav/)**
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Documentation 📚
|
|
180
|
+
Full documentation available at: **https://sisinflab.github.io/DataRec/**
|
|
181
|
+
Includes API reference, guides, tutorials, and dataset overview.
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Contributing 🤝
|
|
186
|
+
|
|
187
|
+
Contributions are welcome!
|
|
188
|
+
To contribute:
|
|
189
|
+
1. Create a feature/fix branch.
|
|
190
|
+
2. Add tests and documentation updates as needed.
|
|
191
|
+
3. Run tests before pushing.
|
|
192
|
+
4. Open a pull request describing your changes clearly.
|
|
193
|
+
|
|
194
|
+
> The project also receives updates from a **private development repository** maintained by SisInfLab.
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## Citation 📖
|
|
199
|
+
|
|
200
|
+
If you use DataRec in your research, please cite our SIGIR 2025 paper:
|
|
201
|
+
|
|
202
|
+
```bibtex
|
|
203
|
+
@inproceedings{DBLP:conf/sigir/MancinoBF0MPN25,
|
|
204
|
+
author = {Alberto Carlo Maria Mancino and
|
|
205
|
+
Salvatore Bufi and
|
|
206
|
+
Angela Di Fazio and
|
|
207
|
+
Antonio Ferrara and
|
|
208
|
+
Daniele Malitesta and
|
|
209
|
+
Claudio Pomo and
|
|
210
|
+
Tommaso Di Noia},
|
|
211
|
+
title = {DataRec: {A} Python Library for Standardized and Reproducible Data
|
|
212
|
+
Management in Recommender Systems},
|
|
213
|
+
booktitle = {{SIGIR}},
|
|
214
|
+
pages = {3478--3487},
|
|
215
|
+
publisher = {{ACM}},
|
|
216
|
+
year = {2025}
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Authors and Contributors 👥
|
|
223
|
+
|
|
224
|
+
**Authors**
|
|
225
|
+
- Alberto Carlo Maria Mancino (Politecnico di Bari)
|
|
226
|
+
- Salvatore Bufi
|
|
227
|
+
- Angela Di Fazio
|
|
228
|
+
- Daniele Malitesta
|
|
229
|
+
- Antonio Ferrara
|
|
230
|
+
- Claudio Pomo
|
|
231
|
+
- Tommaso Di Noia
|
|
232
|
+
|
|
233
|
+
### Contributors
|
|
234
|
+
|
|
235
|
+
| | | | |
|
|
236
|
+
|:--:|:--:|:--:|:--:|
|
|
237
|
+
| [<img src="https://avatars.githubusercontent.com/AlbertoMancino" width="70px"><br>**Alberto C. M. Mancino**](https://github.com/AlbertoMancino) | [<img src="https://avatars.githubusercontent.com/a-difazio" width="70px"><br>**Angela Di Fazio**](https://github.com/a-difazio) | [<img src="https://avatars.githubusercontent.com/salvatore-bufi" width="70px"><br>**Salvatore Bufi**](https://github.com/salvatore-bufi) | [<img src="https://avatars.githubusercontent.com/GiuseppeFasano" width="70px"><br>**Giuseppe Fasano**](https://github.com/GiuseppeFasano) |
|
|
238
|
+
| [<img src="https://avatars.githubusercontent.com/GianLu210" width="70px"><br>**Gianluca Colonna**](https://github.com/GianLu210) | [<img src="https://avatars.githubusercontent.com/MariaLuigiaN" width="70px"><br>**Maria L. N. De Bonis**](https://github.com/MariaLuigiaN) | [<img src="https://avatars.githubusercontent.com/Marco-Valentini" width="70px"><br>**Marco Valentini**](https://github.com/Marco-Valentini) | |
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## Related Projects 🧩
|
|
242
|
+
|
|
243
|
+
- **Ducho** — library for multimodal representation learning: https://github.com/sisinflab/Ducho
|
|
244
|
+
- **D&D4Rec Tutorial (RecSys 2025)** — *Standard Practices for Data Processing and Multimodal Feature Extraction in Recommendation with DataRec and Ducho*:
|
|
245
|
+
https://sites.google.com/view/dd4rec-tutorial/home
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## License 📜
|
|
250
|
+
Distributed under the **MIT License**.
|
|
251
|
+
See [LICENSE](LICENSE).
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
<p align="center">
|
|
256
|
+
<em>Maintained with ❤️ by <a href="https://github.com/sisinflab">SisInfLab</a></em>
|
|
257
|
+
</p>
|
|
258
|
+
<p align="center">
|
|
259
|
+
<img src=docs/assets/images/sisinflab_logo.png width="100" alt="DataRec Logo">
|
|
260
|
+
</p>
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# 🧩 DataRec: A Python Library for Standardized and Reproducible Data Management in Recommender Systems
|
|
2
|
+
|
|
3
|
+
[](https://sisinflab.github.io/DataRec/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
<img src="docs/assets/images/datarec_logo.png" width="600" alt="DataRec Logo">
|
|
10
|
+
|
|
11
|
+
**DataRec** focuses on the **data management phase** of recommender systems, promoting **standardization**, **interoperability**, and **best practices** for data filtering, splitting, analysis, and export.
|
|
12
|
+
|
|
13
|
+
Official repository of the paper:
|
|
14
|
+
📄 *DataRec: A Python Library for Standardized and Reproducible Data Management in Recommender Systems* (SIGIR 2025) [doi](https://dl.acm.org/doi/10.1145/3726302.3730320)
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## 📑 Table of Contents
|
|
19
|
+
- [Features](#features-)
|
|
20
|
+
- [Installation](#installation)
|
|
21
|
+
- [Quickstart](#quickstart-)
|
|
22
|
+
- [Datasets](#datasets-)
|
|
23
|
+
- [Documentation](#documentation-)
|
|
24
|
+
- [Contributing](#contributing-)
|
|
25
|
+
- [Citation](#citation-)
|
|
26
|
+
- [Authors and Contributors](#authors-and-contributors-)
|
|
27
|
+
- [Related Projects](#related-projects-)
|
|
28
|
+
- [License](#license-)
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Features ✨
|
|
33
|
+
- **Dataset Management**: multi-format I/O with dynamic schema specification.
|
|
34
|
+
- **Reference Datasets**: curated, versioned, and traceable datasets.
|
|
35
|
+
- **Filtering Strategies**: widely used user/item interaction filters.
|
|
36
|
+
- **Splitting Strategies**: temporal and random splits for reproducible evaluation.
|
|
37
|
+
- **Data Characteristics**: compute dataset-level statistics (e.g., sparsity, popularity).
|
|
38
|
+
- **Interoperability**: export datasets to external recommendation frameworks.
|
|
39
|
+
|
|
40
|
+
<img src="docs/assets/images/datarec_architecture.png" width="400" alt="DataRec Architecture">
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
### From source (recommended for development)
|
|
47
|
+
```bash
|
|
48
|
+
git clone https://github.com/sisinflab/DataRec.git
|
|
49
|
+
cd DataRec
|
|
50
|
+
python3.9 -m venv venv
|
|
51
|
+
source venv/bin/activate
|
|
52
|
+
pip install --upgrade pip
|
|
53
|
+
pip install -r requirements.txt
|
|
54
|
+
# editable mode + optional dependency groups (defined in pyproject.toml)
|
|
55
|
+
pip install -e '.[dev,docs]'
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
## Quickstart 🚀
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from datarec.datasets import AmazonOffice
|
|
63
|
+
from datarec.processing import FilterOutDuplicatedInteractions, UserItemIterativeKCore
|
|
64
|
+
from datarec.splitters import RandomHoldOut
|
|
65
|
+
|
|
66
|
+
# 1️⃣ Load a reference dataset
|
|
67
|
+
data = AmazonOffice(version='2014').prepare_and_load()
|
|
68
|
+
|
|
69
|
+
# 2️⃣ Apply preprocessing filters
|
|
70
|
+
data = FilterOutDuplicatedInteractions().run(data)
|
|
71
|
+
data = UserItemIterativeKCore(cores=5).run(data)
|
|
72
|
+
|
|
73
|
+
# 3️⃣ Split into train/validation/test
|
|
74
|
+
splitter = RandomHoldOut(test_ratio=0.2, val_ratio=0.1, seed=42)
|
|
75
|
+
splits = splitter.run(data)
|
|
76
|
+
|
|
77
|
+
train, val, test = splits['train'], splits['val'], splits['test']
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Pipeline paths
|
|
81
|
+
When using YAML pipelines, store only filenames in the steps and pass the base folders at runtime:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from datarec.pipeline import Pipeline
|
|
85
|
+
|
|
86
|
+
pipeline = Pipeline.from_yaml("create_pipeline.yml")
|
|
87
|
+
pipeline.apply(input_folder="./data", output_folder="./outputs")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
For file loaders use `filename` (instead of `path`) and for export steps use `filename`
|
|
91
|
+
(instead of `output_path`) in the YAML.
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## Datasets 📊
|
|
96
|
+
The complete and up-to-date list of datasets (with metadata and statistics) is available in the documentation:
|
|
97
|
+
|
|
98
|
+
👉 **[Datasets Section](https://sisinflab.github.io/DataRec/datasets_nav/)**
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Documentation 📚
|
|
103
|
+
Full documentation available at: **https://sisinflab.github.io/DataRec/**
|
|
104
|
+
Includes API reference, guides, tutorials, and dataset overview.
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Contributing 🤝
|
|
109
|
+
|
|
110
|
+
Contributions are welcome!
|
|
111
|
+
To contribute:
|
|
112
|
+
1. Create a feature/fix branch.
|
|
113
|
+
2. Add tests and documentation updates as needed.
|
|
114
|
+
3. Run tests before pushing.
|
|
115
|
+
4. Open a pull request describing your changes clearly.
|
|
116
|
+
|
|
117
|
+
> The project also receives updates from a **private development repository** maintained by SisInfLab.
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Citation 📖
|
|
122
|
+
|
|
123
|
+
If you use DataRec in your research, please cite our SIGIR 2025 paper:
|
|
124
|
+
|
|
125
|
+
```bibtex
|
|
126
|
+
@inproceedings{DBLP:conf/sigir/MancinoBF0MPN25,
|
|
127
|
+
author = {Alberto Carlo Maria Mancino and
|
|
128
|
+
Salvatore Bufi and
|
|
129
|
+
Angela Di Fazio and
|
|
130
|
+
Antonio Ferrara and
|
|
131
|
+
Daniele Malitesta and
|
|
132
|
+
Claudio Pomo and
|
|
133
|
+
Tommaso Di Noia},
|
|
134
|
+
title = {DataRec: {A} Python Library for Standardized and Reproducible Data
|
|
135
|
+
Management in Recommender Systems},
|
|
136
|
+
booktitle = {{SIGIR}},
|
|
137
|
+
pages = {3478--3487},
|
|
138
|
+
publisher = {{ACM}},
|
|
139
|
+
year = {2025}
|
|
140
|
+
}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Authors and Contributors 👥
|
|
146
|
+
|
|
147
|
+
**Authors**
|
|
148
|
+
- Alberto Carlo Maria Mancino (Politecnico di Bari)
|
|
149
|
+
- Salvatore Bufi
|
|
150
|
+
- Angela Di Fazio
|
|
151
|
+
- Daniele Malitesta
|
|
152
|
+
- Antonio Ferrara
|
|
153
|
+
- Claudio Pomo
|
|
154
|
+
- Tommaso Di Noia
|
|
155
|
+
|
|
156
|
+
### Contributors
|
|
157
|
+
|
|
158
|
+
| | | | |
|
|
159
|
+
|:--:|:--:|:--:|:--:|
|
|
160
|
+
| [<img src="https://avatars.githubusercontent.com/AlbertoMancino" width="70px"><br>**Alberto C. M. Mancino**](https://github.com/AlbertoMancino) | [<img src="https://avatars.githubusercontent.com/a-difazio" width="70px"><br>**Angela Di Fazio**](https://github.com/a-difazio) | [<img src="https://avatars.githubusercontent.com/salvatore-bufi" width="70px"><br>**Salvatore Bufi**](https://github.com/salvatore-bufi) | [<img src="https://avatars.githubusercontent.com/GiuseppeFasano" width="70px"><br>**Giuseppe Fasano**](https://github.com/GiuseppeFasano) |
|
|
161
|
+
| [<img src="https://avatars.githubusercontent.com/GianLu210" width="70px"><br>**Gianluca Colonna**](https://github.com/GianLu210) | [<img src="https://avatars.githubusercontent.com/MariaLuigiaN" width="70px"><br>**Maria L. N. De Bonis**](https://github.com/MariaLuigiaN) | [<img src="https://avatars.githubusercontent.com/Marco-Valentini" width="70px"><br>**Marco Valentini**](https://github.com/Marco-Valentini) | |
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## Related Projects 🧩
|
|
165
|
+
|
|
166
|
+
- **Ducho** — library for multimodal representation learning: https://github.com/sisinflab/Ducho
|
|
167
|
+
- **D&D4Rec Tutorial (RecSys 2025)** — *Standard Practices for Data Processing and Multimodal Feature Extraction in Recommendation with DataRec and Ducho*:
|
|
168
|
+
https://sites.google.com/view/dd4rec-tutorial/home
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## License 📜
|
|
173
|
+
Distributed under the **MIT License**.
|
|
174
|
+
See [LICENSE](LICENSE).
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
<p align="center">
|
|
179
|
+
<em>Maintained with ❤️ by <a href="https://github.com/sisinflab">SisInfLab</a></em>
|
|
180
|
+
</p>
|
|
181
|
+
<p align="center">
|
|
182
|
+
<img src=docs/assets/images/sisinflab_logo.png width="100" alt="DataRec Logo">
|
|
183
|
+
</p>
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from .io.rawdata import RawData
|
|
2
|
+
from .data.dataset import DataRec, from_pickle
|
|
3
|
+
from datarec.registry.utils import available_datasets, print_available_datasets
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from .io.cache import (
|
|
7
|
+
cache_summary,
|
|
8
|
+
cache_dir,
|
|
9
|
+
set_cache_dir,
|
|
10
|
+
reset_cache_dir,
|
|
11
|
+
clear_cache,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
# Core classes
|
|
16
|
+
"DataRec",
|
|
17
|
+
"from_pickle",
|
|
18
|
+
"RawData",
|
|
19
|
+
# Cache management
|
|
20
|
+
"cache_summary",
|
|
21
|
+
"cache_dir",
|
|
22
|
+
"set_cache_dir",
|
|
23
|
+
"reset_cache_dir",
|
|
24
|
+
"clear_cache",
|
|
25
|
+
# Dataset registry
|
|
26
|
+
"available_datasets",
|
|
27
|
+
"print_available_datasets",
|
|
28
|
+
]
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import numpy as np
|
|
5
|
+
from typing import TYPE_CHECKING, Callable, Dict
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING: # avoids circular import at runtime
|
|
8
|
+
from datarec.data import DataRec
|
|
9
|
+
|
|
10
|
+
CHARACTERISTICS: Dict[str, Callable[..., float]] = {}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def characteristic(func=None, *, name=None):
|
|
14
|
+
if func is None:
|
|
15
|
+
return lambda f: characteristic(f, name=name)
|
|
16
|
+
|
|
17
|
+
key = name or func.__name__
|
|
18
|
+
if key in CHARACTERISTICS:
|
|
19
|
+
raise ValueError(f"{key} already registered")
|
|
20
|
+
CHARACTERISTICS[key] = func
|
|
21
|
+
return func
|
|
22
|
+
|
|
23
|
+
@characteristic
|
|
24
|
+
def n_users(dr: DataRec, scale_factor: int = 1000):
|
|
25
|
+
"""
|
|
26
|
+
Calculates the scaled square root of the user-item interaction space.
|
|
27
|
+
"""
|
|
28
|
+
return int(dr.data[dr.user_col].nunique())
|
|
29
|
+
|
|
30
|
+
@characteristic
|
|
31
|
+
def n_items(dr: DataRec, scale_factor: int = 1000):
|
|
32
|
+
"""
|
|
33
|
+
Calculates the scaled square root of the user-item interaction space.
|
|
34
|
+
"""
|
|
35
|
+
return int(dr.data[dr.item_col].nunique())
|
|
36
|
+
|
|
37
|
+
@characteristic
|
|
38
|
+
def n_interactions(dr: DataRec, scale_factor: int = 1000):
|
|
39
|
+
"""
|
|
40
|
+
Calculates the scaled square root of the user-item interaction space.
|
|
41
|
+
"""
|
|
42
|
+
return int(len(dr.data))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@characteristic
|
|
46
|
+
def space_size(dr: DataRec, scale_factor: int = 1000):
|
|
47
|
+
"""
|
|
48
|
+
Calculates the scaled square root of the user-item interaction space.
|
|
49
|
+
"""
|
|
50
|
+
return math.sqrt(dr.n_users * dr.n_items) / scale_factor
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@characteristic
|
|
54
|
+
def space_size_log(dr: DataRec):
|
|
55
|
+
"""
|
|
56
|
+
Calculates the log10 of the space_size metric.
|
|
57
|
+
"""
|
|
58
|
+
return math.log10(space_size(dr))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@characteristic
|
|
62
|
+
def shape(dr: DataRec):
|
|
63
|
+
"""
|
|
64
|
+
Calculates the shape of the interaction matrix (n_users / n_items).
|
|
65
|
+
"""
|
|
66
|
+
return dr.n_users / dr.n_items
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@characteristic
|
|
70
|
+
def shape_log(dr: DataRec):
|
|
71
|
+
"""
|
|
72
|
+
Calculates the log10 of the shape metric.
|
|
73
|
+
"""
|
|
74
|
+
return math.log10(shape(dr))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@characteristic
|
|
78
|
+
def density(dr: DataRec):
|
|
79
|
+
"""
|
|
80
|
+
Calculates the density of the user-item interaction matrix.
|
|
81
|
+
"""
|
|
82
|
+
return dr.transactions / (dr.n_users * dr.n_items)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@characteristic
|
|
86
|
+
def density_log(dr: DataRec):
|
|
87
|
+
"""
|
|
88
|
+
Calculates the log10 of the density metric.
|
|
89
|
+
"""
|
|
90
|
+
return math.log10(density(dr))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def gini(x):
|
|
94
|
+
"""
|
|
95
|
+
Calculates the Gini coefficient for a numpy array.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
x (np.ndarray): An array of non-negative values.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
(float): The Gini coefficient, a measure of inequality.
|
|
102
|
+
"""
|
|
103
|
+
x = np.sort(x) # O(n log n)
|
|
104
|
+
n = len(x)
|
|
105
|
+
cum_index = np.arange(1, n + 1)
|
|
106
|
+
return (np.sum((2 * cum_index - n - 1) * x)) / (n * np.sum(x))
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@characteristic
|
|
110
|
+
def gini_item(dr: DataRec):
|
|
111
|
+
"""
|
|
112
|
+
Calculates the Gini coefficient for item popularity.
|
|
113
|
+
"""
|
|
114
|
+
return gini(np.array(list(dr.sorted_items.values())))
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@characteristic
|
|
118
|
+
def gini_user(dr: DataRec):
|
|
119
|
+
"""
|
|
120
|
+
Calculates the Gini coefficient for user activity.
|
|
121
|
+
"""
|
|
122
|
+
return gini(np.array(list(dr.sorted_users.values())))
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@characteristic
|
|
126
|
+
def ratings_per_user(dr: DataRec):
|
|
127
|
+
"""
|
|
128
|
+
Calculates the average number of ratings per user.
|
|
129
|
+
"""
|
|
130
|
+
return dr.transactions / dr.n_users
|
|
131
|
+
|
|
132
|
+
@characteristic
|
|
133
|
+
def ratings_per_item(dr: DataRec):
|
|
134
|
+
"""
|
|
135
|
+
Calculates the average number of ratings per item.
|
|
136
|
+
"""
|
|
137
|
+
return dr.transactions / dr.n_items
|
|
138
|
+
|