sm-rs 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sm_rs-0.1.0/.github/workflows/ci.yml +24 -0
- sm_rs-0.1.0/.gitignore +11 -0
- sm_rs-0.1.0/CITATION.cff +46 -0
- sm_rs-0.1.0/DATASET_CARD.md +212 -0
- sm_rs-0.1.0/LICENSE +21 -0
- sm_rs-0.1.0/PKG-INFO +202 -0
- sm_rs-0.1.0/README.md +172 -0
- sm_rs-0.1.0/examples/reproduce_perceived.py +97 -0
- sm_rs-0.1.0/pyproject.toml +52 -0
- sm_rs-0.1.0/src/smrs/__init__.py +20 -0
- sm_rs-0.1.0/src/smrs/build.py +117 -0
- sm_rs-0.1.0/src/smrs/data.py +121 -0
- sm_rs-0.1.0/src/smrs/derived.py +119 -0
- sm_rs-0.1.0/src/smrs/fetch.py +146 -0
- sm_rs-0.1.0/src/smrs/metrics.py +113 -0
- sm_rs-0.1.0/src/smrs/tasks/__init__.py +34 -0
- sm_rs-0.1.0/src/smrs/tasks/task1_propensity.py +89 -0
- sm_rs-0.1.0/src/smrs/tasks/task2_proportionality.py +158 -0
- sm_rs-0.1.0/src/smrs/tasks/task3_reranking.py +39 -0
- sm_rs-0.1.0/src/smrs/tasks/task4_diversity.py +38 -0
- sm_rs-0.1.0/src/smrs/tasks/task5_perceived.py +39 -0
- sm_rs-0.1.0/src/smrs/tasks/task6_satisfaction.py +39 -0
- sm_rs-0.1.0/tests/test_data_local.py +36 -0
- sm_rs-0.1.0/tests/test_derived.py +80 -0
- sm_rs-0.1.0/tests/test_fetch.py +50 -0
- sm_rs-0.1.0/tests/test_metrics.py +44 -0
- sm_rs-0.1.0/tests/test_task1.py +33 -0
- sm_rs-0.1.0/tests/test_tasks_2_to_6.py +117 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.9", "3.11", "3.13"]
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: ${{ matrix.python-version }}
|
|
19
|
+
cache: pip
|
|
20
|
+
cache-dependency-path: pyproject.toml
|
|
21
|
+
- name: Install
|
|
22
|
+
run: python -m pip install -e ".[dev]"
|
|
23
|
+
- name: Test
|
|
24
|
+
run: python -m pytest -q
|
sm_rs-0.1.0/.gitignore
ADDED
sm_rs-0.1.0/CITATION.cff
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
message: "If you use the SM-RS dataset or benchmark, please cite BOTH papers below."
|
|
3
|
+
title: "SM-RS benchmark"
|
|
4
|
+
abstract: >-
|
|
5
|
+
Data loaders and canonical task evaluators for SM-RS, the single- and
|
|
6
|
+
multi-objective recommendations dataset linking self-declared propensities with
|
|
7
|
+
contextual impressions, item selections, and perceived quality.
|
|
8
|
+
# Authors of *this software* (the benchmark code). The dataset's authors are in
|
|
9
|
+
# the citations below — please cite the papers.
|
|
10
|
+
authors:
|
|
11
|
+
- family-names: Dokoupil
|
|
12
|
+
given-names: Patrik
|
|
13
|
+
repository-code: "https://github.com/pdokoupil/SM-RS"
|
|
14
|
+
license: MIT
|
|
15
|
+
type: software
|
|
16
|
+
# Primary citation: the journal (SM-RS 2.0) superset.
|
|
17
|
+
preferred-citation:
|
|
18
|
+
type: article
|
|
19
|
+
title: "SM-RS 2.0: User-perceived Qualities of Single- and Multi-Objective Recommender Systems"
|
|
20
|
+
authors:
|
|
21
|
+
- family-names: Dokoupil
|
|
22
|
+
given-names: Patrik
|
|
23
|
+
- family-names: Peska
|
|
24
|
+
given-names: Ladislav
|
|
25
|
+
year: 2026
|
|
26
|
+
doi: "10.1145/3754459"
|
|
27
|
+
journal: "ACM Transactions on Recommender Systems"
|
|
28
|
+
publisher:
|
|
29
|
+
name: "Association for Computing Machinery"
|
|
30
|
+
volume: 4
|
|
31
|
+
issue: 3
|
|
32
|
+
# Also cite the original SM-RS (SIGIR'24).
|
|
33
|
+
references:
|
|
34
|
+
- type: conference-paper
|
|
35
|
+
title: "SM-RS: Single- and Multi-Objective Recommendations with Contextual Impressions and Beyond-Accuracy Propensity Scores"
|
|
36
|
+
authors:
|
|
37
|
+
- family-names: Dokoupil
|
|
38
|
+
given-names: Patrik
|
|
39
|
+
- family-names: Peska
|
|
40
|
+
given-names: Ladislav
|
|
41
|
+
- family-names: Boratto
|
|
42
|
+
given-names: Ludovico
|
|
43
|
+
year: 2024
|
|
44
|
+
doi: "10.1145/3626772.3657863"
|
|
45
|
+
collection-title: "Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval"
|
|
46
|
+
collection-type: proceedings
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Hugging Face dataset card frontmatter (draft). Paste as README.md in the HF dataset repo.
|
|
3
|
+
pretty_name: "SM-RS: Single- and Multi-Objective Recommendations"
|
|
4
|
+
license: cc-by-4.0
|
|
5
|
+
language: [en]
|
|
6
|
+
task_categories: [other]
|
|
7
|
+
tags:
|
|
8
|
+
- recommender-systems
|
|
9
|
+
- multi-objective
|
|
10
|
+
- beyond-accuracy
|
|
11
|
+
- impressions
|
|
12
|
+
- propensity
|
|
13
|
+
- user-study
|
|
14
|
+
- benchmark
|
|
15
|
+
configs:
|
|
16
|
+
- config_name: behaviors
|
|
17
|
+
data_files: behaviors.parquet
|
|
18
|
+
- config_name: propensities
|
|
19
|
+
data_files: propensities.parquet
|
|
20
|
+
- config_name: objective_perceptions
|
|
21
|
+
data_files: objective_perceptions.parquet
|
|
22
|
+
- config_name: criteria_values
|
|
23
|
+
data_files: criteria_values.parquet
|
|
24
|
+
- config_name: comparative_diversity
|
|
25
|
+
data_files: comparative_diversity.parquet
|
|
26
|
+
- config_name: users
|
|
27
|
+
data_files: users.parquet
|
|
28
|
+
- config_name: movies
|
|
29
|
+
data_files: movies.parquet
|
|
30
|
+
- config_name: books
|
|
31
|
+
data_files: books.parquet
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
# SM-RS — Single- and Multi-Objective Recommendations
|
|
35
|
+
|
|
36
|
+
**SM-RS 2.0** — the extended version of SM-RS (SIGIR'24), and a **superset** of v1.
|
|
37
|
+
To our knowledge the only public RecSys dataset linking users' **self-declared
|
|
38
|
+
propensities** toward beyond-accuracy objectives (relevance, diversity, novelty,
|
|
39
|
+
exploration) with **contextual impressions**, item selections, and **explicit
|
|
40
|
+
perceived-quality** judgments — across two domains (movies, books).
|
|
41
|
+
|
|
42
|
+
> **If you use this dataset, please cite BOTH papers** (SM-RS 2.0, TORS 2026; and
|
|
43
|
+
> SM-RS, SIGIR 2024) — see [Citation](#citation).
|
|
44
|
+
|
|
45
|
+
- 📦 **Benchmark code (loaders + canonical evaluators):** https://github.com/pdokoupil/SM-RS
|
|
46
|
+
- 🧊 **Reproducibility archive:** OSF — https://osf.io/wsakx (v2), https://osf.io/hkzje (v1)
|
|
47
|
+
|
|
48
|
+
## Quick start
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from datasets import load_dataset
|
|
52
|
+
behaviors = load_dataset("pdokoupil/SM-RS", "behaviors")
|
|
53
|
+
# or use the benchmark package: pip install sm-rs
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## What's inside
|
|
57
|
+
|
|
58
|
+
### Core tables
|
|
59
|
+
| Config | Rows | Description |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| `behaviors` | ~10k | impression logs: `userId`, `timestamp`, `history`, `impressions` (`itemId`/`selected`/`timestamp`), `mors`, `iteration` |
|
|
62
|
+
| `propensities` | — | per-user self-declared `relevance` / `diversity` / `novelty` / `exploration` propensities (+ `userId`, `timestamp`) |
|
|
63
|
+
| `objective_perceptions` | — | per-impression perceived `relevance`/`diversity`/`novelty`/`exploration` (+ `impressionsIds`) |
|
|
64
|
+
| `criteria_values` | — | computed objective values per impression |
|
|
65
|
+
| `comparative_diversity` | — | triplet judgments (`list1`/`list2`/`list3`, `perceived_diversities`, `domain`) |
|
|
66
|
+
| `users` | — | demographics + `domain` (movies/books) |
|
|
67
|
+
| `movies`, `books` | — | item id maps |
|
|
68
|
+
|
|
69
|
+
### Derived matrices — recomputed locally, **not** hosted here
|
|
70
|
+
Scoring recommendation-list tasks (2, 3) needs item-item (relevance), distance
|
|
71
|
+
(diversity), and popularity (novelty) matrices. These are **not distributed** —
|
|
72
|
+
neither here nor on this dataset — because they derive from third-party catalogs
|
|
73
|
+
(MovieLens, goodbooks-10k) whose terms restrict redistribution, and because they'd
|
|
74
|
+
add gigabytes. Instead the [benchmark package](https://github.com/pdokoupil/SM-RS)
|
|
75
|
+
**recomputes** them from a rating matrix you build from your own download of those
|
|
76
|
+
public datasets (`smrs.derived`: EASE^R item-item, 1−cosine distance, frequency
|
|
77
|
+
popularity — all deterministic). Bit-exact originals are archived on OSF.
|
|
78
|
+
|
|
79
|
+
## Tasks (benchmark)
|
|
80
|
+
|
|
81
|
+
Six tasks share this dataset; each has a canonical `evaluate()` in the
|
|
82
|
+
[benchmark repo](https://github.com/pdokoupil/SM-RS) (`smrs.tasks.*`):
|
|
83
|
+
|
|
84
|
+
| # | Task | Module | Metrics |
|
|
85
|
+
|---|------|--------|---------|
|
|
86
|
+
| 1 | Propensity estimation | `task1_propensity` | MAE · MSE · KL |
|
|
87
|
+
| 2 | Results proportionality | `task2_proportionality` | MAE · KL · wSUM · Pearson ρ |
|
|
88
|
+
| 3 | Selections-aware reranking | `task3_reranking` | nDCG@10 · Precision@5 |
|
|
89
|
+
| 4 | Diversity-metric definition | `task4_diversity` | MAE · MSE · KL |
|
|
90
|
+
| 5 | Perceived quality (5.1 rel / 5.2 div / 5.3 nov / 5.4 ser) | `task5_perceived` | MAE · MSE · Kendall τ |
|
|
91
|
+
| 6 | Satisfaction (6.1 / 6.2) | `task6_satisfaction` | MAE · MSE · Kendall τ |
|
|
92
|
+
|
|
93
|
+
## Leaderboard
|
|
94
|
+
|
|
95
|
+
Self-service: run the canonical `evaluate()` and open a PR adding your row (with a
|
|
96
|
+
repro link). The paper baselines below are the rows to beat (from SM-RS 2.0, TORS
|
|
97
|
+
2026, Tables 2–4).
|
|
98
|
+
|
|
99
|
+
### Task 1 — Propensity estimation · MAE / MSE / KL (↓)
|
|
100
|
+
| Method | MAE | MSE | KL |
|
|
101
|
+
|---|---|---|---|
|
|
102
|
+
| Locally normalized | 0.150 | 0.040 | 0.439 |
|
|
103
|
+
| Globally normalized | 0.149 | 0.039 | 0.271 |
|
|
104
|
+
|
|
105
|
+
### Task 2 — Results proportionality · MAE↓ / KL↓ / wSUM↑ / ρ↑
|
|
106
|
+
| Method | MAE | KL | wSUM | ρ |
|
|
107
|
+
|---|---|---|---|---|
|
|
108
|
+
| FAI | 0.181 | 0.374 | 0.568 | −0.033 |
|
|
109
|
+
| Probabilistic FAI | 0.141 | 0.242 | 0.568 | 0.318 |
|
|
110
|
+
| Incremental Weighted Average | 0.145 | 0.245 | 0.580 | 0.297 |
|
|
111
|
+
| **RLProp** | **0.114** | **0.181** | 0.549 | **0.356** |
|
|
112
|
+
| GreedyLM | 0.154 | 0.278 | **0.582** | 0.256 |
|
|
113
|
+
| DQNMORS | 0.230 | 0.869 | 0.447 | 0.013 |
|
|
114
|
+
|
|
115
|
+
### Task 3 — Selections-aware reranking · nDCG@10 / Precision@5 (↑)
|
|
116
|
+
| Method | nDCG@10 | Precision@5 |
|
|
117
|
+
|---|---|---|
|
|
118
|
+
| **Popularity based** | **0.406** | **0.244** |
|
|
119
|
+
| EASE^R | 0.368 | 0.225 |
|
|
120
|
+
| VAE | 0.292 | 0.174 |
|
|
121
|
+
| MultVAE | 0.328 | 0.189 |
|
|
122
|
+
| ImplicitMF | 0.337 | 0.210 |
|
|
123
|
+
| UserKNN | 0.367 | 0.224 |
|
|
124
|
+
| Identity (no reranking) | 0.215 | 0.131 |
|
|
125
|
+
|
|
126
|
+
### Task 4 — Diversity-metric definition · MAE / MSE / KL (↓)
|
|
127
|
+
| Method | MAE | MSE | KL |
|
|
128
|
+
|---|---|---|---|
|
|
129
|
+
| CF-ILD | 0.470 | 0.316 | 0.279 |
|
|
130
|
+
| CB-ILD | 0.292 | 0.116 | 0.282 |
|
|
131
|
+
| CF-ILMD | 0.409 | 0.246 | **0.272** |
|
|
132
|
+
| **CB-ILMD** | **0.287** | **0.114** | 0.278 |
|
|
133
|
+
| BIN-DIV | 0.294 | 0.127 | 0.288 |
|
|
134
|
+
|
|
135
|
+
### Task 5 — Perceived quality · MAE↓ / MSE↓ / Kendall τ↑
|
|
136
|
+
|
|
137
|
+
**5.1 relevance**
|
|
138
|
+
| Method | MAE | MSE | τ |
|
|
139
|
+
|---|---|---|---|
|
|
140
|
+
| Popularity | 0.205 | 0.079 | 0.228 |
|
|
141
|
+
| Relevance-based | 0.191 | 0.063 | **0.263** |
|
|
142
|
+
| Linear Regression | 0.235 | 0.085 | 0.080 |
|
|
143
|
+
| LSTM | **0.189** | **0.061** | 0.022 |
|
|
144
|
+
|
|
145
|
+
**5.2 diversity**
|
|
146
|
+
| Method | MAE | MSE | τ |
|
|
147
|
+
|---|---|---|---|
|
|
148
|
+
| CF-ILD | 0.270 | 0.116 | 0.171 |
|
|
149
|
+
| CB-ILD | 0.255 | 0.081 | −0.004 |
|
|
150
|
+
| CF-ILMD | 0.227 | 0.074 | 0.146 |
|
|
151
|
+
| CB-ILMD | 0.346 | 0.170 | −0.025 |
|
|
152
|
+
| BIN-DIV | 0.421 | 0.242 | 0.069 |
|
|
153
|
+
| Linear Regression | **0.222** | **0.061** | **0.196** |
|
|
154
|
+
| LSTM | 0.218 | 0.066 | 0.071 |
|
|
155
|
+
|
|
156
|
+
**5.3 novelty**
|
|
157
|
+
| Method | MAE | MSE | τ |
|
|
158
|
+
|---|---|---|---|
|
|
159
|
+
| Popularity complement | 0.369 | 0.200 | **0.233** |
|
|
160
|
+
| Linear Regression | **0.259** | 0.104 | 0.143 |
|
|
161
|
+
| LSTM | 0.261 | **0.089** | −0.008 |
|
|
162
|
+
|
|
163
|
+
**5.4 serendipity**
|
|
164
|
+
| Method | MAE | MSE | τ |
|
|
165
|
+
|---|---|---|---|
|
|
166
|
+
| CF-Unexpectedness | 0.303 | 0.142 | **0.128** |
|
|
167
|
+
| CB-Unexpectedness | **0.250** | **0.077** | 0.054 |
|
|
168
|
+
| CF-EASE | 0.283 | 0.117 | 0.068 |
|
|
169
|
+
| CB-EASE | 0.348 | 0.179 | 0.040 |
|
|
170
|
+
| 1−PMI | 0.336 | 0.170 | 0.068 |
|
|
171
|
+
| Linear Regression | 0.270 | 0.103 | 0.039 |
|
|
172
|
+
| LSTM | 0.261 | 0.093 | −0.031 |
|
|
173
|
+
|
|
174
|
+
### Task 6 — Satisfaction · MAE↓ / MSE↓ / Kendall τ↑
|
|
175
|
+
| Method | MAE | MSE | τ | Subtask |
|
|
176
|
+
|---|---|---|---|---|
|
|
177
|
+
| Popularity based | 0.233 | 0.100 | 0.156 | 6.1 |
|
|
178
|
+
| Relevance based | 0.223 | 0.083 | 0.145 | 6.1 |
|
|
179
|
+
| Linear Regression | 0.255 | 0.102 | 0.045 | 6.1 |
|
|
180
|
+
| LSTM | 0.210 | 0.072 | −0.023 | 6.1 |
|
|
181
|
+
| **LSTM Combined** | **0.138** | **0.035** | **0.515** | 6.2 |
|
|
182
|
+
|
|
183
|
+
> The `Linear Regression` and popularity/relevance baselines are reproduced by the
|
|
184
|
+
> benchmark package (`examples/reproduce_perceived.py`) — MAE matches the paper.
|
|
185
|
+
|
|
186
|
+
## What's new vs. SM-RS 1.0 / getting the v1 subset
|
|
187
|
+
SM-RS 2.0 is a superset of the SIGIR'24 release (adds perceived-quality judgments
|
|
188
|
+
and more tasks). The v1 subset is recoverable from the same tables — see the
|
|
189
|
+
benchmark repo for the column mapping.
|
|
190
|
+
|
|
191
|
+
## Licensing
|
|
192
|
+
Collected study data (this dataset): **CC-BY-4.0** — free to use with attribution
|
|
193
|
+
(cite both papers). The dataset references catalog items **by ID only** and does
|
|
194
|
+
**not** redistribute MovieLens or goodbooks-10k. Derived matrices are recomputed by
|
|
195
|
+
the benchmark from your own copies of those datasets, obtained under **their**
|
|
196
|
+
licenses (MovieLens: no redistribution; goodbooks-10k: per its repository license).
|
|
197
|
+
|
|
198
|
+
## Citation
|
|
199
|
+
```bibtex
|
|
200
|
+
@article{dokoupil2026smrs2,
|
|
201
|
+
author = {Dokoupil, Patrik and Peska, Ladislav},
|
|
202
|
+
title = {SM-RS 2.0: User-perceived Qualities of Single- and Multi-Objective Recommender Systems},
|
|
203
|
+
journal = {ACM Transactions on Recommender Systems}, volume = {4}, number = {3}, year = {2026},
|
|
204
|
+
doi = {10.1145/3754459}
|
|
205
|
+
}
|
|
206
|
+
@inproceedings{dokoupil2024smrs,
|
|
207
|
+
author = {Dokoupil, Patrik and Peska, Ladislav and Boratto, Ludovico},
|
|
208
|
+
title = {SM-RS: Single- and Multi-Objective Recommendations with Contextual Impressions and Beyond-Accuracy Propensity Scores},
|
|
209
|
+
booktitle = {Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval},
|
|
210
|
+
series = {SIGIR '24}, pages = {988--995}, year = {2024}, doi = {10.1145/3626772.3657863}
|
|
211
|
+
}
|
|
212
|
+
```
|
sm_rs-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Patrik Dokoupil
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
sm_rs-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sm-rs
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: SM-RS benchmark: data loaders and canonical task evaluators for the single- and multi-objective recommendations dataset.
|
|
5
|
+
Project-URL: Homepage, https://github.com/pdokoupil/SM-RS
|
|
6
|
+
Project-URL: Repository, https://github.com/pdokoupil/SM-RS
|
|
7
|
+
Project-URL: Dataset, https://huggingface.co/datasets/pdokoupil/SM-RS
|
|
8
|
+
Project-URL: Paper (TORS 2026), https://doi.org/10.1145/3754459
|
|
9
|
+
Project-URL: Paper (SIGIR 2024), https://doi.org/10.1145/3626772.3657863
|
|
10
|
+
Author: Patrik Dokoupil
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: benchmark,beyond-accuracy,dataset,impressions,multi-objective,propensity,recommender-systems
|
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Requires-Dist: huggingface-hub>=0.20
|
|
21
|
+
Requires-Dist: numpy>=1.21
|
|
22
|
+
Requires-Dist: pandas>=1.3
|
|
23
|
+
Requires-Dist: pyarrow>=12
|
|
24
|
+
Requires-Dist: scikit-learn>=1.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7; extra == 'dev'
|
|
27
|
+
Provides-Extra: lstm
|
|
28
|
+
Requires-Dist: tensorflow>=2.10; extra == 'lstm'
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
<div align="center">
|
|
32
|
+
|
|
33
|
+
# SM-RS
|
|
34
|
+
|
|
35
|
+
**The single- and multi-objective recommendations benchmark** — self-declared user
|
|
36
|
+
propensities (relevance · diversity · novelty · exploration) linked to contextual
|
|
37
|
+
impressions, item selections, and perceived quality.
|
|
38
|
+
|
|
39
|
+
<!-- Badges light up once the repo is pushed and the package is published. -->
|
|
40
|
+
[](https://pypi.org/project/sm-rs/)
|
|
41
|
+
[](https://github.com/pdokoupil/SM-RS/actions/workflows/ci.yml)
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
[](https://huggingface.co/datasets/pdokoupil/SM-RS)
|
|
44
|
+
|
|
45
|
+
</div>
|
|
46
|
+
|
|
47
|
+
SM-RS is, to our knowledge, the only public recommender-systems dataset linking
|
|
48
|
+
users' **self-declared propensities** toward beyond-accuracy objectives with
|
|
49
|
+
**contextual impressions**, item selections, and **explicit perceived-quality**
|
|
50
|
+
judgments. This repository is the **benchmark code**: data loaders and the
|
|
51
|
+
*canonical* evaluator for each task, so everyone reports comparable numbers. The
|
|
52
|
+
**data** lives on Hugging Face; the **leaderboard** lives in the dataset card.
|
|
53
|
+
|
|
54
|
+
> **Dataset & leaderboard:** 🤗 [pdokoupil/SM-RS](https://huggingface.co/datasets/pdokoupil/SM-RS)
|
|
55
|
+
> **Cite BOTH** the SM-RS 2.0 (TORS'26) and SM-RS (SIGIR'24) papers — see [Citing](#citing).
|
|
56
|
+
|
|
57
|
+
## Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install sm-rs # core (numpy / pandas / scikit-learn)
|
|
61
|
+
pip install "sm-rs[lstm]" # + TensorFlow, for the optional LSTM baseline
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Quick start (Task 1: propensity estimation)
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import numpy as np
|
|
68
|
+
from smrs.tasks import task1_propensity as t1
|
|
69
|
+
|
|
70
|
+
# reproducible 80/20 split (seed 2024), reading a local SM-RS copy for now
|
|
71
|
+
X_train, X_test, y_train, y_test = t1.split(data_dir="path/to/sm-rs")
|
|
72
|
+
|
|
73
|
+
# ... train your estimator, produce an (N, 4) array of [rel, div, nov, exp] ...
|
|
74
|
+
predictions = np.full((len(y_test), 4), 0.25) # placeholder
|
|
75
|
+
|
|
76
|
+
print(t1.evaluate(predictions, y_test)) # {'MAE': ..., 'MSE': ..., 'KLDiv': ...}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## The six tasks
|
|
80
|
+
|
|
81
|
+
All six draw on the **same** dataset; they differ in what you predict and how it's
|
|
82
|
+
scored. The canonical evaluator for each lives in `smrs.tasks.*`.
|
|
83
|
+
|
|
84
|
+
| # | Task | You produce | Metric(s) | Evaluator |
|
|
85
|
+
|---|------|-------------|-----------|-----------|
|
|
86
|
+
| 1 | **Propensity estimation** | a 4-vector propensity per user | MAE · MSE · KL | ✅ `task1_propensity` |
|
|
87
|
+
| 2 | **Results proportionality** | a top-k list matching target propensities | MAE · KL · wSUM · Pearson ρ | ✅ `task2_proportionality`¹ |
|
|
88
|
+
| 3 | **Selections-aware reranking** | a reranked impression list | nDCG@10 · Precision@5 | ✅ `task3_reranking` |
|
|
89
|
+
| 4 | **Diversity-metric definition** | per-list diversity values | MAE · MSE · KL | ✅ `task4_diversity` |
|
|
90
|
+
| 5 | **Perceived quality** (5.1 rel / 5.2 div / 5.3 nov / 5.4 ser) | per-objective perception | MAE · MSE · Kendall τ | ✅ `task5_perceived` |
|
|
91
|
+
| 6 | **Satisfaction** (6.1 / 6.2) | overall satisfaction | MAE · MSE · Kendall τ | ✅ `task6_satisfaction` |
|
|
92
|
+
|
|
93
|
+
¹ Task 2's *metric layer* is implemented; turning a top-k list into achieved
|
|
94
|
+
objective proportions needs the derived matrices (see Data) and lands with the
|
|
95
|
+
source→rating-matrix builder.
|
|
96
|
+
|
|
97
|
+
## Data
|
|
98
|
+
|
|
99
|
+
Two layers:
|
|
100
|
+
|
|
101
|
+
1. **Core tables** (the collected study data, **CC-BY**, hosted on Hugging Face):
|
|
102
|
+
`behaviors`, `propensities`, `objective_perceptions`, `criteria_values`,
|
|
103
|
+
`comparative_diversity`, `users`, `movies`, `books`. Items are referenced by ID.
|
|
104
|
+
|
|
105
|
+
**Auto-downloaded** from the Hub and cached — no manual download:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from smrs import data
|
|
109
|
+
df = data.load("propensities") # downloads from HF, cached
|
|
110
|
+
# offline / local copy (e.g. OSF download):
|
|
111
|
+
df = data.load("propensities", data_dir="path/to/sm-rs") # or set $SMRS_DATA_DIR
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Users of the 🤗 `datasets` library can equivalently do
|
|
115
|
+
`load_dataset("pdokoupil/SM-RS", "behaviors")`.
|
|
116
|
+
|
|
117
|
+
2. **Derived matrices** — *recomputed locally, not downloaded.* The list-scoring
|
|
118
|
+
tasks (2, 3) need per-item / per-pair artifacts (relevance via item-item,
|
|
119
|
+
intra-list diversity via a distance matrix, novelty via popularity). Rather than
|
|
120
|
+
ship multi-GB blobs — or redistribute the third-party catalogs they come from —
|
|
121
|
+
the benchmark **recomputes** them from a rating matrix you build from your own
|
|
122
|
+
download of the public source datasets (movies: **MovieLens 25M** — the "Latest"
|
|
123
|
+
snapshot at collection time — plus **MovieLens Tag Genome 2021**; books:
|
|
124
|
+
**goodbooks-10k**):
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from smrs import derived
|
|
128
|
+
art = derived.build_artifacts(rating_matrix_movies) # {item_item, distance_matrix, mean_popularities}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
`derived` provides the deterministic pieces: `popularity`, `cosine_distance`
|
|
132
|
+
(1 − cosine over item rating-vectors), and `ease_item_item` (EASE^R closed form,
|
|
133
|
+
used as the relevance model). This keeps the benchmark lightweight and
|
|
134
|
+
license-clean. The **bit-exact original artifacts are archived on OSF**
|
|
135
|
+
([v2](https://osf.io/wsakx)) for strict reproduction.
|
|
136
|
+
|
|
137
|
+
Get the source datasets with the bundled fetcher (downloads from the official
|
|
138
|
+
hosts — GroupLens, the goodbooks repo — under their licenses; or place them
|
|
139
|
+
there manually):
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
smrs-fetch --list # show sources + licenses, no download
|
|
143
|
+
smrs-fetch --dest ./sm-rs-sources # download MovieLens 25M, Tag Genome 2021, goodbooks-10k
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
> **Why recompute?** MovieLens may not be redistributed, and a 5 GB download hurts
|
|
147
|
+
> adoption. You obtain MovieLens/goodbooks under their own licenses; we ship only
|
|
148
|
+
> the study data, the id-maps (`movies.json`: movieId→imdbId; `books.json`:
|
|
149
|
+
> book_index→goodreads_id), and the recompute code. The canonical scorer is pinned
|
|
150
|
+
> (sources above; **positive feedback = rating ≥ 3**; EASE λ) so results stay
|
|
151
|
+
> comparable. For strict reproduction of the paper's numbers, use the OSF artifacts.
|
|
152
|
+
|
|
153
|
+
## Reproduction check
|
|
154
|
+
|
|
155
|
+
`examples/reproduce_perceived.py` reproduces the paper's **Linear Regression**
|
|
156
|
+
baseline for Tasks 5 & 6 (the one baseline needing no derived/source data), scored
|
|
157
|
+
with this package's evaluators — MAE matches the paper exactly:
|
|
158
|
+
|
|
159
|
+
| subtask | MAE (ours / paper) | MSE | Kendall τ |
|
|
160
|
+
|---|---|---|---|
|
|
161
|
+
| 5.1 relevance | 0.235 / 0.235 | 0.086 / 0.085 | 0.076 / 0.080 |
|
|
162
|
+
| 5.2 diversity | 0.222 / 0.222 | 0.080 / 0.061 | 0.197 / 0.196 |
|
|
163
|
+
| 5.3 novelty | 0.259 / 0.259 | 0.104 / 0.104 | 0.143 / 0.143 |
|
|
164
|
+
| 5.4 serendipity | 0.270 / 0.270 | 0.104 / 0.103 | 0.036 / 0.039 |
|
|
165
|
+
| 6 satisfaction | 0.255 / 0.255 | 0.102 / 0.102 | 0.039 / 0.045 |
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
SMRS_DATA_DIR=/path/to/sm-rs python examples/reproduce_perceived.py
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Submitting to the leaderboard
|
|
172
|
+
|
|
173
|
+
Self-service (no submission server): run the canonical `evaluate()` for a task,
|
|
174
|
+
then open a PR adding your row to the leaderboard in the
|
|
175
|
+
[dataset card](https://huggingface.co/datasets/pdokoupil/SM-RS), with a link to
|
|
176
|
+
reproduce. The shipped baselines are the rows to beat.
|
|
177
|
+
|
|
178
|
+
## Citing
|
|
179
|
+
|
|
180
|
+
Please cite **both** papers (GitHub's "Cite this repository" reads
|
|
181
|
+
[`CITATION.cff`](CITATION.cff)):
|
|
182
|
+
|
|
183
|
+
```bibtex
|
|
184
|
+
@article{dokoupil2026smrs2,
|
|
185
|
+
author = {Dokoupil, Patrik and Peska, Ladislav},
|
|
186
|
+
title = {SM-RS 2.0: User-perceived Qualities of Single- and Multi-Objective Recommender Systems},
|
|
187
|
+
journal = {ACM Transactions on Recommender Systems},
|
|
188
|
+
volume = {4}, number = {3}, year = {2026},
|
|
189
|
+
doi = {10.1145/3754459}
|
|
190
|
+
}
|
|
191
|
+
@inproceedings{dokoupil2024smrs,
|
|
192
|
+
author = {Dokoupil, Patrik and Peska, Ladislav and Boratto, Ludovico},
|
|
193
|
+
title = {SM-RS: Single- and Multi-Objective Recommendations with Contextual Impressions and Beyond-Accuracy Propensity Scores},
|
|
194
|
+
booktitle = {Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval},
|
|
195
|
+
series = {SIGIR '24}, pages = {988--995}, year = {2024},
|
|
196
|
+
doi = {10.1145/3626772.3657863}
|
|
197
|
+
}
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## License
|
|
201
|
+
|
|
202
|
+
Code: MIT (see [`LICENSE`](LICENSE)). Data: CC-BY-4.0 (see the dataset card).
|