pairwise-comparison-framework 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pairwise_comparison_framework-1.0.0/PKG-INFO +157 -0
- pairwise_comparison_framework-1.0.0/README.md +139 -0
- pairwise_comparison_framework-1.0.0/pyproject.toml +28 -0
- pairwise_comparison_framework-1.0.0/src/pairwise_comparison_framework/__init__.py +4 -0
- pairwise_comparison_framework-1.0.0/src/pairwise_comparison_framework/_base.py +59 -0
- pairwise_comparison_framework-1.0.0/src/pairwise_comparison_framework/clusterizer/__init__.py +805 -0
- pairwise_comparison_framework-1.0.0/src/pairwise_comparison_framework/models/__init__.py +41 -0
- pairwise_comparison_framework-1.0.0/src/pairwise_comparison_framework/predictor/__init__.py +637 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: pairwise-comparison-framework
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A framework for characterizing and predicting pairwise comparisons using user clustering and matrix completion
|
|
5
|
+
Author: Henrique Chaves
|
|
6
|
+
Author-email: Henrique Chaves <henriquechaves@poli.ufrj.br>
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Dist: matplotlib>=3.10.7
|
|
9
|
+
Requires-Dist: networkx>=3.5
|
|
10
|
+
Requires-Dist: numpy>=2.3.3
|
|
11
|
+
Requires-Dist: pandas>=2.3.3
|
|
12
|
+
Requires-Dist: pydantic>=2.11.9
|
|
13
|
+
Requires-Dist: scikit-learn>=1.7.2
|
|
14
|
+
Requires-Dist: scipy>=1.16.2
|
|
15
|
+
Requires-Dist: fancyimpute>=0.7.0
|
|
16
|
+
Requires-Python: >=3.13
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# Pairwise Comparison Framework
|
|
20
|
+
|
|
21
|
+
A Python framework for clustering users based on their pairwise comparison patterns and predicting missing comparisons via matrix completion.
|
|
22
|
+
|
|
23
|
+
This project was developed as part of a final thesis (TCC) at the Escola Politécnica, Universidade Federal do Rio de Janeiro (UFRJ).
|
|
24
|
+
|
|
25
|
+
## Overview
|
|
26
|
+
|
|
27
|
+
The framework operates in two stages:
|
|
28
|
+
|
|
29
|
+
1. **Clustering** (Stage 1): Each user's pairwise comparisons are modeled as a weighted directed graph. Shortest-path-length matrices serve as structural preference signatures, a custom distance metric measures user similarity, and MDS + K-Means produces the final clusters.
|
|
30
|
+
|
|
31
|
+
2. **Prediction** (Stage 2): A sparse comparison matrix is built from all users' data. Matrix completion (Iterative SVD or KNN) fills in missing entries. Applying completion within each cluster improves prediction accuracy over a global model.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
Requires Python 3.13+. Install with [uv](https://docs.astral.sh/uv/):
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv sync
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Or with pip:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install -e .
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
### Clustering
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
import pairwise_comparison_framework as pcf
|
|
53
|
+
|
|
54
|
+
clusterizer = pcf.Clusterizer()
|
|
55
|
+
clusterizer.load_data("data/sample/comparisons.json", "data/sample/labels.json")
|
|
56
|
+
|
|
57
|
+
# Visualize a user's preference graph
|
|
58
|
+
clusterizer.plot_graph("user_id", with_labels=True)
|
|
59
|
+
|
|
60
|
+
# Get item rankings via PageRank
|
|
61
|
+
clusterizer.get_ranking("user_id", with_labels=True)
|
|
62
|
+
|
|
63
|
+
# Cluster users
|
|
64
|
+
clusters = clusterizer.get_user_clusters(n_clusters=3)
|
|
65
|
+
|
|
66
|
+
# Assess cluster quality
|
|
67
|
+
clusterizer.get_silhouette_scores(n_clusters=3)
|
|
68
|
+
clusterizer.get_elbow_data(max_clusters=10)
|
|
69
|
+
clusterizer.plot_clusters(n_clusters=3)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Prediction
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
# Iterative SVD (default)
|
|
76
|
+
predictor = pcf.Predictor(impute_method="svd", impute_rank=1)
|
|
77
|
+
predictor.load_data("data/sample/comparisons.json", "data/sample/labels.json")
|
|
78
|
+
|
|
79
|
+
# Or KNN imputation
|
|
80
|
+
predictor = pcf.Predictor(impute_method="knn", impute_k=5)
|
|
81
|
+
predictor.load_data("data/sample/comparisons.json", "data/sample/labels.json")
|
|
82
|
+
|
|
83
|
+
# Predict missing comparisons
|
|
84
|
+
completed_matrix = predictor.predict()
|
|
85
|
+
|
|
86
|
+
# Controlled evaluation: global vs per-cluster
|
|
87
|
+
results = predictor.evaluate_global_vs_clusters(
|
|
88
|
+
clusters, n_rounds=50, test_size=0.2
|
|
89
|
+
)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Data Format
|
|
93
|
+
|
|
94
|
+
### Comparisons (`comparisons.json`)
|
|
95
|
+
|
|
96
|
+
A JSON array of user comparison records. Each comparison score answers "How much more preferred is the first item over the second?":
|
|
97
|
+
|
|
98
|
+
- `2`: first item much more preferred
|
|
99
|
+
- `1`: first item more preferred
|
|
100
|
+
- `0`: equally preferred (tie)
|
|
101
|
+
- `-1`: second item more preferred
|
|
102
|
+
- `-2`: second item much more preferred
|
|
103
|
+
|
|
104
|
+
```json
|
|
105
|
+
[
|
|
106
|
+
{
|
|
107
|
+
"user_id": "user_01",
|
|
108
|
+
"comparisons": [
|
|
109
|
+
{
|
|
110
|
+
"first_item_id": "item_01",
|
|
111
|
+
"second_item_id": "item_02",
|
|
112
|
+
"score": 2
|
|
113
|
+
}
|
|
114
|
+
]
|
|
115
|
+
}
|
|
116
|
+
]
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Labels (`labels.json`, optional)
|
|
120
|
+
|
|
121
|
+
A JSON array mapping item IDs to human-readable names:
|
|
122
|
+
|
|
123
|
+
```json
|
|
124
|
+
[
|
|
125
|
+
{
|
|
126
|
+
"item_id": "item_01",
|
|
127
|
+
"label": "Oranges"
|
|
128
|
+
}
|
|
129
|
+
]
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Notebooks
|
|
133
|
+
|
|
134
|
+
| Notebook | Description |
|
|
135
|
+
|----------|-------------|
|
|
136
|
+
| `notebooks/clustering.ipynb` | Clustering pipeline demo with sample data |
|
|
137
|
+
| `notebooks/prediction.ipynb` | Prediction pipeline demo with sample data |
|
|
138
|
+
| `notebooks/epfl_analysis.ipynb` | Full analysis on the EPFL food preference dataset (private dataset, not included) |
|
|
139
|
+
|
|
140
|
+
## Project Structure
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
pairwise-comparison-framework/
|
|
144
|
+
├── src/pairwise_comparison_framework/
|
|
145
|
+
│ ├── __init__.py # Exports Clusterizer, Predictor
|
|
146
|
+
│ ├── _base.py # Shared data loading logic
|
|
147
|
+
│ ├── models/ # Pydantic data models
|
|
148
|
+
│ ├── clusterizer/ # Stage 1: clustering pipeline
|
|
149
|
+
│ └── predictor/ # Stage 2: prediction pipeline
|
|
150
|
+
├── data/
|
|
151
|
+
│ └── sample/ # Small sample dataset (wine preferences)
|
|
152
|
+
└── notebooks/ # Jupyter notebooks
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## License
|
|
156
|
+
|
|
157
|
+
This project is licensed under the MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Pairwise Comparison Framework
|
|
2
|
+
|
|
3
|
+
A Python framework for clustering users based on their pairwise comparison patterns and predicting missing comparisons via matrix completion.
|
|
4
|
+
|
|
5
|
+
This project was developed as part of a final thesis (TCC) at the Escola Politécnica, Universidade Federal do Rio de Janeiro (UFRJ).
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
The framework operates in two stages:
|
|
10
|
+
|
|
11
|
+
1. **Clustering** (Stage 1): Each user's pairwise comparisons are modeled as a weighted directed graph. Shortest-path-length matrices serve as structural preference signatures, a custom distance metric measures user similarity, and MDS + K-Means produces the final clusters.
|
|
12
|
+
|
|
13
|
+
2. **Prediction** (Stage 2): A sparse comparison matrix is built from all users' data. Matrix completion (Iterative SVD or KNN) fills in missing entries. Applying completion within each cluster improves prediction accuracy over a global model.
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
Requires Python 3.13+. Install with [uv](https://docs.astral.sh/uv/):
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
uv sync
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Or with pip:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install -e .
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
### Clustering
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import pairwise_comparison_framework as pcf
|
|
35
|
+
|
|
36
|
+
clusterizer = pcf.Clusterizer()
|
|
37
|
+
clusterizer.load_data("data/sample/comparisons.json", "data/sample/labels.json")
|
|
38
|
+
|
|
39
|
+
# Visualize a user's preference graph
|
|
40
|
+
clusterizer.plot_graph("user_id", with_labels=True)
|
|
41
|
+
|
|
42
|
+
# Get item rankings via PageRank
|
|
43
|
+
clusterizer.get_ranking("user_id", with_labels=True)
|
|
44
|
+
|
|
45
|
+
# Cluster users
|
|
46
|
+
clusters = clusterizer.get_user_clusters(n_clusters=3)
|
|
47
|
+
|
|
48
|
+
# Assess cluster quality
|
|
49
|
+
clusterizer.get_silhouette_scores(n_clusters=3)
|
|
50
|
+
clusterizer.get_elbow_data(max_clusters=10)
|
|
51
|
+
clusterizer.plot_clusters(n_clusters=3)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Prediction
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
# Iterative SVD (default)
|
|
58
|
+
predictor = pcf.Predictor(impute_method="svd", impute_rank=1)
|
|
59
|
+
predictor.load_data("data/sample/comparisons.json", "data/sample/labels.json")
|
|
60
|
+
|
|
61
|
+
# Or KNN imputation
|
|
62
|
+
predictor = pcf.Predictor(impute_method="knn", impute_k=5)
|
|
63
|
+
predictor.load_data("data/sample/comparisons.json", "data/sample/labels.json")
|
|
64
|
+
|
|
65
|
+
# Predict missing comparisons
|
|
66
|
+
completed_matrix = predictor.predict()
|
|
67
|
+
|
|
68
|
+
# Controlled evaluation: global vs per-cluster
|
|
69
|
+
results = predictor.evaluate_global_vs_clusters(
|
|
70
|
+
clusters, n_rounds=50, test_size=0.2
|
|
71
|
+
)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Data Format
|
|
75
|
+
|
|
76
|
+
### Comparisons (`comparisons.json`)
|
|
77
|
+
|
|
78
|
+
A JSON array of user comparison records. Each comparison score answers "How much more preferred is the first item over the second?":
|
|
79
|
+
|
|
80
|
+
- `2`: first item much more preferred
|
|
81
|
+
- `1`: first item more preferred
|
|
82
|
+
- `0`: equally preferred (tie)
|
|
83
|
+
- `-1`: second item more preferred
|
|
84
|
+
- `-2`: second item much more preferred
|
|
85
|
+
|
|
86
|
+
```json
|
|
87
|
+
[
|
|
88
|
+
{
|
|
89
|
+
"user_id": "user_01",
|
|
90
|
+
"comparisons": [
|
|
91
|
+
{
|
|
92
|
+
"first_item_id": "item_01",
|
|
93
|
+
"second_item_id": "item_02",
|
|
94
|
+
"score": 2
|
|
95
|
+
}
|
|
96
|
+
]
|
|
97
|
+
}
|
|
98
|
+
]
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Labels (`labels.json`, optional)
|
|
102
|
+
|
|
103
|
+
A JSON array mapping item IDs to human-readable names:
|
|
104
|
+
|
|
105
|
+
```json
|
|
106
|
+
[
|
|
107
|
+
{
|
|
108
|
+
"item_id": "item_01",
|
|
109
|
+
"label": "Oranges"
|
|
110
|
+
}
|
|
111
|
+
]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Notebooks
|
|
115
|
+
|
|
116
|
+
| Notebook | Description |
|
|
117
|
+
|----------|-------------|
|
|
118
|
+
| `notebooks/clustering.ipynb` | Clustering pipeline demo with sample data |
|
|
119
|
+
| `notebooks/prediction.ipynb` | Prediction pipeline demo with sample data |
|
|
120
|
+
| `notebooks/epfl_analysis.ipynb` | Full analysis on the EPFL food preference dataset (private dataset, not included) |
|
|
121
|
+
|
|
122
|
+
## Project Structure
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
pairwise-comparison-framework/
|
|
126
|
+
├── src/pairwise_comparison_framework/
|
|
127
|
+
│ ├── __init__.py # Exports Clusterizer, Predictor
|
|
128
|
+
│ ├── _base.py # Shared data loading logic
|
|
129
|
+
│ ├── models/ # Pydantic data models
|
|
130
|
+
│ ├── clusterizer/ # Stage 1: clustering pipeline
|
|
131
|
+
│ └── predictor/ # Stage 2: prediction pipeline
|
|
132
|
+
├── data/
|
|
133
|
+
│ └── sample/ # Small sample dataset (wine preferences)
|
|
134
|
+
└── notebooks/ # Jupyter notebooks
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## License
|
|
138
|
+
|
|
139
|
+
This project is licensed under the MIT License. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "pairwise-comparison-framework"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "A framework for characterizing and predicting pairwise comparisons using user clustering and matrix completion"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Henrique Chaves", email = "henriquechaves@poli.ufrj.br" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.13"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
dependencies = [
|
|
12
|
+
"matplotlib>=3.10.7",
|
|
13
|
+
"networkx>=3.5",
|
|
14
|
+
"numpy>=2.3.3",
|
|
15
|
+
"pandas>=2.3.3",
|
|
16
|
+
"pydantic>=2.11.9",
|
|
17
|
+
"scikit-learn>=1.7.2",
|
|
18
|
+
"scipy>=1.16.2",
|
|
19
|
+
"fancyimpute>=0.7.0",
|
|
20
|
+
]
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["uv_build>=0.8.4,<0.9.0"]
|
|
23
|
+
build-backend = "uv_build"
|
|
24
|
+
|
|
25
|
+
[dependency-groups]
|
|
26
|
+
dev = [
|
|
27
|
+
"jupyter>=1.1.1",
|
|
28
|
+
]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from .models import ItemLabel, UserComparisons
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseDataLoader:
|
|
7
|
+
"""Shared data-loading logic for classes that consume comparison data.
|
|
8
|
+
|
|
9
|
+
Provides `_load_comparisons` and `_load_labels` together with the
|
|
10
|
+
`comparisons` and `labels` attributes they populate. Subclasses
|
|
11
|
+
should call `super().__init__()` and then perform their own
|
|
12
|
+
initialisation.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
comparisons: list[UserComparisons]
|
|
16
|
+
labels: list[ItemLabel]
|
|
17
|
+
|
|
18
|
+
def __init__(self) -> None:
|
|
19
|
+
self.comparisons = []
|
|
20
|
+
self.labels = []
|
|
21
|
+
|
|
22
|
+
def _load_comparisons(self, filename: str) -> None:
|
|
23
|
+
"""Load pairwise comparisons from a JSON file.
|
|
24
|
+
|
|
25
|
+
The file must contain a JSON array of objects, each with a
|
|
26
|
+
`user_id` and a `comparisons` list.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
filename: Path to the JSON file.
|
|
30
|
+
"""
|
|
31
|
+
with open(filename, "r") as f:
|
|
32
|
+
data = json.load(f)
|
|
33
|
+
|
|
34
|
+
if not isinstance(data, list):
|
|
35
|
+
raise ValueError("comparisons must be a list")
|
|
36
|
+
|
|
37
|
+
self.comparisons = [UserComparisons(**item) for item in data]
|
|
38
|
+
|
|
39
|
+
print(f"Loaded pairwise comparisons from {len(self.comparisons)} users")
|
|
40
|
+
print(
|
|
41
|
+
f"Total amount of pairwise comparisons: "
|
|
42
|
+
f"{sum(len(user.comparisons) for user in self.comparisons)}"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def _load_labels(self, filename: str) -> None:
|
|
46
|
+
"""Load item labels from a JSON file.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
filename: Path to the JSON file containing a list of
|
|
50
|
+
`{"item_id": ..., "label": ...}` objects.
|
|
51
|
+
"""
|
|
52
|
+
with open(filename, "r") as f:
|
|
53
|
+
data = json.load(f)
|
|
54
|
+
|
|
55
|
+
if not isinstance(data, list):
|
|
56
|
+
raise ValueError("labels must be a list")
|
|
57
|
+
|
|
58
|
+
self.labels = [ItemLabel(**item) for item in data]
|
|
59
|
+
print(f"Loaded labels from {len(self.labels)} items")
|