pairwise-comparison-framework 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ Metadata-Version: 2.3
2
+ Name: pairwise-comparison-framework
3
+ Version: 1.0.0
4
+ Summary: A framework for characterizing and predicting pairwise comparisons using user clustering and matrix completion
5
+ Author: Henrique Chaves
6
+ Author-email: Henrique Chaves <henriquechaves@poli.ufrj.br>
7
+ License: MIT
8
+ Requires-Dist: matplotlib>=3.10.7
9
+ Requires-Dist: networkx>=3.5
10
+ Requires-Dist: numpy>=2.3.3
11
+ Requires-Dist: pandas>=2.3.3
12
+ Requires-Dist: pydantic>=2.11.9
13
+ Requires-Dist: scikit-learn>=1.7.2
14
+ Requires-Dist: scipy>=1.16.2
15
+ Requires-Dist: fancyimpute>=0.7.0
16
+ Requires-Python: >=3.13
17
+ Description-Content-Type: text/markdown
18
+
19
+ # Pairwise Comparison Framework
20
+
21
+ A Python framework for clustering users based on their pairwise comparison patterns and predicting missing comparisons via matrix completion.
22
+
23
+ This project was developed as part of a final thesis (TCC) at the Escola Politécnica, Universidade Federal do Rio de Janeiro (UFRJ).
24
+
25
+ ## Overview
26
+
27
+ The framework operates in two stages:
28
+
29
+ 1. **Clustering** (Stage 1): Each user's pairwise comparisons are modeled as a weighted directed graph. Shortest-path-length matrices serve as structural preference signatures, a custom distance metric measures user similarity, and MDS + K-Means produces the final clusters.
30
+
31
+ 2. **Prediction** (Stage 2): A sparse comparison matrix is built from all users' data. Matrix completion (Iterative SVD or KNN) fills in missing entries. Applying completion within each cluster improves prediction accuracy over a global model.
32
+
33
+ ## Installation
34
+
35
+ Requires Python 3.13+. Install with [uv](https://docs.astral.sh/uv/):
36
+
37
+ ```bash
38
+ uv sync
39
+ ```
40
+
41
+ Or with pip:
42
+
43
+ ```bash
44
+ pip install -e .
45
+ ```
46
+
47
+ ## Quick Start
48
+
49
+ ### Clustering
50
+
51
+ ```python
52
+ import pairwise_comparison_framework as pcf
53
+
54
+ clusterizer = pcf.Clusterizer()
55
+ clusterizer.load_data("data/sample/comparisons.json", "data/sample/labels.json")
56
+
57
+ # Visualize a user's preference graph
58
+ clusterizer.plot_graph("user_id", with_labels=True)
59
+
60
+ # Get item rankings via PageRank
61
+ clusterizer.get_ranking("user_id", with_labels=True)
62
+
63
+ # Cluster users
64
+ clusters = clusterizer.get_user_clusters(n_clusters=3)
65
+
66
+ # Assess cluster quality
67
+ clusterizer.get_silhouette_scores(n_clusters=3)
68
+ clusterizer.get_elbow_data(max_clusters=10)
69
+ clusterizer.plot_clusters(n_clusters=3)
70
+ ```
71
+
72
+ ### Prediction
73
+
74
+ ```python
75
+ # Iterative SVD (default)
76
+ predictor = pcf.Predictor(impute_method="svd", impute_rank=1)
77
+ predictor.load_data("data/sample/comparisons.json", "data/sample/labels.json")
78
+
79
+ # Or KNN imputation
80
+ predictor = pcf.Predictor(impute_method="knn", impute_k=5)
81
+ predictor.load_data("data/sample/comparisons.json", "data/sample/labels.json")
82
+
83
+ # Predict missing comparisons
84
+ completed_matrix = predictor.predict()
85
+
86
+ # Controlled evaluation: global vs per-cluster
87
+ results = predictor.evaluate_global_vs_clusters(
88
+ clusters, n_rounds=50, test_size=0.2
89
+ )
90
+ ```
91
+
92
+ ## Data Format
93
+
94
+ ### Comparisons (`comparisons.json`)
95
+
96
+ A JSON array of user comparison records. Each comparison score answers "How much more preferred is the first item over the second?":
97
+
98
+ - `2`: first item much more preferred
99
+ - `1`: first item more preferred
100
+ - `0`: equally preferred (tie)
101
+ - `-1`: second item more preferred
102
+ - `-2`: second item much more preferred
103
+
104
+ ```json
105
+ [
106
+ {
107
+ "user_id": "user_01",
108
+ "comparisons": [
109
+ {
110
+ "first_item_id": "item_01",
111
+ "second_item_id": "item_02",
112
+ "score": 2
113
+ }
114
+ ]
115
+ }
116
+ ]
117
+ ```
118
+
119
+ ### Labels (`labels.json`, optional)
120
+
121
+ A JSON array mapping item IDs to human-readable names:
122
+
123
+ ```json
124
+ [
125
+ {
126
+ "item_id": "item_01",
127
+ "label": "Oranges"
128
+ }
129
+ ]
130
+ ```
131
+
132
+ ## Notebooks
133
+
134
+ | Notebook | Description |
135
+ |----------|-------------|
136
+ | `notebooks/clustering.ipynb` | Clustering pipeline demo with sample data |
137
+ | `notebooks/prediction.ipynb` | Prediction pipeline demo with sample data |
138
+ | `notebooks/epfl_analysis.ipynb` | Full analysis on the EPFL food preference dataset (private dataset, not included) |
139
+
140
+ ## Project Structure
141
+
142
+ ```
143
+ pairwise-comparison-framework/
144
+ ├── src/pairwise_comparison_framework/
145
+ │ ├── __init__.py # Exports Clusterizer, Predictor
146
+ │ ├── _base.py # Shared data loading logic
147
+ │ ├── models/ # Pydantic data models
148
+ │ ├── clusterizer/ # Stage 1: clustering pipeline
149
+ │ └── predictor/ # Stage 2: prediction pipeline
150
+ ├── data/
151
+ │ └── sample/ # Small sample dataset (wine preferences)
152
+ └── notebooks/ # Jupyter notebooks
153
+ ```
154
+
155
+ ## License
156
+
157
+ This project is licensed under the MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,139 @@
1
+ # Pairwise Comparison Framework
2
+
3
+ A Python framework for clustering users based on their pairwise comparison patterns and predicting missing comparisons via matrix completion.
4
+
5
+ This project was developed as part of a final thesis (TCC) at the Escola Politécnica, Universidade Federal do Rio de Janeiro (UFRJ).
6
+
7
+ ## Overview
8
+
9
+ The framework operates in two stages:
10
+
11
+ 1. **Clustering** (Stage 1): Each user's pairwise comparisons are modeled as a weighted directed graph. Shortest-path-length matrices serve as structural preference signatures, a custom distance metric measures user similarity, and MDS + K-Means produces the final clusters.
12
+
13
+ 2. **Prediction** (Stage 2): A sparse comparison matrix is built from all users' data. Matrix completion (Iterative SVD or KNN) fills in missing entries. Applying completion within each cluster improves prediction accuracy over a global model.
14
+
15
+ ## Installation
16
+
17
+ Requires Python 3.13+. Install with [uv](https://docs.astral.sh/uv/):
18
+
19
+ ```bash
20
+ uv sync
21
+ ```
22
+
23
+ Or with pip:
24
+
25
+ ```bash
26
+ pip install -e .
27
+ ```
28
+
29
+ ## Quick Start
30
+
31
+ ### Clustering
32
+
33
+ ```python
34
+ import pairwise_comparison_framework as pcf
35
+
36
+ clusterizer = pcf.Clusterizer()
37
+ clusterizer.load_data("data/sample/comparisons.json", "data/sample/labels.json")
38
+
39
+ # Visualize a user's preference graph
40
+ clusterizer.plot_graph("user_id", with_labels=True)
41
+
42
+ # Get item rankings via PageRank
43
+ clusterizer.get_ranking("user_id", with_labels=True)
44
+
45
+ # Cluster users
46
+ clusters = clusterizer.get_user_clusters(n_clusters=3)
47
+
48
+ # Assess cluster quality
49
+ clusterizer.get_silhouette_scores(n_clusters=3)
50
+ clusterizer.get_elbow_data(max_clusters=10)
51
+ clusterizer.plot_clusters(n_clusters=3)
52
+ ```
53
+
54
+ ### Prediction
55
+
56
+ ```python
57
+ # Iterative SVD (default)
58
+ predictor = pcf.Predictor(impute_method="svd", impute_rank=1)
59
+ predictor.load_data("data/sample/comparisons.json", "data/sample/labels.json")
60
+
61
+ # Or KNN imputation
62
+ predictor = pcf.Predictor(impute_method="knn", impute_k=5)
63
+ predictor.load_data("data/sample/comparisons.json", "data/sample/labels.json")
64
+
65
+ # Predict missing comparisons
66
+ completed_matrix = predictor.predict()
67
+
68
+ # Controlled evaluation: global vs per-cluster
69
+ results = predictor.evaluate_global_vs_clusters(
70
+ clusters, n_rounds=50, test_size=0.2
71
+ )
72
+ ```
73
+
74
+ ## Data Format
75
+
76
+ ### Comparisons (`comparisons.json`)
77
+
78
+ A JSON array of user comparison records. Each comparison score answers "How much more preferred is the first item over the second?":
79
+
80
+ - `2`: first item much more preferred
81
+ - `1`: first item more preferred
82
+ - `0`: equally preferred (tie)
83
+ - `-1`: second item more preferred
84
+ - `-2`: second item much more preferred
85
+
86
+ ```json
87
+ [
88
+ {
89
+ "user_id": "user_01",
90
+ "comparisons": [
91
+ {
92
+ "first_item_id": "item_01",
93
+ "second_item_id": "item_02",
94
+ "score": 2
95
+ }
96
+ ]
97
+ }
98
+ ]
99
+ ```
100
+
101
+ ### Labels (`labels.json`, optional)
102
+
103
+ A JSON array mapping item IDs to human-readable names:
104
+
105
+ ```json
106
+ [
107
+ {
108
+ "item_id": "item_01",
109
+ "label": "Oranges"
110
+ }
111
+ ]
112
+ ```
113
+
114
+ ## Notebooks
115
+
116
+ | Notebook | Description |
117
+ |----------|-------------|
118
+ | `notebooks/clustering.ipynb` | Clustering pipeline demo with sample data |
119
+ | `notebooks/prediction.ipynb` | Prediction pipeline demo with sample data |
120
+ | `notebooks/epfl_analysis.ipynb` | Full analysis on the EPFL food preference dataset (private dataset, not included) |
121
+
122
+ ## Project Structure
123
+
124
+ ```
125
+ pairwise-comparison-framework/
126
+ ├── src/pairwise_comparison_framework/
127
+ │ ├── __init__.py # Exports Clusterizer, Predictor
128
+ │ ├── _base.py # Shared data loading logic
129
+ │ ├── models/ # Pydantic data models
130
+ │ ├── clusterizer/ # Stage 1: clustering pipeline
131
+ │ └── predictor/ # Stage 2: prediction pipeline
132
+ ├── data/
133
+ │ └── sample/ # Small sample dataset (wine preferences)
134
+ └── notebooks/ # Jupyter notebooks
135
+ ```
136
+
137
+ ## License
138
+
139
+ This project is licensed under the MIT License. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,28 @@
1
+ [project]
2
+ name = "pairwise-comparison-framework"
3
+ version = "1.0.0"
4
+ description = "A framework for characterizing and predicting pairwise comparisons using user clustering and matrix completion"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Henrique Chaves", email = "henriquechaves@poli.ufrj.br" }
8
+ ]
9
+ requires-python = ">=3.13"
10
+ license = {text = "MIT"}
11
+ dependencies = [
12
+ "matplotlib>=3.10.7",
13
+ "networkx>=3.5",
14
+ "numpy>=2.3.3",
15
+ "pandas>=2.3.3",
16
+ "pydantic>=2.11.9",
17
+ "scikit-learn>=1.7.2",
18
+ "scipy>=1.16.2",
19
+ "fancyimpute>=0.7.0",
20
+ ]
21
+ [build-system]
22
+ requires = ["uv_build>=0.8.4,<0.9.0"]
23
+ build-backend = "uv_build"
24
+
25
+ [dependency-groups]
26
+ dev = [
27
+ "jupyter>=1.1.1",
28
+ ]
@@ -0,0 +1,4 @@
1
+ from .clusterizer import Clusterizer
2
+ from .predictor import Predictor
3
+
4
+ __all__ = ["Clusterizer", "Predictor"]
@@ -0,0 +1,59 @@
1
+ import json
2
+
3
+ from .models import ItemLabel, UserComparisons
4
+
5
+
6
+ class BaseDataLoader:
7
+ """Shared data-loading logic for classes that consume comparison data.
8
+
9
+ Provides `_load_comparisons` and `_load_labels` together with the
10
+ `comparisons` and `labels` attributes they populate. Subclasses
11
+ should call `super().__init__()` and then perform their own
12
+ initialisation.
13
+ """
14
+
15
+ comparisons: list[UserComparisons]
16
+ labels: list[ItemLabel]
17
+
18
+ def __init__(self) -> None:
19
+ self.comparisons = []
20
+ self.labels = []
21
+
22
+ def _load_comparisons(self, filename: str) -> None:
23
+ """Load pairwise comparisons from a JSON file.
24
+
25
+ The file must contain a JSON array of objects, each with a
26
+ `user_id` and a `comparisons` list.
27
+
28
+ Args:
29
+ filename: Path to the JSON file.
30
+ """
31
+ with open(filename, "r") as f:
32
+ data = json.load(f)
33
+
34
+ if not isinstance(data, list):
35
+ raise ValueError("comparisons must be a list")
36
+
37
+ self.comparisons = [UserComparisons(**item) for item in data]
38
+
39
+ print(f"Loaded pairwise comparisons from {len(self.comparisons)} users")
40
+ print(
41
+ f"Total amount of pairwise comparisons: "
42
+ f"{sum(len(user.comparisons) for user in self.comparisons)}"
43
+ )
44
+
45
+ def _load_labels(self, filename: str) -> None:
46
+ """Load item labels from a JSON file.
47
+
48
+ Args:
49
+ filename: Path to the JSON file containing a list of
50
+ `{"item_id": ..., "label": ...}` objects.
51
+ """
52
+ with open(filename, "r") as f:
53
+ data = json.load(f)
54
+
55
+ if not isinstance(data, list):
56
+ raise ValueError("labels must be a list")
57
+
58
+ self.labels = [ItemLabel(**item) for item in data]
59
+ print(f"Loaded labels from {len(self.labels)} items")