voting-mcp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. voting_mcp-0.1.0/.env.example +10 -0
  2. voting_mcp-0.1.0/.github/workflows/ci.yml +29 -0
  3. voting_mcp-0.1.0/.gitignore +20 -0
  4. voting_mcp-0.1.0/CLAUDE.md +155 -0
  5. voting_mcp-0.1.0/PKG-INFO +143 -0
  6. voting_mcp-0.1.0/README.md +120 -0
  7. voting_mcp-0.1.0/RESULTS.md +91 -0
  8. voting_mcp-0.1.0/bench/__init__.py +1 -0
  9. voting_mcp-0.1.0/bench/compare.py +153 -0
  10. voting_mcp-0.1.0/bench/config/models.yaml +29 -0
  11. voting_mcp-0.1.0/bench/ensemble.py +84 -0
  12. voting_mcp-0.1.0/bench/fetch_arc.py +72 -0
  13. voting_mcp-0.1.0/bench/fetch_mmlu_pro.py +82 -0
  14. voting_mcp-0.1.0/bench/metrics.py +47 -0
  15. voting_mcp-0.1.0/bench/prompts.py +75 -0
  16. voting_mcp-0.1.0/bench/run_ensemble.py +232 -0
  17. voting_mcp-0.1.0/docs/accuracy_arc.png +0 -0
  18. voting_mcp-0.1.0/docs/accuracy_mmlu_pro.png +0 -0
  19. voting_mcp-0.1.0/pyproject.toml +66 -0
  20. voting_mcp-0.1.0/src/voting_mcp/__init__.py +3 -0
  21. voting_mcp-0.1.0/src/voting_mcp/aggregate.py +54 -0
  22. voting_mcp-0.1.0/src/voting_mcp/rules/__init__.py +23 -0
  23. voting_mcp-0.1.0/src/voting_mcp/rules/_common.py +69 -0
  24. voting_mcp-0.1.0/src/voting_mcp/rules/approval.py +19 -0
  25. voting_mcp-0.1.0/src/voting_mcp/rules/borda.py +35 -0
  26. voting_mcp-0.1.0/src/voting_mcp/rules/condorcet.py +48 -0
  27. voting_mcp-0.1.0/src/voting_mcp/rules/copeland.py +29 -0
  28. voting_mcp-0.1.0/src/voting_mcp/rules/majority.py +49 -0
  29. voting_mcp-0.1.0/src/voting_mcp/rules/opinion_pool.py +31 -0
  30. voting_mcp-0.1.0/src/voting_mcp/rules/plurality.py +23 -0
  31. voting_mcp-0.1.0/src/voting_mcp/rules/stv.py +74 -0
  32. voting_mcp-0.1.0/src/voting_mcp/scoring.py +64 -0
  33. voting_mcp-0.1.0/src/voting_mcp/server.py +164 -0
  34. voting_mcp-0.1.0/src/voting_mcp/types.py +145 -0
  35. voting_mcp-0.1.0/tests/test_bench.py +149 -0
  36. voting_mcp-0.1.0/tests/test_rules.py +416 -0
  37. voting_mcp-0.1.0/tests/test_scoring.py +67 -0
  38. voting_mcp-0.1.0/tests/test_server.py +82 -0
  39. voting_mcp-0.1.0/tests/test_types.py +153 -0
  40. voting_mcp-0.1.0/uv.lock +1559 -0
@@ -0,0 +1,10 @@
1
+ # Bench harness only — the MCP server itself reads NO secrets.
2
+ # Routing default: OpenRouter single gateway (one key, every member is a model_id).
3
+ OPENROUTER_API_KEY=
4
+
5
+ # Per-provider fallback keys (only if you switch models.yaml off the gateway).
6
+ # OPENAI_API_KEY=
7
+ # GOOGLE_API_KEY=
8
+ # DEEPSEEK_API_KEY=
9
+ # ANTHROPIC_API_KEY=
10
+ # ZHIPU_API_KEY=
@@ -0,0 +1,29 @@
1
+ name: ci
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v5
16
+ with:
17
+ python-version: "3.12"
18
+
19
+ - name: Sync (core + dev)
20
+ run: uv sync
21
+
22
+ - name: Lint
23
+ run: uv run ruff check .
24
+
25
+ - name: Type-check (strict)
26
+ run: uv run mypy --strict src
27
+
28
+ - name: Test
29
+ run: uv run pytest -q
@@ -0,0 +1,20 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ .venv/
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+
9
+ # Tooling caches
10
+ .pytest_cache/
11
+ .mypy_cache/
12
+ .ruff_cache/
13
+
14
+ # Secrets
15
+ .env
16
+
17
+ # Bench artifacts: caches, downloaded data, and generated tables/plots stay local.
18
+ # (The raw cache is a cost guardrail; commit it deliberately if you want reuse.)
19
+ bench/results/
20
+ bench/datasets/*.jsonl
@@ -0,0 +1,155 @@
1
+ # CLAUDE.md — voting-mcp
2
+
3
+ ## What this is
4
+ An MCP server that exposes **principled social-choice aggregation rules** (Borda, Copeland,
5
+ Condorcet, approval, STV, linear opinion pool) as callable tools — plus a benchmark harness
6
+ that **proves** these rules beat naive majority vote when aggregating a diverse ensemble of
7
+ LLM agents on a reasoning benchmark.
8
+
9
+ The differentiator is NOT "I can count votes." Almost every multi-agent system hand-rolls
10
+ `Counter(votes).most_common(1)`, which throws away preference intensity and confidence.
11
+ This server ships the rules with their known axiomatic properties AND a measured accuracy
12
+ delta over majority vote. The number is the point.
13
+
14
+ Target end state: published to PyPI + the MCP registry, README carrying the eval table.
15
+
16
+ ## Goals / non-goals
17
+ - GOAL: correct, exhaustively-tested rules; a clean FastMCP server over stdio; a reproducible
18
+ benchmark emitting an accuracy table + plot with confidence intervals.
19
+ - NON-GOAL (this weekend): web UI, remote HTTP/OAuth transport, exact Kemeny/Schulze,
20
+ learned agent weights. Leave hooks, don't build them.
21
+
22
+ ## Tech stack (pin these)
23
+ - Python 3.12, managed with `uv`
24
+ - `mcp[cli]` (FastMCP) for the server
25
+ - `pydantic` v2 for ballot/profile/result schemas
26
+ - `pytest` for tests, `ruff` for lint, `mypy --strict` for types
27
+ - Bench: async HTTP via the `openai` SDK pointed at an OpenAI-compatible endpoint; `.env` for keys;
28
+ `numpy` for aggregation math; `matplotlib` for the one plot
29
+
30
+ ## Repo layout
31
+ ```
32
+ voting-mcp/
33
+ ├── CLAUDE.md
34
+ ├── pyproject.toml
35
+ ├── README.md
36
+ ├── .env.example
37
+ ├── src/voting_mcp/
38
+ │ ├── __init__.py
39
+ │ ├── server.py # FastMCP entrypoint + @mcp.tool registrations
40
+ │ ├── types.py # Ballot / Profile / Result pydantic models
41
+ │ ├── aggregate.py # dispatch + ballot validation
42
+ │ └── rules/
43
+ │ ├── __init__.py
44
+ │ ├── borda.py
45
+ │ ├── copeland.py
46
+ │ ├── condorcet.py
47
+ │ ├── approval.py
48
+ │ ├── stv.py
49
+ │ ├── opinion_pool.py
50
+ │ ├── plurality.py # baseline
51
+ │ └── majority.py # baseline used by the bench
52
+ ├── tests/
53
+ │ └── test_rules.py
54
+ ├── bench/
55
+ │ ├── run_ensemble.py # calls N models on a dataset, caches raw responses
56
+ │ ├── compare.py # rules vs majority: accuracy + bootstrap 95% CI -> table + plot
57
+ │ ├── config/models.yaml # one entry per ensemble member
58
+ │ ├── datasets/ # downloaded benchmark (jsonl)
59
+ │ └── results/
60
+ │ └── raw/ # cached per-model responses (NEVER re-call API if present)
61
+ └── .github/workflows/ci.yml
62
+ ```
63
+
64
+ ## Commands
65
+ - install: `uv sync`
66
+ - run server: `uv run python -m voting_mcp.server`
67
+ - inspect (REQUIRED before a tool is "done"):
68
+ `npx @modelcontextprotocol/inspector uv run python -m voting_mcp.server`
69
+ - test: `uv run pytest -q`
70
+ - lint + types: `uv run ruff check . && uv run mypy --strict src`
71
+ - bench: `uv run python -m bench.run_ensemble --dataset bench/datasets/<file>.jsonl \
72
+ --models bench/config/models.yaml --limit 200`
73
+ - compare: `uv run python -m bench.compare --results bench/results/<run>/`
74
+
75
+ ## Domain model
76
+ - **Ballot** variants: strict ranking, truncated/partial ranking, approval set, score/utility
77
+ vector, probability distribution over candidates.
78
+ - **Profile** = candidate set + list of ballots (all over the same candidate set).
79
+ - **Result** = winner (or winners on a tie), full ranking where defined, and a `note` field
80
+ (e.g. "no Condorcet winner exists").
81
+
82
+ ## Rules to implement — each its own module + tests
83
+ | Rule | Consumes | Known for / why it's here |
84
+ |------|----------|---------------------------|
85
+ | borda | rankings | positional, Condorcet-inconsistent — good contrast |
86
+ | copeland | rankings | Condorcet-consistent pairwise |
87
+ | condorcet | rankings | returns winner OR explicit "no winner" on a cycle |
88
+ | approval | approval sets | simple, strategy-relevant |
89
+ | stv | rankings | multi-round elimination, clone-resistant |
90
+ | opinion_pool | distributions | linear pool — preserves confidence, NOT argmax |
91
+ | plurality | top choice | baseline |
92
+ | majority | top choice | the baseline the bench must beat |
93
+
94
+ ## Correctness discipline — READ THIS
95
+ Social choice is full of subtle edge cases and an LLM gets them wrong confidently. So:
96
+ - **TEST-FIRST for every rule.** Before/with each implementation, write textbook profiles with
97
+ a KNOWN expected winner. Required edge cases per rule where applicable:
98
+ - a **Condorcet cycle** (A>B>C>A) → condorcet must return "no winner", not crash
99
+ - a **tie** → must surface multiple winners, never silently pick one
100
+ - a **truncated/partial ballot** → defined behavior, not a coerce-to-zero
101
+ - a **clone** scenario for stv/borda
102
+ - **Tie-breaking is an explicit, documented parameter** — never an implicit silent rule.
103
+ - **Validate inputs**: reject malformed ballots with a clear error. Never coerce silently.
104
+ - **Rules are pure functions**: take a Profile, return a Result. No I/O, no globals, deterministic.
105
+ - If a rule is NP-hard to compute exactly (Kemeny/Schulze) — it's out of scope; do not approximate
106
+ silently.
107
+
108
+ ## MCP tool layer
109
+ - Wrap each rule with `@mcp.tool()`. The **docstring IS the model-facing spec** — state inputs,
110
+ output shape, and the "no winner" condition precisely.
111
+ - **Strict JSON schema**: `additionalProperties: false`, fully typed fields, an enum for rule names.
112
+ - **stdio transport only. NO network calls. NO file writes. NO secrets in this package.**
113
+ (This keeps the server clean against the OWASP MCP Top 10 by construction — it's pure compute.)
114
+ - A tool is not "done" until it has been exercised in the MCP Inspector.
115
+
116
+ ## Bench harness — the proof
117
+ - **Claim to demonstrate**: principled aggregation > naive majority vote, measured, with CIs.
118
+ - **Ensemble** (bench/config/models.yaml) — one entry per member:
119
+ `{ name, base_url, api_key_env, model_id, weight }`. Default members (5 distinct labs):
120
+ - `gpt-4o-mini` (OpenAI)
121
+ - `gemini-2.5-flash-lite` (Google)
122
+ - `deepseek-v3` (DeepSeek, open-weight)
123
+ - `claude-haiku-4-5` (Anthropic)
124
+ - `glm-4.7` (Zhipu)
125
+ - **Routing**: ONE OpenAI-compatible client. Prefer a single gateway (OpenRouter) so every member
126
+ is just a `model_id`; per-provider `base_url` is the fallback. Each member is interchangeable
127
+ config, never hardcoded.
128
+ - **Per call**: ask for a final label AND (where the task allows) a confidence distribution over
129
+ the options — the opinion_pool rule needs the distribution. Cap `max_tokens`. Retry w/ backoff,
130
+ timeout, and record failures.
131
+ - **Caching (cost guardrail)**: write every raw response to `bench/results/raw/`. On re-run, if a
132
+ cached response exists, DO NOT call the API. This makes aggregation tweaks free — you only pay
133
+ again when prompts or models change.
134
+ - **compare.py**: for each rule + majority baseline, compute accuracy and a **bootstrap 95% CI**;
135
+ emit a markdown table and a single matplotlib bar chart with error bars.
136
+ - **Reproducibility**: fix seeds; record model IDs + dataset hash + timestamp in the results dir.
137
+ - **Cost discipline**: default `--limit 200`; print an estimated cost before any run that hits the API.
138
+ - **Dataset**: an openly downloadable MCQ reasoning set (MMLU subset / ARC-Challenge / GPQA-diamond).
139
+ Pick one that's freely fetchable; store as jsonl with `{id, question, options, answer}`.
140
+
141
+ ## Coding conventions
142
+ - Type hints everywhere; docstrings on public functions; small modules, no god-files.
143
+ - `ruff` clean, `mypy --strict` clean before any commit.
144
+ - Commit after each green `pytest` run, one logical change per commit.
145
+
146
+ ## Definition of done (phase gates)
147
+ 1. Scaffold builds, `uv sync` clean, CI green on an empty test.
148
+ 2. All rules implemented; `pytest` green INCLUDING every required edge case above.
149
+ 3. All tools exercised in the MCP Inspector; schemas strict.
150
+ 4. Bench runs end-to-end on `--limit 200`, produces table + plot, raw responses cached.
151
+ 5. README has install one-liner + eval table; `uv build` succeeds; published; `uvx mcp-scan`
152
+ run against our own server reported clean.
153
+
154
+ ## Out of scope this weekend (leave hooks, don't build)
155
+ Web UI · remote HTTP/OAuth transport · exact Kemeny/Schulze · learned/accuracy-based agent weights.
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: voting-mcp
3
+ Version: 0.1.0
4
+ Summary: MCP server exposing principled social-choice aggregation rules (Borda, Copeland, Condorcet, approval, STV, opinion pool), with a reproducible benchmark measuring their accuracy vs majority vote over an LLM ensemble.
5
+ Author-email: Hrishi Kabra <kabrahrishi@gmail.com>
6
+ License: MIT
7
+ Keywords: aggregation,ensemble,mcp,social-choice,voting
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Requires-Python: >=3.12
14
+ Requires-Dist: mcp[cli]>=1.2.0
15
+ Requires-Dist: pydantic>=2.6
16
+ Provides-Extra: bench
17
+ Requires-Dist: matplotlib>=3.8; extra == 'bench'
18
+ Requires-Dist: numpy>=1.26; extra == 'bench'
19
+ Requires-Dist: openai>=1.30; extra == 'bench'
20
+ Requires-Dist: python-dotenv>=1.0; extra == 'bench'
21
+ Requires-Dist: pyyaml>=6.0; extra == 'bench'
22
+ Description-Content-Type: text/markdown
23
+
24
+ # voting-mcp
25
+
26
+ **Principled social-choice aggregation as MCP tools — with a benchmark that measures the
27
+ accuracy lift over naive majority vote.**
28
+
29
+ Almost every multi-agent system aggregates votes with `Counter(votes).most_common(1)`, throwing
30
+ away preference order and confidence. `voting-mcp` ships the real rules (Borda, Copeland,
31
+ Condorcet, approval, STV, linear opinion pool) as callable MCP tools — each with its known
32
+ axiomatic behavior and explicit, documented tie-breaking — plus a reproducible benchmark that
33
+ aggregates a diverse ensemble of LLMs on a reasoning set and reports accuracy with bootstrap
34
+ confidence intervals.
35
+
36
+ The server is **pure compute**: stdio transport, no network, no file writes, no secrets — clean
37
+ against the OWASP MCP Top 10 by construction.
38
+
39
+ ## Install
40
+
41
+ ```sh
42
+ # run the server directly (once published)
43
+ uvx voting-mcp
44
+
45
+ # or from source
46
+ git clone <repo> && cd voting-mcp
47
+ uv sync
48
+ uv run python -m voting_mcp.server
49
+ ```
50
+
51
+ Add it to an MCP client (e.g. Claude Desktop `claude_desktop_config.json`):
52
+
53
+ ```json
54
+ {
55
+ "mcpServers": {
56
+ "voting": { "command": "uvx", "args": ["voting-mcp"] }
57
+ }
58
+ }
59
+ ```
60
+
61
+ ## Tools
62
+
63
+ Every tool takes a `profile` (`{candidates, ballots}`) and returns a `Result` with the full
64
+ co-winner set (`winners`, so ties are never hidden), the single tie-broken `winner` (or `null`
65
+ when none exists), a `ranking`, per-candidate `scores`, and a `note`.
66
+
67
+ | Tool | Ballots | Notes |
68
+ |------|---------|-------|
69
+ | `borda` | rankings | positional; Condorcet-inconsistent, clone-sensitive |
70
+ | `copeland` | rankings | Condorcet-consistent pairwise (+1 win, +0.5 tie) |
71
+ | `condorcet` | rankings | returns the pairwise winner **or an explicit no-winner on a cycle** |
72
+ | `approval` | approval sets | most-approved wins |
73
+ | `stv` | rankings | single-winner instant-runoff; clone-resistant |
74
+ | `opinion_pool` | distributions | linear pool — **preserves confidence, not an argmax vote** |
75
+ | `plurality` | rankings | baseline (most first choices) |
76
+ | `majority` | rankings | strict >50% or **no winner** |
77
+ | `aggregate_rule` | any | dispatch by a `rule` enum |
78
+
79
+ Tie-breaking is an explicit parameter (`lexicographic` default, `none`, or seeded `random`).
80
+
81
+ ## Benchmark
82
+
83
+ Aggregate an ensemble of 5 models (one OpenAI-compatible client via OpenRouter) on
84
+ ARC-Challenge and compare each rule to the naive majority vote:
85
+
86
+ ```sh
87
+ uv sync --extra bench
88
+ uv run python -m bench.fetch_arc --limit 200
89
+ # prints a cost estimate and STOPS; add --yes to actually call the API, --mock for a free dry run
90
+ uv run python -m bench.run_ensemble --dataset bench/datasets/arc_challenge.jsonl --limit 200 --yes
91
+ uv run python -m bench.compare --dataset bench/datasets/arc_challenge.jsonl --limit 200
92
+ ```
93
+
94
+ Every raw response is cached under `bench/results/raw/`; re-runs never re-call the API, so
95
+ aggregation tweaks are free.
96
+
97
+ ### Results
98
+
99
+ 5-model ensemble (gpt-4o-mini · gemini-2.5-flash-lite · deepseek-v3 · claude-haiku-4.5 ·
100
+ glm-4.7), n = 200, bootstrap 95% CI. Two datasets of different difficulty; full write-up and
101
+ both plots in [`RESULTS.md`](RESULTS.md).
102
+
103
+ **MMLU-Pro (hard, baseline 73.5%) — the informative case:**
104
+
105
+ | Rule | Accuracy | 95% CI | Δ vs majority |
106
+ |------|---------:|:------:|--------------:|
107
+ | **opinion_pool** | **0.755** | [0.695, 0.815] | **+0.020** |
108
+ | **majority_vote (baseline)** | 0.735 | [0.679, 0.788] | — |
109
+ | approval | 0.701 | [0.640, 0.757] | −0.035 |
110
+ | stv | 0.693 | [0.630, 0.750] | −0.043 |
111
+ | copeland | 0.647 | [0.580, 0.710] | −0.088 |
112
+ | condorcet | 0.620 | [0.550, 0.685] | −0.115 |
113
+ | majority (strict) | 0.590 | [0.520, 0.655] | −0.145 |
114
+ | borda | 0.472 | [0.405, 0.540] | −0.263 |
115
+
116
+ ![MMLU-Pro](docs/accuracy_mmlu_pro.png)
117
+
118
+ **The finding (honest):** the value isn't "fancy voting beats majority." It's that **the
119
+ confidence-preserving rule (`opinion_pool`) wins** when the crowd is uncertain (+2.0pp, the only
120
+ rule above baseline — though its CI still overlaps, so *suggestive, not conclusive*), while
121
+ **forcing the distributions into full rankings actively hurts** — `borda` collapses to 0.472,
122
+ far below majority, because with 10 options the tail of the ranking is mostly noise. Aggregate
123
+ the confidence; don't throw it away. On **ARC-Challenge** (baseline 96.8%, near-ceiling) nothing
124
+ separates — every rule lands within overlapping CIs. See [`RESULTS.md`](RESULTS.md).
125
+
126
+ ## Develop
127
+
128
+ ```sh
129
+ uv run pytest -q
130
+ uv run ruff check .
131
+ uv run mypy --strict src
132
+ # exercise the tools in the MCP Inspector:
133
+ npx @modelcontextprotocol/inspector uv run python -m voting_mcp.server
134
+ ```
135
+
136
+ > Note: if you keep this repo under an iCloud-synced folder (e.g. `~/Desktop`), iCloud can spawn
137
+ > duplicate `.pth` files that intermittently break the editable install. Tests use
138
+ > `pythonpath=src`; run the server with `PYTHONPATH=src` if an import fails, or move the repo
139
+ > off the synced folder.
140
+
141
+ ## License
142
+
143
+ MIT
@@ -0,0 +1,120 @@
1
+ # voting-mcp
2
+
3
+ **Principled social-choice aggregation as MCP tools — with a benchmark that measures the
4
+ accuracy lift over naive majority vote.**
5
+
6
+ Almost every multi-agent system aggregates votes with `Counter(votes).most_common(1)`, throwing
7
+ away preference order and confidence. `voting-mcp` ships the real rules (Borda, Copeland,
8
+ Condorcet, approval, STV, linear opinion pool) as callable MCP tools — each with its known
9
+ axiomatic behavior and explicit, documented tie-breaking — plus a reproducible benchmark that
10
+ aggregates a diverse ensemble of LLMs on a reasoning set and reports accuracy with bootstrap
11
+ confidence intervals.
12
+
13
+ The server is **pure compute**: stdio transport, no network, no file writes, no secrets — clean
14
+ against the OWASP MCP Top 10 by construction.
15
+
16
+ ## Install
17
+
18
+ ```sh
19
+ # run the server directly (once published)
20
+ uvx voting-mcp
21
+
22
+ # or from source
23
+ git clone <repo> && cd voting-mcp
24
+ uv sync
25
+ uv run python -m voting_mcp.server
26
+ ```
27
+
28
+ Add it to an MCP client (e.g. Claude Desktop `claude_desktop_config.json`):
29
+
30
+ ```json
31
+ {
32
+ "mcpServers": {
33
+ "voting": { "command": "uvx", "args": ["voting-mcp"] }
34
+ }
35
+ }
36
+ ```
37
+
38
+ ## Tools
39
+
40
+ Every tool takes a `profile` (`{candidates, ballots}`) and returns a `Result` with the full
41
+ co-winner set (`winners`, so ties are never hidden), the single tie-broken `winner` (or `null`
42
+ when none exists), a `ranking`, per-candidate `scores`, and a `note`.
43
+
44
+ | Tool | Ballots | Notes |
45
+ |------|---------|-------|
46
+ | `borda` | rankings | positional; Condorcet-inconsistent, clone-sensitive |
47
+ | `copeland` | rankings | Condorcet-consistent pairwise (+1 win, +0.5 tie) |
48
+ | `condorcet` | rankings | returns the pairwise winner **or an explicit no-winner on a cycle** |
49
+ | `approval` | approval sets | most-approved wins |
50
+ | `stv` | rankings | single-winner instant-runoff; clone-resistant |
51
+ | `opinion_pool` | distributions | linear pool — **preserves confidence, not an argmax vote** |
52
+ | `plurality` | rankings | baseline (most first choices) |
53
+ | `majority` | rankings | strict >50% or **no winner** |
54
+ | `aggregate_rule` | any | dispatch by a `rule` enum |
55
+
56
+ Tie-breaking is an explicit parameter (`lexicographic` default, `none`, or seeded `random`).
57
+
58
+ ## Benchmark
59
+
60
+ Aggregate an ensemble of 5 models (one OpenAI-compatible client via OpenRouter) on
61
+ ARC-Challenge and compare each rule to the naive majority vote:
62
+
63
+ ```sh
64
+ uv sync --extra bench
65
+ uv run python -m bench.fetch_arc --limit 200
66
+ # prints a cost estimate and STOPS; add --yes to actually call the API, --mock for a free dry run
67
+ uv run python -m bench.run_ensemble --dataset bench/datasets/arc_challenge.jsonl --limit 200 --yes
68
+ uv run python -m bench.compare --dataset bench/datasets/arc_challenge.jsonl --limit 200
69
+ ```
70
+
71
+ Every raw response is cached under `bench/results/raw/`; re-runs never re-call the API, so
72
+ aggregation tweaks are free.
73
+
74
+ ### Results
75
+
76
+ 5-model ensemble (gpt-4o-mini · gemini-2.5-flash-lite · deepseek-v3 · claude-haiku-4.5 ·
77
+ glm-4.7), n = 200, bootstrap 95% CI. Two datasets of different difficulty; full write-up and
78
+ both plots in [`RESULTS.md`](RESULTS.md).
79
+
80
+ **MMLU-Pro (hard, baseline 73.5%) — the informative case:**
81
+
82
+ | Rule | Accuracy | 95% CI | Δ vs majority |
83
+ |------|---------:|:------:|--------------:|
84
+ | **opinion_pool** | **0.755** | [0.695, 0.815] | **+0.020** |
85
+ | **majority_vote (baseline)** | 0.735 | [0.679, 0.788] | — |
86
+ | approval | 0.701 | [0.640, 0.757] | −0.035 |
87
+ | stv | 0.693 | [0.630, 0.750] | −0.043 |
88
+ | copeland | 0.647 | [0.580, 0.710] | −0.088 |
89
+ | condorcet | 0.620 | [0.550, 0.685] | −0.115 |
90
+ | majority (strict) | 0.590 | [0.520, 0.655] | −0.145 |
91
+ | borda | 0.472 | [0.405, 0.540] | −0.263 |
92
+
93
+ ![MMLU-Pro](docs/accuracy_mmlu_pro.png)
94
+
95
+ **The finding (honest):** the value isn't "fancy voting beats majority." It's that **the
96
+ confidence-preserving rule (`opinion_pool`) wins** when the crowd is uncertain (+2.0pp, the only
97
+ rule above baseline — though its CI still overlaps, so *suggestive, not conclusive*), while
98
+ **forcing the distributions into full rankings actively hurts** — `borda` collapses to 0.472,
99
+ far below majority, because with 10 options the tail of the ranking is mostly noise. Aggregate
100
+ the confidence; don't throw it away. On **ARC-Challenge** (baseline 96.8%, near-ceiling) nothing
101
+ separates — every rule lands within overlapping CIs. See [`RESULTS.md`](RESULTS.md).
102
+
103
+ ## Develop
104
+
105
+ ```sh
106
+ uv run pytest -q
107
+ uv run ruff check .
108
+ uv run mypy --strict src
109
+ # exercise the tools in the MCP Inspector:
110
+ npx @modelcontextprotocol/inspector uv run python -m voting_mcp.server
111
+ ```
112
+
113
+ > Note: if you keep this repo under an iCloud-synced folder (e.g. `~/Desktop`), iCloud can spawn
114
+ > duplicate `.pth` files that intermittently break the editable install. Tests use
115
+ > `pythonpath=src`; run the server with `PYTHONPATH=src` if an import fails, or move the repo
116
+ > off the synced folder.
117
+
118
+ ## License
119
+
120
+ MIT
@@ -0,0 +1,91 @@
1
+ # Benchmark results
2
+
3
+ **Ensemble:** 5 models via OpenRouter, equal weight —
4
+ `gpt-4o-mini`, `gemini-2.5-flash-lite`, `deepseek-v3`, `claude-haiku-4.5`, `glm-4.7`.
5
+ **Stats:** accuracy with a percentile bootstrap 95% CI (5,000 resamples, seed 0). Ties are
6
+ scored unbiasedly (a k-way tie containing the gold answer scores 1/k). Each member returns a
7
+ label + a confidence distribution; rules consume the ballot kind they need (rankings derived
8
+ from the distribution, approval set thresholded at ≥ 1/#options, distribution verbatim for the
9
+ opinion pool). Run date: 2026-06-30.
10
+
11
+ Two datasets, deliberately chosen for different difficulty:
12
+
13
+ ---
14
+
15
+ ## 1. ARC-Challenge (easy — n=200, baseline 96.8%)
16
+
17
+ 992/1000 responses parsed; every question keeps ≥ 4 ballots.
18
+
19
+ | Rule | Accuracy | 95% CI | Δ vs majority |
20
+ |------|---------:|:------:|--------------:|
21
+ | borda / copeland | 0.975 | [0.950, 0.995] | +0.007 |
22
+ | stv | 0.973 | [0.948, 0.993] | +0.005 |
23
+ | condorcet | 0.970 | [0.945, 0.990] | +0.002 |
24
+ | **majority_vote (baseline)** | 0.968 | [0.943, 0.988] | — |
25
+ | opinion_pool | 0.965 | [0.935, 0.990] | −0.003 |
26
+ | approval | 0.964 | [0.937, 0.987] | −0.003 |
27
+ | majority (strict) | 0.960 | [0.930, 0.985] | −0.008 |
28
+
29
+ ![ARC-Challenge](docs/accuracy_arc.png)
30
+
31
+ **Nothing separates here.** The ensemble is near-ceiling (96.8% baseline), so there is no
32
+ headroom — every rule lands within overlapping CIs. When the crowd is almost always right, how
33
+ you count the votes barely matters.
34
+
35
+ ---
36
+
37
+ ## 2. MMLU-Pro (hard — n=200, baseline 73.5%, up to 10 options)
38
+
39
+ This is the informative setting: lower ceiling, real disagreement. 199/200 questions keep ≥ 3
40
+ ballots, 192/200 keep ≥ 4. (`gemini-2.5-flash-lite` returned non-JSON on ~25% of these hard
41
+ 10-option items even with a large token budget — a real model-compliance characteristic; those
42
+ responses abstain rather than corrupt a ballot.)
43
+
44
+ | Rule | Accuracy | 95% CI | Δ vs majority |
45
+ |------|---------:|:------:|--------------:|
46
+ | **opinion_pool** | **0.755** | [0.695, 0.815] | **+0.020** |
47
+ | **majority_vote (baseline)** | 0.735 | [0.679, 0.788] | — |
48
+ | approval | 0.701 | [0.640, 0.757] | −0.035 |
49
+ | stv | 0.693 | [0.630, 0.750] | −0.043 |
50
+ | copeland | 0.647 | [0.580, 0.710] | −0.088 |
51
+ | condorcet | 0.620 | [0.550, 0.685] | −0.115 |
52
+ | majority (strict) | 0.590 | [0.520, 0.655] | −0.145 |
53
+ | borda | 0.472 | [0.405, 0.540] | −0.263 |
54
+
55
+ ![MMLU-Pro](docs/accuracy_mmlu_pro.png)
56
+
57
+ ## The actual finding
58
+
59
+ The headline isn't "fancy voting beats majority." It's sharper and more useful:
60
+
61
+ 1. **The confidence-preserving rule wins.** `opinion_pool` (a linear pool of the models'
62
+ probability distributions) is the only rule above majority vote, by +2.0 points. When the
63
+ crowd is genuinely uncertain, *keeping* the confidence and averaging it beats discarding it
64
+ for a one-vote-per-model count. This is exactly the rule CLAUDE.md flagged as "preserves
65
+ confidence, NOT argmax" — and it's the one that pays off.
66
+ 2. **Forcing rankings actively hurts.** The positional/pairwise rules degrade, and `borda`
67
+ collapses to 0.472 — far *below* majority. With up to 10 options, each model is confident
68
+ about its top pick and near-noise over the other nine; turning that into a full ranking and
69
+ then weighting all ten positions (Borda) or all pairwise contests (Condorcet/Copeland) feeds
70
+ the aggregator mostly noise. The signal lives in the top pick and the confidence, not in the
71
+ tail order.
72
+ 3. **Honesty on significance.** opinion_pool's +2.0 is the largest margin and directionally
73
+ consistent, but its CI still overlaps the baseline's — call it *suggestive, not conclusive*
74
+ at n=200. The *negative* results (ranking rules hurting) are large and well outside the noise.
75
+
76
+ So the measured, honest takeaway: **on hard problems, aggregate the confidence (opinion_pool);
77
+ do not throw it away by forcing noisy full rankings.** On easy problems it doesn't matter.
78
+
79
+ ## Reproduce
80
+
81
+ ```sh
82
+ uv sync --extra bench
83
+ echo "OPENROUTER_API_KEY=sk-or-..." > .env
84
+ # pick a dataset:
85
+ uv run python -m bench.fetch_arc --limit 200 # easy
86
+ uv run python -m bench.fetch_mmlu_pro --limit 200 # hard
87
+ uv run python -m bench.run_ensemble --dataset bench/datasets/mmlu_pro.jsonl --limit 200 --yes --max-tokens 1024
88
+ uv run python -m bench.compare --dataset bench/datasets/mmlu_pro.jsonl --limit 200 --n-boot 5000
89
+ ```
90
+
91
+ Every raw response is cached, so re-scoring under different rules costs nothing.
@@ -0,0 +1 @@
1
+ """Benchmark harness: prove principled aggregation beats naive majority vote."""