prometheus-ebm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prometheus_ebm-0.1.0/LICENSE +21 -0
- prometheus_ebm-0.1.0/PKG-INFO +338 -0
- prometheus_ebm-0.1.0/README.md +307 -0
- prometheus_ebm-0.1.0/prometheus_ebm/__init__.py +47 -0
- prometheus_ebm-0.1.0/prometheus_ebm/config.py +111 -0
- prometheus_ebm-0.1.0/prometheus_ebm/providers/__init__.py +1 -0
- prometheus_ebm-0.1.0/prometheus_ebm/providers/anthropic.py +40 -0
- prometheus_ebm-0.1.0/prometheus_ebm/providers/kaggle.py +39 -0
- prometheus_ebm-0.1.0/prometheus_ebm/providers/openai.py +42 -0
- prometheus_ebm-0.1.0/prometheus_ebm/providers/openrouter.py +47 -0
- prometheus_ebm-0.1.0/prometheus_ebm/runner.py +179 -0
- prometheus_ebm-0.1.0/prometheus_ebm/scorer.py +236 -0
- prometheus_ebm-0.1.0/prometheus_ebm/taxonomy.py +81 -0
- prometheus_ebm-0.1.0/prometheus_ebm.egg-info/PKG-INFO +338 -0
- prometheus_ebm-0.1.0/prometheus_ebm.egg-info/SOURCES.txt +19 -0
- prometheus_ebm-0.1.0/prometheus_ebm.egg-info/dependency_links.txt +1 -0
- prometheus_ebm-0.1.0/prometheus_ebm.egg-info/requires.txt +15 -0
- prometheus_ebm-0.1.0/prometheus_ebm.egg-info/top_level.txt +1 -0
- prometheus_ebm-0.1.0/pyproject.toml +41 -0
- prometheus_ebm-0.1.0/setup.cfg +4 -0
- prometheus_ebm-0.1.0/tests/test_scorer.py +163 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mushfiqul Alam
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: prometheus-ebm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PROMETHEUS-EBM: Benchmarking Epistemic Metacognition in AI Models
|
|
5
|
+
Author-email: Mushfiqul Alam <mushfiqulalam007@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Mushfiqul-Alam-17/prometheus-ebm-sdk
|
|
8
|
+
Project-URL: Repository, https://github.com/Mushfiqul-Alam-17/prometheus-ebm-sdk
|
|
9
|
+
Keywords: ai,benchmark,metacognition,calibration,epistemic
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: numpy>=1.24
|
|
19
|
+
Requires-Dist: pandas>=2.0
|
|
20
|
+
Requires-Dist: scipy>=1.10
|
|
21
|
+
Requires-Dist: matplotlib>=3.7
|
|
22
|
+
Requires-Dist: requests>=2.28
|
|
23
|
+
Provides-Extra: anthropic
|
|
24
|
+
Requires-Dist: anthropic>=0.25; extra == "anthropic"
|
|
25
|
+
Provides-Extra: openai
|
|
26
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
27
|
+
Provides-Extra: all
|
|
28
|
+
Requires-Dist: anthropic>=0.25; extra == "all"
|
|
29
|
+
Requires-Dist: openai>=1.0; extra == "all"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# PROMETHEUS-EBM SDK
|
|
33
|
+
|
|
34
|
+
[](https://python.org)
|
|
35
|
+
[](LICENSE)
|
|
36
|
+
[]()
|
|
37
|
+
|
|
38
|
+
**Benchmarking Epistemic Metacognition in AI Models**
|
|
39
|
+
|
|
40
|
+
PROMETHEUS-EBM evaluates whether frontier AI models can recognize the *limits of their own knowledge* — not just answer questions, but understand when a question is unanswerable, ambiguous, or self-contradictory.
|
|
41
|
+
|
|
42
|
+
> **Companion to the Kaggle notebook:** [PROMETHEUS-EBM v5.0](https://www.kaggle.com/code/mushfiqulalam007/final-bm-v4) — The full benchmark with live results from 5 frontier models.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Why This Exists
|
|
47
|
+
|
|
48
|
+
Current benchmarks (MMLU, GPQA, HumanEval) test **what a model knows**.
|
|
49
|
+
PROMETHEUS-EBM tests **whether a model knows what it does not know**.
|
|
50
|
+
|
|
51
|
+
This is a critical safety property. A model deployed in medicine, law, or finance that confidently answers when it *should* refuse is more dangerous than one that gets fewer questions right but knows its boundaries.
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## The 4-Class Solvability Taxonomy
|
|
56
|
+
|
|
57
|
+
Every problem is classified into one of four epistemic categories:
|
|
58
|
+
|
|
59
|
+
| Class | Description | Expected Model Behavior |
|
|
60
|
+
|-------|-------------|------------------------|
|
|
61
|
+
| **Determinate** | One clear answer exists | Answer confidently |
|
|
62
|
+
| **Underdetermined** | Multiple valid interpretations | Flag the ambiguity |
|
|
63
|
+
| **Insufficient** | Critical information is missing | Refuse to answer definitively |
|
|
64
|
+
| **Contradictory** | The premises conflict | Detect the contradiction |
|
|
65
|
+
|
|
66
|
+
Models are scored on whether they correctly identify *which category* a problem falls into — not just whether they produce the correct final answer.
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## Scoring Framework
|
|
71
|
+
|
|
72
|
+
| Metric | Range | What It Measures |
|
|
73
|
+
|--------|-------|-----------------|
|
|
74
|
+
| **ECI** (Epistemological Calibration Index) | 0–1 | Composite metacognition score |
|
|
75
|
+
| **SDA** (Solvability Detection Accuracy) | 0–1 | Can the model classify the problem type? |
|
|
76
|
+
| **CA** (Conditional Accuracy) | 0–1 | When it commits to an answer, is it correct? |
|
|
77
|
+
| **RP** (Refusal Precision) | 0–1 | When it refuses, was refusal appropriate? |
|
|
78
|
+
| **ECE** (Expected Calibration Error) | 0–1 | Does stated confidence match actual accuracy? |
|
|
79
|
+
| **HGI** (Hysteresis Gap Index) | 0–1 | Internal inconsistency (lower = better) |
|
|
80
|
+
| **Brier Score** | 0–1 | Calibration quality decomposed into Reliability, Resolution, Uncertainty |
|
|
81
|
+
| **Type-2 D-Prime** | -∞ to +∞ | How well the model's confidence signal distinguishes correct from incorrect answers |
|
|
82
|
+
|
|
83
|
+
### ECI Composition
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
ECI = 0.30 × SDA + 0.25 × CA + 0.20 × RP + 0.15 × (1 - ECE) + 0.10 × (1 - HSS)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Installation
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install prometheus-ebm
|
|
95
|
+
|
|
96
|
+
# With specific provider support:
|
|
97
|
+
pip install "prometheus-ebm[anthropic]" # For Claude API
|
|
98
|
+
pip install "prometheus-ebm[openai]" # For OpenAI API
|
|
99
|
+
pip install "prometheus-ebm[all]" # All providers
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Quick Start
|
|
105
|
+
|
|
106
|
+
### Compare Multiple Models
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from prometheus_ebm import PrometheusRunner, RunConfig
|
|
110
|
+
|
|
111
|
+
config = RunConfig(
|
|
112
|
+
mode="compare",
|
|
113
|
+
models=[
|
|
114
|
+
"anthropic/claude-opus-4-6@default",
|
|
115
|
+
"anthropic/claude-sonnet-4-6@default",
|
|
116
|
+
"google/gemini-3.1-pro-preview",
|
|
117
|
+
"deepseek-ai/deepseek-v3.2",
|
|
118
|
+
"deepseek-ai/deepseek-r1-0528",
|
|
119
|
+
],
|
|
120
|
+
provider="kaggle", # No API key needed
|
|
121
|
+
n_items=200, # Standard dataset (200 base problems)
|
|
122
|
+
stress_decision_ratio=0.40, # EXTENDED mode stress
|
|
123
|
+
stress_clarity_ratio=0.20,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
runner = PrometheusRunner(config)
|
|
127
|
+
results = runner.run()
|
|
128
|
+
results.export("comparison.csv")
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Deep Probe a Single Model (1,000 Items)
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
config = RunConfig(
|
|
135
|
+
mode="deep_probe",
|
|
136
|
+
models=["anthropic/claude-opus-4-6"],
|
|
137
|
+
provider="anthropic",
|
|
138
|
+
api_key="sk-ant-...",
|
|
139
|
+
n_items=1000,
|
|
140
|
+
stress_decision_ratio=0.30,
|
|
141
|
+
bootstrap_iterations=3000,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
runner = PrometheusRunner(config)
|
|
145
|
+
results = runner.run()
|
|
146
|
+
results.export("opus_deep_probe.csv")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Use with OpenRouter (Access 100+ Models)
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
config = RunConfig(
|
|
153
|
+
mode="compare",
|
|
154
|
+
models=["anthropic/claude-opus-4-6", "google/gemini-3.1-pro"],
|
|
155
|
+
provider="openrouter",
|
|
156
|
+
api_key="sk-or-...",
|
|
157
|
+
)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Use with OpenAI
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
config = RunConfig(
|
|
164
|
+
mode="deep_probe",
|
|
165
|
+
models=["gpt-5.4"],
|
|
166
|
+
provider="openai",
|
|
167
|
+
api_key="sk-...",
|
|
168
|
+
n_items=200,
|
|
169
|
+
)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Scoring Only (Bring Your Own Data)
|
|
173
|
+
|
|
174
|
+
If you already have model responses and just need the ECI/Brier/D-Prime scores:
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from prometheus_ebm import ECIScorer, BrierDecomposition, Type2DPrime
|
|
178
|
+
|
|
179
|
+
scorer = ECIScorer()
|
|
180
|
+
|
|
181
|
+
# Compute individual components
|
|
182
|
+
sda = ECIScorer.compute_sda(predicted_classes, true_classes)
|
|
183
|
+
ca = ECIScorer.compute_ca(answers_correct, true_classes)
|
|
184
|
+
rp = ECIScorer.compute_rp(predicted_classes, true_classes)
|
|
185
|
+
ece = ECIScorer.compute_ece(confidences, correctness)
|
|
186
|
+
hss = ECIScorer.compute_hss(predicted_classes, true_classes, answers_given)
|
|
187
|
+
|
|
188
|
+
eci = scorer.compute_eci(sda, ca, rp, ece, hss)
|
|
189
|
+
|
|
190
|
+
# Brier decomposition
|
|
191
|
+
brier = BrierDecomposition.compute(confidences, correctness)
|
|
192
|
+
# → {'brier': 0.18, 'reliability': 0.03, 'resolution': 0.09, 'uncertainty': 0.24}
|
|
193
|
+
|
|
194
|
+
# D-Prime (metacognitive discrimination)
|
|
195
|
+
dprime = Type2DPrime.compute(confidences, correctness, threshold=0.7)
|
|
196
|
+
# → {'d_prime': 1.24, 'hit_rate': 0.85, 'false_alarm_rate': 0.42}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Supported Providers
|
|
202
|
+
|
|
203
|
+
| Provider | API Key Required | Models Available | Best For |
|
|
204
|
+
|----------|:---:|--------|----------|
|
|
205
|
+
| `kaggle` | No | 26 (Kaggle model pool) | Running inside Kaggle notebooks |
|
|
206
|
+
| `openrouter` | Yes | 100+ | Broadest model access with one key |
|
|
207
|
+
| `anthropic` | Yes | Claude family | Direct Anthropic API access |
|
|
208
|
+
| `openai` | Yes | GPT family | Direct OpenAI API access |
|
|
209
|
+
|
|
210
|
+
**Default behavior:** If no API key is provided, the SDK falls back to the Kaggle provider (which requires no authentication when running inside a Kaggle notebook).
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## Configuration Reference
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
RunConfig(
|
|
218
|
+
# ── Mode ──
|
|
219
|
+
mode="compare", # "compare" (multi-model) or "deep_probe" (single-model)
|
|
220
|
+
models=[...], # List of model identifiers
|
|
221
|
+
|
|
222
|
+
# ── Provider ──
|
|
223
|
+
provider="kaggle", # "kaggle", "openrouter", "anthropic", "openai"
|
|
224
|
+
api_key=None, # Required for non-Kaggle providers
|
|
225
|
+
api_base_url=None, # Custom API endpoint (for self-hosted models)
|
|
226
|
+
|
|
227
|
+
# ── Dataset ──
|
|
228
|
+
n_items=200, # Base problem count (200 standard, 1000 for deep probe)
|
|
229
|
+
dataset_path=None, # Path to custom dataset JSON (or None for bundled)
|
|
230
|
+
stress_decision_ratio=0.25, # Fraction with decision-pressure variants
|
|
231
|
+
stress_clarity_ratio=0.10, # Fraction with reduced-clarity variants
|
|
232
|
+
|
|
233
|
+
# ── Statistical ──
|
|
234
|
+
seeds=["s1", "s2"], # Reproducibility seeds for bootstrap
|
|
235
|
+
bootstrap_iterations=2000, # Bootstrap iterations for CIs
|
|
236
|
+
|
|
237
|
+
# ── Time Budget ──
|
|
238
|
+
timeout_per_model=10800, # Max seconds per model (default: 3h)
|
|
239
|
+
total_time_budget=43200, # Total budget (default: 12h)
|
|
240
|
+
time_reserve=3600, # Reserved for analysis (default: 1h)
|
|
241
|
+
|
|
242
|
+
# ── Checkpointing ──
|
|
243
|
+
checkpoint_dir="prometheus_checkpoints",
|
|
244
|
+
resume_from_checkpoint=True,
|
|
245
|
+
|
|
246
|
+
# ── Output ──
|
|
247
|
+
output_dir="prometheus_output",
|
|
248
|
+
|
|
249
|
+
# ── Feature Flags ──
|
|
250
|
+
run_probes=True, # Epoch-2 adversarial probes
|
|
251
|
+
run_multistage=True, # Multi-stage belief revision protocol
|
|
252
|
+
run_statistics=True, # Bootstrap CIs and significance tests
|
|
253
|
+
verbose=True, # Print progress
|
|
254
|
+
)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## V5 Benchmark Results
|
|
260
|
+
|
|
261
|
+
Results from the PROMETHEUS-EBM v5.0 EXTENDED run (5 models × 324 items × 3 seeds):
|
|
262
|
+
|
|
263
|
+
### Epoch-1 Leaderboard
|
|
264
|
+
|
|
265
|
+
| Rank | Model | ECI | 95% CI | SDA |
|
|
266
|
+
|:---:|-------|:---:|--------|:---:|
|
|
267
|
+
| 🥇 | Claude Sonnet 4.6 | **0.884** | [0.878, 0.888] | 85.4% |
|
|
268
|
+
| 🥈 | Claude Opus 4.6 | 0.869 | [0.864, 0.877] | 84.3% |
|
|
269
|
+
| 🥉 | DeepSeek V3.2 | 0.815 | [0.800, 0.829] | 76.5% |
|
|
270
|
+
| 4 | DeepSeek R1-0528 | 0.785 | [0.774, 0.792] | 68.6% |
|
|
271
|
+
| 5 | Gemini 3.1 Pro | 0.767 | [0.745, 0.787] | 73.1% |
|
|
272
|
+
|
|
273
|
+
### Key Findings
|
|
274
|
+
|
|
275
|
+
1. **Sonnet beats Opus on ECI** (0.884 vs 0.869, statistically significant). The mid-tier model has better epistemic calibration than the top-tier model. Metacognition is not monotonic with scale.
|
|
276
|
+
|
|
277
|
+
2. **Opus leads on adversarial resilience.** Under the multi-stage protocol, Opus improved its accuracy by +13.9% after being challenged with counter-arguments. It correctly revised wrong answers without abandoning right ones.
|
|
278
|
+
|
|
279
|
+
3. **DeepSeek R1 classifies problems differently.** R1's solvability detection (SDA = 68.6%) diverges from all other models, and its evaluation perspective as a judge disagreed with peers at 16–20%. Chain-of-thought reasoning does not inherently improve metacognition.
|
|
280
|
+
|
|
281
|
+
4. **Gemini 3.1 Pro is the most overconfident.** Its stated confidence exceeds actual accuracy by 33 percentage points — the largest gap in the benchmark.
|
|
282
|
+
|
|
283
|
+
---
|
|
284
|
+
|
|
285
|
+
## Project Structure
|
|
286
|
+
|
|
287
|
+
```
|
|
288
|
+
prometheus-ebm-sdk/
|
|
289
|
+
├── prometheus_ebm/
|
|
290
|
+
│ ├── __init__.py # Public API exports
|
|
291
|
+
│ ├── config.py # RunConfig dataclass
|
|
292
|
+
│ ├── taxonomy.py # 4-class solvability taxonomy
|
|
293
|
+
│ ├── scorer.py # ECI, HGI, Brier, D-Prime
|
|
294
|
+
│ ├── runner.py # Benchmark orchestrator
|
|
295
|
+
│ ├── data/ # Bundled dataset (200 problems)
|
|
296
|
+
│ └── providers/
|
|
297
|
+
│ ├── kaggle.py # Kaggle kbench adapter
|
|
298
|
+
│ ├── openrouter.py # OpenRouter API adapter
|
|
299
|
+
│ ├── anthropic.py # Anthropic Claude adapter
|
|
300
|
+
│ └── openai.py # OpenAI adapter
|
|
301
|
+
├── tests/
|
|
302
|
+
│ └── test_scorer.py # Unit tests for scoring engine
|
|
303
|
+
├── examples/
|
|
304
|
+
│ ├── compare_5_models.py # Multi-model comparison example
|
|
305
|
+
│ └── deep_probe_opus.py # Single-model deep probe example
|
|
306
|
+
├── pyproject.toml # Package configuration
|
|
307
|
+
└── LICENSE
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
---
|
|
311
|
+
|
|
312
|
+
## Roadmap
|
|
313
|
+
|
|
314
|
+
| Version | Status | Features |
|
|
315
|
+
|---------|--------|----------|
|
|
316
|
+
| **v0.1.0** | ✅ Current | Scorer (ECI, Brier, D-Prime), Taxonomy, Config, Provider adapters |
|
|
317
|
+
| **v0.2.0** | Planned | Full evaluation loop, stress augmentation engine, export pipeline |
|
|
318
|
+
| **v0.3.0** | Planned | Bootstrap CI, pairwise significance, contamination audit |
|
|
319
|
+
| **v1.0.0** | Planned | 1,000-item dataset, CLI tool, HTML report generator |
|
|
320
|
+
|
|
321
|
+
---
|
|
322
|
+
|
|
323
|
+
## License
|
|
324
|
+
|
|
325
|
+
MIT — See [LICENSE](LICENSE) for details.
|
|
326
|
+
|
|
327
|
+
---
|
|
328
|
+
|
|
329
|
+
## Citation
|
|
330
|
+
|
|
331
|
+
```bibtex
|
|
332
|
+
@misc{alam2026prometheus,
|
|
333
|
+
title = {PROMETHEUS-EBM: Benchmarking Epistemic Metacognition in Frontier AI Models},
|
|
334
|
+
author = {Mushfiqul Alam},
|
|
335
|
+
year = {2026},
|
|
336
|
+
url = {https://github.com/Mushfiqul-Alam-17/prometheus-ebm-sdk}
|
|
337
|
+
}
|
|
338
|
+
```
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
# PROMETHEUS-EBM SDK
|
|
2
|
+
|
|
3
|
+
[](https://python.org)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
[]()
|
|
6
|
+
|
|
7
|
+
**Benchmarking Epistemic Metacognition in AI Models**
|
|
8
|
+
|
|
9
|
+
PROMETHEUS-EBM evaluates whether frontier AI models can recognize the *limits of their own knowledge* — not just answer questions, but understand when a question is unanswerable, ambiguous, or self-contradictory.
|
|
10
|
+
|
|
11
|
+
> **Companion to the Kaggle notebook:** [PROMETHEUS-EBM v5.0](https://www.kaggle.com/code/mushfiqulalam007/final-bm-v4) — The full benchmark with live results from 5 frontier models.
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Why This Exists
|
|
16
|
+
|
|
17
|
+
Current benchmarks (MMLU, GPQA, HumanEval) test **what a model knows**.
|
|
18
|
+
PROMETHEUS-EBM tests **whether a model knows what it does not know**.
|
|
19
|
+
|
|
20
|
+
This is a critical safety property. A model deployed in medicine, law, or finance that confidently answers when it *should* refuse is more dangerous than one that gets fewer questions right but knows its boundaries.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## The 4-Class Solvability Taxonomy
|
|
25
|
+
|
|
26
|
+
Every problem is classified into one of four epistemic categories:
|
|
27
|
+
|
|
28
|
+
| Class | Description | Expected Model Behavior |
|
|
29
|
+
|-------|-------------|------------------------|
|
|
30
|
+
| **Determinate** | One clear answer exists | Answer confidently |
|
|
31
|
+
| **Underdetermined** | Multiple valid interpretations | Flag the ambiguity |
|
|
32
|
+
| **Insufficient** | Critical information is missing | Refuse to answer definitively |
|
|
33
|
+
| **Contradictory** | The premises conflict | Detect the contradiction |
|
|
34
|
+
|
|
35
|
+
Models are scored on whether they correctly identify *which category* a problem falls into — not just whether they produce the correct final answer.
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Scoring Framework
|
|
40
|
+
|
|
41
|
+
| Metric | Range | What It Measures |
|
|
42
|
+
|--------|-------|-----------------|
|
|
43
|
+
| **ECI** (Epistemological Calibration Index) | 0–1 | Composite metacognition score |
|
|
44
|
+
| **SDA** (Solvability Detection Accuracy) | 0–1 | Can the model classify the problem type? |
|
|
45
|
+
| **CA** (Conditional Accuracy) | 0–1 | When it commits to an answer, is it correct? |
|
|
46
|
+
| **RP** (Refusal Precision) | 0–1 | When it refuses, was refusal appropriate? |
|
|
47
|
+
| **ECE** (Expected Calibration Error) | 0–1 | Does stated confidence match actual accuracy? |
|
|
48
|
+
| **HGI** (Hysteresis Gap Index) | 0–1 | Internal inconsistency (lower = better) |
|
|
49
|
+
| **Brier Score** | 0–1 | Calibration quality decomposed into Reliability, Resolution, Uncertainty |
|
|
50
|
+
| **Type-2 D-Prime** | -∞ to +∞ | How well the model's confidence signal distinguishes correct from incorrect answers |
|
|
51
|
+
|
|
52
|
+
### ECI Composition
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
ECI = 0.30 × SDA + 0.25 × CA + 0.20 × RP + 0.15 × (1 - ECE) + 0.10 × (1 - HSS)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install prometheus-ebm
|
|
64
|
+
|
|
65
|
+
# With specific provider support:
|
|
66
|
+
pip install "prometheus-ebm[anthropic]" # For Claude API
|
|
67
|
+
pip install "prometheus-ebm[openai]" # For OpenAI API
|
|
68
|
+
pip install "prometheus-ebm[all]" # All providers
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
### Compare Multiple Models
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from prometheus_ebm import PrometheusRunner, RunConfig
|
|
79
|
+
|
|
80
|
+
config = RunConfig(
|
|
81
|
+
mode="compare",
|
|
82
|
+
models=[
|
|
83
|
+
"anthropic/claude-opus-4-6@default",
|
|
84
|
+
"anthropic/claude-sonnet-4-6@default",
|
|
85
|
+
"google/gemini-3.1-pro-preview",
|
|
86
|
+
"deepseek-ai/deepseek-v3.2",
|
|
87
|
+
"deepseek-ai/deepseek-r1-0528",
|
|
88
|
+
],
|
|
89
|
+
provider="kaggle", # No API key needed
|
|
90
|
+
n_items=200, # Standard dataset (200 base problems)
|
|
91
|
+
stress_decision_ratio=0.40, # EXTENDED mode stress
|
|
92
|
+
stress_clarity_ratio=0.20,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
runner = PrometheusRunner(config)
|
|
96
|
+
results = runner.run()
|
|
97
|
+
results.export("comparison.csv")
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Deep Probe a Single Model (1,000 Items)
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
config = RunConfig(
|
|
104
|
+
mode="deep_probe",
|
|
105
|
+
models=["anthropic/claude-opus-4-6"],
|
|
106
|
+
provider="anthropic",
|
|
107
|
+
api_key="sk-ant-...",
|
|
108
|
+
n_items=1000,
|
|
109
|
+
stress_decision_ratio=0.30,
|
|
110
|
+
bootstrap_iterations=3000,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
runner = PrometheusRunner(config)
|
|
114
|
+
results = runner.run()
|
|
115
|
+
results.export("opus_deep_probe.csv")
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Use with OpenRouter (Access 100+ Models)
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
config = RunConfig(
|
|
122
|
+
mode="compare",
|
|
123
|
+
models=["anthropic/claude-opus-4-6", "google/gemini-3.1-pro"],
|
|
124
|
+
provider="openrouter",
|
|
125
|
+
api_key="sk-or-...",
|
|
126
|
+
)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Use with OpenAI
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
config = RunConfig(
|
|
133
|
+
mode="deep_probe",
|
|
134
|
+
models=["gpt-5.4"],
|
|
135
|
+
provider="openai",
|
|
136
|
+
api_key="sk-...",
|
|
137
|
+
n_items=200,
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Scoring Only (Bring Your Own Data)
|
|
142
|
+
|
|
143
|
+
If you already have model responses and just need the ECI/Brier/D-Prime scores:
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from prometheus_ebm import ECIScorer, BrierDecomposition, Type2DPrime
|
|
147
|
+
|
|
148
|
+
scorer = ECIScorer()
|
|
149
|
+
|
|
150
|
+
# Compute individual components
|
|
151
|
+
sda = ECIScorer.compute_sda(predicted_classes, true_classes)
|
|
152
|
+
ca = ECIScorer.compute_ca(answers_correct, true_classes)
|
|
153
|
+
rp = ECIScorer.compute_rp(predicted_classes, true_classes)
|
|
154
|
+
ece = ECIScorer.compute_ece(confidences, correctness)
|
|
155
|
+
hss = ECIScorer.compute_hss(predicted_classes, true_classes, answers_given)
|
|
156
|
+
|
|
157
|
+
eci = scorer.compute_eci(sda, ca, rp, ece, hss)
|
|
158
|
+
|
|
159
|
+
# Brier decomposition
|
|
160
|
+
brier = BrierDecomposition.compute(confidences, correctness)
|
|
161
|
+
# → {'brier': 0.18, 'reliability': 0.03, 'resolution': 0.09, 'uncertainty': 0.24}
|
|
162
|
+
|
|
163
|
+
# D-Prime (metacognitive discrimination)
|
|
164
|
+
dprime = Type2DPrime.compute(confidences, correctness, threshold=0.7)
|
|
165
|
+
# → {'d_prime': 1.24, 'hit_rate': 0.85, 'false_alarm_rate': 0.42}
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Supported Providers
|
|
171
|
+
|
|
172
|
+
| Provider | API Key Required | Models Available | Best For |
|
|
173
|
+
|----------|:---:|--------|----------|
|
|
174
|
+
| `kaggle` | No | 26 (Kaggle model pool) | Running inside Kaggle notebooks |
|
|
175
|
+
| `openrouter` | Yes | 100+ | Broadest model access with one key |
|
|
176
|
+
| `anthropic` | Yes | Claude family | Direct Anthropic API access |
|
|
177
|
+
| `openai` | Yes | GPT family | Direct OpenAI API access |
|
|
178
|
+
|
|
179
|
+
**Default behavior:** If no API key is provided, the SDK falls back to the Kaggle provider (which requires no authentication when running inside a Kaggle notebook).
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Configuration Reference
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
RunConfig(
|
|
187
|
+
# ── Mode ──
|
|
188
|
+
mode="compare", # "compare" (multi-model) or "deep_probe" (single-model)
|
|
189
|
+
models=[...], # List of model identifiers
|
|
190
|
+
|
|
191
|
+
# ── Provider ──
|
|
192
|
+
provider="kaggle", # "kaggle", "openrouter", "anthropic", "openai"
|
|
193
|
+
api_key=None, # Required for non-Kaggle providers
|
|
194
|
+
api_base_url=None, # Custom API endpoint (for self-hosted models)
|
|
195
|
+
|
|
196
|
+
# ── Dataset ──
|
|
197
|
+
n_items=200, # Base problem count (200 standard, 1000 for deep probe)
|
|
198
|
+
dataset_path=None, # Path to custom dataset JSON (or None for bundled)
|
|
199
|
+
stress_decision_ratio=0.25, # Fraction with decision-pressure variants
|
|
200
|
+
stress_clarity_ratio=0.10, # Fraction with reduced-clarity variants
|
|
201
|
+
|
|
202
|
+
# ── Statistical ──
|
|
203
|
+
seeds=["s1", "s2"], # Reproducibility seeds for bootstrap
|
|
204
|
+
bootstrap_iterations=2000, # Bootstrap iterations for CIs
|
|
205
|
+
|
|
206
|
+
# ── Time Budget ──
|
|
207
|
+
timeout_per_model=10800, # Max seconds per model (default: 3h)
|
|
208
|
+
total_time_budget=43200, # Total budget (default: 12h)
|
|
209
|
+
time_reserve=3600, # Reserved for analysis (default: 1h)
|
|
210
|
+
|
|
211
|
+
# ── Checkpointing ──
|
|
212
|
+
checkpoint_dir="prometheus_checkpoints",
|
|
213
|
+
resume_from_checkpoint=True,
|
|
214
|
+
|
|
215
|
+
# ── Output ──
|
|
216
|
+
output_dir="prometheus_output",
|
|
217
|
+
|
|
218
|
+
# ── Feature Flags ──
|
|
219
|
+
run_probes=True, # Epoch-2 adversarial probes
|
|
220
|
+
run_multistage=True, # Multi-stage belief revision protocol
|
|
221
|
+
run_statistics=True, # Bootstrap CIs and significance tests
|
|
222
|
+
verbose=True, # Print progress
|
|
223
|
+
)
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## V5 Benchmark Results
|
|
229
|
+
|
|
230
|
+
Results from the PROMETHEUS-EBM v5.0 EXTENDED run (5 models × 324 items × 3 seeds):
|
|
231
|
+
|
|
232
|
+
### Epoch-1 Leaderboard
|
|
233
|
+
|
|
234
|
+
| Rank | Model | ECI | 95% CI | SDA |
|
|
235
|
+
|:---:|-------|:---:|--------|:---:|
|
|
236
|
+
| 🥇 | Claude Sonnet 4.6 | **0.884** | [0.878, 0.888] | 85.4% |
|
|
237
|
+
| 🥈 | Claude Opus 4.6 | 0.869 | [0.864, 0.877] | 84.3% |
|
|
238
|
+
| 🥉 | DeepSeek V3.2 | 0.815 | [0.800, 0.829] | 76.5% |
|
|
239
|
+
| 4 | DeepSeek R1-0528 | 0.785 | [0.774, 0.792] | 68.6% |
|
|
240
|
+
| 5 | Gemini 3.1 Pro | 0.767 | [0.745, 0.787] | 73.1% |
|
|
241
|
+
|
|
242
|
+
### Key Findings
|
|
243
|
+
|
|
244
|
+
1. **Sonnet beats Opus on ECI** (0.884 vs 0.869, statistically significant). The mid-tier model has better epistemic calibration than the top-tier model. Metacognition is not monotonic with scale.
|
|
245
|
+
|
|
246
|
+
2. **Opus leads on adversarial resilience.** Under the multi-stage protocol, Opus improved its accuracy by +13.9% after being challenged with counter-arguments. It correctly revised wrong answers without abandoning right ones.
|
|
247
|
+
|
|
248
|
+
3. **DeepSeek R1 classifies problems differently.** R1's solvability detection (SDA = 68.6%) diverges from all other models, and its evaluation perspective as a judge disagreed with peers at 16–20%. Chain-of-thought reasoning does not inherently improve metacognition.
|
|
249
|
+
|
|
250
|
+
4. **Gemini 3.1 Pro is the most overconfident.** Its stated confidence exceeds actual accuracy by 33 percentage points — the largest gap in the benchmark.
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## Project Structure
|
|
255
|
+
|
|
256
|
+
```
|
|
257
|
+
prometheus-ebm-sdk/
|
|
258
|
+
├── prometheus_ebm/
|
|
259
|
+
│ ├── __init__.py # Public API exports
|
|
260
|
+
│ ├── config.py # RunConfig dataclass
|
|
261
|
+
│ ├── taxonomy.py # 4-class solvability taxonomy
|
|
262
|
+
│ ├── scorer.py # ECI, HGI, Brier, D-Prime
|
|
263
|
+
│ ├── runner.py # Benchmark orchestrator
|
|
264
|
+
│ ├── data/ # Bundled dataset (200 problems)
|
|
265
|
+
│ └── providers/
|
|
266
|
+
│ ├── kaggle.py # Kaggle kbench adapter
|
|
267
|
+
│ ├── openrouter.py # OpenRouter API adapter
|
|
268
|
+
│ ├── anthropic.py # Anthropic Claude adapter
|
|
269
|
+
│ └── openai.py # OpenAI adapter
|
|
270
|
+
├── tests/
|
|
271
|
+
│ └── test_scorer.py # Unit tests for scoring engine
|
|
272
|
+
├── examples/
|
|
273
|
+
│ ├── compare_5_models.py # Multi-model comparison example
|
|
274
|
+
│ └── deep_probe_opus.py # Single-model deep probe example
|
|
275
|
+
├── pyproject.toml # Package configuration
|
|
276
|
+
└── LICENSE
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
## Roadmap
|
|
282
|
+
|
|
283
|
+
| Version | Status | Features |
|
|
284
|
+
|---------|--------|----------|
|
|
285
|
+
| **v0.1.0** | ✅ Current | Scorer (ECI, Brier, D-Prime), Taxonomy, Config, Provider adapters |
|
|
286
|
+
| **v0.2.0** | Planned | Full evaluation loop, stress augmentation engine, export pipeline |
|
|
287
|
+
| **v0.3.0** | Planned | Bootstrap CI, pairwise significance, contamination audit |
|
|
288
|
+
| **v1.0.0** | Planned | 1,000-item dataset, CLI tool, HTML report generator |
|
|
289
|
+
|
|
290
|
+
---
|
|
291
|
+
|
|
292
|
+
## License
|
|
293
|
+
|
|
294
|
+
MIT — See [LICENSE](LICENSE) for details.
|
|
295
|
+
|
|
296
|
+
---
|
|
297
|
+
|
|
298
|
+
## Citation
|
|
299
|
+
|
|
300
|
+
```bibtex
|
|
301
|
+
@misc{alam2026prometheus,
|
|
302
|
+
title = {PROMETHEUS-EBM: Benchmarking Epistemic Metacognition in Frontier AI Models},
|
|
303
|
+
author = {Mushfiqul Alam},
|
|
304
|
+
year = {2026},
|
|
305
|
+
url = {https://github.com/Mushfiqul-Alam-17/prometheus-ebm-sdk}
|
|
306
|
+
}
|
|
307
|
+
```
|