crystal-metrics 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crystal_metrics-0.1.0/PKG-INFO +51 -0
- crystal_metrics-0.1.0/README.md +332 -0
- crystal_metrics-0.1.0/pyproject.toml +47 -0
- crystal_metrics-0.1.0/setup.cfg +4 -0
- crystal_metrics-0.1.0/src/crystal_metrics/README.md +25 -0
- crystal_metrics-0.1.0/src/crystal_metrics/__init__.py +43 -0
- crystal_metrics-0.1.0/src/crystal_metrics/accuracy.py +414 -0
- crystal_metrics-0.1.0/src/crystal_metrics/cli.py +145 -0
- crystal_metrics-0.1.0/src/crystal_metrics/judge.py +115 -0
- crystal_metrics-0.1.0/src/crystal_metrics/reasoning.py +447 -0
- crystal_metrics-0.1.0/src/crystal_metrics/similarity.py +157 -0
- crystal_metrics-0.1.0/src/crystal_metrics.egg-info/PKG-INFO +51 -0
- crystal_metrics-0.1.0/src/crystal_metrics.egg-info/SOURCES.txt +18 -0
- crystal_metrics-0.1.0/src/crystal_metrics.egg-info/dependency_links.txt +1 -0
- crystal_metrics-0.1.0/src/crystal_metrics.egg-info/entry_points.txt +2 -0
- crystal_metrics-0.1.0/src/crystal_metrics.egg-info/requires.txt +11 -0
- crystal_metrics-0.1.0/src/crystal_metrics.egg-info/top_level.txt +1 -0
- crystal_metrics-0.1.0/tests/test_accuracy.py +113 -0
- crystal_metrics-0.1.0/tests/test_parity.py +163 -0
- crystal_metrics-0.1.0/tests/test_reasoning.py +118 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crystal-metrics
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Transparent multimodal reasoning metrics from the CRYSTAL benchmark (Match F1, Ordered Match F1, accuracy).
|
|
5
|
+
Author: Wayner Barrios, SouYoung Jin
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/waybarrios/crystal
|
|
8
|
+
Project-URL: Paper, https://arxiv.org/abs/2603.13099
|
|
9
|
+
Project-URL: Dataset, https://huggingface.co/datasets/waybarrios/CRYSTAL
|
|
10
|
+
Keywords: mllm,vlm,reasoning,evaluation,match-f1,crystal,benchmark
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: numpy
|
|
18
|
+
Requires-Dist: pandas
|
|
19
|
+
Requires-Dist: torch
|
|
20
|
+
Requires-Dist: sentence-transformers
|
|
21
|
+
Requires-Dist: tqdm
|
|
22
|
+
Provides-Extra: judge
|
|
23
|
+
Requires-Dist: openai>=1.0; extra == "judge"
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest; extra == "dev"
|
|
26
|
+
|
|
27
|
+
# crystal-metrics
|
|
28
|
+
|
|
29
|
+
Transparent multimodal reasoning metrics from the **CRYSTAL** benchmark —
|
|
30
|
+
*Match F1*, *Ordered Match F1*, *Precision*, *Recall*, and multi-format
|
|
31
|
+
*Accuracy*.
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install crystal-metrics # core metrics
|
|
35
|
+
pip install crystal-metrics[judge] # + optional LLM judge
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from crystal_metrics import MLLMReasoningEvaluator
|
|
40
|
+
|
|
41
|
+
evaluator = MLLMReasoningEvaluator() # all-distilroberta-v1, tau=0.35 (paper defaults)
|
|
42
|
+
m = evaluator.evaluate_single(
|
|
43
|
+
predicted_steps=["Three objects on a table", "The middle one is smallest", "Answer C"],
|
|
44
|
+
reference_steps=["There are three objects", "Compare their sizes", "Middle is smallest", "Select C"],
|
|
45
|
+
alpha=0.3, # enable Ordered Match F1
|
|
46
|
+
)
|
|
47
|
+
print(m.match_f1, m.precision, m.recall, m.ordered_match_f1)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
See the [docs](https://github.com/waybarrios/crystal/tree/main/docs) for
|
|
51
|
+
installation, quickstart, metric definitions, and the CLI.
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<img src="assets/eccv2026_logo.png" alt="ECCV 2026 — Malmö, Sept 8–12" width="320"/>
|
|
4
|
+
|
|
5
|
+
# CRYSTAL: Beyond Final Answers
|
|
6
|
+
|
|
7
|
+
### Transparent Multimodal Reasoning Evaluation
|
|
8
|
+
|
|
9
|
+
[](https://arxiv.org/abs/2603.13099)
|
|
10
|
+
[](#crystal-benchmark)
|
|
11
|
+
[](#key-findings)
|
|
12
|
+
[](https://huggingface.co/datasets/waybarrios/CRYSTAL)
|
|
13
|
+
[](#license)
|
|
14
|
+
|
|
15
|
+
**[Wayner Barrios](https://github.com/waybarrios)** · **[SouYoung Jin](https://souyoungjin.github.io/)**
|
|
16
|
+
|
|
17
|
+
*Dartmouth College*
|
|
18
|
+
|
|
19
|
+
[Paper](https://arxiv.org/abs/2603.13099) | [Dataset](https://huggingface.co/datasets/waybarrios/CRYSTAL) | [Results](#key-findings) | [Training](#training-with-causal-process-reward)
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
*Your model gets the right answer. But does it actually **reason**?*
|
|
24
|
+
|
|
25
|
+
</div>
|
|
26
|
+
|
|
27
|
+
## The Problem: Right Answers, Wrong Reasons
|
|
28
|
+
|
|
29
|
+
Current vision-language benchmarks only check final answers. This means a model that **guesses correctly** scores the same as one that **reasons correctly**. CRYSTAL exposes this blind spot.
|
|
30
|
+
|
|
31
|
+
<div align="center">
|
|
32
|
+
<img src="assets/teaser_sample0.jpg" width="280"/>
|
|
33
|
+
|
|
34
|
+
> **Q:** *"Which of the 3 objects is the smallest?"* — A model answers correctly (C: middle console), but its reasoning states the middle console is *larger* than the others while claiming it's the smallest. **Traditional benchmarks give full credit for this lucky guess.** CRYSTAL catches it (Match F1: 0.15).
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
## What is CRYSTAL?
|
|
38
|
+
|
|
39
|
+
**C**lear **R**easoning via **Y**ielded **S**teps, **T**raceability and **L**ogic — a diagnostic benchmark with **6,372 instances** that evaluates multimodal reasoning through **verifiable intermediate steps**.
|
|
40
|
+
|
|
41
|
+
Instead of just asking *"is the answer correct?"*, CRYSTAL asks:
|
|
42
|
+
- Did the model identify the right visual elements?
|
|
43
|
+
- Did it apply the correct logical steps?
|
|
44
|
+
- Are the steps in a coherent order?
|
|
45
|
+
|
|
46
|
+
### How It Works
|
|
47
|
+
|
|
48
|
+
<div align="center">
|
|
49
|
+
<img src="assets/workflow_dataset.png" width="700"/>
|
|
50
|
+
|
|
51
|
+
*Multi-agent reasoning pipeline. Four independent MLLMs generate candidate steps, semantically clustered into consensus sequences, validated by a 5th agent, and verified by human annotators.*
|
|
52
|
+
</div>
|
|
53
|
+
|
|
54
|
+
### Benchmark Statistics
|
|
55
|
+
|
|
56
|
+
| Statistic | Value |
|
|
57
|
+
|:--|:--|
|
|
58
|
+
| Total instances | **6,372** |
|
|
59
|
+
| Avg. reasoning steps per question | **11.6** |
|
|
60
|
+
| Step range | 3 – 42 |
|
|
61
|
+
| Difficulty split | Easy 48.5% / Medium 44.1% / Hard 7.4% |
|
|
62
|
+
| Sources | MathVision, ScienceQA, RealWorldQA, MMVP, PlotQA |
|
|
63
|
+
|
|
64
|
+
### Novel Metrics
|
|
65
|
+
|
|
66
|
+
| Metric | What it measures |
|
|
67
|
+
|:--|:--|
|
|
68
|
+
| **Match F1** | Step-level precision & recall via semantic similarity matching |
|
|
69
|
+
| **Ordered Match F1** | Penalizes disordered reasoning chains (via LIS ratio) |
|
|
70
|
+
|
|
71
|
+
## Metrics Package (`crystal-metrics`)
|
|
72
|
+
|
|
73
|
+
The CRYSTAL metrics ship as a pip-installable package so anyone can score their own models.
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install crystal-metrics # Match F1, Ordered Match F1, Precision, Recall, Accuracy
|
|
77
|
+
pip install crystal-metrics[judge] # + optional LLM judge for free-form answers
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from crystal_metrics import MLLMReasoningEvaluator
|
|
82
|
+
|
|
83
|
+
evaluator = MLLMReasoningEvaluator() # all-distilroberta-v1, τ=0.35 (paper defaults)
|
|
84
|
+
m = evaluator.evaluate_single(predicted_steps, reference_steps, alpha=0.3)
|
|
85
|
+
print(m.match_f1, m.precision, m.recall, m.ordered_match_f1)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Or from the command line:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
crystal-metrics evaluate predictions.json references.json --alpha 0.3
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Docs:** [installation](docs/installation.md) · [quickstart](docs/quickstart.md) · [metric definitions](docs/metrics.md) · [CLI](docs/cli.md). Package source lives in [`src/crystal_metrics/`](src/crystal_metrics/).
|
|
95
|
+
|
|
96
|
+
## Examples
|
|
97
|
+
|
|
98
|
+
CRYSTAL spans diverse reasoning scenarios — from spatial understanding to scientific reasoning and mathematical problem-solving:
|
|
99
|
+
|
|
100
|
+
<table>
|
|
101
|
+
<tr>
|
|
102
|
+
<td width="33%" align="center">
|
|
103
|
+
<img src="assets/dataset_example_sample3.jpg" width="280"/><br/>
|
|
104
|
+
<b>Spatial Reasoning</b><br/>
|
|
105
|
+
<sub><i>"Is it safe to turn right?"</i> — Requires identifying traffic signals, pedestrian crossings, and vehicle positions</sub>
|
|
106
|
+
</td>
|
|
107
|
+
<td width="33%" align="center">
|
|
108
|
+
<img src="assets/example_scienceqa_3807.jpg" width="280"/><br/>
|
|
109
|
+
<b>Scientific Reasoning</b><br/>
|
|
110
|
+
<sub><i>"Which organism would have the most energy available?"</i> — Requires tracing food web dependencies</sub>
|
|
111
|
+
</td>
|
|
112
|
+
<td width="33%" align="center">
|
|
113
|
+
<img src="assets/example_mathvision_767.jpg" width="280"/><br/>
|
|
114
|
+
<b>Mathematical Reasoning</b><br/>
|
|
115
|
+
<sub><i>"Which kite is different?"</i> — Requires systematic comparison of geometric properties</sub>
|
|
116
|
+
</td>
|
|
117
|
+
</tr>
|
|
118
|
+
</table>
|
|
119
|
+
|
|
120
|
+
<details>
|
|
121
|
+
<summary><b>See a full reasoning trace example</b></summary>
|
|
122
|
+
|
|
123
|
+
**Image:** Egg placement puzzle (2x3 grid + chicken)
|
|
124
|
+
|
|
125
|
+
<div align="center">
|
|
126
|
+
<img src="assets/conservatism_example_sample818.jpg" width="300"/>
|
|
127
|
+
</div>
|
|
128
|
+
|
|
129
|
+
**Question:** *"How many eggs can you place so no two eggs are in the same row or column?"*
|
|
130
|
+
|
|
131
|
+
**Baseline model (Qwen2.5-VL-3B):**
|
|
132
|
+
> *"The answer is 3."* — Correct answer, but states the goal without explaining *how*.
|
|
133
|
+
|
|
134
|
+
**After CPR-Curriculum training:**
|
|
135
|
+
> *"Step 1: The grid is 2 rows x 3 columns. Step 2: Apply checkerboard placement constraint. Step 3: Place eggs at positions (1,1), (2,2) avoiding shared rows/columns. Step 4: Maximum is 2 per constraint... but wait, with 2 rows we can place at most 2. Re-examine: 3 columns allow diagonal placement. Answer: 3."* — Structured reasoning with explicit strategy.
|
|
136
|
+
|
|
137
|
+
</details>
|
|
138
|
+
|
|
139
|
+
## Key Findings
|
|
140
|
+
|
|
141
|
+
We evaluated **20 MLLMs** (16 open-source from 1B to 38B, plus GPT-5, GPT-5-mini, GPT-5.2 Instant, and Gemini 2.5 Flash):
|
|
142
|
+
|
|
143
|
+
### Finding 1: Cherry-Picking is Universal
|
|
144
|
+
|
|
145
|
+
**19 out of 20 models** produce high-precision but low-recall reasoning — they say a few correct things but skip most of the required steps.
|
|
146
|
+
|
|
147
|
+
| Model | Params | Accuracy | Match F1 | Precision | Recall |
|
|
148
|
+
|:--|:--:|:--:|:--:|:--:|:--:|
|
|
149
|
+
| | | ***Commercial*** | | | |
|
|
150
|
+
| GPT-5 | n/a | **57.99%** | 0.612 | 0.925 | 0.479 |
|
|
151
|
+
| GPT-5-mini | n/a | 55.59% | **0.773** | **0.978** | 0.669 |
|
|
152
|
+
| Gemini 2.5 Flash | n/a | 53.95% | 0.673 | 0.701 | **0.765** |
|
|
153
|
+
| GPT-5.2 Instant | n/a | 47.35% | 0.564 | 0.974 | 0.416 |
|
|
154
|
+
| | | ***Qwen Family*** | | | |
|
|
155
|
+
| Qwen3-VL-8B | 8B | **57.66%** | 0.659 | 0.827 | 0.590 |
|
|
156
|
+
| Qwen3-VL-32B | 32B | 49.22% | 0.718 | 0.819 | 0.704 |
|
|
157
|
+
| Qwen2.5-VL-32B | 32B | 47.63% | 0.653 | 0.943 | 0.524 |
|
|
158
|
+
| Qwen2.5-VL-3B | 3B | 39.85% | 0.480 | 0.898 | 0.347 |
|
|
159
|
+
| Qwen3-VL-2B | 2B | 34.15% | 0.595 | 0.726 | 0.535 |
|
|
160
|
+
| Qwen2.5-VL-7B | 7B | 30.43% | 0.475 | 0.765 | 0.365 |
|
|
161
|
+
| | | ***InternVL Family*** | | | |
|
|
162
|
+
| InternVL3.5-8B | 8B | 51.98% | 0.530 | 0.882 | 0.416 |
|
|
163
|
+
| InternVL3.5-38B | 38B | 51.21% | 0.612 | 0.892 | 0.498 |
|
|
164
|
+
| InternVL3.5-4B | 4B | 37.61% | 0.432 | 0.895 | 0.325 |
|
|
165
|
+
| InternVL3.5-2B | 2B | 33.02% | 0.469 | 0.725 | 0.371 |
|
|
166
|
+
| InternVL3.5-1B | 1B | 30.13% | 0.330 | 0.616 | 0.243 |
|
|
167
|
+
| | | ***Other Open-Source*** | | | |
|
|
168
|
+
| Gemma3-12B | 12B | 33.83% | 0.605 | 0.838 | 0.499 |
|
|
169
|
+
| Gemma3-4B | 4B | 28.65% | 0.618 | 0.878 | 0.506 |
|
|
170
|
+
| Llama 3.2-11B | 11B | 24.83% | 0.471 | 0.713 | 0.379 |
|
|
171
|
+
| LLaVA-v1.6-7B | 7B | 24.66% | 0.512 | 0.961 | 0.370 |
|
|
172
|
+
| MiniCPM-v2.6-8B | 8B | 25.54% | 0.215 | 0.709 | 0.134 |
|
|
173
|
+
|
|
174
|
+
> Even GPT-5 (best accuracy: 57.99%) recovers only **47.9% of reference steps**. Models say the right things but omit most of the reasoning.
|
|
175
|
+
|
|
176
|
+
### Finding 2: Accuracy and Reasoning Diverge
|
|
177
|
+
|
|
178
|
+
GPT-5 leads accuracy but ranks **8th** in Match F1. GPT-5-mini leads F1 at lower accuracy. **Gemma3-4B outperforms InternVL3.5-38B** in reasoning despite 9.5x fewer parameters.
|
|
179
|
+
|
|
180
|
+
### Finding 3: No Model Reasons in Order
|
|
181
|
+
|
|
182
|
+
Among competitive models, **no model preserves more than 60% of matched steps in the correct order**. Models retrieve relevant reasoning steps but fail to organize them coherently.
|
|
183
|
+
|
|
184
|
+
### Finding 4: Scaling is Non-Monotonic
|
|
185
|
+
|
|
186
|
+
Bigger models don't uniformly improve: Qwen3-VL-32B achieves better F1 (0.718) but *lower accuracy* (49.22%) than Qwen3-VL-8B (0.659 F1, 57.66% accuracy). Scale can improve answer extraction while suppressing reasoning coverage.
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
## Training with Causal Process Reward
|
|
190
|
+
|
|
191
|
+
CRYSTAL isn't just a benchmark — it enables a new training paradigm. We propose **Causal Process Reward (CPR)**, a multiplicative reward that couples answer correctness with step-level alignment:
|
|
192
|
+
|
|
193
|
+
$$R_{\text{CPR}} = \begin{cases} a_w + s_w \cdot \text{F1}_{\text{step}} & \text{if answer correct} \\ s_w \cdot \text{F1}_{\text{step}} \cdot \lambda & \text{otherwise} \end{cases}$$
|
|
194
|
+
|
|
195
|
+
Unlike additive rewards (which let models maximize accuracy by guessing while ignoring reasoning), CPR requires both correct answers **and** faithful reasoning.
|
|
196
|
+
|
|
197
|
+
### CPR-Curriculum: +32% Match F1 where others fail
|
|
198
|
+
|
|
199
|
+
| Strategy | Accuracy | Match F1 | Precision | Recall | Ordered F1 |
|
|
200
|
+
|:--|:--:|:--:|:--:|:--:|:--:|
|
|
201
|
+
| Baseline (Qwen2.5-VL-3B) | 39.85% | 0.480 | 0.898 | 0.347 | 0.434 |
|
|
202
|
+
| Composite (additive) | 44.92% | 0.426 | 0.983 | 0.284 | 0.392 |
|
|
203
|
+
| Answer-Only | 44.30% | 0.429 | 0.803 | 0.308 | 0.380 |
|
|
204
|
+
| **CPR** | 41.40% | **0.633** | 0.975 | 0.489 | **0.560** |
|
|
205
|
+
| **CPR-Curriculum** | **47.52%** | **0.633** | 0.963 | **0.493** | **0.560** |
|
|
206
|
+
|
|
207
|
+
> Additive strategies collapse at step 600 and diverge to NaN by step 1,500. CPR trains stably through 2,800 steps.
|
|
208
|
+
|
|
209
|
+
<div align="center">
|
|
210
|
+
<img src="assets/grpo_f1_trajectory.png" width="48%"/> <img src="assets/grpo_accuracy_trajectory.png" width="48%"/>
|
|
211
|
+
|
|
212
|
+
*Left: Match F1 trajectory. Right: Accuracy trajectory. Composite collapses at step 600; CPR variants train stably through 2,800 steps.*
|
|
213
|
+
</div>
|
|
214
|
+
|
|
215
|
+
### Cross-Model Generalization: CPR-Curriculum on InternVL3.5-4B
|
|
216
|
+
|
|
217
|
+
Same two-phase protocol, no architecture-specific tuning. Recall nearly **triples** (0.325 → 0.811):
|
|
218
|
+
|
|
219
|
+
| Configuration | Accuracy | Match F1 | Precision | Recall | Ordered F1 |
|
|
220
|
+
|:--|:--:|:--:|:--:|:--:|:--:|
|
|
221
|
+
| Baseline (InternVL3.5-4B) | 37.61% | 0.432 | 0.895 | 0.325 | 0.387 |
|
|
222
|
+
| **CPR-Curriculum** | **45.76%** | **0.833** | **0.903** | **0.811** | **0.719** |
|
|
223
|
+
| Δ | +8.15 | +0.401 | +0.008 | +0.486 | +0.332 |
|
|
224
|
+
|
|
225
|
+
> CPR-Curriculum generalizes across architectures: **+93% Match F1** on InternVL3.5-4B without any model-specific tuning.
|
|
226
|
+
|
|
227
|
+
## Ablation Studies
|
|
228
|
+
|
|
229
|
+
### Encoder and Threshold Selection
|
|
230
|
+
|
|
231
|
+
We tested 4 sentence encoders across 5 thresholds (100 experiments). DistilRoBERTa-v1 dominates with 4-8pp advantages across all model families. Model rankings remain stable across all configurations.
|
|
232
|
+
|
|
233
|
+
<div align="center">
|
|
234
|
+
<img src="assets/ablation_encoders.png" width="700"/>
|
|
235
|
+
|
|
236
|
+
*Encoder comparison across thresholds. DistilRoBERTa-v1 consistently outperforms alternatives. Threshold variation yields only 2-3pp swings.*
|
|
237
|
+
</div>
|
|
238
|
+
|
|
239
|
+
### Human Agreement Study
|
|
240
|
+
|
|
241
|
+
On 100 adversarially sampled step pairs, the encoder achieves **84% agreement** with a human annotator, with **100% agreement below the threshold** (zero false matches on semantically unrelated steps).
|
|
242
|
+
|
|
243
|
+
| Similarity Band | Pairs | Agreement | Note |
|
|
244
|
+
|:--|:--:|:--:|:--|
|
|
245
|
+
| < 0.20 | 33 | 100.0% | No false matches |
|
|
246
|
+
| [0.20, 0.35) | 14 | 100.0% | No false matches |
|
|
247
|
+
| [0.35, 0.50) | 19 | 68.4% | Borderline zone |
|
|
248
|
+
| [0.50, 0.70) | 25 | 64.0% | Borderline zone |
|
|
249
|
+
| >= 0.70 | 9 | 88.9% | Clear matches |
|
|
250
|
+
| **Overall** | **100** | **84.0%** | Cohen's kappa = 0.534 |
|
|
251
|
+
|
|
252
|
+
## Supplementary Examples
|
|
253
|
+
|
|
254
|
+
<details>
|
|
255
|
+
<summary><b>CRYSTAL Benchmark Example (MathVision)</b></summary>
|
|
256
|
+
|
|
257
|
+
<div align="center">
|
|
258
|
+
<img src="assets/crystal_ex1_sample1265.jpg" width="400"/>
|
|
259
|
+
</div>
|
|
260
|
+
|
|
261
|
+
**Q:** *"Anna starts in the direction of the arrow. At each crossing she turns either right or left. At the first crossing she turns right, at the next left, then left again, then right, then left and left again. What will she find at the next crossing?"*
|
|
262
|
+
|
|
263
|
+
**Ground Truth:** A
|
|
264
|
+
|
|
265
|
+
**Reference Reasoning Steps (12 steps):**
|
|
266
|
+
1. Anna starts in the direction of an arrow
|
|
267
|
+
2. She turns at each crossing either right or left
|
|
268
|
+
3. First crossing: turns right
|
|
269
|
+
4. Second crossing: left
|
|
270
|
+
5. Third crossing: left again
|
|
271
|
+
6. Fourth crossing: right
|
|
272
|
+
7. Fifth crossing: left
|
|
273
|
+
8. Sixth crossing: left again
|
|
274
|
+
9. Question asks what she finds at the next crossing
|
|
275
|
+
10. Answer format is multiple-choice (A, B, C, D, E)
|
|
276
|
+
11. *[2 more steps for spatial tracing]*
|
|
277
|
+
|
|
278
|
+
</details>
|
|
279
|
+
|
|
280
|
+
<details>
|
|
281
|
+
<summary><b>GRPO Qualitative Improvement</b></summary>
|
|
282
|
+
|
|
283
|
+
<div align="center">
|
|
284
|
+
<img src="assets/grpo_example_sample3.jpg" width="400"/>
|
|
285
|
+
</div>
|
|
286
|
+
|
|
287
|
+
**Q:** *"What color is the traffic light?"* **GT:** Green
|
|
288
|
+
|
|
289
|
+
**Baseline** (2 steps, 33% coverage):
|
|
290
|
+
> 1. Noted the traffic light is green. 2. Identified the traffic light as green. — *Cherry-picks high-confidence steps only.*
|
|
291
|
+
|
|
292
|
+
**After GRPO** (4 steps, 67% coverage):
|
|
293
|
+
> 1. The traffic lights in the scene are green. 2. The traffic lights are positioned at an intersection. 3. The traffic lights are part of the traffic control system. 4. The traffic lights are visible from a distance. — *More comprehensive reasoning (+34pp coverage).*
|
|
294
|
+
|
|
295
|
+
</details>
|
|
296
|
+
|
|
297
|
+
## Dataset
|
|
298
|
+
|
|
299
|
+
The CRYSTAL benchmark is available on HuggingFace:
|
|
300
|
+
|
|
301
|
+
> 🤗 **[huggingface.co/datasets/waybarrios/CRYSTAL](https://huggingface.co/datasets/waybarrios/CRYSTAL)** — 6,372 instances with reference reasoning steps, released under CC-BY-NC-SA-4.0.
|
|
302
|
+
>
|
|
303
|
+
> *Evaluation scripts and pre-trained CPR-Curriculum checkpoints coming soon.*
|
|
304
|
+
|
|
305
|
+
## Why CRYSTAL Matters
|
|
306
|
+
|
|
307
|
+
| If you are... | CRYSTAL helps you... |
|
|
308
|
+
|:--|:--|
|
|
309
|
+
| **Building MLLMs** | Diagnose *where* reasoning fails (perception vs. logic) |
|
|
310
|
+
| **Evaluating models** | Go beyond accuracy to measure reasoning transparency |
|
|
311
|
+
| **Training with RL** | Use CPR rewards to improve reasoning without manual step annotations |
|
|
312
|
+
| **Researching scaling** | Understand non-monotonic trade-offs between accuracy and reasoning |
|
|
313
|
+
| **Deploying AI systems** | Identify models that guess vs. models that actually reason |
|
|
314
|
+
|
|
315
|
+
## Citation
|
|
316
|
+
|
|
317
|
+
```bibtex
|
|
318
|
+
@misc{barrios2026crystal,
|
|
319
|
+
title = {Beyond Final Answers: CRYSTAL Benchmark for Transparent
|
|
320
|
+
Multimodal Reasoning Evaluation},
|
|
321
|
+
author = {Wayner Barrios and SouYoung Jin},
|
|
322
|
+
year = {2026},
|
|
323
|
+
eprint = {2603.13099},
|
|
324
|
+
archivePrefix = {arXiv},
|
|
325
|
+
primaryClass = {cs.AI},
|
|
326
|
+
url = {https://arxiv.org/abs/2603.13099}
|
|
327
|
+
}
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
## License
|
|
331
|
+
|
|
332
|
+
This project is released under the MIT License.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "crystal-metrics"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Transparent multimodal reasoning metrics from the CRYSTAL benchmark (Match F1, Ordered Match F1, accuracy)."
|
|
9
|
+
readme = "src/crystal_metrics/README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Wayner Barrios" },
|
|
14
|
+
{ name = "SouYoung Jin" },
|
|
15
|
+
]
|
|
16
|
+
keywords = ["mllm", "vlm", "reasoning", "evaluation", "match-f1", "crystal", "benchmark"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"numpy",
|
|
25
|
+
"pandas",
|
|
26
|
+
"torch",
|
|
27
|
+
"sentence-transformers",
|
|
28
|
+
"tqdm",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
judge = ["openai>=1.0"]
|
|
33
|
+
dev = ["pytest"]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/waybarrios/crystal"
|
|
37
|
+
Paper = "https://arxiv.org/abs/2603.13099"
|
|
38
|
+
Dataset = "https://huggingface.co/datasets/waybarrios/CRYSTAL"
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
crystal-metrics = "crystal_metrics.cli:main"
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.packages.find]
|
|
44
|
+
where = ["src"]
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.package-data]
|
|
47
|
+
crystal_metrics = ["README.md"]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# crystal-metrics
|
|
2
|
+
|
|
3
|
+
Transparent multimodal reasoning metrics from the **CRYSTAL** benchmark —
|
|
4
|
+
*Match F1*, *Ordered Match F1*, *Precision*, *Recall*, and multi-format
|
|
5
|
+
*Accuracy*.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install crystal-metrics # core metrics
|
|
9
|
+
pip install crystal-metrics[judge] # + optional LLM judge
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from crystal_metrics import MLLMReasoningEvaluator
|
|
14
|
+
|
|
15
|
+
evaluator = MLLMReasoningEvaluator() # all-distilroberta-v1, tau=0.35 (paper defaults)
|
|
16
|
+
m = evaluator.evaluate_single(
|
|
17
|
+
predicted_steps=["Three objects on a table", "The middle one is smallest", "Answer C"],
|
|
18
|
+
reference_steps=["There are three objects", "Compare their sizes", "Middle is smallest", "Select C"],
|
|
19
|
+
alpha=0.3, # enable Ordered Match F1
|
|
20
|
+
)
|
|
21
|
+
print(m.match_f1, m.precision, m.recall, m.ordered_match_f1)
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
See the [docs](https://github.com/waybarrios/crystal/tree/main/docs) for
|
|
25
|
+
installation, quickstart, metric definitions, and the CLI.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
crystal-metrics: transparent multimodal reasoning metrics from the CRYSTAL benchmark.
|
|
3
|
+
|
|
4
|
+
Core metrics (no LLM required):
|
|
5
|
+
- Match F1, Precision, Recall -> MLLMReasoningEvaluator
|
|
6
|
+
- Ordered Match F1 (Kendall / LIS) -> MLLMReasoningEvaluator(..., alpha=...)
|
|
7
|
+
- Multi-format Accuracy -> AccuracyCalculator
|
|
8
|
+
|
|
9
|
+
The optional LLM judge lives in ``crystal_metrics.judge`` and needs the
|
|
10
|
+
``[judge]`` extra: ``pip install crystal-metrics[judge]``. It is intentionally
|
|
11
|
+
NOT imported here so the core package stays free of the ``openai`` dependency.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from .accuracy import AccuracyCalculator, AccuracyResult, AnswerNormalizer
|
|
15
|
+
from .reasoning import (
|
|
16
|
+
EvaluationMetrics,
|
|
17
|
+
MLLMReasoningEvaluator,
|
|
18
|
+
load_json_data,
|
|
19
|
+
save_results,
|
|
20
|
+
)
|
|
21
|
+
from .similarity import (
|
|
22
|
+
best_match_f1,
|
|
23
|
+
jaccard_similarity,
|
|
24
|
+
semantic_match_f1,
|
|
25
|
+
word_overlap_similarity,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__version__ = "0.1.0"
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"MLLMReasoningEvaluator",
|
|
32
|
+
"EvaluationMetrics",
|
|
33
|
+
"AccuracyCalculator",
|
|
34
|
+
"AccuracyResult",
|
|
35
|
+
"AnswerNormalizer",
|
|
36
|
+
"best_match_f1",
|
|
37
|
+
"semantic_match_f1",
|
|
38
|
+
"jaccard_similarity",
|
|
39
|
+
"word_overlap_similarity",
|
|
40
|
+
"load_json_data",
|
|
41
|
+
"save_results",
|
|
42
|
+
"__version__",
|
|
43
|
+
]
|