pearmut 1.0.0__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pearmut-1.0.0 → pearmut-1.0.2}/PKG-INFO +87 -16
- {pearmut-1.0.0 → pearmut-1.0.2}/README.md +86 -15
- {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/PKG-INFO +87 -16
- {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/SOURCES.txt +3 -2
- {pearmut-1.0.0 → pearmut-1.0.2}/pyproject.toml +1 -1
- {pearmut-1.0.0 → pearmut-1.0.2}/server/app.py +103 -2
- {pearmut-1.0.0 → pearmut-1.0.2}/server/assignment.py +59 -25
- {pearmut-1.0.0 → pearmut-1.0.2}/server/cli.py +241 -150
- pearmut-1.0.2/server/constants.py +93 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/server/results_export.py +1 -1
- pearmut-1.0.2/server/static/annotate.bundle.js +1 -0
- pearmut-1.0.2/server/static/annotate.html +160 -0
- pearmut-1.0.2/server/static/dashboard.bundle.js +1 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/server/static/dashboard.html +6 -1
- {pearmut-1.0.0 → pearmut-1.0.2}/server/static/index.html +1 -1
- {pearmut-1.0.0 → pearmut-1.0.2}/server/static/style.css +8 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/server/utils.py +4 -14
- pearmut-1.0.0/server/static/basic.bundle.js +0 -1
- pearmut-1.0.0/server/static/basic.html +0 -97
- pearmut-1.0.0/server/static/dashboard.bundle.js +0 -1
- {pearmut-1.0.0 → pearmut-1.0.2}/LICENSE +0 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/dependency_links.txt +0 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/entry_points.txt +0 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/requires.txt +0 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/top_level.txt +0 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/server/static/favicon.svg +0 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/server/static/index.bundle.js +0 -0
- {pearmut-1.0.0 → pearmut-1.0.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.2
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -19,17 +19,10 @@ Provides-Extra: dev
|
|
|
19
19
|
Requires-Dist: pytest; extra == "dev"
|
|
20
20
|
Dynamic: license-file
|
|
21
21
|
|
|
22
|
-
# Pearmut
|
|
22
|
+
# 🍐Pearmut <br> [](https://pypi.org/project/pearmut) [](https://pypi.python.org/pypi/pearmut/) [](https://pypi.org/project/pearmut/) [](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [](https://arxiv.org/abs/2601.02933)
|
|
23
23
|
|
|
24
24
|
**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
|
|
25
25
|
|
|
26
|
-
[](https://pypi.org/project/pearmut)
|
|
27
|
-
|
|
28
|
-
[](https://pypi.python.org/pypi/pearmut/)
|
|
29
|
-
|
|
30
|
-
[](https://pypi.org/project/pearmut/)
|
|
31
|
-
|
|
32
|
-
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
33
26
|
|
|
34
27
|
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
|
|
35
28
|
|
|
@@ -52,6 +45,8 @@ Dynamic: license-file
|
|
|
52
45
|
- [Terminology](#terminology)
|
|
53
46
|
- [Development](#development)
|
|
54
47
|
- [Citation](#citation)
|
|
48
|
+
- [Changelog](#changelog)
|
|
49
|
+
|
|
55
50
|
|
|
56
51
|
## Quick Start
|
|
57
52
|
|
|
@@ -111,7 +106,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
111
106
|
}
|
|
112
107
|
```
|
|
113
108
|
|
|
114
|
-
Each item has to have `
|
|
109
|
+
Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
|
|
110
|
+
Optionally, you can also include `src` (source string) and/or `ref` (reference string).
|
|
111
|
+
If neither `src` nor `ref` is provided, only the model outputs will be displayed.
|
|
115
112
|
For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
|
|
116
113
|
Any other keys that you add will simply be stored in the logs.
|
|
117
114
|
|
|
@@ -145,6 +142,74 @@ The `shuffle` parameter in campaign `info` controls this behavior:
|
|
|
145
142
|
}
|
|
146
143
|
```
|
|
147
144
|
|
|
145
|
+
### Showing Model Names
|
|
146
|
+
|
|
147
|
+
By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
|
|
148
|
+
```python
|
|
149
|
+
{
|
|
150
|
+
"info": {
|
|
151
|
+
"assignment": "task-based",
|
|
152
|
+
"protocol": "ESA",
|
|
153
|
+
"show_model_names": true # Default: false.
|
|
154
|
+
},
|
|
155
|
+
"campaign_id": "my_campaign",
|
|
156
|
+
"data": [...]
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Custom Score Sliders
|
|
161
|
+
|
|
162
|
+
For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
{
|
|
166
|
+
"info": {
|
|
167
|
+
"assignment": "task-based",
|
|
168
|
+
"protocol": "ESA",
|
|
169
|
+
"sliders": [
|
|
170
|
+
{"name": "Fluency", "min": 0, "max": 5, "step": 1},
|
|
171
|
+
{"name": "Adequacy", "min": 0, "max": 100, "step": 1}
|
|
172
|
+
]
|
|
173
|
+
},
|
|
174
|
+
"campaign_id": "my_campaign",
|
|
175
|
+
"data": [...]
|
|
176
|
+
}
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
|
|
180
|
+
|
|
181
|
+
### Textfield for Post-editing/Translation
|
|
182
|
+
|
|
183
|
+
Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
{
|
|
187
|
+
"info": {
|
|
188
|
+
"protocol": "DA",
|
|
189
|
+
"textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Textfield modes:**
|
|
195
|
+
- `null` or omitted: No textfield (default)
|
|
196
|
+
- `"hidden"`: Textfield hidden by default, shown by clicking a button
|
|
197
|
+
- `"visible"`: Textfield always visible
|
|
198
|
+
- `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
|
|
199
|
+
|
|
200
|
+
### Custom Instructions
|
|
201
|
+
|
|
202
|
+
Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
|
|
203
|
+
Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
|
|
204
|
+
```python
|
|
205
|
+
{
|
|
206
|
+
"info": {
|
|
207
|
+
"protocol": "DA",
|
|
208
|
+
"instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
```
|
|
212
|
+
|
|
148
213
|
### Pre-filled Error Spans (ESA<sup>AI</sup>)
|
|
149
214
|
|
|
150
215
|
Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
|
|
@@ -263,7 +328,7 @@ All items must contain outputs from all models for this assignment type to work
|
|
|
263
328
|
**How it works:**
|
|
264
329
|
1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
|
|
265
330
|
2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
|
|
266
|
-
3. Contrastive
|
|
331
|
+
3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
|
|
267
332
|
4. Item prioritization: Items with the least annotations for the selected models are prioritized
|
|
268
333
|
5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
|
|
269
334
|
|
|
@@ -289,6 +354,7 @@ The `users` field accepts:
|
|
|
289
354
|
}
|
|
290
355
|
```
|
|
291
356
|
|
|
357
|
+
|
|
292
358
|
### Multimodal Annotations
|
|
293
359
|
|
|
294
360
|
Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
|
|
@@ -369,7 +435,7 @@ Customize the goodbye message shown to users when they complete all annotations
|
|
|
369
435
|
- **Score**: Numeric quality rating (0-100)
|
|
370
436
|
- **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
|
|
371
437
|
- **Error Categories**: MQM taxonomy labels for errors
|
|
372
|
-
- **Template**: The annotation interface type. The `
|
|
438
|
+
- **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
|
|
373
439
|
- **Assignment**: The method for distributing items to users:
|
|
374
440
|
- **Task-based**: Each user has predefined items
|
|
375
441
|
- **Single-stream**: Users draw from a shared pool with random assignment
|
|
@@ -400,7 +466,7 @@ pearmut run
|
|
|
400
466
|
2. Add build rule to `webpack.config.js`
|
|
401
467
|
3. Reference as `info->template` in campaign JSON
|
|
402
468
|
|
|
403
|
-
See [web/src/
|
|
469
|
+
See [web/src/annotate.ts](web/src/annotate.ts) for example.
|
|
404
470
|
|
|
405
471
|
### Deployment
|
|
406
472
|
|
|
@@ -411,10 +477,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
|
|
|
411
477
|
If you use this work in your paper, please cite as following.
|
|
412
478
|
```bibtex
|
|
413
479
|
@misc{zouhar2026pearmut,
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
480
|
+
title={Pearmut: Human Evaluation of Translation Made Trivial},
|
|
481
|
+
author={Vilém Zouhar and Tom Kocmi},
|
|
482
|
+
year={2026},
|
|
483
|
+
eprint={2601.02933},
|
|
484
|
+
archivePrefix={arXiv},
|
|
485
|
+
primaryClass={cs.CL},
|
|
486
|
+
url={https://arxiv.org/abs/2601.02933},
|
|
417
487
|
}
|
|
418
488
|
```
|
|
419
489
|
|
|
420
490
|
Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
|
|
491
|
+
See changes in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -1,14 +1,7 @@
|
|
|
1
|
-
# Pearmut
|
|
1
|
+
# 🍐Pearmut <br> [](https://pypi.org/project/pearmut) [](https://pypi.python.org/pypi/pearmut/) [](https://pypi.org/project/pearmut/) [](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [](https://arxiv.org/abs/2601.02933)
|
|
2
2
|
|
|
3
3
|
**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
|
|
4
4
|
|
|
5
|
-
[](https://pypi.org/project/pearmut)
|
|
6
|
-
|
|
7
|
-
[](https://pypi.python.org/pypi/pearmut/)
|
|
8
|
-
|
|
9
|
-
[](https://pypi.org/project/pearmut/)
|
|
10
|
-
|
|
11
|
-
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
12
5
|
|
|
13
6
|
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
|
|
14
7
|
|
|
@@ -31,6 +24,8 @@
|
|
|
31
24
|
- [Terminology](#terminology)
|
|
32
25
|
- [Development](#development)
|
|
33
26
|
- [Citation](#citation)
|
|
27
|
+
- [Changelog](#changelog)
|
|
28
|
+
|
|
34
29
|
|
|
35
30
|
## Quick Start
|
|
36
31
|
|
|
@@ -90,7 +85,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
90
85
|
}
|
|
91
86
|
```
|
|
92
87
|
|
|
93
|
-
Each item has to have `
|
|
88
|
+
Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
|
|
89
|
+
Optionally, you can also include `src` (source string) and/or `ref` (reference string).
|
|
90
|
+
If neither `src` nor `ref` is provided, only the model outputs will be displayed.
|
|
94
91
|
For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
|
|
95
92
|
Any other keys that you add will simply be stored in the logs.
|
|
96
93
|
|
|
@@ -124,6 +121,74 @@ The `shuffle` parameter in campaign `info` controls this behavior:
|
|
|
124
121
|
}
|
|
125
122
|
```
|
|
126
123
|
|
|
124
|
+
### Showing Model Names
|
|
125
|
+
|
|
126
|
+
By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
|
|
127
|
+
```python
|
|
128
|
+
{
|
|
129
|
+
"info": {
|
|
130
|
+
"assignment": "task-based",
|
|
131
|
+
"protocol": "ESA",
|
|
132
|
+
"show_model_names": true # Default: false.
|
|
133
|
+
},
|
|
134
|
+
"campaign_id": "my_campaign",
|
|
135
|
+
"data": [...]
|
|
136
|
+
}
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Custom Score Sliders
|
|
140
|
+
|
|
141
|
+
For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
{
|
|
145
|
+
"info": {
|
|
146
|
+
"assignment": "task-based",
|
|
147
|
+
"protocol": "ESA",
|
|
148
|
+
"sliders": [
|
|
149
|
+
{"name": "Fluency", "min": 0, "max": 5, "step": 1},
|
|
150
|
+
{"name": "Adequacy", "min": 0, "max": 100, "step": 1}
|
|
151
|
+
]
|
|
152
|
+
},
|
|
153
|
+
"campaign_id": "my_campaign",
|
|
154
|
+
"data": [...]
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
|
|
159
|
+
|
|
160
|
+
### Textfield for Post-editing/Translation
|
|
161
|
+
|
|
162
|
+
Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
{
|
|
166
|
+
"info": {
|
|
167
|
+
"protocol": "DA",
|
|
168
|
+
"textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
**Textfield modes:**
|
|
174
|
+
- `null` or omitted: No textfield (default)
|
|
175
|
+
- `"hidden"`: Textfield hidden by default, shown by clicking a button
|
|
176
|
+
- `"visible"`: Textfield always visible
|
|
177
|
+
- `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
|
|
178
|
+
|
|
179
|
+
### Custom Instructions
|
|
180
|
+
|
|
181
|
+
Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
|
|
182
|
+
Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
|
|
183
|
+
```python
|
|
184
|
+
{
|
|
185
|
+
"info": {
|
|
186
|
+
"protocol": "DA",
|
|
187
|
+
"instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
```
|
|
191
|
+
|
|
127
192
|
### Pre-filled Error Spans (ESA<sup>AI</sup>)
|
|
128
193
|
|
|
129
194
|
Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
|
|
@@ -242,7 +307,7 @@ All items must contain outputs from all models for this assignment type to work
|
|
|
242
307
|
**How it works:**
|
|
243
308
|
1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
|
|
244
309
|
2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
|
|
245
|
-
3. Contrastive
|
|
310
|
+
3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
|
|
246
311
|
4. Item prioritization: Items with the least annotations for the selected models are prioritized
|
|
247
312
|
5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
|
|
248
313
|
|
|
@@ -268,6 +333,7 @@ The `users` field accepts:
|
|
|
268
333
|
}
|
|
269
334
|
```
|
|
270
335
|
|
|
336
|
+
|
|
271
337
|
### Multimodal Annotations
|
|
272
338
|
|
|
273
339
|
Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
|
|
@@ -348,7 +414,7 @@ Customize the goodbye message shown to users when they complete all annotations
|
|
|
348
414
|
- **Score**: Numeric quality rating (0-100)
|
|
349
415
|
- **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
|
|
350
416
|
- **Error Categories**: MQM taxonomy labels for errors
|
|
351
|
-
- **Template**: The annotation interface type. The `
|
|
417
|
+
- **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
|
|
352
418
|
- **Assignment**: The method for distributing items to users:
|
|
353
419
|
- **Task-based**: Each user has predefined items
|
|
354
420
|
- **Single-stream**: Users draw from a shared pool with random assignment
|
|
@@ -379,7 +445,7 @@ pearmut run
|
|
|
379
445
|
2. Add build rule to `webpack.config.js`
|
|
380
446
|
3. Reference as `info->template` in campaign JSON
|
|
381
447
|
|
|
382
|
-
See [web/src/
|
|
448
|
+
See [web/src/annotate.ts](web/src/annotate.ts) for example.
|
|
383
449
|
|
|
384
450
|
### Deployment
|
|
385
451
|
|
|
@@ -390,10 +456,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
|
|
|
390
456
|
If you use this work in your paper, please cite as following.
|
|
391
457
|
```bibtex
|
|
392
458
|
@misc{zouhar2026pearmut,
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
459
|
+
title={Pearmut: Human Evaluation of Translation Made Trivial},
|
|
460
|
+
author={Vilém Zouhar and Tom Kocmi},
|
|
461
|
+
year={2026},
|
|
462
|
+
eprint={2601.02933},
|
|
463
|
+
archivePrefix={arXiv},
|
|
464
|
+
primaryClass={cs.CL},
|
|
465
|
+
url={https://arxiv.org/abs/2601.02933},
|
|
396
466
|
}
|
|
397
467
|
```
|
|
398
468
|
|
|
399
469
|
Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
|
|
470
|
+
See changes in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.2
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -19,17 +19,10 @@ Provides-Extra: dev
|
|
|
19
19
|
Requires-Dist: pytest; extra == "dev"
|
|
20
20
|
Dynamic: license-file
|
|
21
21
|
|
|
22
|
-
# Pearmut
|
|
22
|
+
# 🍐Pearmut <br> [](https://pypi.org/project/pearmut) [](https://pypi.python.org/pypi/pearmut/) [](https://pypi.org/project/pearmut/) [](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [](https://arxiv.org/abs/2601.02933)
|
|
23
23
|
|
|
24
24
|
**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
|
|
25
25
|
|
|
26
|
-
[](https://pypi.org/project/pearmut)
|
|
27
|
-
|
|
28
|
-
[](https://pypi.python.org/pypi/pearmut/)
|
|
29
|
-
|
|
30
|
-
[](https://pypi.org/project/pearmut/)
|
|
31
|
-
|
|
32
|
-
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
33
26
|
|
|
34
27
|
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
|
|
35
28
|
|
|
@@ -52,6 +45,8 @@ Dynamic: license-file
|
|
|
52
45
|
- [Terminology](#terminology)
|
|
53
46
|
- [Development](#development)
|
|
54
47
|
- [Citation](#citation)
|
|
48
|
+
- [Changelog](#changelog)
|
|
49
|
+
|
|
55
50
|
|
|
56
51
|
## Quick Start
|
|
57
52
|
|
|
@@ -111,7 +106,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
111
106
|
}
|
|
112
107
|
```
|
|
113
108
|
|
|
114
|
-
Each item has to have `
|
|
109
|
+
Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
|
|
110
|
+
Optionally, you can also include `src` (source string) and/or `ref` (reference string).
|
|
111
|
+
If neither `src` nor `ref` is provided, only the model outputs will be displayed.
|
|
115
112
|
For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
|
|
116
113
|
Any other keys that you add will simply be stored in the logs.
|
|
117
114
|
|
|
@@ -145,6 +142,74 @@ The `shuffle` parameter in campaign `info` controls this behavior:
|
|
|
145
142
|
}
|
|
146
143
|
```
|
|
147
144
|
|
|
145
|
+
### Showing Model Names
|
|
146
|
+
|
|
147
|
+
By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
|
|
148
|
+
```python
|
|
149
|
+
{
|
|
150
|
+
"info": {
|
|
151
|
+
"assignment": "task-based",
|
|
152
|
+
"protocol": "ESA",
|
|
153
|
+
"show_model_names": true # Default: false.
|
|
154
|
+
},
|
|
155
|
+
"campaign_id": "my_campaign",
|
|
156
|
+
"data": [...]
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Custom Score Sliders
|
|
161
|
+
|
|
162
|
+
For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
{
|
|
166
|
+
"info": {
|
|
167
|
+
"assignment": "task-based",
|
|
168
|
+
"protocol": "ESA",
|
|
169
|
+
"sliders": [
|
|
170
|
+
{"name": "Fluency", "min": 0, "max": 5, "step": 1},
|
|
171
|
+
{"name": "Adequacy", "min": 0, "max": 100, "step": 1}
|
|
172
|
+
]
|
|
173
|
+
},
|
|
174
|
+
"campaign_id": "my_campaign",
|
|
175
|
+
"data": [...]
|
|
176
|
+
}
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
|
|
180
|
+
|
|
181
|
+
### Textfield for Post-editing/Translation
|
|
182
|
+
|
|
183
|
+
Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
{
|
|
187
|
+
"info": {
|
|
188
|
+
"protocol": "DA",
|
|
189
|
+
"textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Textfield modes:**
|
|
195
|
+
- `null` or omitted: No textfield (default)
|
|
196
|
+
- `"hidden"`: Textfield hidden by default, shown by clicking a button
|
|
197
|
+
- `"visible"`: Textfield always visible
|
|
198
|
+
- `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
|
|
199
|
+
|
|
200
|
+
### Custom Instructions
|
|
201
|
+
|
|
202
|
+
Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
|
|
203
|
+
Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
|
|
204
|
+
```python
|
|
205
|
+
{
|
|
206
|
+
"info": {
|
|
207
|
+
"protocol": "DA",
|
|
208
|
+
"instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
```
|
|
212
|
+
|
|
148
213
|
### Pre-filled Error Spans (ESA<sup>AI</sup>)
|
|
149
214
|
|
|
150
215
|
Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
|
|
@@ -263,7 +328,7 @@ All items must contain outputs from all models for this assignment type to work
|
|
|
263
328
|
**How it works:**
|
|
264
329
|
1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
|
|
265
330
|
2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
|
|
266
|
-
3. Contrastive
|
|
331
|
+
3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
|
|
267
332
|
4. Item prioritization: Items with the least annotations for the selected models are prioritized
|
|
268
333
|
5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
|
|
269
334
|
|
|
@@ -289,6 +354,7 @@ The `users` field accepts:
|
|
|
289
354
|
}
|
|
290
355
|
```
|
|
291
356
|
|
|
357
|
+
|
|
292
358
|
### Multimodal Annotations
|
|
293
359
|
|
|
294
360
|
Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
|
|
@@ -369,7 +435,7 @@ Customize the goodbye message shown to users when they complete all annotations
|
|
|
369
435
|
- **Score**: Numeric quality rating (0-100)
|
|
370
436
|
- **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
|
|
371
437
|
- **Error Categories**: MQM taxonomy labels for errors
|
|
372
|
-
- **Template**: The annotation interface type. The `
|
|
438
|
+
- **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
|
|
373
439
|
- **Assignment**: The method for distributing items to users:
|
|
374
440
|
- **Task-based**: Each user has predefined items
|
|
375
441
|
- **Single-stream**: Users draw from a shared pool with random assignment
|
|
@@ -400,7 +466,7 @@ pearmut run
|
|
|
400
466
|
2. Add build rule to `webpack.config.js`
|
|
401
467
|
3. Reference as `info->template` in campaign JSON
|
|
402
468
|
|
|
403
|
-
See [web/src/
|
|
469
|
+
See [web/src/annotate.ts](web/src/annotate.ts) for example.
|
|
404
470
|
|
|
405
471
|
### Deployment
|
|
406
472
|
|
|
@@ -411,10 +477,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
|
|
|
411
477
|
If you use this work in your paper, please cite as following.
|
|
412
478
|
```bibtex
|
|
413
479
|
@misc{zouhar2026pearmut,
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
480
|
+
title={Pearmut: Human Evaluation of Translation Made Trivial},
|
|
481
|
+
author={Vilém Zouhar and Tom Kocmi},
|
|
482
|
+
year={2026},
|
|
483
|
+
eprint={2601.02933},
|
|
484
|
+
archivePrefix={arXiv},
|
|
485
|
+
primaryClass={cs.CL},
|
|
486
|
+
url={https://arxiv.org/abs/2601.02933},
|
|
417
487
|
}
|
|
418
488
|
```
|
|
419
489
|
|
|
420
490
|
Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
|
|
491
|
+
See changes in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -10,10 +10,11 @@ pearmut.egg-info/top_level.txt
|
|
|
10
10
|
server/app.py
|
|
11
11
|
server/assignment.py
|
|
12
12
|
server/cli.py
|
|
13
|
+
server/constants.py
|
|
13
14
|
server/results_export.py
|
|
14
15
|
server/utils.py
|
|
15
|
-
server/static/
|
|
16
|
-
server/static/
|
|
16
|
+
server/static/annotate.bundle.js
|
|
17
|
+
server/static/annotate.html
|
|
17
18
|
server/static/dashboard.bundle.js
|
|
18
19
|
server/static/dashboard.html
|
|
19
20
|
server/static/favicon.svg
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from fastapi import FastAPI, Query
|
|
6
6
|
from fastapi.middleware.cors import CORSMiddleware
|
|
7
|
-
from fastapi.responses import JSONResponse, Response
|
|
7
|
+
from fastapi.responses import FileResponse, JSONResponse, Response
|
|
8
8
|
from fastapi.staticfiles import StaticFiles
|
|
9
9
|
from pydantic import BaseModel
|
|
10
10
|
|
|
@@ -17,6 +17,7 @@ from .results_export import (
|
|
|
17
17
|
)
|
|
18
18
|
from .utils import (
|
|
19
19
|
ROOT,
|
|
20
|
+
TOKEN_MAIN,
|
|
20
21
|
check_validation_threshold,
|
|
21
22
|
load_progress_data,
|
|
22
23
|
save_db_payload,
|
|
@@ -192,7 +193,11 @@ async def _dashboard_data(request: DashboardDataRequest):
|
|
|
192
193
|
progress_new[user_id] = entry
|
|
193
194
|
|
|
194
195
|
return JSONResponse(
|
|
195
|
-
content={
|
|
196
|
+
content={
|
|
197
|
+
"data": progress_new,
|
|
198
|
+
"validation_threshold": validation_threshold,
|
|
199
|
+
"assignment": assignment,
|
|
200
|
+
},
|
|
196
201
|
status_code=200,
|
|
197
202
|
)
|
|
198
203
|
|
|
@@ -280,6 +285,91 @@ async def _reset_task(request: ResetTaskRequest):
|
|
|
280
285
|
return response
|
|
281
286
|
|
|
282
287
|
|
|
288
|
+
class PurgeCampaignRequest(BaseModel):
|
|
289
|
+
campaign_id: str
|
|
290
|
+
token: str
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@app.post("/purge-campaign")
|
|
294
|
+
async def _purge_campaign(request: PurgeCampaignRequest):
|
|
295
|
+
global progress_data, tasks_data
|
|
296
|
+
|
|
297
|
+
campaign_id = request.campaign_id
|
|
298
|
+
token = request.token
|
|
299
|
+
|
|
300
|
+
if campaign_id not in progress_data:
|
|
301
|
+
return JSONResponse(content="Unknown campaign ID", status_code=400)
|
|
302
|
+
if token != tasks_data[campaign_id]["token"]:
|
|
303
|
+
return JSONResponse(content="Invalid token", status_code=400)
|
|
304
|
+
|
|
305
|
+
# Unlink assets if they exist
|
|
306
|
+
destination = (
|
|
307
|
+
tasks_data[campaign_id].get("info", {}).get("assets", {}).get("destination")
|
|
308
|
+
)
|
|
309
|
+
if destination:
|
|
310
|
+
symlink_path = f"{ROOT}/data/{destination}".rstrip("/")
|
|
311
|
+
if os.path.islink(symlink_path):
|
|
312
|
+
os.remove(symlink_path)
|
|
313
|
+
|
|
314
|
+
# Remove task file
|
|
315
|
+
task_file = f"{ROOT}/data/tasks/{campaign_id}.json"
|
|
316
|
+
if os.path.exists(task_file):
|
|
317
|
+
os.remove(task_file)
|
|
318
|
+
|
|
319
|
+
# Remove output file
|
|
320
|
+
output_file = f"{ROOT}/data/outputs/{campaign_id}.jsonl"
|
|
321
|
+
if os.path.exists(output_file):
|
|
322
|
+
os.remove(output_file)
|
|
323
|
+
|
|
324
|
+
# Remove from in-memory data structures
|
|
325
|
+
del tasks_data[campaign_id]
|
|
326
|
+
del progress_data[campaign_id]
|
|
327
|
+
|
|
328
|
+
# Save updated progress data
|
|
329
|
+
save_progress_data(progress_data)
|
|
330
|
+
|
|
331
|
+
return JSONResponse(content="ok", status_code=200)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class AddCampaignRequest(BaseModel):
|
|
335
|
+
campaign_data: dict[str, Any]
|
|
336
|
+
token_main: str
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
@app.post("/add-campaign")
|
|
340
|
+
async def _add_campaign(request: AddCampaignRequest):
|
|
341
|
+
global progress_data, tasks_data
|
|
342
|
+
|
|
343
|
+
from .cli import _add_single_campaign
|
|
344
|
+
|
|
345
|
+
if request.token_main != TOKEN_MAIN:
|
|
346
|
+
return JSONResponse(
|
|
347
|
+
content={"error": "Invalid main token. Use the latest one."},
|
|
348
|
+
status_code=400,
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
server = f"{os.environ.get('PEARMUT_SERVER_URL', 'http://localhost:8001')}"
|
|
353
|
+
_add_single_campaign(request.campaign_data, overwrite=False, server=server)
|
|
354
|
+
|
|
355
|
+
campaign_id = request.campaign_data["campaign_id"]
|
|
356
|
+
with open(f"{ROOT}/data/tasks/{campaign_id}.json", "r") as f:
|
|
357
|
+
tasks_data[campaign_id] = json.load(f)
|
|
358
|
+
|
|
359
|
+
progress_data = load_progress_data(warn=None)
|
|
360
|
+
|
|
361
|
+
return JSONResponse(
|
|
362
|
+
content={
|
|
363
|
+
"status": "ok",
|
|
364
|
+
"campaign_id": campaign_id,
|
|
365
|
+
"token": tasks_data[campaign_id]["token"],
|
|
366
|
+
},
|
|
367
|
+
status_code=200,
|
|
368
|
+
)
|
|
369
|
+
except Exception as e:
|
|
370
|
+
return JSONResponse(content={"error": str(e)}, status_code=400)
|
|
371
|
+
|
|
372
|
+
|
|
283
373
|
@app.get("/download-annotations")
|
|
284
374
|
async def _download_annotations(
|
|
285
375
|
campaign_id: list[str] = Query(),
|
|
@@ -345,6 +435,17 @@ if not os.path.exists(static_dir + "index.html"):
|
|
|
345
435
|
"Static directory not found. Please build the frontend first."
|
|
346
436
|
)
|
|
347
437
|
|
|
438
|
+
# Serve HTML files directly without redirect
|
|
439
|
+
@app.get("/annotate")
|
|
440
|
+
async def serve_annotate():
|
|
441
|
+
return FileResponse(static_dir + "annotate.html")
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
@app.get("/dashboard")
|
|
445
|
+
async def serve_dashboard():
|
|
446
|
+
return FileResponse(static_dir + "dashboard.html")
|
|
447
|
+
|
|
448
|
+
|
|
348
449
|
# Mount user assets from data/assets/
|
|
349
450
|
assets_dir = f"{ROOT}/data/assets"
|
|
350
451
|
os.makedirs(assets_dir, exist_ok=True)
|