pearmut 1.0.0__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {pearmut-1.0.0 → pearmut-1.0.2}/PKG-INFO +87 -16
  2. {pearmut-1.0.0 → pearmut-1.0.2}/README.md +86 -15
  3. {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/PKG-INFO +87 -16
  4. {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/SOURCES.txt +3 -2
  5. {pearmut-1.0.0 → pearmut-1.0.2}/pyproject.toml +1 -1
  6. {pearmut-1.0.0 → pearmut-1.0.2}/server/app.py +103 -2
  7. {pearmut-1.0.0 → pearmut-1.0.2}/server/assignment.py +59 -25
  8. {pearmut-1.0.0 → pearmut-1.0.2}/server/cli.py +241 -150
  9. pearmut-1.0.2/server/constants.py +93 -0
  10. {pearmut-1.0.0 → pearmut-1.0.2}/server/results_export.py +1 -1
  11. pearmut-1.0.2/server/static/annotate.bundle.js +1 -0
  12. pearmut-1.0.2/server/static/annotate.html +160 -0
  13. pearmut-1.0.2/server/static/dashboard.bundle.js +1 -0
  14. {pearmut-1.0.0 → pearmut-1.0.2}/server/static/dashboard.html +6 -1
  15. {pearmut-1.0.0 → pearmut-1.0.2}/server/static/index.html +1 -1
  16. {pearmut-1.0.0 → pearmut-1.0.2}/server/static/style.css +8 -0
  17. {pearmut-1.0.0 → pearmut-1.0.2}/server/utils.py +4 -14
  18. pearmut-1.0.0/server/static/basic.bundle.js +0 -1
  19. pearmut-1.0.0/server/static/basic.html +0 -97
  20. pearmut-1.0.0/server/static/dashboard.bundle.js +0 -1
  21. {pearmut-1.0.0 → pearmut-1.0.2}/LICENSE +0 -0
  22. {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/dependency_links.txt +0 -0
  23. {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/entry_points.txt +0 -0
  24. {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/requires.txt +0 -0
  25. {pearmut-1.0.0 → pearmut-1.0.2}/pearmut.egg-info/top_level.txt +0 -0
  26. {pearmut-1.0.0 → pearmut-1.0.2}/server/static/favicon.svg +0 -0
  27. {pearmut-1.0.0 → pearmut-1.0.2}/server/static/index.bundle.js +0 -0
  28. {pearmut-1.0.0 → pearmut-1.0.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pearmut
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: A tool for evaluation of model outputs, primarily MT.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -19,17 +19,10 @@ Provides-Extra: dev
19
19
  Requires-Dist: pytest; extra == "dev"
20
20
  Dynamic: license-file
21
21
 
22
- # Pearmut 🍐
22
+ # 🍐Pearmut <br> [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [![arXiv](https://img.shields.io/badge/arXiv-2601.02933-b31b1b.svg?style=flat)](https://arxiv.org/abs/2601.02933)
23
23
 
24
24
  **Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
25
25
 
26
- [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
27
- &nbsp;
28
- [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/)
29
- &nbsp;
30
- [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/)
31
- &nbsp;
32
- [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
33
26
 
34
27
  <img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
35
28
 
@@ -52,6 +45,8 @@ Dynamic: license-file
52
45
  - [Terminology](#terminology)
53
46
  - [Development](#development)
54
47
  - [Citation](#citation)
48
+ - [Changelog](#changelog)
49
+
55
50
 
56
51
  ## Quick Start
57
52
 
@@ -111,7 +106,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
111
106
  }
112
107
  ```
113
108
 
114
- Each item has to have `src` (string) and `tgt` (dictionary from model names to strings, even for a single model evaluation).
109
+ Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
110
+ Optionally, you can also include `src` (source string) and/or `ref` (reference string).
111
+ If neither `src` nor `ref` is provided, only the model outputs will be displayed.
115
112
  For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
116
113
  Any other keys that you add will simply be stored in the logs.
117
114
 
@@ -145,6 +142,74 @@ The `shuffle` parameter in campaign `info` controls this behavior:
145
142
  }
146
143
  ```
147
144
 
145
+ ### Showing Model Names
146
+
147
+ By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
148
+ ```python
149
+ {
150
+ "info": {
151
+ "assignment": "task-based",
152
+ "protocol": "ESA",
153
+ "show_model_names": true # Default: false.
154
+ },
155
+ "campaign_id": "my_campaign",
156
+ "data": [...]
157
+ }
158
+ ```
159
+
160
+ ### Custom Score Sliders
161
+
162
+ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
163
+
164
+ ```python
165
+ {
166
+ "info": {
167
+ "assignment": "task-based",
168
+ "protocol": "ESA",
169
+ "sliders": [
170
+ {"name": "Fluency", "min": 0, "max": 5, "step": 1},
171
+ {"name": "Adequacy", "min": 0, "max": 100, "step": 1}
172
+ ]
173
+ },
174
+ "campaign_id": "my_campaign",
175
+ "data": [...]
176
+ }
177
+ ```
178
+
179
+ When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
180
+
181
+ ### Textfield for Post-editing/Translation
182
+
183
+ Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
184
+
185
+ ```python
186
+ {
187
+ "info": {
188
+ "protocol": "DA",
189
+ "textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
190
+ }
191
+ }
192
+ ```
193
+
194
+ **Textfield modes:**
195
+ - `null` or omitted: No textfield (default)
196
+ - `"hidden"`: Textfield hidden by default, shown by clicking a button
197
+ - `"visible"`: Textfield always visible
198
+ - `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
199
+
200
+ ### Custom Instructions
201
+
202
+ Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
203
+ Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
204
+ ```python
205
+ {
206
+ "info": {
207
+ "protocol": "DA",
208
+ "instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
209
+ }
210
+ }
211
+ ```
212
+
148
213
  ### Pre-filled Error Spans (ESA<sup>AI</sup>)
149
214
 
150
215
  Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
@@ -263,7 +328,7 @@ All items must contain outputs from all models for this assignment type to work
263
328
  **How it works:**
264
329
  1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
265
330
  2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
266
- 3. Contrastive evaluatoin: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
331
+ 3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
267
332
  4. Item prioritization: Items with the least annotations for the selected models are prioritized
268
333
  5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
269
334
 
@@ -289,6 +354,7 @@ The `users` field accepts:
289
354
  }
290
355
  ```
291
356
 
357
+
292
358
  ### Multimodal Annotations
293
359
 
294
360
  Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
@@ -369,7 +435,7 @@ Customize the goodbye message shown to users when they complete all annotations
369
435
  - **Score**: Numeric quality rating (0-100)
370
436
  - **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
371
437
  - **Error Categories**: MQM taxonomy labels for errors
372
- - **Template**: The annotation interface type. The `basic` template supports comparing multiple outputs simultaneously.
438
+ - **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
373
439
  - **Assignment**: The method for distributing items to users:
374
440
  - **Task-based**: Each user has predefined items
375
441
  - **Single-stream**: Users draw from a shared pool with random assignment
@@ -400,7 +466,7 @@ pearmut run
400
466
  2. Add build rule to `webpack.config.js`
401
467
  3. Reference as `info->template` in campaign JSON
402
468
 
403
- See [web/src/basic.ts](web/src/basic.ts) for example.
469
+ See [web/src/annotate.ts](web/src/annotate.ts) for example.
404
470
 
405
471
  ### Deployment
406
472
 
@@ -411,10 +477,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
411
477
  If you use this work in your paper, please cite as following.
412
478
  ```bibtex
413
479
  @misc{zouhar2026pearmut,
414
- author = {Zouhar, Vilém},
415
- title = {Pearmut: Human Evaluation of Translation Made Trivial},
416
- year = {2026}
480
+ title={Pearmut: Human Evaluation of Translation Made Trivial},
481
+ author={Vilém Zouhar and Tom Kocmi},
482
+ year={2026},
483
+ eprint={2601.02933},
484
+ archivePrefix={arXiv},
485
+ primaryClass={cs.CL},
486
+ url={https://arxiv.org/abs/2601.02933},
417
487
  }
418
488
  ```
419
489
 
420
490
  Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
491
+ See changes in [CHANGELOG.md](CHANGELOG.md).
@@ -1,14 +1,7 @@
1
- # Pearmut 🍐
1
+ # 🍐Pearmut <br> [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [![arXiv](https://img.shields.io/badge/arXiv-2601.02933-b31b1b.svg?style=flat)](https://arxiv.org/abs/2601.02933)
2
2
 
3
3
  **Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
4
4
 
5
- [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
6
- &nbsp;
7
- [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/)
8
- &nbsp;
9
- [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/)
10
- &nbsp;
11
- [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
12
5
 
13
6
  <img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
14
7
 
@@ -31,6 +24,8 @@
31
24
  - [Terminology](#terminology)
32
25
  - [Development](#development)
33
26
  - [Citation](#citation)
27
+ - [Changelog](#changelog)
28
+
34
29
 
35
30
  ## Quick Start
36
31
 
@@ -90,7 +85,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
90
85
  }
91
86
  ```
92
87
 
93
- Each item has to have `src` (string) and `tgt` (dictionary from model names to strings, even for a single model evaluation).
88
+ Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
89
+ Optionally, you can also include `src` (source string) and/or `ref` (reference string).
90
+ If neither `src` nor `ref` is provided, only the model outputs will be displayed.
94
91
  For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
95
92
  Any other keys that you add will simply be stored in the logs.
96
93
 
@@ -124,6 +121,74 @@ The `shuffle` parameter in campaign `info` controls this behavior:
124
121
  }
125
122
  ```
126
123
 
124
+ ### Showing Model Names
125
+
126
+ By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
127
+ ```python
128
+ {
129
+ "info": {
130
+ "assignment": "task-based",
131
+ "protocol": "ESA",
132
+ "show_model_names": true # Default: false.
133
+ },
134
+ "campaign_id": "my_campaign",
135
+ "data": [...]
136
+ }
137
+ ```
138
+
139
+ ### Custom Score Sliders
140
+
141
+ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
142
+
143
+ ```python
144
+ {
145
+ "info": {
146
+ "assignment": "task-based",
147
+ "protocol": "ESA",
148
+ "sliders": [
149
+ {"name": "Fluency", "min": 0, "max": 5, "step": 1},
150
+ {"name": "Adequacy", "min": 0, "max": 100, "step": 1}
151
+ ]
152
+ },
153
+ "campaign_id": "my_campaign",
154
+ "data": [...]
155
+ }
156
+ ```
157
+
158
+ When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
159
+
160
+ ### Textfield for Post-editing/Translation
161
+
162
+ Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
163
+
164
+ ```python
165
+ {
166
+ "info": {
167
+ "protocol": "DA",
168
+ "textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
169
+ }
170
+ }
171
+ ```
172
+
173
+ **Textfield modes:**
174
+ - `null` or omitted: No textfield (default)
175
+ - `"hidden"`: Textfield hidden by default, shown by clicking a button
176
+ - `"visible"`: Textfield always visible
177
+ - `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
178
+
179
+ ### Custom Instructions
180
+
181
+ Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
182
+ Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
183
+ ```python
184
+ {
185
+ "info": {
186
+ "protocol": "DA",
187
+ "instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
188
+ }
189
+ }
190
+ ```
191
+
127
192
  ### Pre-filled Error Spans (ESA<sup>AI</sup>)
128
193
 
129
194
  Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
@@ -242,7 +307,7 @@ All items must contain outputs from all models for this assignment type to work
242
307
  **How it works:**
243
308
  1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
244
309
  2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
245
- 3. Contrastive evaluatoin: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
310
+ 3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
246
311
  4. Item prioritization: Items with the least annotations for the selected models are prioritized
247
312
  5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
248
313
 
@@ -268,6 +333,7 @@ The `users` field accepts:
268
333
  }
269
334
  ```
270
335
 
336
+
271
337
  ### Multimodal Annotations
272
338
 
273
339
  Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
@@ -348,7 +414,7 @@ Customize the goodbye message shown to users when they complete all annotations
348
414
  - **Score**: Numeric quality rating (0-100)
349
415
  - **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
350
416
  - **Error Categories**: MQM taxonomy labels for errors
351
- - **Template**: The annotation interface type. The `basic` template supports comparing multiple outputs simultaneously.
417
+ - **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
352
418
  - **Assignment**: The method for distributing items to users:
353
419
  - **Task-based**: Each user has predefined items
354
420
  - **Single-stream**: Users draw from a shared pool with random assignment
@@ -379,7 +445,7 @@ pearmut run
379
445
  2. Add build rule to `webpack.config.js`
380
446
  3. Reference as `info->template` in campaign JSON
381
447
 
382
- See [web/src/basic.ts](web/src/basic.ts) for example.
448
+ See [web/src/annotate.ts](web/src/annotate.ts) for example.
383
449
 
384
450
  ### Deployment
385
451
 
@@ -390,10 +456,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
390
456
  If you use this work in your paper, please cite as following.
391
457
  ```bibtex
392
458
  @misc{zouhar2026pearmut,
393
- author = {Zouhar, Vilém},
394
- title = {Pearmut: Human Evaluation of Translation Made Trivial},
395
- year = {2026}
459
+ title={Pearmut: Human Evaluation of Translation Made Trivial},
460
+ author={Vilém Zouhar and Tom Kocmi},
461
+ year={2026},
462
+ eprint={2601.02933},
463
+ archivePrefix={arXiv},
464
+ primaryClass={cs.CL},
465
+ url={https://arxiv.org/abs/2601.02933},
396
466
  }
397
467
  ```
398
468
 
399
469
  Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
470
+ See changes in [CHANGELOG.md](CHANGELOG.md).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pearmut
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: A tool for evaluation of model outputs, primarily MT.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -19,17 +19,10 @@ Provides-Extra: dev
19
19
  Requires-Dist: pytest; extra == "dev"
20
20
  Dynamic: license-file
21
21
 
22
- # Pearmut 🍐
22
+ # 🍐Pearmut <br> [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [![arXiv](https://img.shields.io/badge/arXiv-2601.02933-b31b1b.svg?style=flat)](https://arxiv.org/abs/2601.02933)
23
23
 
24
24
  **Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
25
25
 
26
- [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
27
- &nbsp;
28
- [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/)
29
- &nbsp;
30
- [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/)
31
- &nbsp;
32
- [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
33
26
 
34
27
  <img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
35
28
 
@@ -52,6 +45,8 @@ Dynamic: license-file
52
45
  - [Terminology](#terminology)
53
46
  - [Development](#development)
54
47
  - [Citation](#citation)
48
+ - [Changelog](#changelog)
49
+
55
50
 
56
51
  ## Quick Start
57
52
 
@@ -111,7 +106,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
111
106
  }
112
107
  ```
113
108
 
114
- Each item has to have `src` (string) and `tgt` (dictionary from model names to strings, even for a single model evaluation).
109
+ Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
110
+ Optionally, you can also include `src` (source string) and/or `ref` (reference string).
111
+ If neither `src` nor `ref` is provided, only the model outputs will be displayed.
115
112
  For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
116
113
  Any other keys that you add will simply be stored in the logs.
117
114
 
@@ -145,6 +142,74 @@ The `shuffle` parameter in campaign `info` controls this behavior:
145
142
  }
146
143
  ```
147
144
 
145
+ ### Showing Model Names
146
+
147
+ By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
148
+ ```python
149
+ {
150
+ "info": {
151
+ "assignment": "task-based",
152
+ "protocol": "ESA",
153
+ "show_model_names": true # Default: false.
154
+ },
155
+ "campaign_id": "my_campaign",
156
+ "data": [...]
157
+ }
158
+ ```
159
+
160
+ ### Custom Score Sliders
161
+
162
+ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
163
+
164
+ ```python
165
+ {
166
+ "info": {
167
+ "assignment": "task-based",
168
+ "protocol": "ESA",
169
+ "sliders": [
170
+ {"name": "Fluency", "min": 0, "max": 5, "step": 1},
171
+ {"name": "Adequacy", "min": 0, "max": 100, "step": 1}
172
+ ]
173
+ },
174
+ "campaign_id": "my_campaign",
175
+ "data": [...]
176
+ }
177
+ ```
178
+
179
+ When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
180
+
181
+ ### Textfield for Post-editing/Translation
182
+
183
+ Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
184
+
185
+ ```python
186
+ {
187
+ "info": {
188
+ "protocol": "DA",
189
+ "textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
190
+ }
191
+ }
192
+ ```
193
+
194
+ **Textfield modes:**
195
+ - `null` or omitted: No textfield (default)
196
+ - `"hidden"`: Textfield hidden by default, shown by clicking a button
197
+ - `"visible"`: Textfield always visible
198
+ - `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
199
+
200
+ ### Custom Instructions
201
+
202
+ Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
203
+ Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
204
+ ```python
205
+ {
206
+ "info": {
207
+ "protocol": "DA",
208
+ "instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
209
+ }
210
+ }
211
+ ```
212
+
148
213
  ### Pre-filled Error Spans (ESA<sup>AI</sup>)
149
214
 
150
215
  Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
@@ -263,7 +328,7 @@ All items must contain outputs from all models for this assignment type to work
263
328
  **How it works:**
264
329
  1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
265
330
  2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
266
- 3. Contrastive evaluatoin: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
331
+ 3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
267
332
  4. Item prioritization: Items with the least annotations for the selected models are prioritized
268
333
  5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
269
334
 
@@ -289,6 +354,7 @@ The `users` field accepts:
289
354
  }
290
355
  ```
291
356
 
357
+
292
358
  ### Multimodal Annotations
293
359
 
294
360
  Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
@@ -369,7 +435,7 @@ Customize the goodbye message shown to users when they complete all annotations
369
435
  - **Score**: Numeric quality rating (0-100)
370
436
  - **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
371
437
  - **Error Categories**: MQM taxonomy labels for errors
372
- - **Template**: The annotation interface type. The `basic` template supports comparing multiple outputs simultaneously.
438
+ - **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
373
439
  - **Assignment**: The method for distributing items to users:
374
440
  - **Task-based**: Each user has predefined items
375
441
  - **Single-stream**: Users draw from a shared pool with random assignment
@@ -400,7 +466,7 @@ pearmut run
400
466
  2. Add build rule to `webpack.config.js`
401
467
  3. Reference as `info->template` in campaign JSON
402
468
 
403
- See [web/src/basic.ts](web/src/basic.ts) for example.
469
+ See [web/src/annotate.ts](web/src/annotate.ts) for example.
404
470
 
405
471
  ### Deployment
406
472
 
@@ -411,10 +477,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
411
477
  If you use this work in your paper, please cite as following.
412
478
  ```bibtex
413
479
  @misc{zouhar2026pearmut,
414
- author = {Zouhar, Vilém},
415
- title = {Pearmut: Human Evaluation of Translation Made Trivial},
416
- year = {2026}
480
+ title={Pearmut: Human Evaluation of Translation Made Trivial},
481
+ author={Vilém Zouhar and Tom Kocmi},
482
+ year={2026},
483
+ eprint={2601.02933},
484
+ archivePrefix={arXiv},
485
+ primaryClass={cs.CL},
486
+ url={https://arxiv.org/abs/2601.02933},
417
487
  }
418
488
  ```
419
489
 
420
490
  Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
491
+ See changes in [CHANGELOG.md](CHANGELOG.md).
@@ -10,10 +10,11 @@ pearmut.egg-info/top_level.txt
10
10
  server/app.py
11
11
  server/assignment.py
12
12
  server/cli.py
13
+ server/constants.py
13
14
  server/results_export.py
14
15
  server/utils.py
15
- server/static/basic.bundle.js
16
- server/static/basic.html
16
+ server/static/annotate.bundle.js
17
+ server/static/annotate.html
17
18
  server/static/dashboard.bundle.js
18
19
  server/static/dashboard.html
19
20
  server/static/favicon.svg
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pearmut"
3
- version = "1.0.0"
3
+ version = "1.0.2"
4
4
  description = "A tool for evaluation of model outputs, primarily MT."
5
5
  readme = "README.md"
6
6
  license = { text = "MIT" }
@@ -4,7 +4,7 @@ from typing import Any
4
4
 
5
5
  from fastapi import FastAPI, Query
6
6
  from fastapi.middleware.cors import CORSMiddleware
7
- from fastapi.responses import JSONResponse, Response
7
+ from fastapi.responses import FileResponse, JSONResponse, Response
8
8
  from fastapi.staticfiles import StaticFiles
9
9
  from pydantic import BaseModel
10
10
 
@@ -17,6 +17,7 @@ from .results_export import (
17
17
  )
18
18
  from .utils import (
19
19
  ROOT,
20
+ TOKEN_MAIN,
20
21
  check_validation_threshold,
21
22
  load_progress_data,
22
23
  save_db_payload,
@@ -192,7 +193,11 @@ async def _dashboard_data(request: DashboardDataRequest):
192
193
  progress_new[user_id] = entry
193
194
 
194
195
  return JSONResponse(
195
- content={"data": progress_new, "validation_threshold": validation_threshold},
196
+ content={
197
+ "data": progress_new,
198
+ "validation_threshold": validation_threshold,
199
+ "assignment": assignment,
200
+ },
196
201
  status_code=200,
197
202
  )
198
203
 
@@ -280,6 +285,91 @@ async def _reset_task(request: ResetTaskRequest):
280
285
  return response
281
286
 
282
287
 
288
+ class PurgeCampaignRequest(BaseModel):
289
+ campaign_id: str
290
+ token: str
291
+
292
+
293
+ @app.post("/purge-campaign")
294
+ async def _purge_campaign(request: PurgeCampaignRequest):
295
+ global progress_data, tasks_data
296
+
297
+ campaign_id = request.campaign_id
298
+ token = request.token
299
+
300
+ if campaign_id not in progress_data:
301
+ return JSONResponse(content="Unknown campaign ID", status_code=400)
302
+ if token != tasks_data[campaign_id]["token"]:
303
+ return JSONResponse(content="Invalid token", status_code=400)
304
+
305
+ # Unlink assets if they exist
306
+ destination = (
307
+ tasks_data[campaign_id].get("info", {}).get("assets", {}).get("destination")
308
+ )
309
+ if destination:
310
+ symlink_path = f"{ROOT}/data/{destination}".rstrip("/")
311
+ if os.path.islink(symlink_path):
312
+ os.remove(symlink_path)
313
+
314
+ # Remove task file
315
+ task_file = f"{ROOT}/data/tasks/{campaign_id}.json"
316
+ if os.path.exists(task_file):
317
+ os.remove(task_file)
318
+
319
+ # Remove output file
320
+ output_file = f"{ROOT}/data/outputs/{campaign_id}.jsonl"
321
+ if os.path.exists(output_file):
322
+ os.remove(output_file)
323
+
324
+ # Remove from in-memory data structures
325
+ del tasks_data[campaign_id]
326
+ del progress_data[campaign_id]
327
+
328
+ # Save updated progress data
329
+ save_progress_data(progress_data)
330
+
331
+ return JSONResponse(content="ok", status_code=200)
332
+
333
+
334
+ class AddCampaignRequest(BaseModel):
335
+ campaign_data: dict[str, Any]
336
+ token_main: str
337
+
338
+
339
+ @app.post("/add-campaign")
340
+ async def _add_campaign(request: AddCampaignRequest):
341
+ global progress_data, tasks_data
342
+
343
+ from .cli import _add_single_campaign
344
+
345
+ if request.token_main != TOKEN_MAIN:
346
+ return JSONResponse(
347
+ content={"error": "Invalid main token. Use the latest one."},
348
+ status_code=400,
349
+ )
350
+
351
+ try:
352
+ server = f"{os.environ.get('PEARMUT_SERVER_URL', 'http://localhost:8001')}"
353
+ _add_single_campaign(request.campaign_data, overwrite=False, server=server)
354
+
355
+ campaign_id = request.campaign_data["campaign_id"]
356
+ with open(f"{ROOT}/data/tasks/{campaign_id}.json", "r") as f:
357
+ tasks_data[campaign_id] = json.load(f)
358
+
359
+ progress_data = load_progress_data(warn=None)
360
+
361
+ return JSONResponse(
362
+ content={
363
+ "status": "ok",
364
+ "campaign_id": campaign_id,
365
+ "token": tasks_data[campaign_id]["token"],
366
+ },
367
+ status_code=200,
368
+ )
369
+ except Exception as e:
370
+ return JSONResponse(content={"error": str(e)}, status_code=400)
371
+
372
+
283
373
  @app.get("/download-annotations")
284
374
  async def _download_annotations(
285
375
  campaign_id: list[str] = Query(),
@@ -345,6 +435,17 @@ if not os.path.exists(static_dir + "index.html"):
345
435
  "Static directory not found. Please build the frontend first."
346
436
  )
347
437
 
438
+ # Serve HTML files directly without redirect
439
+ @app.get("/annotate")
440
+ async def serve_annotate():
441
+ return FileResponse(static_dir + "annotate.html")
442
+
443
+
444
+ @app.get("/dashboard")
445
+ async def serve_dashboard():
446
+ return FileResponse(static_dir + "dashboard.html")
447
+
448
+
348
449
  # Mount user assets from data/assets/
349
450
  assets_dir = f"{ROOT}/data/assets"
350
451
  os.makedirs(assets_dir, exist_ok=True)