pearmut 1.0.1__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {pearmut-1.0.1 → pearmut-1.0.2}/PKG-INFO +46 -65
  2. {pearmut-1.0.1 → pearmut-1.0.2}/README.md +45 -64
  3. {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/PKG-INFO +46 -65
  4. {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/SOURCES.txt +2 -2
  5. {pearmut-1.0.1 → pearmut-1.0.2}/pyproject.toml +1 -1
  6. {pearmut-1.0.1 → pearmut-1.0.2}/server/app.py +48 -20
  7. {pearmut-1.0.1 → pearmut-1.0.2}/server/assignment.py +12 -31
  8. {pearmut-1.0.1 → pearmut-1.0.2}/server/cli.py +45 -27
  9. {pearmut-1.0.1 → pearmut-1.0.2}/server/results_export.py +1 -1
  10. pearmut-1.0.2/server/static/annotate.bundle.js +1 -0
  11. pearmut-1.0.1/server/static/basic.html → pearmut-1.0.2/server/static/annotate.html +30 -3
  12. pearmut-1.0.2/server/static/dashboard.bundle.js +1 -0
  13. {pearmut-1.0.1 → pearmut-1.0.2}/server/static/dashboard.html +6 -1
  14. {pearmut-1.0.1 → pearmut-1.0.2}/server/static/index.html +1 -1
  15. {pearmut-1.0.1 → pearmut-1.0.2}/server/static/style.css +8 -0
  16. {pearmut-1.0.1 → pearmut-1.0.2}/server/utils.py +3 -1
  17. pearmut-1.0.1/server/static/basic.bundle.js +0 -1
  18. pearmut-1.0.1/server/static/dashboard.bundle.js +0 -1
  19. {pearmut-1.0.1 → pearmut-1.0.2}/LICENSE +0 -0
  20. {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/dependency_links.txt +0 -0
  21. {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/entry_points.txt +0 -0
  22. {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/requires.txt +0 -0
  23. {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/top_level.txt +0 -0
  24. {pearmut-1.0.1 → pearmut-1.0.2}/server/constants.py +0 -0
  25. {pearmut-1.0.1 → pearmut-1.0.2}/server/static/favicon.svg +0 -0
  26. {pearmut-1.0.1 → pearmut-1.0.2}/server/static/index.bundle.js +0 -0
  27. {pearmut-1.0.1 → pearmut-1.0.2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pearmut
3
- Version: 1.0.1
3
+ Version: 1.0.2
4
4
  Summary: A tool for evaluation of model outputs, primarily MT.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -19,7 +19,7 @@ Provides-Extra: dev
19
19
  Requires-Dist: pytest; extra == "dev"
20
20
  Dynamic: license-file
21
21
 
22
- # 🍐Pearmut &nbsp; &nbsp; [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
22
+ # 🍐Pearmut <br> [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [![arXiv](https://img.shields.io/badge/arXiv-2601.02933-b31b1b.svg?style=flat)](https://arxiv.org/abs/2601.02933)
23
23
 
24
24
  **Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
25
25
 
@@ -142,6 +142,21 @@ The `shuffle` parameter in campaign `info` controls this behavior:
142
142
  }
143
143
  ```
144
144
 
145
+ ### Showing Model Names
146
+
147
+ By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
148
+ ```python
149
+ {
150
+ "info": {
151
+ "assignment": "task-based",
152
+ "protocol": "ESA",
153
+ "show_model_names": true # Default: false.
154
+ },
155
+ "campaign_id": "my_campaign",
156
+ "data": [...]
157
+ }
158
+ ```
159
+
145
160
  ### Custom Score Sliders
146
161
 
147
162
  For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
@@ -163,6 +178,25 @@ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scal
163
178
 
164
179
  When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
165
180
 
181
+ ### Textfield for Post-editing/Translation
182
+
183
+ Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
184
+
185
+ ```python
186
+ {
187
+ "info": {
188
+ "protocol": "DA",
189
+ "textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
190
+ }
191
+ }
192
+ ```
193
+
194
+ **Textfield modes:**
195
+ - `null` or omitted: No textfield (default)
196
+ - `"hidden"`: Textfield hidden by default, shown by clicking a button
197
+ - `"visible"`: Textfield always visible
198
+ - `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
199
+
166
200
  ### Custom Instructions
167
201
 
168
202
  Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
@@ -401,7 +435,7 @@ Customize the goodbye message shown to users when they complete all annotations
401
435
  - **Score**: Numeric quality rating (0-100)
402
436
  - **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
403
437
  - **Error Categories**: MQM taxonomy labels for errors
404
- - **Template**: The annotation interface type. The `basic` template supports comparing multiple outputs simultaneously.
438
+ - **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
405
439
  - **Assignment**: The method for distributing items to users:
406
440
  - **Task-based**: Each user has predefined items
407
441
  - **Single-stream**: Users draw from a shared pool with random assignment
@@ -432,7 +466,7 @@ pearmut run
432
466
  2. Add build rule to `webpack.config.js`
433
467
  3. Reference as `info->template` in campaign JSON
434
468
 
435
- See [web/src/basic.ts](web/src/basic.ts) for example.
469
+ See [web/src/annotate.ts](web/src/annotate.ts) for example.
436
470
 
437
471
  ### Deployment
438
472
 
@@ -443,68 +477,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
443
477
  If you use this work in your paper, please cite as following.
444
478
  ```bibtex
445
479
  @misc{zouhar2026pearmut,
446
- author = {Zouhar, Vilém},
447
- title = {Pearmut: Human Evaluation of Translation Made Trivial},
448
- year = {2026}
480
+ title={Pearmut: Human Evaluation of Translation Made Trivial},
481
+ author={Vilém Zouhar and Tom Kocmi},
482
+ year={2026},
483
+ eprint={2601.02933},
484
+ archivePrefix={arXiv},
485
+ primaryClass={cs.CL},
486
+ url={https://arxiv.org/abs/2601.02933},
449
487
  }
450
488
  ```
451
489
 
452
490
  Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
453
-
454
- # Changelog
455
-
456
- - v1.0.1
457
- - Support RTL languages
458
- - Add boxes for references
459
- - Add custom score sliders for multi-dimensional evaluation
460
- - Make instructions customizable and protocol-dependent
461
- - Support custom sliders
462
- - Purge/reset whole tasks from dashboard
463
- - Fix resetting individual users in single-stream/dynamic
464
- - Fix notification stacking
465
- - Add campaigns from dashboard
466
- - v0.3.3
467
- - Rename `doc_id` to `item_id`
468
- - Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
469
- - Add dynamic assignment type with contrastive model comparison
470
- - Add `instructions_goodbye` field with variable substitution
471
- - Add visual anchors at 33% and 66% on sliders
472
- - Add German→English ESA tutorial with attention checks
473
- - Validate document model consistency before shuffle
474
- - Fix UI block on any interaction
475
- - v0.3.2
476
- - Revert seeding of user IDs
477
- - Set ESA (Error Span Annotation) as default
478
- - Update server IP address configuration
479
- - Show approximate alignment by default
480
- - Unify pointwise and listwise interfaces into `basic`
481
- - Refactor protocol configuration (breaking change)
482
- - v0.2.11
483
- - Add comment field in settings panel
484
- - Add `score_gt` validation for listwise comparisons
485
- - Add Content-Disposition headers for proper download filenames
486
- - Add model results display to dashboard with rankings
487
- - Add campaign file structure validation
488
- - Purge command now unlinks assets
489
- - v0.2.6
490
- - Add frozen annotation links feature for view-only mode
491
- - Add word-level annotation mode toggle for error spans
492
- - Add `[missing]` token support
493
- - Improve frontend speed and cleanup toolboxes on item load
494
- - Host assets via symlinks
495
- - Add validation threshold for success/fail tokens
496
- - Implement reset masking for annotations
497
- - Allow pre-defined user IDs and tokens in campaign data
498
- - v0.1.1
499
- - Set server defaults and add VM launch scripts
500
- - Add warning dialog when navigating away with unsaved work
501
- - Add tutorial validation support for pointwise and listwise
502
- - Add ability to preview existing annotations via progress bar
503
- - Add support for ESA<sup>AI</sup> pre-filled error_spans
504
- - Rename pairwise to listwise and update layout
505
- - Implement single-stream assignment type
506
- - v0.0.3
507
- - Support multimodal inputs and outputs
508
- - Add dashboard
509
- - Implement ESA (Error Span Annotation) and MQM support
510
-
491
+ See changes in [CHANGELOG.md](CHANGELOG.md).
@@ -1,4 +1,4 @@
1
- # 🍐Pearmut &nbsp; &nbsp; [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
1
+ # 🍐Pearmut <br> [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [![arXiv](https://img.shields.io/badge/arXiv-2601.02933-b31b1b.svg?style=flat)](https://arxiv.org/abs/2601.02933)
2
2
 
3
3
  **Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
4
4
 
@@ -121,6 +121,21 @@ The `shuffle` parameter in campaign `info` controls this behavior:
121
121
  }
122
122
  ```
123
123
 
124
+ ### Showing Model Names
125
+
126
+ By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
127
+ ```python
128
+ {
129
+ "info": {
130
+ "assignment": "task-based",
131
+ "protocol": "ESA",
132
+ "show_model_names": true # Default: false.
133
+ },
134
+ "campaign_id": "my_campaign",
135
+ "data": [...]
136
+ }
137
+ ```
138
+
124
139
  ### Custom Score Sliders
125
140
 
126
141
  For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
@@ -142,6 +157,25 @@ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scal
142
157
 
143
158
  When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
144
159
 
160
+ ### Textfield for Post-editing/Translation
161
+
162
+ Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
163
+
164
+ ```python
165
+ {
166
+ "info": {
167
+ "protocol": "DA",
168
+ "textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
169
+ }
170
+ }
171
+ ```
172
+
173
+ **Textfield modes:**
174
+ - `null` or omitted: No textfield (default)
175
+ - `"hidden"`: Textfield hidden by default, shown by clicking a button
176
+ - `"visible"`: Textfield always visible
177
+ - `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
178
+
145
179
  ### Custom Instructions
146
180
 
147
181
  Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
@@ -380,7 +414,7 @@ Customize the goodbye message shown to users when they complete all annotations
380
414
  - **Score**: Numeric quality rating (0-100)
381
415
  - **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
382
416
  - **Error Categories**: MQM taxonomy labels for errors
383
- - **Template**: The annotation interface type. The `basic` template supports comparing multiple outputs simultaneously.
417
+ - **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
384
418
  - **Assignment**: The method for distributing items to users:
385
419
  - **Task-based**: Each user has predefined items
386
420
  - **Single-stream**: Users draw from a shared pool with random assignment
@@ -411,7 +445,7 @@ pearmut run
411
445
  2. Add build rule to `webpack.config.js`
412
446
  3. Reference as `info->template` in campaign JSON
413
447
 
414
- See [web/src/basic.ts](web/src/basic.ts) for example.
448
+ See [web/src/annotate.ts](web/src/annotate.ts) for example.
415
449
 
416
450
  ### Deployment
417
451
 
@@ -422,68 +456,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
422
456
  If you use this work in your paper, please cite as following.
423
457
  ```bibtex
424
458
  @misc{zouhar2026pearmut,
425
- author = {Zouhar, Vilém},
426
- title = {Pearmut: Human Evaluation of Translation Made Trivial},
427
- year = {2026}
459
+ title={Pearmut: Human Evaluation of Translation Made Trivial},
460
+ author={Vilém Zouhar and Tom Kocmi},
461
+ year={2026},
462
+ eprint={2601.02933},
463
+ archivePrefix={arXiv},
464
+ primaryClass={cs.CL},
465
+ url={https://arxiv.org/abs/2601.02933},
428
466
  }
429
467
  ```
430
468
 
431
469
  Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
432
-
433
- # Changelog
434
-
435
- - v1.0.1
436
- - Support RTL languages
437
- - Add boxes for references
438
- - Add custom score sliders for multi-dimensional evaluation
439
- - Make instructions customizable and protocol-dependent
440
- - Support custom sliders
441
- - Purge/reset whole tasks from dashboard
442
- - Fix resetting individual users in single-stream/dynamic
443
- - Fix notification stacking
444
- - Add campaigns from dashboard
445
- - v0.3.3
446
- - Rename `doc_id` to `item_id`
447
- - Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
448
- - Add dynamic assignment type with contrastive model comparison
449
- - Add `instructions_goodbye` field with variable substitution
450
- - Add visual anchors at 33% and 66% on sliders
451
- - Add German→English ESA tutorial with attention checks
452
- - Validate document model consistency before shuffle
453
- - Fix UI block on any interaction
454
- - v0.3.2
455
- - Revert seeding of user IDs
456
- - Set ESA (Error Span Annotation) as default
457
- - Update server IP address configuration
458
- - Show approximate alignment by default
459
- - Unify pointwise and listwise interfaces into `basic`
460
- - Refactor protocol configuration (breaking change)
461
- - v0.2.11
462
- - Add comment field in settings panel
463
- - Add `score_gt` validation for listwise comparisons
464
- - Add Content-Disposition headers for proper download filenames
465
- - Add model results display to dashboard with rankings
466
- - Add campaign file structure validation
467
- - Purge command now unlinks assets
468
- - v0.2.6
469
- - Add frozen annotation links feature for view-only mode
470
- - Add word-level annotation mode toggle for error spans
471
- - Add `[missing]` token support
472
- - Improve frontend speed and cleanup toolboxes on item load
473
- - Host assets via symlinks
474
- - Add validation threshold for success/fail tokens
475
- - Implement reset masking for annotations
476
- - Allow pre-defined user IDs and tokens in campaign data
477
- - v0.1.1
478
- - Set server defaults and add VM launch scripts
479
- - Add warning dialog when navigating away with unsaved work
480
- - Add tutorial validation support for pointwise and listwise
481
- - Add ability to preview existing annotations via progress bar
482
- - Add support for ESA<sup>AI</sup> pre-filled error_spans
483
- - Rename pairwise to listwise and update layout
484
- - Implement single-stream assignment type
485
- - v0.0.3
486
- - Support multimodal inputs and outputs
487
- - Add dashboard
488
- - Implement ESA (Error Span Annotation) and MQM support
489
-
470
+ See changes in [CHANGELOG.md](CHANGELOG.md).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pearmut
3
- Version: 1.0.1
3
+ Version: 1.0.2
4
4
  Summary: A tool for evaluation of model outputs, primarily MT.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -19,7 +19,7 @@ Provides-Extra: dev
19
19
  Requires-Dist: pytest; extra == "dev"
20
20
  Dynamic: license-file
21
21
 
22
- # 🍐Pearmut &nbsp; &nbsp; [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
22
+ # 🍐Pearmut <br> [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [![arXiv](https://img.shields.io/badge/arXiv-2601.02933-b31b1b.svg?style=flat)](https://arxiv.org/abs/2601.02933)
23
23
 
24
24
  **Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
25
25
 
@@ -142,6 +142,21 @@ The `shuffle` parameter in campaign `info` controls this behavior:
142
142
  }
143
143
  ```
144
144
 
145
+ ### Showing Model Names
146
+
147
+ By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
148
+ ```python
149
+ {
150
+ "info": {
151
+ "assignment": "task-based",
152
+ "protocol": "ESA",
153
+ "show_model_names": true # Default: false.
154
+ },
155
+ "campaign_id": "my_campaign",
156
+ "data": [...]
157
+ }
158
+ ```
159
+
145
160
  ### Custom Score Sliders
146
161
 
147
162
  For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
@@ -163,6 +178,25 @@ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scal
163
178
 
164
179
  When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
165
180
 
181
+ ### Textfield for Post-editing/Translation
182
+
183
+ Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
184
+
185
+ ```python
186
+ {
187
+ "info": {
188
+ "protocol": "DA",
189
+ "textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
190
+ }
191
+ }
192
+ ```
193
+
194
+ **Textfield modes:**
195
+ - `null` or omitted: No textfield (default)
196
+ - `"hidden"`: Textfield hidden by default, shown by clicking a button
197
+ - `"visible"`: Textfield always visible
198
+ - `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
199
+
166
200
  ### Custom Instructions
167
201
 
168
202
  Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
@@ -401,7 +435,7 @@ Customize the goodbye message shown to users when they complete all annotations
401
435
  - **Score**: Numeric quality rating (0-100)
402
436
  - **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
403
437
  - **Error Categories**: MQM taxonomy labels for errors
404
- - **Template**: The annotation interface type. The `basic` template supports comparing multiple outputs simultaneously.
438
+ - **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
405
439
  - **Assignment**: The method for distributing items to users:
406
440
  - **Task-based**: Each user has predefined items
407
441
  - **Single-stream**: Users draw from a shared pool with random assignment
@@ -432,7 +466,7 @@ pearmut run
432
466
  2. Add build rule to `webpack.config.js`
433
467
  3. Reference as `info->template` in campaign JSON
434
468
 
435
- See [web/src/basic.ts](web/src/basic.ts) for example.
469
+ See [web/src/annotate.ts](web/src/annotate.ts) for example.
436
470
 
437
471
  ### Deployment
438
472
 
@@ -443,68 +477,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
443
477
  If you use this work in your paper, please cite as following.
444
478
  ```bibtex
445
479
  @misc{zouhar2026pearmut,
446
- author = {Zouhar, Vilém},
447
- title = {Pearmut: Human Evaluation of Translation Made Trivial},
448
- year = {2026}
480
+ title={Pearmut: Human Evaluation of Translation Made Trivial},
481
+ author={Vilém Zouhar and Tom Kocmi},
482
+ year={2026},
483
+ eprint={2601.02933},
484
+ archivePrefix={arXiv},
485
+ primaryClass={cs.CL},
486
+ url={https://arxiv.org/abs/2601.02933},
449
487
  }
450
488
  ```
451
489
 
452
490
  Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
453
-
454
- # Changelog
455
-
456
- - v1.0.1
457
- - Support RTL languages
458
- - Add boxes for references
459
- - Add custom score sliders for multi-dimensional evaluation
460
- - Make instructions customizable and protocol-dependent
461
- - Support custom sliders
462
- - Purge/reset whole tasks from dashboard
463
- - Fix resetting individual users in single-stream/dynamic
464
- - Fix notification stacking
465
- - Add campaigns from dashboard
466
- - v0.3.3
467
- - Rename `doc_id` to `item_id`
468
- - Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
469
- - Add dynamic assignment type with contrastive model comparison
470
- - Add `instructions_goodbye` field with variable substitution
471
- - Add visual anchors at 33% and 66% on sliders
472
- - Add German→English ESA tutorial with attention checks
473
- - Validate document model consistency before shuffle
474
- - Fix UI block on any interaction
475
- - v0.3.2
476
- - Revert seeding of user IDs
477
- - Set ESA (Error Span Annotation) as default
478
- - Update server IP address configuration
479
- - Show approximate alignment by default
480
- - Unify pointwise and listwise interfaces into `basic`
481
- - Refactor protocol configuration (breaking change)
482
- - v0.2.11
483
- - Add comment field in settings panel
484
- - Add `score_gt` validation for listwise comparisons
485
- - Add Content-Disposition headers for proper download filenames
486
- - Add model results display to dashboard with rankings
487
- - Add campaign file structure validation
488
- - Purge command now unlinks assets
489
- - v0.2.6
490
- - Add frozen annotation links feature for view-only mode
491
- - Add word-level annotation mode toggle for error spans
492
- - Add `[missing]` token support
493
- - Improve frontend speed and cleanup toolboxes on item load
494
- - Host assets via symlinks
495
- - Add validation threshold for success/fail tokens
496
- - Implement reset masking for annotations
497
- - Allow pre-defined user IDs and tokens in campaign data
498
- - v0.1.1
499
- - Set server defaults and add VM launch scripts
500
- - Add warning dialog when navigating away with unsaved work
501
- - Add tutorial validation support for pointwise and listwise
502
- - Add ability to preview existing annotations via progress bar
503
- - Add support for ESA<sup>AI</sup> pre-filled error_spans
504
- - Rename pairwise to listwise and update layout
505
- - Implement single-stream assignment type
506
- - v0.0.3
507
- - Support multimodal inputs and outputs
508
- - Add dashboard
509
- - Implement ESA (Error Span Annotation) and MQM support
510
-
491
+ See changes in [CHANGELOG.md](CHANGELOG.md).
@@ -13,8 +13,8 @@ server/cli.py
13
13
  server/constants.py
14
14
  server/results_export.py
15
15
  server/utils.py
16
- server/static/basic.bundle.js
17
- server/static/basic.html
16
+ server/static/annotate.bundle.js
17
+ server/static/annotate.html
18
18
  server/static/dashboard.bundle.js
19
19
  server/static/dashboard.html
20
20
  server/static/favicon.svg
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pearmut"
3
- version = "1.0.1"
3
+ version = "1.0.2"
4
4
  description = "A tool for evaluation of model outputs, primarily MT."
5
5
  readme = "README.md"
6
6
  license = { text = "MIT" }
@@ -4,7 +4,7 @@ from typing import Any
4
4
 
5
5
  from fastapi import FastAPI, Query
6
6
  from fastapi.middleware.cors import CORSMiddleware
7
- from fastapi.responses import JSONResponse, Response
7
+ from fastapi.responses import FileResponse, JSONResponse, Response
8
8
  from fastapi.staticfiles import StaticFiles
9
9
  from pydantic import BaseModel
10
10
 
@@ -17,6 +17,7 @@ from .results_export import (
17
17
  )
18
18
  from .utils import (
19
19
  ROOT,
20
+ TOKEN_MAIN,
20
21
  check_validation_threshold,
21
22
  load_progress_data,
22
23
  save_db_payload,
@@ -192,7 +193,11 @@ async def _dashboard_data(request: DashboardDataRequest):
192
193
  progress_new[user_id] = entry
193
194
 
194
195
  return JSONResponse(
195
- content={"data": progress_new, "validation_threshold": validation_threshold},
196
+ content={
197
+ "data": progress_new,
198
+ "validation_threshold": validation_threshold,
199
+ "assignment": assignment,
200
+ },
196
201
  status_code=200,
197
202
  )
198
203
 
@@ -288,7 +293,7 @@ class PurgeCampaignRequest(BaseModel):
288
293
  @app.post("/purge-campaign")
289
294
  async def _purge_campaign(request: PurgeCampaignRequest):
290
295
  global progress_data, tasks_data
291
-
296
+
292
297
  campaign_id = request.campaign_id
293
298
  token = request.token
294
299
 
@@ -298,57 +303,69 @@ async def _purge_campaign(request: PurgeCampaignRequest):
298
303
  return JSONResponse(content="Invalid token", status_code=400)
299
304
 
300
305
  # Unlink assets if they exist
301
- destination = tasks_data[campaign_id].get("info", {}).get("assets", {}).get("destination")
306
+ destination = (
307
+ tasks_data[campaign_id].get("info", {}).get("assets", {}).get("destination")
308
+ )
302
309
  if destination:
303
310
  symlink_path = f"{ROOT}/data/{destination}".rstrip("/")
304
311
  if os.path.islink(symlink_path):
305
312
  os.remove(symlink_path)
306
-
313
+
307
314
  # Remove task file
308
315
  task_file = f"{ROOT}/data/tasks/{campaign_id}.json"
309
316
  if os.path.exists(task_file):
310
317
  os.remove(task_file)
311
-
318
+
312
319
  # Remove output file
313
320
  output_file = f"{ROOT}/data/outputs/{campaign_id}.jsonl"
314
321
  if os.path.exists(output_file):
315
322
  os.remove(output_file)
316
-
323
+
317
324
  # Remove from in-memory data structures
318
325
  del tasks_data[campaign_id]
319
326
  del progress_data[campaign_id]
320
-
327
+
321
328
  # Save updated progress data
322
329
  save_progress_data(progress_data)
323
-
330
+
324
331
  return JSONResponse(content="ok", status_code=200)
325
332
 
326
333
 
327
334
  class AddCampaignRequest(BaseModel):
328
335
  campaign_data: dict[str, Any]
336
+ token_main: str
329
337
 
330
338
 
331
339
  @app.post("/add-campaign")
332
340
  async def _add_campaign(request: AddCampaignRequest):
333
341
  global progress_data, tasks_data
334
-
342
+
335
343
  from .cli import _add_single_campaign
336
-
344
+
345
+ if request.token_main != TOKEN_MAIN:
346
+ return JSONResponse(
347
+ content={"error": "Invalid main token. Use the latest one."},
348
+ status_code=400,
349
+ )
350
+
337
351
  try:
338
352
  server = f"{os.environ.get('PEARMUT_SERVER_URL', 'http://localhost:8001')}"
339
353
  _add_single_campaign(request.campaign_data, overwrite=False, server=server)
340
-
341
- campaign_id = request.campaign_data['campaign_id']
354
+
355
+ campaign_id = request.campaign_data["campaign_id"]
342
356
  with open(f"{ROOT}/data/tasks/{campaign_id}.json", "r") as f:
343
357
  tasks_data[campaign_id] = json.load(f)
344
-
358
+
345
359
  progress_data = load_progress_data(warn=None)
346
-
347
- return JSONResponse(content={
348
- "status": "ok",
349
- "campaign_id": campaign_id,
350
- "token": tasks_data[campaign_id]["token"]
351
- }, status_code=200)
360
+
361
+ return JSONResponse(
362
+ content={
363
+ "status": "ok",
364
+ "campaign_id": campaign_id,
365
+ "token": tasks_data[campaign_id]["token"],
366
+ },
367
+ status_code=200,
368
+ )
352
369
  except Exception as e:
353
370
  return JSONResponse(content={"error": str(e)}, status_code=400)
354
371
 
@@ -418,6 +435,17 @@ if not os.path.exists(static_dir + "index.html"):
418
435
  "Static directory not found. Please build the frontend first."
419
436
  )
420
437
 
438
+ # Serve HTML files directly without redirect
439
+ @app.get("/annotate")
440
+ async def serve_annotate():
441
+ return FileResponse(static_dir + "annotate.html")
442
+
443
+
444
+ @app.get("/dashboard")
445
+ async def serve_dashboard():
446
+ return FileResponse(static_dir + "dashboard.html")
447
+
448
+
421
449
  # Mount user assets from data/assets/
422
450
  assets_dir = f"{ROOT}/data/assets"
423
451
  os.makedirs(assets_dir, exist_ok=True)