pearmut 1.0.1__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pearmut-1.0.1 → pearmut-1.0.2}/PKG-INFO +46 -65
- {pearmut-1.0.1 → pearmut-1.0.2}/README.md +45 -64
- {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/PKG-INFO +46 -65
- {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/SOURCES.txt +2 -2
- {pearmut-1.0.1 → pearmut-1.0.2}/pyproject.toml +1 -1
- {pearmut-1.0.1 → pearmut-1.0.2}/server/app.py +48 -20
- {pearmut-1.0.1 → pearmut-1.0.2}/server/assignment.py +12 -31
- {pearmut-1.0.1 → pearmut-1.0.2}/server/cli.py +45 -27
- {pearmut-1.0.1 → pearmut-1.0.2}/server/results_export.py +1 -1
- pearmut-1.0.2/server/static/annotate.bundle.js +1 -0
- pearmut-1.0.1/server/static/basic.html → pearmut-1.0.2/server/static/annotate.html +30 -3
- pearmut-1.0.2/server/static/dashboard.bundle.js +1 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/server/static/dashboard.html +6 -1
- {pearmut-1.0.1 → pearmut-1.0.2}/server/static/index.html +1 -1
- {pearmut-1.0.1 → pearmut-1.0.2}/server/static/style.css +8 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/server/utils.py +3 -1
- pearmut-1.0.1/server/static/basic.bundle.js +0 -1
- pearmut-1.0.1/server/static/dashboard.bundle.js +0 -1
- {pearmut-1.0.1 → pearmut-1.0.2}/LICENSE +0 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/dependency_links.txt +0 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/entry_points.txt +0 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/requires.txt +0 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/pearmut.egg-info/top_level.txt +0 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/server/constants.py +0 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/server/static/favicon.svg +0 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/server/static/index.bundle.js +0 -0
- {pearmut-1.0.1 → pearmut-1.0.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.2
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -19,7 +19,7 @@ Provides-Extra: dev
|
|
|
19
19
|
Requires-Dist: pytest; extra == "dev"
|
|
20
20
|
Dynamic: license-file
|
|
21
21
|
|
|
22
|
-
# 🍐Pearmut
|
|
22
|
+
# 🍐Pearmut <br> [](https://pypi.org/project/pearmut) [](https://pypi.python.org/pypi/pearmut/) [](https://pypi.org/project/pearmut/) [](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [](https://arxiv.org/abs/2601.02933)
|
|
23
23
|
|
|
24
24
|
**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
|
|
25
25
|
|
|
@@ -142,6 +142,21 @@ The `shuffle` parameter in campaign `info` controls this behavior:
|
|
|
142
142
|
}
|
|
143
143
|
```
|
|
144
144
|
|
|
145
|
+
### Showing Model Names
|
|
146
|
+
|
|
147
|
+
By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
|
|
148
|
+
```python
|
|
149
|
+
{
|
|
150
|
+
"info": {
|
|
151
|
+
"assignment": "task-based",
|
|
152
|
+
"protocol": "ESA",
|
|
153
|
+
"show_model_names": true # Default: false.
|
|
154
|
+
},
|
|
155
|
+
"campaign_id": "my_campaign",
|
|
156
|
+
"data": [...]
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
145
160
|
### Custom Score Sliders
|
|
146
161
|
|
|
147
162
|
For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
|
|
@@ -163,6 +178,25 @@ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scal
|
|
|
163
178
|
|
|
164
179
|
When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
|
|
165
180
|
|
|
181
|
+
### Textfield for Post-editing/Translation
|
|
182
|
+
|
|
183
|
+
Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
{
|
|
187
|
+
"info": {
|
|
188
|
+
"protocol": "DA",
|
|
189
|
+
"textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Textfield modes:**
|
|
195
|
+
- `null` or omitted: No textfield (default)
|
|
196
|
+
- `"hidden"`: Textfield hidden by default, shown by clicking a button
|
|
197
|
+
- `"visible"`: Textfield always visible
|
|
198
|
+
- `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
|
|
199
|
+
|
|
166
200
|
### Custom Instructions
|
|
167
201
|
|
|
168
202
|
Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
|
|
@@ -401,7 +435,7 @@ Customize the goodbye message shown to users when they complete all annotations
|
|
|
401
435
|
- **Score**: Numeric quality rating (0-100)
|
|
402
436
|
- **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
|
|
403
437
|
- **Error Categories**: MQM taxonomy labels for errors
|
|
404
|
-
- **Template**: The annotation interface type. The `
|
|
438
|
+
- **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
|
|
405
439
|
- **Assignment**: The method for distributing items to users:
|
|
406
440
|
- **Task-based**: Each user has predefined items
|
|
407
441
|
- **Single-stream**: Users draw from a shared pool with random assignment
|
|
@@ -432,7 +466,7 @@ pearmut run
|
|
|
432
466
|
2. Add build rule to `webpack.config.js`
|
|
433
467
|
3. Reference as `info->template` in campaign JSON
|
|
434
468
|
|
|
435
|
-
See [web/src/
|
|
469
|
+
See [web/src/annotate.ts](web/src/annotate.ts) for example.
|
|
436
470
|
|
|
437
471
|
### Deployment
|
|
438
472
|
|
|
@@ -443,68 +477,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
|
|
|
443
477
|
If you use this work in your paper, please cite as following.
|
|
444
478
|
```bibtex
|
|
445
479
|
@misc{zouhar2026pearmut,
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
480
|
+
title={Pearmut: Human Evaluation of Translation Made Trivial},
|
|
481
|
+
author={Vilém Zouhar and Tom Kocmi},
|
|
482
|
+
year={2026},
|
|
483
|
+
eprint={2601.02933},
|
|
484
|
+
archivePrefix={arXiv},
|
|
485
|
+
primaryClass={cs.CL},
|
|
486
|
+
url={https://arxiv.org/abs/2601.02933},
|
|
449
487
|
}
|
|
450
488
|
```
|
|
451
489
|
|
|
452
490
|
Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
|
|
453
|
-
|
|
454
|
-
# Changelog
|
|
455
|
-
|
|
456
|
-
- v1.0.1
|
|
457
|
-
- Support RTL languages
|
|
458
|
-
- Add boxes for references
|
|
459
|
-
- Add custom score sliders for multi-dimensional evaluation
|
|
460
|
-
- Make instructions customizable and protocol-dependent
|
|
461
|
-
- Support custom sliders
|
|
462
|
-
- Purge/reset whole tasks from dashboard
|
|
463
|
-
- Fix resetting individual users in single-stream/dynamic
|
|
464
|
-
- Fix notification stacking
|
|
465
|
-
- Add campaigns from dashboard
|
|
466
|
-
- v0.3.3
|
|
467
|
-
- Rename `doc_id` to `item_id`
|
|
468
|
-
- Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
|
|
469
|
-
- Add dynamic assignment type with contrastive model comparison
|
|
470
|
-
- Add `instructions_goodbye` field with variable substitution
|
|
471
|
-
- Add visual anchors at 33% and 66% on sliders
|
|
472
|
-
- Add German→English ESA tutorial with attention checks
|
|
473
|
-
- Validate document model consistency before shuffle
|
|
474
|
-
- Fix UI block on any interaction
|
|
475
|
-
- v0.3.2
|
|
476
|
-
- Revert seeding of user IDs
|
|
477
|
-
- Set ESA (Error Span Annotation) as default
|
|
478
|
-
- Update server IP address configuration
|
|
479
|
-
- Show approximate alignment by default
|
|
480
|
-
- Unify pointwise and listwise interfaces into `basic`
|
|
481
|
-
- Refactor protocol configuration (breaking change)
|
|
482
|
-
- v0.2.11
|
|
483
|
-
- Add comment field in settings panel
|
|
484
|
-
- Add `score_gt` validation for listwise comparisons
|
|
485
|
-
- Add Content-Disposition headers for proper download filenames
|
|
486
|
-
- Add model results display to dashboard with rankings
|
|
487
|
-
- Add campaign file structure validation
|
|
488
|
-
- Purge command now unlinks assets
|
|
489
|
-
- v0.2.6
|
|
490
|
-
- Add frozen annotation links feature for view-only mode
|
|
491
|
-
- Add word-level annotation mode toggle for error spans
|
|
492
|
-
- Add `[missing]` token support
|
|
493
|
-
- Improve frontend speed and cleanup toolboxes on item load
|
|
494
|
-
- Host assets via symlinks
|
|
495
|
-
- Add validation threshold for success/fail tokens
|
|
496
|
-
- Implement reset masking for annotations
|
|
497
|
-
- Allow pre-defined user IDs and tokens in campaign data
|
|
498
|
-
- v0.1.1
|
|
499
|
-
- Set server defaults and add VM launch scripts
|
|
500
|
-
- Add warning dialog when navigating away with unsaved work
|
|
501
|
-
- Add tutorial validation support for pointwise and listwise
|
|
502
|
-
- Add ability to preview existing annotations via progress bar
|
|
503
|
-
- Add support for ESA<sup>AI</sup> pre-filled error_spans
|
|
504
|
-
- Rename pairwise to listwise and update layout
|
|
505
|
-
- Implement single-stream assignment type
|
|
506
|
-
- v0.0.3
|
|
507
|
-
- Support multimodal inputs and outputs
|
|
508
|
-
- Add dashboard
|
|
509
|
-
- Implement ESA (Error Span Annotation) and MQM support
|
|
510
|
-
|
|
491
|
+
See changes in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# 🍐Pearmut
|
|
1
|
+
# 🍐Pearmut <br> [](https://pypi.org/project/pearmut) [](https://pypi.python.org/pypi/pearmut/) [](https://pypi.org/project/pearmut/) [](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [](https://arxiv.org/abs/2601.02933)
|
|
2
2
|
|
|
3
3
|
**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
|
|
4
4
|
|
|
@@ -121,6 +121,21 @@ The `shuffle` parameter in campaign `info` controls this behavior:
|
|
|
121
121
|
}
|
|
122
122
|
```
|
|
123
123
|
|
|
124
|
+
### Showing Model Names
|
|
125
|
+
|
|
126
|
+
By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
|
|
127
|
+
```python
|
|
128
|
+
{
|
|
129
|
+
"info": {
|
|
130
|
+
"assignment": "task-based",
|
|
131
|
+
"protocol": "ESA",
|
|
132
|
+
"show_model_names": true # Default: false.
|
|
133
|
+
},
|
|
134
|
+
"campaign_id": "my_campaign",
|
|
135
|
+
"data": [...]
|
|
136
|
+
}
|
|
137
|
+
```
|
|
138
|
+
|
|
124
139
|
### Custom Score Sliders
|
|
125
140
|
|
|
126
141
|
For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
|
|
@@ -142,6 +157,25 @@ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scal
|
|
|
142
157
|
|
|
143
158
|
When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
|
|
144
159
|
|
|
160
|
+
### Textfield for Post-editing/Translation
|
|
161
|
+
|
|
162
|
+
Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
{
|
|
166
|
+
"info": {
|
|
167
|
+
"protocol": "DA",
|
|
168
|
+
"textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
**Textfield modes:**
|
|
174
|
+
- `null` or omitted: No textfield (default)
|
|
175
|
+
- `"hidden"`: Textfield hidden by default, shown by clicking a button
|
|
176
|
+
- `"visible"`: Textfield always visible
|
|
177
|
+
- `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
|
|
178
|
+
|
|
145
179
|
### Custom Instructions
|
|
146
180
|
|
|
147
181
|
Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
|
|
@@ -380,7 +414,7 @@ Customize the goodbye message shown to users when they complete all annotations
|
|
|
380
414
|
- **Score**: Numeric quality rating (0-100)
|
|
381
415
|
- **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
|
|
382
416
|
- **Error Categories**: MQM taxonomy labels for errors
|
|
383
|
-
- **Template**: The annotation interface type. The `
|
|
417
|
+
- **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
|
|
384
418
|
- **Assignment**: The method for distributing items to users:
|
|
385
419
|
- **Task-based**: Each user has predefined items
|
|
386
420
|
- **Single-stream**: Users draw from a shared pool with random assignment
|
|
@@ -411,7 +445,7 @@ pearmut run
|
|
|
411
445
|
2. Add build rule to `webpack.config.js`
|
|
412
446
|
3. Reference as `info->template` in campaign JSON
|
|
413
447
|
|
|
414
|
-
See [web/src/
|
|
448
|
+
See [web/src/annotate.ts](web/src/annotate.ts) for example.
|
|
415
449
|
|
|
416
450
|
### Deployment
|
|
417
451
|
|
|
@@ -422,68 +456,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
|
|
|
422
456
|
If you use this work in your paper, please cite as following.
|
|
423
457
|
```bibtex
|
|
424
458
|
@misc{zouhar2026pearmut,
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
459
|
+
title={Pearmut: Human Evaluation of Translation Made Trivial},
|
|
460
|
+
author={Vilém Zouhar and Tom Kocmi},
|
|
461
|
+
year={2026},
|
|
462
|
+
eprint={2601.02933},
|
|
463
|
+
archivePrefix={arXiv},
|
|
464
|
+
primaryClass={cs.CL},
|
|
465
|
+
url={https://arxiv.org/abs/2601.02933},
|
|
428
466
|
}
|
|
429
467
|
```
|
|
430
468
|
|
|
431
469
|
Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
|
|
432
|
-
|
|
433
|
-
# Changelog
|
|
434
|
-
|
|
435
|
-
- v1.0.1
|
|
436
|
-
- Support RTL languages
|
|
437
|
-
- Add boxes for references
|
|
438
|
-
- Add custom score sliders for multi-dimensional evaluation
|
|
439
|
-
- Make instructions customizable and protocol-dependent
|
|
440
|
-
- Support custom sliders
|
|
441
|
-
- Purge/reset whole tasks from dashboard
|
|
442
|
-
- Fix resetting individual users in single-stream/dynamic
|
|
443
|
-
- Fix notification stacking
|
|
444
|
-
- Add campaigns from dashboard
|
|
445
|
-
- v0.3.3
|
|
446
|
-
- Rename `doc_id` to `item_id`
|
|
447
|
-
- Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
|
|
448
|
-
- Add dynamic assignment type with contrastive model comparison
|
|
449
|
-
- Add `instructions_goodbye` field with variable substitution
|
|
450
|
-
- Add visual anchors at 33% and 66% on sliders
|
|
451
|
-
- Add German→English ESA tutorial with attention checks
|
|
452
|
-
- Validate document model consistency before shuffle
|
|
453
|
-
- Fix UI block on any interaction
|
|
454
|
-
- v0.3.2
|
|
455
|
-
- Revert seeding of user IDs
|
|
456
|
-
- Set ESA (Error Span Annotation) as default
|
|
457
|
-
- Update server IP address configuration
|
|
458
|
-
- Show approximate alignment by default
|
|
459
|
-
- Unify pointwise and listwise interfaces into `basic`
|
|
460
|
-
- Refactor protocol configuration (breaking change)
|
|
461
|
-
- v0.2.11
|
|
462
|
-
- Add comment field in settings panel
|
|
463
|
-
- Add `score_gt` validation for listwise comparisons
|
|
464
|
-
- Add Content-Disposition headers for proper download filenames
|
|
465
|
-
- Add model results display to dashboard with rankings
|
|
466
|
-
- Add campaign file structure validation
|
|
467
|
-
- Purge command now unlinks assets
|
|
468
|
-
- v0.2.6
|
|
469
|
-
- Add frozen annotation links feature for view-only mode
|
|
470
|
-
- Add word-level annotation mode toggle for error spans
|
|
471
|
-
- Add `[missing]` token support
|
|
472
|
-
- Improve frontend speed and cleanup toolboxes on item load
|
|
473
|
-
- Host assets via symlinks
|
|
474
|
-
- Add validation threshold for success/fail tokens
|
|
475
|
-
- Implement reset masking for annotations
|
|
476
|
-
- Allow pre-defined user IDs and tokens in campaign data
|
|
477
|
-
- v0.1.1
|
|
478
|
-
- Set server defaults and add VM launch scripts
|
|
479
|
-
- Add warning dialog when navigating away with unsaved work
|
|
480
|
-
- Add tutorial validation support for pointwise and listwise
|
|
481
|
-
- Add ability to preview existing annotations via progress bar
|
|
482
|
-
- Add support for ESA<sup>AI</sup> pre-filled error_spans
|
|
483
|
-
- Rename pairwise to listwise and update layout
|
|
484
|
-
- Implement single-stream assignment type
|
|
485
|
-
- v0.0.3
|
|
486
|
-
- Support multimodal inputs and outputs
|
|
487
|
-
- Add dashboard
|
|
488
|
-
- Implement ESA (Error Span Annotation) and MQM support
|
|
489
|
-
|
|
470
|
+
See changes in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.2
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -19,7 +19,7 @@ Provides-Extra: dev
|
|
|
19
19
|
Requires-Dist: pytest; extra == "dev"
|
|
20
20
|
Dynamic: license-file
|
|
21
21
|
|
|
22
|
-
# 🍐Pearmut
|
|
22
|
+
# 🍐Pearmut <br> [](https://pypi.org/project/pearmut) [](https://pypi.python.org/pypi/pearmut/) [](https://pypi.org/project/pearmut/) [](https://github.com/zouharvi/pearmut/actions/workflows/test.yml) [](https://arxiv.org/abs/2601.02933)
|
|
23
23
|
|
|
24
24
|
**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
|
|
25
25
|
|
|
@@ -142,6 +142,21 @@ The `shuffle` parameter in campaign `info` controls this behavior:
|
|
|
142
142
|
}
|
|
143
143
|
```
|
|
144
144
|
|
|
145
|
+
### Showing Model Names
|
|
146
|
+
|
|
147
|
+
By default, model names are hidden to avoid biasing annotators. To display model names on top of each output block, set `show_model_names` to `true`:
|
|
148
|
+
```python
|
|
149
|
+
{
|
|
150
|
+
"info": {
|
|
151
|
+
"assignment": "task-based",
|
|
152
|
+
"protocol": "ESA",
|
|
153
|
+
"show_model_names": true # Default: false.
|
|
154
|
+
},
|
|
155
|
+
"campaign_id": "my_campaign",
|
|
156
|
+
"data": [...]
|
|
157
|
+
}
|
|
158
|
+
```
|
|
159
|
+
|
|
145
160
|
### Custom Score Sliders
|
|
146
161
|
|
|
147
162
|
For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
|
|
@@ -163,6 +178,25 @@ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scal
|
|
|
163
178
|
|
|
164
179
|
When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
|
|
165
180
|
|
|
181
|
+
### Textfield for Post-editing/Translation
|
|
182
|
+
|
|
183
|
+
Enable a textfield for post-editing or translation tasks using the `textfield` parameter in `info`. The textfield content is stored in annotations alongside scores and error spans.
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
{
|
|
187
|
+
"info": {
|
|
188
|
+
"protocol": "DA",
|
|
189
|
+
"textfield": "prefilled" # Options: null, "hidden", "visible", "prefilled"
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Textfield modes:**
|
|
195
|
+
- `null` or omitted: No textfield (default)
|
|
196
|
+
- `"hidden"`: Textfield hidden by default, shown by clicking a button
|
|
197
|
+
- `"visible"`: Textfield always visible
|
|
198
|
+
- `"prefilled"`: Textfield visible and pre-filled with model output for post-editing
|
|
199
|
+
|
|
166
200
|
### Custom Instructions
|
|
167
201
|
|
|
168
202
|
Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
|
|
@@ -401,7 +435,7 @@ Customize the goodbye message shown to users when they complete all annotations
|
|
|
401
435
|
- **Score**: Numeric quality rating (0-100)
|
|
402
436
|
- **Error Spans**: Text highlights marking errors with severity (`minor`, `major`)
|
|
403
437
|
- **Error Categories**: MQM taxonomy labels for errors
|
|
404
|
-
- **Template**: The annotation interface type. The `
|
|
438
|
+
- **Template**: The annotation interface type. The `annotate` template supports comparing multiple outputs simultaneously.
|
|
405
439
|
- **Assignment**: The method for distributing items to users:
|
|
406
440
|
- **Task-based**: Each user has predefined items
|
|
407
441
|
- **Single-stream**: Users draw from a shared pool with random assignment
|
|
@@ -432,7 +466,7 @@ pearmut run
|
|
|
432
466
|
2. Add build rule to `webpack.config.js`
|
|
433
467
|
3. Reference as `info->template` in campaign JSON
|
|
434
468
|
|
|
435
|
-
See [web/src/
|
|
469
|
+
See [web/src/annotate.ts](web/src/annotate.ts) for example.
|
|
436
470
|
|
|
437
471
|
### Deployment
|
|
438
472
|
|
|
@@ -443,68 +477,15 @@ Run on public server or tunnel local port to public IP/domain and run locally.
|
|
|
443
477
|
If you use this work in your paper, please cite as following.
|
|
444
478
|
```bibtex
|
|
445
479
|
@misc{zouhar2026pearmut,
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
480
|
+
title={Pearmut: Human Evaluation of Translation Made Trivial},
|
|
481
|
+
author={Vilém Zouhar and Tom Kocmi},
|
|
482
|
+
year={2026},
|
|
483
|
+
eprint={2601.02933},
|
|
484
|
+
archivePrefix={arXiv},
|
|
485
|
+
primaryClass={cs.CL},
|
|
486
|
+
url={https://arxiv.org/abs/2601.02933},
|
|
449
487
|
}
|
|
450
488
|
```
|
|
451
489
|
|
|
452
490
|
Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
|
|
453
|
-
|
|
454
|
-
# Changelog
|
|
455
|
-
|
|
456
|
-
- v1.0.1
|
|
457
|
-
- Support RTL languages
|
|
458
|
-
- Add boxes for references
|
|
459
|
-
- Add custom score sliders for multi-dimensional evaluation
|
|
460
|
-
- Make instructions customizable and protocol-dependent
|
|
461
|
-
- Support custom sliders
|
|
462
|
-
- Purge/reset whole tasks from dashboard
|
|
463
|
-
- Fix resetting individual users in single-stream/dynamic
|
|
464
|
-
- Fix notification stacking
|
|
465
|
-
- Add campaigns from dashboard
|
|
466
|
-
- v0.3.3
|
|
467
|
-
- Rename `doc_id` to `item_id`
|
|
468
|
-
- Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
|
|
469
|
-
- Add dynamic assignment type with contrastive model comparison
|
|
470
|
-
- Add `instructions_goodbye` field with variable substitution
|
|
471
|
-
- Add visual anchors at 33% and 66% on sliders
|
|
472
|
-
- Add German→English ESA tutorial with attention checks
|
|
473
|
-
- Validate document model consistency before shuffle
|
|
474
|
-
- Fix UI block on any interaction
|
|
475
|
-
- v0.3.2
|
|
476
|
-
- Revert seeding of user IDs
|
|
477
|
-
- Set ESA (Error Span Annotation) as default
|
|
478
|
-
- Update server IP address configuration
|
|
479
|
-
- Show approximate alignment by default
|
|
480
|
-
- Unify pointwise and listwise interfaces into `basic`
|
|
481
|
-
- Refactor protocol configuration (breaking change)
|
|
482
|
-
- v0.2.11
|
|
483
|
-
- Add comment field in settings panel
|
|
484
|
-
- Add `score_gt` validation for listwise comparisons
|
|
485
|
-
- Add Content-Disposition headers for proper download filenames
|
|
486
|
-
- Add model results display to dashboard with rankings
|
|
487
|
-
- Add campaign file structure validation
|
|
488
|
-
- Purge command now unlinks assets
|
|
489
|
-
- v0.2.6
|
|
490
|
-
- Add frozen annotation links feature for view-only mode
|
|
491
|
-
- Add word-level annotation mode toggle for error spans
|
|
492
|
-
- Add `[missing]` token support
|
|
493
|
-
- Improve frontend speed and cleanup toolboxes on item load
|
|
494
|
-
- Host assets via symlinks
|
|
495
|
-
- Add validation threshold for success/fail tokens
|
|
496
|
-
- Implement reset masking for annotations
|
|
497
|
-
- Allow pre-defined user IDs and tokens in campaign data
|
|
498
|
-
- v0.1.1
|
|
499
|
-
- Set server defaults and add VM launch scripts
|
|
500
|
-
- Add warning dialog when navigating away with unsaved work
|
|
501
|
-
- Add tutorial validation support for pointwise and listwise
|
|
502
|
-
- Add ability to preview existing annotations via progress bar
|
|
503
|
-
- Add support for ESA<sup>AI</sup> pre-filled error_spans
|
|
504
|
-
- Rename pairwise to listwise and update layout
|
|
505
|
-
- Implement single-stream assignment type
|
|
506
|
-
- v0.0.3
|
|
507
|
-
- Support multimodal inputs and outputs
|
|
508
|
-
- Add dashboard
|
|
509
|
-
- Implement ESA (Error Span Annotation) and MQM support
|
|
510
|
-
|
|
491
|
+
See changes in [CHANGELOG.md](CHANGELOG.md).
|
|
@@ -13,8 +13,8 @@ server/cli.py
|
|
|
13
13
|
server/constants.py
|
|
14
14
|
server/results_export.py
|
|
15
15
|
server/utils.py
|
|
16
|
-
server/static/
|
|
17
|
-
server/static/
|
|
16
|
+
server/static/annotate.bundle.js
|
|
17
|
+
server/static/annotate.html
|
|
18
18
|
server/static/dashboard.bundle.js
|
|
19
19
|
server/static/dashboard.html
|
|
20
20
|
server/static/favicon.svg
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
from fastapi import FastAPI, Query
|
|
6
6
|
from fastapi.middleware.cors import CORSMiddleware
|
|
7
|
-
from fastapi.responses import JSONResponse, Response
|
|
7
|
+
from fastapi.responses import FileResponse, JSONResponse, Response
|
|
8
8
|
from fastapi.staticfiles import StaticFiles
|
|
9
9
|
from pydantic import BaseModel
|
|
10
10
|
|
|
@@ -17,6 +17,7 @@ from .results_export import (
|
|
|
17
17
|
)
|
|
18
18
|
from .utils import (
|
|
19
19
|
ROOT,
|
|
20
|
+
TOKEN_MAIN,
|
|
20
21
|
check_validation_threshold,
|
|
21
22
|
load_progress_data,
|
|
22
23
|
save_db_payload,
|
|
@@ -192,7 +193,11 @@ async def _dashboard_data(request: DashboardDataRequest):
|
|
|
192
193
|
progress_new[user_id] = entry
|
|
193
194
|
|
|
194
195
|
return JSONResponse(
|
|
195
|
-
content={
|
|
196
|
+
content={
|
|
197
|
+
"data": progress_new,
|
|
198
|
+
"validation_threshold": validation_threshold,
|
|
199
|
+
"assignment": assignment,
|
|
200
|
+
},
|
|
196
201
|
status_code=200,
|
|
197
202
|
)
|
|
198
203
|
|
|
@@ -288,7 +293,7 @@ class PurgeCampaignRequest(BaseModel):
|
|
|
288
293
|
@app.post("/purge-campaign")
|
|
289
294
|
async def _purge_campaign(request: PurgeCampaignRequest):
|
|
290
295
|
global progress_data, tasks_data
|
|
291
|
-
|
|
296
|
+
|
|
292
297
|
campaign_id = request.campaign_id
|
|
293
298
|
token = request.token
|
|
294
299
|
|
|
@@ -298,57 +303,69 @@ async def _purge_campaign(request: PurgeCampaignRequest):
|
|
|
298
303
|
return JSONResponse(content="Invalid token", status_code=400)
|
|
299
304
|
|
|
300
305
|
# Unlink assets if they exist
|
|
301
|
-
destination =
|
|
306
|
+
destination = (
|
|
307
|
+
tasks_data[campaign_id].get("info", {}).get("assets", {}).get("destination")
|
|
308
|
+
)
|
|
302
309
|
if destination:
|
|
303
310
|
symlink_path = f"{ROOT}/data/{destination}".rstrip("/")
|
|
304
311
|
if os.path.islink(symlink_path):
|
|
305
312
|
os.remove(symlink_path)
|
|
306
|
-
|
|
313
|
+
|
|
307
314
|
# Remove task file
|
|
308
315
|
task_file = f"{ROOT}/data/tasks/{campaign_id}.json"
|
|
309
316
|
if os.path.exists(task_file):
|
|
310
317
|
os.remove(task_file)
|
|
311
|
-
|
|
318
|
+
|
|
312
319
|
# Remove output file
|
|
313
320
|
output_file = f"{ROOT}/data/outputs/{campaign_id}.jsonl"
|
|
314
321
|
if os.path.exists(output_file):
|
|
315
322
|
os.remove(output_file)
|
|
316
|
-
|
|
323
|
+
|
|
317
324
|
# Remove from in-memory data structures
|
|
318
325
|
del tasks_data[campaign_id]
|
|
319
326
|
del progress_data[campaign_id]
|
|
320
|
-
|
|
327
|
+
|
|
321
328
|
# Save updated progress data
|
|
322
329
|
save_progress_data(progress_data)
|
|
323
|
-
|
|
330
|
+
|
|
324
331
|
return JSONResponse(content="ok", status_code=200)
|
|
325
332
|
|
|
326
333
|
|
|
327
334
|
class AddCampaignRequest(BaseModel):
|
|
328
335
|
campaign_data: dict[str, Any]
|
|
336
|
+
token_main: str
|
|
329
337
|
|
|
330
338
|
|
|
331
339
|
@app.post("/add-campaign")
|
|
332
340
|
async def _add_campaign(request: AddCampaignRequest):
|
|
333
341
|
global progress_data, tasks_data
|
|
334
|
-
|
|
342
|
+
|
|
335
343
|
from .cli import _add_single_campaign
|
|
336
|
-
|
|
344
|
+
|
|
345
|
+
if request.token_main != TOKEN_MAIN:
|
|
346
|
+
return JSONResponse(
|
|
347
|
+
content={"error": "Invalid main token. Use the latest one."},
|
|
348
|
+
status_code=400,
|
|
349
|
+
)
|
|
350
|
+
|
|
337
351
|
try:
|
|
338
352
|
server = f"{os.environ.get('PEARMUT_SERVER_URL', 'http://localhost:8001')}"
|
|
339
353
|
_add_single_campaign(request.campaign_data, overwrite=False, server=server)
|
|
340
|
-
|
|
341
|
-
campaign_id = request.campaign_data[
|
|
354
|
+
|
|
355
|
+
campaign_id = request.campaign_data["campaign_id"]
|
|
342
356
|
with open(f"{ROOT}/data/tasks/{campaign_id}.json", "r") as f:
|
|
343
357
|
tasks_data[campaign_id] = json.load(f)
|
|
344
|
-
|
|
358
|
+
|
|
345
359
|
progress_data = load_progress_data(warn=None)
|
|
346
|
-
|
|
347
|
-
return JSONResponse(
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
360
|
+
|
|
361
|
+
return JSONResponse(
|
|
362
|
+
content={
|
|
363
|
+
"status": "ok",
|
|
364
|
+
"campaign_id": campaign_id,
|
|
365
|
+
"token": tasks_data[campaign_id]["token"],
|
|
366
|
+
},
|
|
367
|
+
status_code=200,
|
|
368
|
+
)
|
|
352
369
|
except Exception as e:
|
|
353
370
|
return JSONResponse(content={"error": str(e)}, status_code=400)
|
|
354
371
|
|
|
@@ -418,6 +435,17 @@ if not os.path.exists(static_dir + "index.html"):
|
|
|
418
435
|
"Static directory not found. Please build the frontend first."
|
|
419
436
|
)
|
|
420
437
|
|
|
438
|
+
# Serve HTML files directly without redirect
|
|
439
|
+
@app.get("/annotate")
|
|
440
|
+
async def serve_annotate():
|
|
441
|
+
return FileResponse(static_dir + "annotate.html")
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
@app.get("/dashboard")
|
|
445
|
+
async def serve_dashboard():
|
|
446
|
+
return FileResponse(static_dir + "dashboard.html")
|
|
447
|
+
|
|
448
|
+
|
|
421
449
|
# Mount user assets from data/assets/
|
|
422
450
|
assets_dir = f"{ROOT}/data/assets"
|
|
423
451
|
os.makedirs(assets_dir, exist_ok=True)
|