pearmut 1.0.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {pearmut-1.0.0 → pearmut-1.0.1}/PKG-INFO +101 -11
  2. {pearmut-1.0.0 → pearmut-1.0.1}/README.md +100 -10
  3. {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/PKG-INFO +101 -11
  4. {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/SOURCES.txt +1 -0
  5. {pearmut-1.0.0 → pearmut-1.0.1}/pyproject.toml +1 -1
  6. {pearmut-1.0.0 → pearmut-1.0.1}/server/app.py +73 -0
  7. {pearmut-1.0.0 → pearmut-1.0.1}/server/assignment.py +70 -17
  8. {pearmut-1.0.0 → pearmut-1.0.1}/server/cli.py +209 -136
  9. pearmut-1.0.1/server/constants.py +93 -0
  10. pearmut-1.0.1/server/static/basic.bundle.js +1 -0
  11. pearmut-1.0.1/server/static/basic.html +133 -0
  12. pearmut-1.0.1/server/static/dashboard.bundle.js +1 -0
  13. {pearmut-1.0.0 → pearmut-1.0.1}/server/static/dashboard.html +1 -1
  14. {pearmut-1.0.0 → pearmut-1.0.1}/server/static/index.html +1 -1
  15. {pearmut-1.0.0 → pearmut-1.0.1}/server/utils.py +1 -13
  16. pearmut-1.0.0/server/static/basic.bundle.js +0 -1
  17. pearmut-1.0.0/server/static/basic.html +0 -97
  18. pearmut-1.0.0/server/static/dashboard.bundle.js +0 -1
  19. {pearmut-1.0.0 → pearmut-1.0.1}/LICENSE +0 -0
  20. {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/dependency_links.txt +0 -0
  21. {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/entry_points.txt +0 -0
  22. {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/requires.txt +0 -0
  23. {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/top_level.txt +0 -0
  24. {pearmut-1.0.0 → pearmut-1.0.1}/server/results_export.py +0 -0
  25. {pearmut-1.0.0 → pearmut-1.0.1}/server/static/favicon.svg +0 -0
  26. {pearmut-1.0.0 → pearmut-1.0.1}/server/static/index.bundle.js +0 -0
  27. {pearmut-1.0.0 → pearmut-1.0.1}/server/static/style.css +0 -0
  28. {pearmut-1.0.0 → pearmut-1.0.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pearmut
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: A tool for evaluation of model outputs, primarily MT.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -19,17 +19,10 @@ Provides-Extra: dev
19
19
  Requires-Dist: pytest; extra == "dev"
20
20
  Dynamic: license-file
21
21
 
22
- # Pearmut 🍐
22
+ # 🍐Pearmut &nbsp; &nbsp; [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
23
23
 
24
24
  **Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
25
25
 
26
- [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
27
- &nbsp;
28
- [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/)
29
- &nbsp;
30
- [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/)
31
- &nbsp;
32
- [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
33
26
 
34
27
  <img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
35
28
 
@@ -52,6 +45,8 @@ Dynamic: license-file
52
45
  - [Terminology](#terminology)
53
46
  - [Development](#development)
54
47
  - [Citation](#citation)
48
+ - [Changelog](#changelog)
49
+
55
50
 
56
51
  ## Quick Start
57
52
 
@@ -111,7 +106,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
111
106
  }
112
107
  ```
113
108
 
114
- Each item has to have `src` (string) and `tgt` (dictionary from model names to strings, even for a single model evaluation).
109
+ Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
110
+ Optionally, you can also include `src` (source string) and/or `ref` (reference string).
111
+ If neither `src` nor `ref` is provided, only the model outputs will be displayed.
115
112
  For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
116
113
  Any other keys that you add will simply be stored in the logs.
117
114
 
@@ -145,6 +142,40 @@ The `shuffle` parameter in campaign `info` controls this behavior:
145
142
  }
146
143
  ```
147
144
 
145
+ ### Custom Score Sliders
146
+
147
+ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
148
+
149
+ ```python
150
+ {
151
+ "info": {
152
+ "assignment": "task-based",
153
+ "protocol": "ESA",
154
+ "sliders": [
155
+ {"name": "Fluency", "min": 0, "max": 5, "step": 1},
156
+ {"name": "Adequacy", "min": 0, "max": 100, "step": 1}
157
+ ]
158
+ },
159
+ "campaign_id": "my_campaign",
160
+ "data": [...]
161
+ }
162
+ ```
163
+
164
+ When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
165
+
166
+ ### Custom Instructions
167
+
168
+ Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
169
+ Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
170
+ ```python
171
+ {
172
+ "info": {
173
+ "protocol": "DA",
174
+ "instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
175
+ }
176
+ }
177
+ ```
178
+
148
179
  ### Pre-filled Error Spans (ESA<sup>AI</sup>)
149
180
 
150
181
  Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
@@ -263,7 +294,7 @@ All items must contain outputs from all models for this assignment type to work
263
294
  **How it works:**
264
295
  1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
265
296
  2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
266
- 3. Contrastive evaluatoin: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
297
+ 3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
267
298
  4. Item prioritization: Items with the least annotations for the selected models are prioritized
268
299
  5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
269
300
 
@@ -289,6 +320,7 @@ The `users` field accepts:
289
320
  }
290
321
  ```
291
322
 
323
+
292
324
  ### Multimodal Annotations
293
325
 
294
326
  Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
@@ -418,3 +450,61 @@ If you use this work in your paper, please cite as following.
418
450
  ```
419
451
 
420
452
  Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
453
+
454
+ # Changelog
455
+
456
+ - v1.0.1
457
+ - Support RTL languages
458
+ - Add boxes for references
459
+ - Add custom score sliders for multi-dimensional evaluation
460
+ - Make instructions customizable and protocol-dependent
461
+ - Support custom sliders
462
+ - Purge/reset whole tasks from dashboard
463
+ - Fix resetting individual users in single-stream/dynamic
464
+ - Fix notification stacking
465
+ - Add campaigns from dashboard
466
+ - v0.3.3
467
+ - Rename `doc_id` to `item_id`
468
+ - Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
469
+ - Add dynamic assignment type with contrastive model comparison
470
+ - Add `instructions_goodbye` field with variable substitution
471
+ - Add visual anchors at 33% and 66% on sliders
472
+ - Add German→English ESA tutorial with attention checks
473
+ - Validate document model consistency before shuffle
474
+ - Fix UI block on any interaction
475
+ - v0.3.2
476
+ - Revert seeding of user IDs
477
+ - Set ESA (Error Span Annotation) as default
478
+ - Update server IP address configuration
479
+ - Show approximate alignment by default
480
+ - Unify pointwise and listwise interfaces into `basic`
481
+ - Refactor protocol configuration (breaking change)
482
+ - v0.2.11
483
+ - Add comment field in settings panel
484
+ - Add `score_gt` validation for listwise comparisons
485
+ - Add Content-Disposition headers for proper download filenames
486
+ - Add model results display to dashboard with rankings
487
+ - Add campaign file structure validation
488
+ - Purge command now unlinks assets
489
+ - v0.2.6
490
+ - Add frozen annotation links feature for view-only mode
491
+ - Add word-level annotation mode toggle for error spans
492
+ - Add `[missing]` token support
493
+ - Improve frontend speed and cleanup toolboxes on item load
494
+ - Host assets via symlinks
495
+ - Add validation threshold for success/fail tokens
496
+ - Implement reset masking for annotations
497
+ - Allow pre-defined user IDs and tokens in campaign data
498
+ - v0.1.1
499
+ - Set server defaults and add VM launch scripts
500
+ - Add warning dialog when navigating away with unsaved work
501
+ - Add tutorial validation support for pointwise and listwise
502
+ - Add ability to preview existing annotations via progress bar
503
+ - Add support for ESA<sup>AI</sup> pre-filled error_spans
504
+ - Rename pairwise to listwise and update layout
505
+ - Implement single-stream assignment type
506
+ - v0.0.3
507
+ - Support multimodal inputs and outputs
508
+ - Add dashboard
509
+ - Implement ESA (Error Span Annotation) and MQM support
510
+
@@ -1,14 +1,7 @@
1
- # Pearmut 🍐
1
+ # 🍐Pearmut &nbsp; &nbsp; [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
2
2
 
3
3
  **Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
4
4
 
5
- [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
6
- &nbsp;
7
- [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/)
8
- &nbsp;
9
- [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/)
10
- &nbsp;
11
- [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
12
5
 
13
6
  <img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
14
7
 
@@ -31,6 +24,8 @@
31
24
  - [Terminology](#terminology)
32
25
  - [Development](#development)
33
26
  - [Citation](#citation)
27
+ - [Changelog](#changelog)
28
+
34
29
 
35
30
  ## Quick Start
36
31
 
@@ -90,7 +85,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
90
85
  }
91
86
  ```
92
87
 
93
- Each item has to have `src` (string) and `tgt` (dictionary from model names to strings, even for a single model evaluation).
88
+ Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
89
+ Optionally, you can also include `src` (source string) and/or `ref` (reference string).
90
+ If neither `src` nor `ref` is provided, only the model outputs will be displayed.
94
91
  For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
95
92
  Any other keys that you add will simply be stored in the logs.
96
93
 
@@ -124,6 +121,40 @@ The `shuffle` parameter in campaign `info` controls this behavior:
124
121
  }
125
122
  ```
126
123
 
124
+ ### Custom Score Sliders
125
+
126
+ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
127
+
128
+ ```python
129
+ {
130
+ "info": {
131
+ "assignment": "task-based",
132
+ "protocol": "ESA",
133
+ "sliders": [
134
+ {"name": "Fluency", "min": 0, "max": 5, "step": 1},
135
+ {"name": "Adequacy", "min": 0, "max": 100, "step": 1}
136
+ ]
137
+ },
138
+ "campaign_id": "my_campaign",
139
+ "data": [...]
140
+ }
141
+ ```
142
+
143
+ When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
144
+
145
+ ### Custom Instructions
146
+
147
+ Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
148
+ Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
149
+ ```python
150
+ {
151
+ "info": {
152
+ "protocol": "DA",
153
+ "instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
154
+ }
155
+ }
156
+ ```
157
+
127
158
  ### Pre-filled Error Spans (ESA<sup>AI</sup>)
128
159
 
129
160
  Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
@@ -242,7 +273,7 @@ All items must contain outputs from all models for this assignment type to work
242
273
  **How it works:**
243
274
  1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
244
275
  2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
245
- 3. Contrastive evaluatoin: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
276
+ 3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
246
277
  4. Item prioritization: Items with the least annotations for the selected models are prioritized
247
278
  5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
248
279
 
@@ -268,6 +299,7 @@ The `users` field accepts:
268
299
  }
269
300
  ```
270
301
 
302
+
271
303
  ### Multimodal Annotations
272
304
 
273
305
  Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
@@ -397,3 +429,61 @@ If you use this work in your paper, please cite as following.
397
429
  ```
398
430
 
399
431
  Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
432
+
433
+ # Changelog
434
+
435
+ - v1.0.1
436
+ - Support RTL languages
437
+ - Add boxes for references
438
+ - Add custom score sliders for multi-dimensional evaluation
439
+ - Make instructions customizable and protocol-dependent
440
+ - Support custom sliders
441
+ - Purge/reset whole tasks from dashboard
442
+ - Fix resetting individual users in single-stream/dynamic
443
+ - Fix notification stacking
444
+ - Add campaigns from dashboard
445
+ - v0.3.3
446
+ - Rename `doc_id` to `item_id`
447
+ - Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
448
+ - Add dynamic assignment type with contrastive model comparison
449
+ - Add `instructions_goodbye` field with variable substitution
450
+ - Add visual anchors at 33% and 66% on sliders
451
+ - Add German→English ESA tutorial with attention checks
452
+ - Validate document model consistency before shuffle
453
+ - Fix UI block on any interaction
454
+ - v0.3.2
455
+ - Revert seeding of user IDs
456
+ - Set ESA (Error Span Annotation) as default
457
+ - Update server IP address configuration
458
+ - Show approximate alignment by default
459
+ - Unify pointwise and listwise interfaces into `basic`
460
+ - Refactor protocol configuration (breaking change)
461
+ - v0.2.11
462
+ - Add comment field in settings panel
463
+ - Add `score_gt` validation for listwise comparisons
464
+ - Add Content-Disposition headers for proper download filenames
465
+ - Add model results display to dashboard with rankings
466
+ - Add campaign file structure validation
467
+ - Purge command now unlinks assets
468
+ - v0.2.6
469
+ - Add frozen annotation links feature for view-only mode
470
+ - Add word-level annotation mode toggle for error spans
471
+ - Add `[missing]` token support
472
+ - Improve frontend speed and cleanup toolboxes on item load
473
+ - Host assets via symlinks
474
+ - Add validation threshold for success/fail tokens
475
+ - Implement reset masking for annotations
476
+ - Allow pre-defined user IDs and tokens in campaign data
477
+ - v0.1.1
478
+ - Set server defaults and add VM launch scripts
479
+ - Add warning dialog when navigating away with unsaved work
480
+ - Add tutorial validation support for pointwise and listwise
481
+ - Add ability to preview existing annotations via progress bar
482
+ - Add support for ESA<sup>AI</sup> pre-filled error_spans
483
+ - Rename pairwise to listwise and update layout
484
+ - Implement single-stream assignment type
485
+ - v0.0.3
486
+ - Support multimodal inputs and outputs
487
+ - Add dashboard
488
+ - Implement ESA (Error Span Annotation) and MQM support
489
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pearmut
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: A tool for evaluation of model outputs, primarily MT.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: MIT
@@ -19,17 +19,10 @@ Provides-Extra: dev
19
19
  Requires-Dist: pytest; extra == "dev"
20
20
  Dynamic: license-file
21
21
 
22
- # Pearmut 🍐
22
+ # 🍐Pearmut &nbsp; &nbsp; [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut) [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/) [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/) [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
23
23
 
24
24
  **Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
25
25
 
26
- [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
27
- &nbsp;
28
- [![PyPI download/month](https://img.shields.io/pypi/dm/pearmut.svg)](https://pypi.python.org/pypi/pearmut/)
29
- &nbsp;
30
- [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/)
31
- &nbsp;
32
- [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
33
26
 
34
27
  <img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
35
28
 
@@ -52,6 +45,8 @@ Dynamic: license-file
52
45
  - [Terminology](#terminology)
53
46
  - [Development](#development)
54
47
  - [Citation](#citation)
48
+ - [Changelog](#changelog)
49
+
55
50
 
56
51
  ## Quick Start
57
52
 
@@ -111,7 +106,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
111
106
  }
112
107
  ```
113
108
 
114
- Each item has to have `src` (string) and `tgt` (dictionary from model names to strings, even for a single model evaluation).
109
+ Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
110
+ Optionally, you can also include `src` (source string) and/or `ref` (reference string).
111
+ If neither `src` nor `ref` is provided, only the model outputs will be displayed.
115
112
  For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
116
113
  Any other keys that you add will simply be stored in the logs.
117
114
 
@@ -145,6 +142,40 @@ The `shuffle` parameter in campaign `info` controls this behavior:
145
142
  }
146
143
  ```
147
144
 
145
+ ### Custom Score Sliders
146
+
147
+ For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
148
+
149
+ ```python
150
+ {
151
+ "info": {
152
+ "assignment": "task-based",
153
+ "protocol": "ESA",
154
+ "sliders": [
155
+ {"name": "Fluency", "min": 0, "max": 5, "step": 1},
156
+ {"name": "Adequacy", "min": 0, "max": 100, "step": 1}
157
+ ]
158
+ },
159
+ "campaign_id": "my_campaign",
160
+ "data": [...]
161
+ }
162
+ ```
163
+
164
+ When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
165
+
166
+ ### Custom Instructions
167
+
168
+ Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
169
+ Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
170
+ ```python
171
+ {
172
+ "info": {
173
+ "protocol": "DA",
174
+ "instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
175
+ }
176
+ }
177
+ ```
178
+
148
179
  ### Pre-filled Error Spans (ESA<sup>AI</sup>)
149
180
 
150
181
  Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
@@ -263,7 +294,7 @@ All items must contain outputs from all models for this assignment type to work
263
294
  **How it works:**
264
295
  1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
265
296
  2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
266
- 3. Contrastive evaluatoin: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
297
+ 3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
267
298
  4. Item prioritization: Items with the least annotations for the selected models are prioritized
268
299
  5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
269
300
 
@@ -289,6 +320,7 @@ The `users` field accepts:
289
320
  }
290
321
  ```
291
322
 
323
+
292
324
  ### Multimodal Annotations
293
325
 
294
326
  Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
@@ -418,3 +450,61 @@ If you use this work in your paper, please cite as following.
418
450
  ```
419
451
 
420
452
  Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
453
+
454
+ # Changelog
455
+
456
+ - v1.0.1
457
+ - Support RTL languages
458
+ - Add boxes for references
459
+ - Add custom score sliders for multi-dimensional evaluation
460
+ - Make instructions customizable and protocol-dependent
461
+ - Support custom sliders
462
+ - Purge/reset whole tasks from dashboard
463
+ - Fix resetting individual users in single-stream/dynamic
464
+ - Fix notification stacking
465
+ - Add campaigns from dashboard
466
+ - v0.3.3
467
+ - Rename `doc_id` to `item_id`
468
+ - Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
469
+ - Add dynamic assignment type with contrastive model comparison
470
+ - Add `instructions_goodbye` field with variable substitution
471
+ - Add visual anchors at 33% and 66% on sliders
472
+ - Add German→English ESA tutorial with attention checks
473
+ - Validate document model consistency before shuffle
474
+ - Fix UI block on any interaction
475
+ - v0.3.2
476
+ - Revert seeding of user IDs
477
+ - Set ESA (Error Span Annotation) as default
478
+ - Update server IP address configuration
479
+ - Show approximate alignment by default
480
+ - Unify pointwise and listwise interfaces into `basic`
481
+ - Refactor protocol configuration (breaking change)
482
+ - v0.2.11
483
+ - Add comment field in settings panel
484
+ - Add `score_gt` validation for listwise comparisons
485
+ - Add Content-Disposition headers for proper download filenames
486
+ - Add model results display to dashboard with rankings
487
+ - Add campaign file structure validation
488
+ - Purge command now unlinks assets
489
+ - v0.2.6
490
+ - Add frozen annotation links feature for view-only mode
491
+ - Add word-level annotation mode toggle for error spans
492
+ - Add `[missing]` token support
493
+ - Improve frontend speed and cleanup toolboxes on item load
494
+ - Host assets via symlinks
495
+ - Add validation threshold for success/fail tokens
496
+ - Implement reset masking for annotations
497
+ - Allow pre-defined user IDs and tokens in campaign data
498
+ - v0.1.1
499
+ - Set server defaults and add VM launch scripts
500
+ - Add warning dialog when navigating away with unsaved work
501
+ - Add tutorial validation support for pointwise and listwise
502
+ - Add ability to preview existing annotations via progress bar
503
+ - Add support for ESA<sup>AI</sup> pre-filled error_spans
504
+ - Rename pairwise to listwise and update layout
505
+ - Implement single-stream assignment type
506
+ - v0.0.3
507
+ - Support multimodal inputs and outputs
508
+ - Add dashboard
509
+ - Implement ESA (Error Span Annotation) and MQM support
510
+
@@ -10,6 +10,7 @@ pearmut.egg-info/top_level.txt
10
10
  server/app.py
11
11
  server/assignment.py
12
12
  server/cli.py
13
+ server/constants.py
13
14
  server/results_export.py
14
15
  server/utils.py
15
16
  server/static/basic.bundle.js
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pearmut"
3
- version = "1.0.0"
3
+ version = "1.0.1"
4
4
  description = "A tool for evaluation of model outputs, primarily MT."
5
5
  readme = "README.md"
6
6
  license = { text = "MIT" }
@@ -280,6 +280,79 @@ async def _reset_task(request: ResetTaskRequest):
280
280
  return response
281
281
 
282
282
 
283
+ class PurgeCampaignRequest(BaseModel):
284
+ campaign_id: str
285
+ token: str
286
+
287
+
288
+ @app.post("/purge-campaign")
289
+ async def _purge_campaign(request: PurgeCampaignRequest):
290
+ global progress_data, tasks_data
291
+
292
+ campaign_id = request.campaign_id
293
+ token = request.token
294
+
295
+ if campaign_id not in progress_data:
296
+ return JSONResponse(content="Unknown campaign ID", status_code=400)
297
+ if token != tasks_data[campaign_id]["token"]:
298
+ return JSONResponse(content="Invalid token", status_code=400)
299
+
300
+ # Unlink assets if they exist
301
+ destination = tasks_data[campaign_id].get("info", {}).get("assets", {}).get("destination")
302
+ if destination:
303
+ symlink_path = f"{ROOT}/data/{destination}".rstrip("/")
304
+ if os.path.islink(symlink_path):
305
+ os.remove(symlink_path)
306
+
307
+ # Remove task file
308
+ task_file = f"{ROOT}/data/tasks/{campaign_id}.json"
309
+ if os.path.exists(task_file):
310
+ os.remove(task_file)
311
+
312
+ # Remove output file
313
+ output_file = f"{ROOT}/data/outputs/{campaign_id}.jsonl"
314
+ if os.path.exists(output_file):
315
+ os.remove(output_file)
316
+
317
+ # Remove from in-memory data structures
318
+ del tasks_data[campaign_id]
319
+ del progress_data[campaign_id]
320
+
321
+ # Save updated progress data
322
+ save_progress_data(progress_data)
323
+
324
+ return JSONResponse(content="ok", status_code=200)
325
+
326
+
327
+ class AddCampaignRequest(BaseModel):
328
+ campaign_data: dict[str, Any]
329
+
330
+
331
+ @app.post("/add-campaign")
332
+ async def _add_campaign(request: AddCampaignRequest):
333
+ global progress_data, tasks_data
334
+
335
+ from .cli import _add_single_campaign
336
+
337
+ try:
338
+ server = f"{os.environ.get('PEARMUT_SERVER_URL', 'http://localhost:8001')}"
339
+ _add_single_campaign(request.campaign_data, overwrite=False, server=server)
340
+
341
+ campaign_id = request.campaign_data['campaign_id']
342
+ with open(f"{ROOT}/data/tasks/{campaign_id}.json", "r") as f:
343
+ tasks_data[campaign_id] = json.load(f)
344
+
345
+ progress_data = load_progress_data(warn=None)
346
+
347
+ return JSONResponse(content={
348
+ "status": "ok",
349
+ "campaign_id": campaign_id,
350
+ "token": tasks_data[campaign_id]["token"]
351
+ }, status_code=200)
352
+ except Exception as e:
353
+ return JSONResponse(content={"error": str(e)}, status_code=400)
354
+
355
+
283
356
  @app.get("/download-annotations")
284
357
  async def _download_annotations(
285
358
  campaign_id: list[str] = Query(),