pearmut 0.1.2__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {pearmut-0.1.2 → pearmut-0.2.0}/PKG-INFO +85 -10
  2. {pearmut-0.1.2 → pearmut-0.2.0}/README.md +84 -9
  3. {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/PKG-INFO +85 -10
  4. {pearmut-0.1.2 → pearmut-0.2.0}/pyproject.toml +2 -1
  5. {pearmut-0.1.2 → pearmut-0.2.0}/server/app.py +50 -8
  6. {pearmut-0.1.2 → pearmut-0.2.0}/server/assignment.py +135 -11
  7. {pearmut-0.1.2 → pearmut-0.2.0}/server/cli.py +1 -1
  8. {pearmut-0.1.2 → pearmut-0.2.0}/server/static/assets/style.css +7 -0
  9. {pearmut-0.1.2 → pearmut-0.2.0}/server/static/dashboard.bundle.js +1 -1
  10. {pearmut-0.1.2 → pearmut-0.2.0}/server/static/dashboard.html +1 -1
  11. {pearmut-0.1.2 → pearmut-0.2.0}/server/static/listwise.bundle.js +1 -1
  12. {pearmut-0.1.2 → pearmut-0.2.0}/server/static/listwise.html +2 -2
  13. pearmut-0.2.0/server/static/pointwise.bundle.js +1 -0
  14. {pearmut-0.1.2 → pearmut-0.2.0}/server/static/pointwise.html +1 -1
  15. pearmut-0.2.0/server/utils.py +101 -0
  16. pearmut-0.1.2/server/static/pointwise.bundle.js +0 -1
  17. pearmut-0.1.2/server/utils.py +0 -48
  18. {pearmut-0.1.2 → pearmut-0.2.0}/LICENSE +0 -0
  19. {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/SOURCES.txt +0 -0
  20. {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/dependency_links.txt +0 -0
  21. {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/entry_points.txt +0 -0
  22. {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/requires.txt +0 -0
  23. {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/top_level.txt +0 -0
  24. {pearmut-0.1.2 → pearmut-0.2.0}/server/static/assets/favicon.svg +0 -0
  25. {pearmut-0.1.2 → pearmut-0.2.0}/server/static/index.html +0 -0
  26. {pearmut-0.1.2 → pearmut-0.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pearmut
3
- Version: 0.1.2
3
+ Version: 0.2.0
4
4
  Summary: A tool for evaluation of model outputs, primarily MT.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: apache-2.0
@@ -23,7 +23,7 @@ Dynamic: license-file
23
23
 
24
24
  Pearmut is a **Platform for Evaluation and Reviewing of Multilingual Tasks**.
25
25
  It evaluates model outputs, primarily translation but also various other NLP tasks.
26
- Supports multimodality (text, video, audio, images) and a variety of annotation protocols (DA, ESA, MQM, paired ESA, etc).
26
+ Supports multimodality (text, video, audio, images) and a variety of annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), paired ESA, etc).
27
27
 
28
28
  [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
29
29
  &nbsp;
@@ -31,7 +31,7 @@ Supports multimodality (text, video, audio, images) and a variety of annotation
31
31
  &nbsp;
32
32
  [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/)
33
33
  &nbsp;
34
- [![build status](https://github.com/zouharvi/pearmut/actions/workflows/ci.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/ci.yml)
34
+ [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
35
35
 
36
36
  <img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/f14c91a5-44d7-4248-ada9-387e95ca59d0" />
37
37
 
@@ -42,11 +42,11 @@ You do not need to clone this repository. Simply install with pip and run locall
42
42
  # install the package
43
43
  pip install pearmut
44
44
  # download two campaign definitions
45
- wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/wmt25_%23_en-cs_CZ.json
46
- wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/wmt25_%23_cs-de_DE.json
45
+ wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/esa_encs.json
46
+ wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/da_enuk.json
47
47
  # load them into pearmut
48
- pearmut add wmt25_#_en-cs_CZ.json
49
- pearmut add wmt25_#_cs-de_DE.json
48
+ pearmut add esa_encs.json
49
+ pearmut add da_enuk.json
50
50
  # start pearmut (will show management links)
51
51
  pearmut run
52
52
  ```
@@ -115,6 +115,62 @@ For the standard ones (ESA, DA, MQM), we expect each item to be a dictionary (co
115
115
  ... # definition of another item (document)
116
116
  ```
117
117
 
118
+ ## Pre-filled Error Spans (ESA<sup>AI</sup> Support)
119
+
120
+ For workflows where you want to provide pre-filled error annotations (e.g., ESA<sup>AI</sup>), you can include an `error_spans` key in each item.
121
+ These spans will be loaded into the interface as existing annotations that users can review, modify, or delete.
122
+
123
+ ```python
124
+ {
125
+ "src": "The quick brown fox jumps over the lazy dog.",
126
+ "tgt": "Rychlá hnědá liška skáče přes líného psa.",
127
+ "error_spans": [
128
+ {
129
+ "start_i": 0, # character index start (inclusive)
130
+ "end_i": 5, # character index end (inclusive)
131
+ "severity": "minor", # "minor", "major", "neutral", or null
132
+ "category": null # MQM category string or null
133
+ },
134
+ {
135
+ "start_i": 27,
136
+ "end_i": 32,
137
+ "severity": "major",
138
+ "category": null
139
+ }
140
+ ]
141
+ }
142
+ ```
143
+
144
+ For **listwise** template, `error_spans` is a 2D array where each inner array corresponds to error spans for that candidate.
145
+
146
+ See [examples/esaai_prefilled.json](examples/esaai_prefilled.json) for a complete example.
147
+
148
+ ## Tutorial and Attention Checks
149
+
150
+ You can add validation rules to items for tutorials or attention checks. Items with `validation` field will be checked before submission:
151
+
152
+ ```python
153
+ {
154
+ "src": "The quick brown fox jumps.",
155
+ "tgt": "Rychlá hnědá liška skáče.",
156
+ "validation": {
157
+ "warning": "Please set score between 70-80.", # shown on failure (omit for silent logging)
158
+ "score": [70, 80], # required score range [min, max]
159
+ "error_spans": [{"start_i": [0, 2], "end_i": [4, 8], "severity": "minor"}], # expected spans
160
+ "allow_skip": true # show "skip tutorial" button
161
+ }
162
+ }
163
+ ```
164
+
165
+ - Tutorial items: Include `allow_skip: true` and `warning` to let users skip after seeing the feedback
166
+ - Loud attention checks: Include `warning` without `allow_skip` to force users to retry
167
+ - Silent attention checks: Omit `warning` to silently log failures without user notification (useful for quality control with bad translations)
168
+ For listwise template, `validation` is an array where each element corresponds to a candidate.
169
+ The dashboard shows failed/total validation checks per user.
170
+ See [examples/tutorial_pointwise.json](examples/tutorial_pointwise.json) and [examples/tutorial_listwise.json](examples/tutorial_listwise.json) for complete examples.
171
+
172
+ ## Single-stream Assignment
173
+
118
174
  We also support a simple allocation where all annotators draw from the same pool (`single-stream`). Items are randomly assigned to annotators from the pool of unfinished items:
119
175
  ```python
120
176
  {
@@ -138,7 +194,7 @@ We also support dynamic allocation of annotations (`dynamic`, not yet ⚠️), w
138
194
  "campaign_id": "my campaign 6",
139
195
  "info": {
140
196
  "assignment": "dynamic",
141
- "template": "kway",
197
+ "template": "listwise",
142
198
  "protocol_k": 5,
143
199
  "num_users": 50,
144
200
  },
@@ -154,6 +210,25 @@ pearmut add my_campaign_4.json
154
210
  pearmut run
155
211
  ```
156
212
 
213
+ ## Campaign options
214
+
215
+ In summary, you can select from the assignment types
216
+
217
+ - `task-based`: each user has a predefined set of items
218
+ - `single-stream`: all users are annotating together the same set of items
219
+ - `dynamic`: WIP ⚠️
220
+
221
+ and independently of that select your protocol template:
222
+
223
+ - `pointwise`: evaluate a single output given a single output
224
+ - `protocol_score`: ask for score 0 to 100
225
+ - `protocol_error_spans`: ask for highlighting error spans
226
+ - `protocol_error_categories`: ask for highlighting error categories
227
+ - `listwise`: evaluate multiple outputs at the same time given a single output ⚠️
228
+ - `protocol_score`: ask for score 0 to 100
229
+ - `protocol_error_spans`: ask for highlighting error spans
230
+ - `protocol_error_categories`: ask for highlighting error categories
231
+
157
232
  ## Campaign management
158
233
 
159
234
  When adding new campaigns or launching pearmut, a management link is shown that gives an overview of annotator progress but also an easy access to the annotation links or resetting the task progress (no data will be lost).
@@ -170,7 +245,7 @@ An intentionally incorrect token can be shown if the annotations don't pass qual
170
245
 
171
246
  We also support anything HTML-compatible both on the input and on the output.
172
247
  This includes embedded YouTube videos, or even simple `<video ` tags that point to some resource somewhere.
173
- For an example, try [examples/mock_multimodal.json](examples/mock_multimodal.json).
248
+ For an example, try [examples/multimodal.json](examples/multimodal.json).
174
249
  Tip: make sure the elements are already appropriately styled.
175
250
 
176
251
  <img width="800" alt="Preview of multimodal elements in Pearmut" src="https://github.com/user-attachments/assets/f34a1a3e-ad95-4114-95ee-8a49e8003faf" />
@@ -209,7 +284,7 @@ If you use this work in your paper, please cite as:
209
284
  ```bibtex
210
285
  @misc{zouhar2025pearmut,
211
286
  author={Vilém Zouhar},
212
- title={Pearmut🍐 Platform for Evaluation and Reviewing of Multilingual Tasks},
287
+ title={Pearmut: Platform for Evaluating and Reviewing of Multilingual Tasks},
213
288
  url={https://github.com/zouharvi/pearmut/},
214
289
  year={2025},
215
290
  }
@@ -2,7 +2,7 @@
2
2
 
3
3
  Pearmut is a **Platform for Evaluation and Reviewing of Multilingual Tasks**.
4
4
  It evaluates model outputs, primarily translation but also various other NLP tasks.
5
- Supports multimodality (text, video, audio, images) and a variety of annotation protocols (DA, ESA, MQM, paired ESA, etc).
5
+ Supports multimodality (text, video, audio, images) and a variety of annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), paired ESA, etc).
6
6
 
7
7
  [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
8
8
  &nbsp;
@@ -10,7 +10,7 @@ Supports multimodality (text, video, audio, images) and a variety of annotation
10
10
  &nbsp;
11
11
  [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/)
12
12
  &nbsp;
13
- [![build status](https://github.com/zouharvi/pearmut/actions/workflows/ci.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/ci.yml)
13
+ [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
14
14
 
15
15
  <img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/f14c91a5-44d7-4248-ada9-387e95ca59d0" />
16
16
 
@@ -21,11 +21,11 @@ You do not need to clone this repository. Simply install with pip and run locall
21
21
  # install the package
22
22
  pip install pearmut
23
23
  # download two campaign definitions
24
- wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/wmt25_%23_en-cs_CZ.json
25
- wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/wmt25_%23_cs-de_DE.json
24
+ wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/esa_encs.json
25
+ wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/da_enuk.json
26
26
  # load them into pearmut
27
- pearmut add wmt25_#_en-cs_CZ.json
28
- pearmut add wmt25_#_cs-de_DE.json
27
+ pearmut add esa_encs.json
28
+ pearmut add da_enuk.json
29
29
  # start pearmut (will show management links)
30
30
  pearmut run
31
31
  ```
@@ -94,6 +94,62 @@ For the standard ones (ESA, DA, MQM), we expect each item to be a dictionary (co
94
94
  ... # definition of another item (document)
95
95
  ```
96
96
 
97
+ ## Pre-filled Error Spans (ESA<sup>AI</sup> Support)
98
+
99
+ For workflows where you want to provide pre-filled error annotations (e.g., ESA<sup>AI</sup>), you can include an `error_spans` key in each item.
100
+ These spans will be loaded into the interface as existing annotations that users can review, modify, or delete.
101
+
102
+ ```python
103
+ {
104
+ "src": "The quick brown fox jumps over the lazy dog.",
105
+ "tgt": "Rychlá hnědá liška skáče přes líného psa.",
106
+ "error_spans": [
107
+ {
108
+ "start_i": 0, # character index start (inclusive)
109
+ "end_i": 5, # character index end (inclusive)
110
+ "severity": "minor", # "minor", "major", "neutral", or null
111
+ "category": null # MQM category string or null
112
+ },
113
+ {
114
+ "start_i": 27,
115
+ "end_i": 32,
116
+ "severity": "major",
117
+ "category": null
118
+ }
119
+ ]
120
+ }
121
+ ```
122
+
123
+ For **listwise** template, `error_spans` is a 2D array where each inner array corresponds to error spans for that candidate.
124
+
125
+ See [examples/esaai_prefilled.json](examples/esaai_prefilled.json) for a complete example.
126
+
127
+ ## Tutorial and Attention Checks
128
+
129
+ You can add validation rules to items for tutorials or attention checks. Items with `validation` field will be checked before submission:
130
+
131
+ ```python
132
+ {
133
+ "src": "The quick brown fox jumps.",
134
+ "tgt": "Rychlá hnědá liška skáče.",
135
+ "validation": {
136
+ "warning": "Please set score between 70-80.", # shown on failure (omit for silent logging)
137
+ "score": [70, 80], # required score range [min, max]
138
+ "error_spans": [{"start_i": [0, 2], "end_i": [4, 8], "severity": "minor"}], # expected spans
139
+ "allow_skip": true # show "skip tutorial" button
140
+ }
141
+ }
142
+ ```
143
+
144
+ - Tutorial items: Include `allow_skip: true` and `warning` to let users skip after seeing the feedback
145
+ - Loud attention checks: Include `warning` without `allow_skip` to force users to retry
146
+ - Silent attention checks: Omit `warning` to silently log failures without user notification (useful for quality control with bad translations)
147
+ For listwise template, `validation` is an array where each element corresponds to a candidate.
148
+ The dashboard shows failed/total validation checks per user.
149
+ See [examples/tutorial_pointwise.json](examples/tutorial_pointwise.json) and [examples/tutorial_listwise.json](examples/tutorial_listwise.json) for complete examples.
150
+
151
+ ## Single-stream Assignment
152
+
97
153
  We also support a simple allocation where all annotators draw from the same pool (`single-stream`). Items are randomly assigned to annotators from the pool of unfinished items:
98
154
  ```python
99
155
  {
@@ -117,7 +173,7 @@ We also support dynamic allocation of annotations (`dynamic`, not yet ⚠️), w
117
173
  "campaign_id": "my campaign 6",
118
174
  "info": {
119
175
  "assignment": "dynamic",
120
- "template": "kway",
176
+ "template": "listwise",
121
177
  "protocol_k": 5,
122
178
  "num_users": 50,
123
179
  },
@@ -133,6 +189,25 @@ pearmut add my_campaign_4.json
133
189
  pearmut run
134
190
  ```
135
191
 
192
+ ## Campaign options
193
+
194
+ In summary, you can select from the assignment types
195
+
196
+ - `task-based`: each user has a predefined set of items
197
+ - `single-stream`: all users are annotating together the same set of items
198
+ - `dynamic`: WIP ⚠️
199
+
200
+ and independently of that select your protocol template:
201
+
202
+ - `pointwise`: evaluate a single output given a single output
203
+ - `protocol_score`: ask for score 0 to 100
204
+ - `protocol_error_spans`: ask for highlighting error spans
205
+ - `protocol_error_categories`: ask for highlighting error categories
206
+ - `listwise`: evaluate multiple outputs at the same time given a single output ⚠️
207
+ - `protocol_score`: ask for score 0 to 100
208
+ - `protocol_error_spans`: ask for highlighting error spans
209
+ - `protocol_error_categories`: ask for highlighting error categories
210
+
136
211
  ## Campaign management
137
212
 
138
213
  When adding new campaigns or launching pearmut, a management link is shown that gives an overview of annotator progress but also an easy access to the annotation links or resetting the task progress (no data will be lost).
@@ -149,7 +224,7 @@ An intentionally incorrect token can be shown if the annotations don't pass qual
149
224
 
150
225
  We also support anything HTML-compatible both on the input and on the output.
151
226
  This includes embedded YouTube videos, or even simple `<video ` tags that point to some resource somewhere.
152
- For an example, try [examples/mock_multimodal.json](examples/mock_multimodal.json).
227
+ For an example, try [examples/multimodal.json](examples/multimodal.json).
153
228
  Tip: make sure the elements are already appropriately styled.
154
229
 
155
230
  <img width="800" alt="Preview of multimodal elements in Pearmut" src="https://github.com/user-attachments/assets/f34a1a3e-ad95-4114-95ee-8a49e8003faf" />
@@ -188,7 +263,7 @@ If you use this work in your paper, please cite as:
188
263
  ```bibtex
189
264
  @misc{zouhar2025pearmut,
190
265
  author={Vilém Zouhar},
191
- title={Pearmut🍐 Platform for Evaluation and Reviewing of Multilingual Tasks},
266
+ title={Pearmut: Platform for Evaluating and Reviewing of Multilingual Tasks},
192
267
  url={https://github.com/zouharvi/pearmut/},
193
268
  year={2025},
194
269
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pearmut
3
- Version: 0.1.2
3
+ Version: 0.2.0
4
4
  Summary: A tool for evaluation of model outputs, primarily MT.
5
5
  Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
6
6
  License: apache-2.0
@@ -23,7 +23,7 @@ Dynamic: license-file
23
23
 
24
24
  Pearmut is a **Platform for Evaluation and Reviewing of Multilingual Tasks**.
25
25
  It evaluates model outputs, primarily translation but also various other NLP tasks.
26
- Supports multimodality (text, video, audio, images) and a variety of annotation protocols (DA, ESA, MQM, paired ESA, etc).
26
+ Supports multimodality (text, video, audio, images) and a variety of annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), paired ESA, etc).
27
27
 
28
28
  [![PyPi version](https://badgen.net/pypi/v/pearmut/)](https://pypi.org/project/pearmut)
29
29
  &nbsp;
@@ -31,7 +31,7 @@ Supports multimodality (text, video, audio, images) and a variety of annotation
31
31
  &nbsp;
32
32
  [![PyPi license](https://badgen.net/pypi/license/pearmut/)](https://pypi.org/project/pearmut/)
33
33
  &nbsp;
34
- [![build status](https://github.com/zouharvi/pearmut/actions/workflows/ci.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/ci.yml)
34
+ [![build status](https://github.com/zouharvi/pearmut/actions/workflows/test.yml/badge.svg)](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
35
35
 
36
36
  <img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/f14c91a5-44d7-4248-ada9-387e95ca59d0" />
37
37
 
@@ -42,11 +42,11 @@ You do not need to clone this repository. Simply install with pip and run locall
42
42
  # install the package
43
43
  pip install pearmut
44
44
  # download two campaign definitions
45
- wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/wmt25_%23_en-cs_CZ.json
46
- wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/wmt25_%23_cs-de_DE.json
45
+ wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/esa_encs.json
46
+ wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/da_enuk.json
47
47
  # load them into pearmut
48
- pearmut add wmt25_#_en-cs_CZ.json
49
- pearmut add wmt25_#_cs-de_DE.json
48
+ pearmut add esa_encs.json
49
+ pearmut add da_enuk.json
50
50
  # start pearmut (will show management links)
51
51
  pearmut run
52
52
  ```
@@ -115,6 +115,62 @@ For the standard ones (ESA, DA, MQM), we expect each item to be a dictionary (co
115
115
  ... # definition of another item (document)
116
116
  ```
117
117
 
118
+ ## Pre-filled Error Spans (ESA<sup>AI</sup> Support)
119
+
120
+ For workflows where you want to provide pre-filled error annotations (e.g., ESA<sup>AI</sup>), you can include an `error_spans` key in each item.
121
+ These spans will be loaded into the interface as existing annotations that users can review, modify, or delete.
122
+
123
+ ```python
124
+ {
125
+ "src": "The quick brown fox jumps over the lazy dog.",
126
+ "tgt": "Rychlá hnědá liška skáče přes líného psa.",
127
+ "error_spans": [
128
+ {
129
+ "start_i": 0, # character index start (inclusive)
130
+ "end_i": 5, # character index end (inclusive)
131
+ "severity": "minor", # "minor", "major", "neutral", or null
132
+ "category": null # MQM category string or null
133
+ },
134
+ {
135
+ "start_i": 27,
136
+ "end_i": 32,
137
+ "severity": "major",
138
+ "category": null
139
+ }
140
+ ]
141
+ }
142
+ ```
143
+
144
+ For **listwise** template, `error_spans` is a 2D array where each inner array corresponds to error spans for that candidate.
145
+
146
+ See [examples/esaai_prefilled.json](examples/esaai_prefilled.json) for a complete example.
147
+
148
+ ## Tutorial and Attention Checks
149
+
150
+ You can add validation rules to items for tutorials or attention checks. Items with `validation` field will be checked before submission:
151
+
152
+ ```python
153
+ {
154
+ "src": "The quick brown fox jumps.",
155
+ "tgt": "Rychlá hnědá liška skáče.",
156
+ "validation": {
157
+ "warning": "Please set score between 70-80.", # shown on failure (omit for silent logging)
158
+ "score": [70, 80], # required score range [min, max]
159
+ "error_spans": [{"start_i": [0, 2], "end_i": [4, 8], "severity": "minor"}], # expected spans
160
+ "allow_skip": true # show "skip tutorial" button
161
+ }
162
+ }
163
+ ```
164
+
165
+ - Tutorial items: Include `allow_skip: true` and `warning` to let users skip after seeing the feedback
166
+ - Loud attention checks: Include `warning` without `allow_skip` to force users to retry
167
+ - Silent attention checks: Omit `warning` to silently log failures without user notification (useful for quality control with bad translations)
168
+ For listwise template, `validation` is an array where each element corresponds to a candidate.
169
+ The dashboard shows failed/total validation checks per user.
170
+ See [examples/tutorial_pointwise.json](examples/tutorial_pointwise.json) and [examples/tutorial_listwise.json](examples/tutorial_listwise.json) for complete examples.
171
+
172
+ ## Single-stream Assignment
173
+
118
174
  We also support a simple allocation where all annotators draw from the same pool (`single-stream`). Items are randomly assigned to annotators from the pool of unfinished items:
119
175
  ```python
120
176
  {
@@ -138,7 +194,7 @@ We also support dynamic allocation of annotations (`dynamic`, not yet ⚠️), w
138
194
  "campaign_id": "my campaign 6",
139
195
  "info": {
140
196
  "assignment": "dynamic",
141
- "template": "kway",
197
+ "template": "listwise",
142
198
  "protocol_k": 5,
143
199
  "num_users": 50,
144
200
  },
@@ -154,6 +210,25 @@ pearmut add my_campaign_4.json
154
210
  pearmut run
155
211
  ```
156
212
 
213
+ ## Campaign options
214
+
215
+ In summary, you can select from the assignment types
216
+
217
+ - `task-based`: each user has a predefined set of items
218
+ - `single-stream`: all users are annotating together the same set of items
219
+ - `dynamic`: WIP ⚠️
220
+
221
+ and independently of that select your protocol template:
222
+
223
+ - `pointwise`: evaluate a single output given a single output
224
+ - `protocol_score`: ask for score 0 to 100
225
+ - `protocol_error_spans`: ask for highlighting error spans
226
+ - `protocol_error_categories`: ask for highlighting error categories
227
+ - `listwise`: evaluate multiple outputs at the same time given a single output ⚠️
228
+ - `protocol_score`: ask for score 0 to 100
229
+ - `protocol_error_spans`: ask for highlighting error spans
230
+ - `protocol_error_categories`: ask for highlighting error categories
231
+
157
232
  ## Campaign management
158
233
 
159
234
  When adding new campaigns or launching pearmut, a management link is shown that gives an overview of annotator progress but also an easy access to the annotation links or resetting the task progress (no data will be lost).
@@ -170,7 +245,7 @@ An intentionally incorrect token can be shown if the annotations don't pass qual
170
245
 
171
246
  We also support anything HTML-compatible both on the input and on the output.
172
247
  This includes embedded YouTube videos, or even simple `<video ` tags that point to some resource somewhere.
173
- For an example, try [examples/mock_multimodal.json](examples/mock_multimodal.json).
248
+ For an example, try [examples/multimodal.json](examples/multimodal.json).
174
249
  Tip: make sure the elements are already appropriately styled.
175
250
 
176
251
  <img width="800" alt="Preview of multimodal elements in Pearmut" src="https://github.com/user-attachments/assets/f34a1a3e-ad95-4114-95ee-8a49e8003faf" />
@@ -209,7 +284,7 @@ If you use this work in your paper, please cite as:
209
284
  ```bibtex
210
285
  @misc{zouhar2025pearmut,
211
286
  author={Vilém Zouhar},
212
- title={Pearmut🍐 Platform for Evaluation and Reviewing of Multilingual Tasks},
287
+ title={Pearmut: Platform for Evaluating and Reviewing of Multilingual Tasks},
213
288
  url={https://github.com/zouharvi/pearmut/},
214
289
  year={2025},
215
290
  }
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "pearmut"
3
- version = "0.1.2"
3
+ version = "0.2.0"
4
4
  description = "A tool for evaluation of model outputs, primarily MT."
5
5
  readme = "README.md"
6
6
  license = { text = "apache-2.0" }
@@ -40,6 +40,7 @@ build-backend = "setuptools.build_meta"
40
40
  [tool.setuptools.package-data]
41
41
  "pearmut" = ["static/**"]
42
42
 
43
+ # managed by .github/workflows/publish.yml now but still can be built and pushed lcoally
43
44
  # rm -rf server/static/; npm install web/ --prefix web/; npm run build --prefix web/
44
45
  # rm -rf {build,dist,pearmut.egg-info}/*; python3 -m build
45
46
  # python3 -m twine upload dist/* -u __token__
@@ -8,8 +8,8 @@ from fastapi.responses import JSONResponse
8
8
  from fastapi.staticfiles import StaticFiles
9
9
  from pydantic import BaseModel
10
10
 
11
- from .assignment import get_next_item, reset_task, update_progress
12
- from .utils import ROOT, load_progress_data, save_progress_data
11
+ from .assignment import get_i_item, get_next_item, reset_task, update_progress
12
+ from .utils import ROOT, load_progress_data, save_db_payload, save_progress_data
13
13
 
14
14
  os.makedirs(f"{ROOT}/data/outputs", exist_ok=True)
15
15
 
@@ -36,7 +36,7 @@ class LogResponseRequest(BaseModel):
36
36
  campaign_id: str
37
37
  user_id: str
38
38
  item_i: int
39
- payload: Any
39
+ payload: dict[str, Any]
40
40
 
41
41
 
42
42
  @app.post("/log-response")
@@ -45,6 +45,7 @@ async def _log_response(request: LogResponseRequest):
45
45
 
46
46
  campaign_id = request.campaign_id
47
47
  user_id = request.user_id
48
+ item_i = request.item_i
48
49
 
49
50
  if campaign_id not in progress_data:
50
51
  return JSONResponse(content={"error": "Unknown campaign ID"}, status_code=400)
@@ -52,8 +53,8 @@ async def _log_response(request: LogResponseRequest):
52
53
  return JSONResponse(content={"error": "Unknown user ID"}, status_code=400)
53
54
 
54
55
  # append response to the output log
55
- with open(f"{ROOT}/data/outputs/{campaign_id}.jsonl", "a") as log_file:
56
- log_file.write(json.dumps(request.payload, ensure_ascii=False) + "\n")
56
+ save_db_payload(campaign_id, request.payload | {
57
+ "user_id": user_id, "item_i": item_i})
57
58
 
58
59
  # if actions were submitted, we can log time data
59
60
  if "actions" in request.payload:
@@ -68,7 +69,16 @@ async def _log_response(request: LogResponseRequest):
68
69
  for a, b in zip(times, times[1:])
69
70
  ])
70
71
 
71
- update_progress(campaign_id, user_id, tasks_data, progress_data, request.item_i, request.payload)
72
+ # Initialize validation_checks if it doesn't exist
73
+ print(request.payload.keys())
74
+ if "validations" in request.payload:
75
+ if "validations" not in progress_data[campaign_id][user_id]:
76
+ progress_data[campaign_id][user_id]["validations"] = {}
77
+
78
+ progress_data[campaign_id][user_id]["validations"][request.item_i] = request.payload["validations"]
79
+
80
+ update_progress(campaign_id, user_id, tasks_data,
81
+ progress_data, request.item_i, request.payload)
72
82
  save_progress_data(progress_data)
73
83
 
74
84
  return JSONResponse(content={"status": "ok"}, status_code=200)
@@ -97,6 +107,32 @@ async def _get_next_item(request: NextItemRequest):
97
107
  )
98
108
 
99
109
 
110
+ class GetItemRequest(BaseModel):
111
+ campaign_id: str
112
+ user_id: str
113
+ item_i: int
114
+
115
+
116
+ @app.post("/get-i-item")
117
+ async def _get_i_item(request: GetItemRequest):
118
+ campaign_id = request.campaign_id
119
+ user_id = request.user_id
120
+ item_i = request.item_i
121
+
122
+ if campaign_id not in progress_data:
123
+ return JSONResponse(content={"error": "Unknown campaign ID"}, status_code=400)
124
+ if user_id not in progress_data[campaign_id]:
125
+ return JSONResponse(content={"error": "Unknown user ID"}, status_code=400)
126
+
127
+ return get_i_item(
128
+ campaign_id,
129
+ user_id,
130
+ tasks_data,
131
+ progress_data,
132
+ item_i,
133
+ )
134
+
135
+
100
136
  class DashboardDataRequest(BaseModel):
101
137
  campaign_id: str
102
138
  token: str | None = None
@@ -119,6 +155,11 @@ async def _dashboard_data(request: DashboardDataRequest):
119
155
  for user_id, user_val in progress_data[campaign_id].items():
120
156
  # shallow copy
121
157
  entry = dict(user_val)
158
+ entry["validations"] = [
159
+ all(v)
160
+ for v in list(entry.get("validations", {}).values())
161
+ ]
162
+
122
163
 
123
164
  if not is_privileged:
124
165
  entry["token_correct"] = None
@@ -203,10 +244,11 @@ async def _download_progress(
203
244
 
204
245
  static_dir = f"{os.path.dirname(os.path.abspath(__file__))}/static/"
205
246
  if not os.path.exists(static_dir + "index.html"):
206
- raise FileNotFoundError("Static directory not found. Please build the frontend first.")
247
+ raise FileNotFoundError(
248
+ "Static directory not found. Please build the frontend first.")
207
249
 
208
250
  app.mount(
209
251
  "/",
210
252
  StaticFiles(directory=static_dir, html=True, follow_symlink=True),
211
253
  name="static",
212
- )
254
+ )