pearmut 0.1.2__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pearmut-0.1.2 → pearmut-0.2.0}/PKG-INFO +85 -10
- {pearmut-0.1.2 → pearmut-0.2.0}/README.md +84 -9
- {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/PKG-INFO +85 -10
- {pearmut-0.1.2 → pearmut-0.2.0}/pyproject.toml +2 -1
- {pearmut-0.1.2 → pearmut-0.2.0}/server/app.py +50 -8
- {pearmut-0.1.2 → pearmut-0.2.0}/server/assignment.py +135 -11
- {pearmut-0.1.2 → pearmut-0.2.0}/server/cli.py +1 -1
- {pearmut-0.1.2 → pearmut-0.2.0}/server/static/assets/style.css +7 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/server/static/dashboard.bundle.js +1 -1
- {pearmut-0.1.2 → pearmut-0.2.0}/server/static/dashboard.html +1 -1
- {pearmut-0.1.2 → pearmut-0.2.0}/server/static/listwise.bundle.js +1 -1
- {pearmut-0.1.2 → pearmut-0.2.0}/server/static/listwise.html +2 -2
- pearmut-0.2.0/server/static/pointwise.bundle.js +1 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/server/static/pointwise.html +1 -1
- pearmut-0.2.0/server/utils.py +101 -0
- pearmut-0.1.2/server/static/pointwise.bundle.js +0 -1
- pearmut-0.1.2/server/utils.py +0 -48
- {pearmut-0.1.2 → pearmut-0.2.0}/LICENSE +0 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/SOURCES.txt +0 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/dependency_links.txt +0 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/entry_points.txt +0 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/requires.txt +0 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/pearmut.egg-info/top_level.txt +0 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/server/static/assets/favicon.svg +0 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/server/static/index.html +0 -0
- {pearmut-0.1.2 → pearmut-0.2.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: apache-2.0
|
|
@@ -23,7 +23,7 @@ Dynamic: license-file
|
|
|
23
23
|
|
|
24
24
|
Pearmut is a **Platform for Evaluation and Reviewing of Multilingual Tasks**.
|
|
25
25
|
It evaluates model outputs, primarily translation but also various other NLP tasks.
|
|
26
|
-
Supports multimodality (text, video, audio, images) and a variety of annotation protocols (DA, ESA, MQM, paired ESA, etc).
|
|
26
|
+
Supports multimodality (text, video, audio, images) and a variety of annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), paired ESA, etc).
|
|
27
27
|
|
|
28
28
|
[](https://pypi.org/project/pearmut)
|
|
29
29
|
|
|
@@ -31,7 +31,7 @@ Supports multimodality (text, video, audio, images) and a variety of annotation
|
|
|
31
31
|
|
|
32
32
|
[](https://pypi.org/project/pearmut/)
|
|
33
33
|
|
|
34
|
-
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
35
35
|
|
|
36
36
|
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/f14c91a5-44d7-4248-ada9-387e95ca59d0" />
|
|
37
37
|
|
|
@@ -42,11 +42,11 @@ You do not need to clone this repository. Simply install with pip and run locall
|
|
|
42
42
|
# install the package
|
|
43
43
|
pip install pearmut
|
|
44
44
|
# download two campaign definitions
|
|
45
|
-
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/
|
|
46
|
-
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/
|
|
45
|
+
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/esa_encs.json
|
|
46
|
+
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/da_enuk.json
|
|
47
47
|
# load them into pearmut
|
|
48
|
-
pearmut add
|
|
49
|
-
pearmut add
|
|
48
|
+
pearmut add esa_encs.json
|
|
49
|
+
pearmut add da_enuk.json
|
|
50
50
|
# start pearmut (will show management links)
|
|
51
51
|
pearmut run
|
|
52
52
|
```
|
|
@@ -115,6 +115,62 @@ For the standard ones (ESA, DA, MQM), we expect each item to be a dictionary (co
|
|
|
115
115
|
... # definition of another item (document)
|
|
116
116
|
```
|
|
117
117
|
|
|
118
|
+
## Pre-filled Error Spans (ESA<sup>AI</sup> Support)
|
|
119
|
+
|
|
120
|
+
For workflows where you want to provide pre-filled error annotations (e.g., ESA<sup>AI</sup>), you can include an `error_spans` key in each item.
|
|
121
|
+
These spans will be loaded into the interface as existing annotations that users can review, modify, or delete.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
{
|
|
125
|
+
"src": "The quick brown fox jumps over the lazy dog.",
|
|
126
|
+
"tgt": "Rychlá hnědá liška skáče přes líného psa.",
|
|
127
|
+
"error_spans": [
|
|
128
|
+
{
|
|
129
|
+
"start_i": 0, # character index start (inclusive)
|
|
130
|
+
"end_i": 5, # character index end (inclusive)
|
|
131
|
+
"severity": "minor", # "minor", "major", "neutral", or null
|
|
132
|
+
"category": null # MQM category string or null
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
"start_i": 27,
|
|
136
|
+
"end_i": 32,
|
|
137
|
+
"severity": "major",
|
|
138
|
+
"category": null
|
|
139
|
+
}
|
|
140
|
+
]
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
For **listwise** template, `error_spans` is a 2D array where each inner array corresponds to error spans for that candidate.
|
|
145
|
+
|
|
146
|
+
See [examples/esaai_prefilled.json](examples/esaai_prefilled.json) for a complete example.
|
|
147
|
+
|
|
148
|
+
## Tutorial and Attention Checks
|
|
149
|
+
|
|
150
|
+
You can add validation rules to items for tutorials or attention checks. Items with `validation` field will be checked before submission:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
{
|
|
154
|
+
"src": "The quick brown fox jumps.",
|
|
155
|
+
"tgt": "Rychlá hnědá liška skáče.",
|
|
156
|
+
"validation": {
|
|
157
|
+
"warning": "Please set score between 70-80.", # shown on failure (omit for silent logging)
|
|
158
|
+
"score": [70, 80], # required score range [min, max]
|
|
159
|
+
"error_spans": [{"start_i": [0, 2], "end_i": [4, 8], "severity": "minor"}], # expected spans
|
|
160
|
+
"allow_skip": true # show "skip tutorial" button
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
- Tutorial items: Include `allow_skip: true` and `warning` to let users skip after seeing the feedback
|
|
166
|
+
- Loud attention checks: Include `warning` without `allow_skip` to force users to retry
|
|
167
|
+
- Silent attention checks: Omit `warning` to silently log failures without user notification (useful for quality control with bad translations)
|
|
168
|
+
For listwise template, `validation` is an array where each element corresponds to a candidate.
|
|
169
|
+
The dashboard shows failed/total validation checks per user.
|
|
170
|
+
See [examples/tutorial_pointwise.json](examples/tutorial_pointwise.json) and [examples/tutorial_listwise.json](examples/tutorial_listwise.json) for complete examples.
|
|
171
|
+
|
|
172
|
+
## Single-stream Assignment
|
|
173
|
+
|
|
118
174
|
We also support a simple allocation where all annotators draw from the same pool (`single-stream`). Items are randomly assigned to annotators from the pool of unfinished items:
|
|
119
175
|
```python
|
|
120
176
|
{
|
|
@@ -138,7 +194,7 @@ We also support dynamic allocation of annotations (`dynamic`, not yet ⚠️), w
|
|
|
138
194
|
"campaign_id": "my campaign 6",
|
|
139
195
|
"info": {
|
|
140
196
|
"assignment": "dynamic",
|
|
141
|
-
"template": "
|
|
197
|
+
"template": "listwise",
|
|
142
198
|
"protocol_k": 5,
|
|
143
199
|
"num_users": 50,
|
|
144
200
|
},
|
|
@@ -154,6 +210,25 @@ pearmut add my_campaign_4.json
|
|
|
154
210
|
pearmut run
|
|
155
211
|
```
|
|
156
212
|
|
|
213
|
+
## Campaign options
|
|
214
|
+
|
|
215
|
+
In summary, you can select from the assignment types
|
|
216
|
+
|
|
217
|
+
- `task-based`: each user has a predefined set of items
|
|
218
|
+
- `single-stream`: all users are annotating together the same set of items
|
|
219
|
+
- `dynamic`: WIP ⚠️
|
|
220
|
+
|
|
221
|
+
and independently of that select your protocol template:
|
|
222
|
+
|
|
223
|
+
- `pointwise`: evaluate a single output given a single output
|
|
224
|
+
- `protocol_score`: ask for score 0 to 100
|
|
225
|
+
- `protocol_error_spans`: ask for highlighting error spans
|
|
226
|
+
- `protocol_error_categories`: ask for highlighting error categories
|
|
227
|
+
- `listwise`: evaluate multiple outputs at the same time given a single output ⚠️
|
|
228
|
+
- `protocol_score`: ask for score 0 to 100
|
|
229
|
+
- `protocol_error_spans`: ask for highlighting error spans
|
|
230
|
+
- `protocol_error_categories`: ask for highlighting error categories
|
|
231
|
+
|
|
157
232
|
## Campaign management
|
|
158
233
|
|
|
159
234
|
When adding new campaigns or launching pearmut, a management link is shown that gives an overview of annotator progress but also an easy access to the annotation links or resetting the task progress (no data will be lost).
|
|
@@ -170,7 +245,7 @@ An intentionally incorrect token can be shown if the annotations don't pass qual
|
|
|
170
245
|
|
|
171
246
|
We also support anything HTML-compatible both on the input and on the output.
|
|
172
247
|
This includes embedded YouTube videos, or even simple `<video ` tags that point to some resource somewhere.
|
|
173
|
-
For an example, try [examples/
|
|
248
|
+
For an example, try [examples/multimodal.json](examples/multimodal.json).
|
|
174
249
|
Tip: make sure the elements are already appropriately styled.
|
|
175
250
|
|
|
176
251
|
<img width="800" alt="Preview of multimodal elements in Pearmut" src="https://github.com/user-attachments/assets/f34a1a3e-ad95-4114-95ee-8a49e8003faf" />
|
|
@@ -209,7 +284,7 @@ If you use this work in your paper, please cite as:
|
|
|
209
284
|
```bibtex
|
|
210
285
|
@misc{zouhar2025pearmut,
|
|
211
286
|
author={Vilém Zouhar},
|
|
212
|
-
title={Pearmut
|
|
287
|
+
title={Pearmut: Platform for Evaluating and Reviewing of Multilingual Tasks},
|
|
213
288
|
url={https://github.com/zouharvi/pearmut/},
|
|
214
289
|
year={2025},
|
|
215
290
|
}
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Pearmut is a **Platform for Evaluation and Reviewing of Multilingual Tasks**.
|
|
4
4
|
It evaluates model outputs, primarily translation but also various other NLP tasks.
|
|
5
|
-
Supports multimodality (text, video, audio, images) and a variety of annotation protocols (DA, ESA, MQM, paired ESA, etc).
|
|
5
|
+
Supports multimodality (text, video, audio, images) and a variety of annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), paired ESA, etc).
|
|
6
6
|
|
|
7
7
|
[](https://pypi.org/project/pearmut)
|
|
8
8
|
|
|
@@ -10,7 +10,7 @@ Supports multimodality (text, video, audio, images) and a variety of annotation
|
|
|
10
10
|
|
|
11
11
|
[](https://pypi.org/project/pearmut/)
|
|
12
12
|
|
|
13
|
-
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
14
14
|
|
|
15
15
|
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/f14c91a5-44d7-4248-ada9-387e95ca59d0" />
|
|
16
16
|
|
|
@@ -21,11 +21,11 @@ You do not need to clone this repository. Simply install with pip and run locall
|
|
|
21
21
|
# install the package
|
|
22
22
|
pip install pearmut
|
|
23
23
|
# download two campaign definitions
|
|
24
|
-
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/
|
|
25
|
-
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/
|
|
24
|
+
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/esa_encs.json
|
|
25
|
+
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/da_enuk.json
|
|
26
26
|
# load them into pearmut
|
|
27
|
-
pearmut add
|
|
28
|
-
pearmut add
|
|
27
|
+
pearmut add esa_encs.json
|
|
28
|
+
pearmut add da_enuk.json
|
|
29
29
|
# start pearmut (will show management links)
|
|
30
30
|
pearmut run
|
|
31
31
|
```
|
|
@@ -94,6 +94,62 @@ For the standard ones (ESA, DA, MQM), we expect each item to be a dictionary (co
|
|
|
94
94
|
... # definition of another item (document)
|
|
95
95
|
```
|
|
96
96
|
|
|
97
|
+
## Pre-filled Error Spans (ESA<sup>AI</sup> Support)
|
|
98
|
+
|
|
99
|
+
For workflows where you want to provide pre-filled error annotations (e.g., ESA<sup>AI</sup>), you can include an `error_spans` key in each item.
|
|
100
|
+
These spans will be loaded into the interface as existing annotations that users can review, modify, or delete.
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
{
|
|
104
|
+
"src": "The quick brown fox jumps over the lazy dog.",
|
|
105
|
+
"tgt": "Rychlá hnědá liška skáče přes líného psa.",
|
|
106
|
+
"error_spans": [
|
|
107
|
+
{
|
|
108
|
+
"start_i": 0, # character index start (inclusive)
|
|
109
|
+
"end_i": 5, # character index end (inclusive)
|
|
110
|
+
"severity": "minor", # "minor", "major", "neutral", or null
|
|
111
|
+
"category": null # MQM category string or null
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
"start_i": 27,
|
|
115
|
+
"end_i": 32,
|
|
116
|
+
"severity": "major",
|
|
117
|
+
"category": null
|
|
118
|
+
}
|
|
119
|
+
]
|
|
120
|
+
}
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
For **listwise** template, `error_spans` is a 2D array where each inner array corresponds to error spans for that candidate.
|
|
124
|
+
|
|
125
|
+
See [examples/esaai_prefilled.json](examples/esaai_prefilled.json) for a complete example.
|
|
126
|
+
|
|
127
|
+
## Tutorial and Attention Checks
|
|
128
|
+
|
|
129
|
+
You can add validation rules to items for tutorials or attention checks. Items with `validation` field will be checked before submission:
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
{
|
|
133
|
+
"src": "The quick brown fox jumps.",
|
|
134
|
+
"tgt": "Rychlá hnědá liška skáče.",
|
|
135
|
+
"validation": {
|
|
136
|
+
"warning": "Please set score between 70-80.", # shown on failure (omit for silent logging)
|
|
137
|
+
"score": [70, 80], # required score range [min, max]
|
|
138
|
+
"error_spans": [{"start_i": [0, 2], "end_i": [4, 8], "severity": "minor"}], # expected spans
|
|
139
|
+
"allow_skip": true # show "skip tutorial" button
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
- Tutorial items: Include `allow_skip: true` and `warning` to let users skip after seeing the feedback
|
|
145
|
+
- Loud attention checks: Include `warning` without `allow_skip` to force users to retry
|
|
146
|
+
- Silent attention checks: Omit `warning` to silently log failures without user notification (useful for quality control with bad translations)
|
|
147
|
+
For listwise template, `validation` is an array where each element corresponds to a candidate.
|
|
148
|
+
The dashboard shows failed/total validation checks per user.
|
|
149
|
+
See [examples/tutorial_pointwise.json](examples/tutorial_pointwise.json) and [examples/tutorial_listwise.json](examples/tutorial_listwise.json) for complete examples.
|
|
150
|
+
|
|
151
|
+
## Single-stream Assignment
|
|
152
|
+
|
|
97
153
|
We also support a simple allocation where all annotators draw from the same pool (`single-stream`). Items are randomly assigned to annotators from the pool of unfinished items:
|
|
98
154
|
```python
|
|
99
155
|
{
|
|
@@ -117,7 +173,7 @@ We also support dynamic allocation of annotations (`dynamic`, not yet ⚠️), w
|
|
|
117
173
|
"campaign_id": "my campaign 6",
|
|
118
174
|
"info": {
|
|
119
175
|
"assignment": "dynamic",
|
|
120
|
-
"template": "
|
|
176
|
+
"template": "listwise",
|
|
121
177
|
"protocol_k": 5,
|
|
122
178
|
"num_users": 50,
|
|
123
179
|
},
|
|
@@ -133,6 +189,25 @@ pearmut add my_campaign_4.json
|
|
|
133
189
|
pearmut run
|
|
134
190
|
```
|
|
135
191
|
|
|
192
|
+
## Campaign options
|
|
193
|
+
|
|
194
|
+
In summary, you can select from the assignment types
|
|
195
|
+
|
|
196
|
+
- `task-based`: each user has a predefined set of items
|
|
197
|
+
- `single-stream`: all users are annotating together the same set of items
|
|
198
|
+
- `dynamic`: WIP ⚠️
|
|
199
|
+
|
|
200
|
+
and independently of that select your protocol template:
|
|
201
|
+
|
|
202
|
+
- `pointwise`: evaluate a single output given a single output
|
|
203
|
+
- `protocol_score`: ask for score 0 to 100
|
|
204
|
+
- `protocol_error_spans`: ask for highlighting error spans
|
|
205
|
+
- `protocol_error_categories`: ask for highlighting error categories
|
|
206
|
+
- `listwise`: evaluate multiple outputs at the same time given a single output ⚠️
|
|
207
|
+
- `protocol_score`: ask for score 0 to 100
|
|
208
|
+
- `protocol_error_spans`: ask for highlighting error spans
|
|
209
|
+
- `protocol_error_categories`: ask for highlighting error categories
|
|
210
|
+
|
|
136
211
|
## Campaign management
|
|
137
212
|
|
|
138
213
|
When adding new campaigns or launching pearmut, a management link is shown that gives an overview of annotator progress but also an easy access to the annotation links or resetting the task progress (no data will be lost).
|
|
@@ -149,7 +224,7 @@ An intentionally incorrect token can be shown if the annotations don't pass qual
|
|
|
149
224
|
|
|
150
225
|
We also support anything HTML-compatible both on the input and on the output.
|
|
151
226
|
This includes embedded YouTube videos, or even simple `<video ` tags that point to some resource somewhere.
|
|
152
|
-
For an example, try [examples/
|
|
227
|
+
For an example, try [examples/multimodal.json](examples/multimodal.json).
|
|
153
228
|
Tip: make sure the elements are already appropriately styled.
|
|
154
229
|
|
|
155
230
|
<img width="800" alt="Preview of multimodal elements in Pearmut" src="https://github.com/user-attachments/assets/f34a1a3e-ad95-4114-95ee-8a49e8003faf" />
|
|
@@ -188,7 +263,7 @@ If you use this work in your paper, please cite as:
|
|
|
188
263
|
```bibtex
|
|
189
264
|
@misc{zouhar2025pearmut,
|
|
190
265
|
author={Vilém Zouhar},
|
|
191
|
-
title={Pearmut
|
|
266
|
+
title={Pearmut: Platform for Evaluating and Reviewing of Multilingual Tasks},
|
|
192
267
|
url={https://github.com/zouharvi/pearmut/},
|
|
193
268
|
year={2025},
|
|
194
269
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: apache-2.0
|
|
@@ -23,7 +23,7 @@ Dynamic: license-file
|
|
|
23
23
|
|
|
24
24
|
Pearmut is a **Platform for Evaluation and Reviewing of Multilingual Tasks**.
|
|
25
25
|
It evaluates model outputs, primarily translation but also various other NLP tasks.
|
|
26
|
-
Supports multimodality (text, video, audio, images) and a variety of annotation protocols (DA, ESA, MQM, paired ESA, etc).
|
|
26
|
+
Supports multimodality (text, video, audio, images) and a variety of annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), paired ESA, etc).
|
|
27
27
|
|
|
28
28
|
[](https://pypi.org/project/pearmut)
|
|
29
29
|
|
|
@@ -31,7 +31,7 @@ Supports multimodality (text, video, audio, images) and a variety of annotation
|
|
|
31
31
|
|
|
32
32
|
[](https://pypi.org/project/pearmut/)
|
|
33
33
|
|
|
34
|
-
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
35
35
|
|
|
36
36
|
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/f14c91a5-44d7-4248-ada9-387e95ca59d0" />
|
|
37
37
|
|
|
@@ -42,11 +42,11 @@ You do not need to clone this repository. Simply install with pip and run locall
|
|
|
42
42
|
# install the package
|
|
43
43
|
pip install pearmut
|
|
44
44
|
# download two campaign definitions
|
|
45
|
-
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/
|
|
46
|
-
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/
|
|
45
|
+
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/esa_encs.json
|
|
46
|
+
wget https://raw.githubusercontent.com/zouharvi/pearmut/refs/heads/main/examples/da_enuk.json
|
|
47
47
|
# load them into pearmut
|
|
48
|
-
pearmut add
|
|
49
|
-
pearmut add
|
|
48
|
+
pearmut add esa_encs.json
|
|
49
|
+
pearmut add da_enuk.json
|
|
50
50
|
# start pearmut (will show management links)
|
|
51
51
|
pearmut run
|
|
52
52
|
```
|
|
@@ -115,6 +115,62 @@ For the standard ones (ESA, DA, MQM), we expect each item to be a dictionary (co
|
|
|
115
115
|
... # definition of another item (document)
|
|
116
116
|
```
|
|
117
117
|
|
|
118
|
+
## Pre-filled Error Spans (ESA<sup>AI</sup> Support)
|
|
119
|
+
|
|
120
|
+
For workflows where you want to provide pre-filled error annotations (e.g., ESA<sup>AI</sup>), you can include an `error_spans` key in each item.
|
|
121
|
+
These spans will be loaded into the interface as existing annotations that users can review, modify, or delete.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
{
|
|
125
|
+
"src": "The quick brown fox jumps over the lazy dog.",
|
|
126
|
+
"tgt": "Rychlá hnědá liška skáče přes líného psa.",
|
|
127
|
+
"error_spans": [
|
|
128
|
+
{
|
|
129
|
+
"start_i": 0, # character index start (inclusive)
|
|
130
|
+
"end_i": 5, # character index end (inclusive)
|
|
131
|
+
"severity": "minor", # "minor", "major", "neutral", or null
|
|
132
|
+
"category": null # MQM category string or null
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
"start_i": 27,
|
|
136
|
+
"end_i": 32,
|
|
137
|
+
"severity": "major",
|
|
138
|
+
"category": null
|
|
139
|
+
}
|
|
140
|
+
]
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
For **listwise** template, `error_spans` is a 2D array where each inner array corresponds to error spans for that candidate.
|
|
145
|
+
|
|
146
|
+
See [examples/esaai_prefilled.json](examples/esaai_prefilled.json) for a complete example.
|
|
147
|
+
|
|
148
|
+
## Tutorial and Attention Checks
|
|
149
|
+
|
|
150
|
+
You can add validation rules to items for tutorials or attention checks. Items with `validation` field will be checked before submission:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
{
|
|
154
|
+
"src": "The quick brown fox jumps.",
|
|
155
|
+
"tgt": "Rychlá hnědá liška skáče.",
|
|
156
|
+
"validation": {
|
|
157
|
+
"warning": "Please set score between 70-80.", # shown on failure (omit for silent logging)
|
|
158
|
+
"score": [70, 80], # required score range [min, max]
|
|
159
|
+
"error_spans": [{"start_i": [0, 2], "end_i": [4, 8], "severity": "minor"}], # expected spans
|
|
160
|
+
"allow_skip": true # show "skip tutorial" button
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
- Tutorial items: Include `allow_skip: true` and `warning` to let users skip after seeing the feedback
|
|
166
|
+
- Loud attention checks: Include `warning` without `allow_skip` to force users to retry
|
|
167
|
+
- Silent attention checks: Omit `warning` to silently log failures without user notification (useful for quality control with bad translations)
|
|
168
|
+
For listwise template, `validation` is an array where each element corresponds to a candidate.
|
|
169
|
+
The dashboard shows failed/total validation checks per user.
|
|
170
|
+
See [examples/tutorial_pointwise.json](examples/tutorial_pointwise.json) and [examples/tutorial_listwise.json](examples/tutorial_listwise.json) for complete examples.
|
|
171
|
+
|
|
172
|
+
## Single-stream Assignment
|
|
173
|
+
|
|
118
174
|
We also support a simple allocation where all annotators draw from the same pool (`single-stream`). Items are randomly assigned to annotators from the pool of unfinished items:
|
|
119
175
|
```python
|
|
120
176
|
{
|
|
@@ -138,7 +194,7 @@ We also support dynamic allocation of annotations (`dynamic`, not yet ⚠️), w
|
|
|
138
194
|
"campaign_id": "my campaign 6",
|
|
139
195
|
"info": {
|
|
140
196
|
"assignment": "dynamic",
|
|
141
|
-
"template": "
|
|
197
|
+
"template": "listwise",
|
|
142
198
|
"protocol_k": 5,
|
|
143
199
|
"num_users": 50,
|
|
144
200
|
},
|
|
@@ -154,6 +210,25 @@ pearmut add my_campaign_4.json
|
|
|
154
210
|
pearmut run
|
|
155
211
|
```
|
|
156
212
|
|
|
213
|
+
## Campaign options
|
|
214
|
+
|
|
215
|
+
In summary, you can select from the assignment types
|
|
216
|
+
|
|
217
|
+
- `task-based`: each user has a predefined set of items
|
|
218
|
+
- `single-stream`: all users are annotating together the same set of items
|
|
219
|
+
- `dynamic`: WIP ⚠️
|
|
220
|
+
|
|
221
|
+
and independently of that select your protocol template:
|
|
222
|
+
|
|
223
|
+
- `pointwise`: evaluate a single output given a single output
|
|
224
|
+
- `protocol_score`: ask for score 0 to 100
|
|
225
|
+
- `protocol_error_spans`: ask for highlighting error spans
|
|
226
|
+
- `protocol_error_categories`: ask for highlighting error categories
|
|
227
|
+
- `listwise`: evaluate multiple outputs at the same time given a single output ⚠️
|
|
228
|
+
- `protocol_score`: ask for score 0 to 100
|
|
229
|
+
- `protocol_error_spans`: ask for highlighting error spans
|
|
230
|
+
- `protocol_error_categories`: ask for highlighting error categories
|
|
231
|
+
|
|
157
232
|
## Campaign management
|
|
158
233
|
|
|
159
234
|
When adding new campaigns or launching pearmut, a management link is shown that gives an overview of annotator progress but also an easy access to the annotation links or resetting the task progress (no data will be lost).
|
|
@@ -170,7 +245,7 @@ An intentionally incorrect token can be shown if the annotations don't pass qual
|
|
|
170
245
|
|
|
171
246
|
We also support anything HTML-compatible both on the input and on the output.
|
|
172
247
|
This includes embedded YouTube videos, or even simple `<video ` tags that point to some resource somewhere.
|
|
173
|
-
For an example, try [examples/
|
|
248
|
+
For an example, try [examples/multimodal.json](examples/multimodal.json).
|
|
174
249
|
Tip: make sure the elements are already appropriately styled.
|
|
175
250
|
|
|
176
251
|
<img width="800" alt="Preview of multimodal elements in Pearmut" src="https://github.com/user-attachments/assets/f34a1a3e-ad95-4114-95ee-8a49e8003faf" />
|
|
@@ -209,7 +284,7 @@ If you use this work in your paper, please cite as:
|
|
|
209
284
|
```bibtex
|
|
210
285
|
@misc{zouhar2025pearmut,
|
|
211
286
|
author={Vilém Zouhar},
|
|
212
|
-
title={Pearmut
|
|
287
|
+
title={Pearmut: Platform for Evaluating and Reviewing of Multilingual Tasks},
|
|
213
288
|
url={https://github.com/zouharvi/pearmut/},
|
|
214
289
|
year={2025},
|
|
215
290
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "pearmut"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
description = "A tool for evaluation of model outputs, primarily MT."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { text = "apache-2.0" }
|
|
@@ -40,6 +40,7 @@ build-backend = "setuptools.build_meta"
|
|
|
40
40
|
[tool.setuptools.package-data]
|
|
41
41
|
"pearmut" = ["static/**"]
|
|
42
42
|
|
|
43
|
+
# managed by .github/workflows/publish.yml now but still can be built and pushed lcoally
|
|
43
44
|
# rm -rf server/static/; npm install web/ --prefix web/; npm run build --prefix web/
|
|
44
45
|
# rm -rf {build,dist,pearmut.egg-info}/*; python3 -m build
|
|
45
46
|
# python3 -m twine upload dist/* -u __token__
|
|
@@ -8,8 +8,8 @@ from fastapi.responses import JSONResponse
|
|
|
8
8
|
from fastapi.staticfiles import StaticFiles
|
|
9
9
|
from pydantic import BaseModel
|
|
10
10
|
|
|
11
|
-
from .assignment import get_next_item, reset_task, update_progress
|
|
12
|
-
from .utils import ROOT, load_progress_data, save_progress_data
|
|
11
|
+
from .assignment import get_i_item, get_next_item, reset_task, update_progress
|
|
12
|
+
from .utils import ROOT, load_progress_data, save_db_payload, save_progress_data
|
|
13
13
|
|
|
14
14
|
os.makedirs(f"{ROOT}/data/outputs", exist_ok=True)
|
|
15
15
|
|
|
@@ -36,7 +36,7 @@ class LogResponseRequest(BaseModel):
|
|
|
36
36
|
campaign_id: str
|
|
37
37
|
user_id: str
|
|
38
38
|
item_i: int
|
|
39
|
-
payload: Any
|
|
39
|
+
payload: dict[str, Any]
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
@app.post("/log-response")
|
|
@@ -45,6 +45,7 @@ async def _log_response(request: LogResponseRequest):
|
|
|
45
45
|
|
|
46
46
|
campaign_id = request.campaign_id
|
|
47
47
|
user_id = request.user_id
|
|
48
|
+
item_i = request.item_i
|
|
48
49
|
|
|
49
50
|
if campaign_id not in progress_data:
|
|
50
51
|
return JSONResponse(content={"error": "Unknown campaign ID"}, status_code=400)
|
|
@@ -52,8 +53,8 @@ async def _log_response(request: LogResponseRequest):
|
|
|
52
53
|
return JSONResponse(content={"error": "Unknown user ID"}, status_code=400)
|
|
53
54
|
|
|
54
55
|
# append response to the output log
|
|
55
|
-
|
|
56
|
-
|
|
56
|
+
save_db_payload(campaign_id, request.payload | {
|
|
57
|
+
"user_id": user_id, "item_i": item_i})
|
|
57
58
|
|
|
58
59
|
# if actions were submitted, we can log time data
|
|
59
60
|
if "actions" in request.payload:
|
|
@@ -68,7 +69,16 @@ async def _log_response(request: LogResponseRequest):
|
|
|
68
69
|
for a, b in zip(times, times[1:])
|
|
69
70
|
])
|
|
70
71
|
|
|
71
|
-
|
|
72
|
+
# Initialize validation_checks if it doesn't exist
|
|
73
|
+
print(request.payload.keys())
|
|
74
|
+
if "validations" in request.payload:
|
|
75
|
+
if "validations" not in progress_data[campaign_id][user_id]:
|
|
76
|
+
progress_data[campaign_id][user_id]["validations"] = {}
|
|
77
|
+
|
|
78
|
+
progress_data[campaign_id][user_id]["validations"][request.item_i] = request.payload["validations"]
|
|
79
|
+
|
|
80
|
+
update_progress(campaign_id, user_id, tasks_data,
|
|
81
|
+
progress_data, request.item_i, request.payload)
|
|
72
82
|
save_progress_data(progress_data)
|
|
73
83
|
|
|
74
84
|
return JSONResponse(content={"status": "ok"}, status_code=200)
|
|
@@ -97,6 +107,32 @@ async def _get_next_item(request: NextItemRequest):
|
|
|
97
107
|
)
|
|
98
108
|
|
|
99
109
|
|
|
110
|
+
class GetItemRequest(BaseModel):
|
|
111
|
+
campaign_id: str
|
|
112
|
+
user_id: str
|
|
113
|
+
item_i: int
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@app.post("/get-i-item")
|
|
117
|
+
async def _get_i_item(request: GetItemRequest):
|
|
118
|
+
campaign_id = request.campaign_id
|
|
119
|
+
user_id = request.user_id
|
|
120
|
+
item_i = request.item_i
|
|
121
|
+
|
|
122
|
+
if campaign_id not in progress_data:
|
|
123
|
+
return JSONResponse(content={"error": "Unknown campaign ID"}, status_code=400)
|
|
124
|
+
if user_id not in progress_data[campaign_id]:
|
|
125
|
+
return JSONResponse(content={"error": "Unknown user ID"}, status_code=400)
|
|
126
|
+
|
|
127
|
+
return get_i_item(
|
|
128
|
+
campaign_id,
|
|
129
|
+
user_id,
|
|
130
|
+
tasks_data,
|
|
131
|
+
progress_data,
|
|
132
|
+
item_i,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
100
136
|
class DashboardDataRequest(BaseModel):
|
|
101
137
|
campaign_id: str
|
|
102
138
|
token: str | None = None
|
|
@@ -119,6 +155,11 @@ async def _dashboard_data(request: DashboardDataRequest):
|
|
|
119
155
|
for user_id, user_val in progress_data[campaign_id].items():
|
|
120
156
|
# shallow copy
|
|
121
157
|
entry = dict(user_val)
|
|
158
|
+
entry["validations"] = [
|
|
159
|
+
all(v)
|
|
160
|
+
for v in list(entry.get("validations", {}).values())
|
|
161
|
+
]
|
|
162
|
+
|
|
122
163
|
|
|
123
164
|
if not is_privileged:
|
|
124
165
|
entry["token_correct"] = None
|
|
@@ -203,10 +244,11 @@ async def _download_progress(
|
|
|
203
244
|
|
|
204
245
|
static_dir = f"{os.path.dirname(os.path.abspath(__file__))}/static/"
|
|
205
246
|
if not os.path.exists(static_dir + "index.html"):
|
|
206
|
-
raise FileNotFoundError(
|
|
247
|
+
raise FileNotFoundError(
|
|
248
|
+
"Static directory not found. Please build the frontend first.")
|
|
207
249
|
|
|
208
250
|
app.mount(
|
|
209
251
|
"/",
|
|
210
252
|
StaticFiles(directory=static_dir, html=True, follow_symlink=True),
|
|
211
253
|
name="static",
|
|
212
|
-
)
|
|
254
|
+
)
|