pearmut 0.3.2__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pearmut-0.3.2 → pearmut-1.0.0}/PKG-INFO +56 -27
- {pearmut-0.3.2 → pearmut-1.0.0}/README.md +54 -26
- {pearmut-0.3.2 → pearmut-1.0.0}/pearmut.egg-info/PKG-INFO +56 -27
- {pearmut-0.3.2 → pearmut-1.0.0}/pearmut.egg-info/SOURCES.txt +2 -0
- {pearmut-0.3.2 → pearmut-1.0.0}/pearmut.egg-info/requires.txt +1 -0
- {pearmut-0.3.2 → pearmut-1.0.0}/pyproject.toml +2 -1
- {pearmut-0.3.2 → pearmut-1.0.0}/server/app.py +52 -29
- pearmut-1.0.0/server/assignment.py +552 -0
- {pearmut-0.3.2 → pearmut-1.0.0}/server/cli.py +104 -25
- pearmut-1.0.0/server/results_export.py +210 -0
- pearmut-1.0.0/server/static/basic.bundle.js +1 -0
- pearmut-1.0.0/server/static/basic.html +97 -0
- pearmut-1.0.0/server/static/dashboard.bundle.js +1 -0
- pearmut-1.0.0/server/static/dashboard.html +96 -0
- pearmut-1.0.0/server/static/index.bundle.js +1 -0
- pearmut-1.0.0/server/static/index.html +1 -0
- {pearmut-0.3.2 → pearmut-1.0.0}/server/static/style.css +1 -1
- {pearmut-0.3.2 → pearmut-1.0.0}/server/utils.py +16 -2
- pearmut-0.3.2/server/assignment.py +0 -342
- pearmut-0.3.2/server/static/basic.bundle.js +0 -1
- pearmut-0.3.2/server/static/basic.html +0 -74
- pearmut-0.3.2/server/static/dashboard.bundle.js +0 -1
- pearmut-0.3.2/server/static/dashboard.html +0 -81
- pearmut-0.3.2/server/static/index.html +0 -1
- {pearmut-0.3.2 → pearmut-1.0.0}/LICENSE +0 -0
- {pearmut-0.3.2 → pearmut-1.0.0}/pearmut.egg-info/dependency_links.txt +0 -0
- {pearmut-0.3.2 → pearmut-1.0.0}/pearmut.egg-info/entry_points.txt +0 -0
- {pearmut-0.3.2 → pearmut-1.0.0}/pearmut.egg-info/top_level.txt +0 -0
- {pearmut-0.3.2 → pearmut-1.0.0}/server/static/favicon.svg +0 -0
- {pearmut-0.3.2 → pearmut-1.0.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -14,6 +14,7 @@ Requires-Dist: fastapi>=0.110.0
|
|
|
14
14
|
Requires-Dist: uvicorn>=0.29.0
|
|
15
15
|
Requires-Dist: wonderwords>=3.0.0
|
|
16
16
|
Requires-Dist: psutil>=7.1.0
|
|
17
|
+
Requires-Dist: typst>=0.14.4
|
|
17
18
|
Provides-Extra: dev
|
|
18
19
|
Requires-Dist: pytest; extra == "dev"
|
|
19
20
|
Dynamic: license-file
|
|
@@ -30,7 +31,8 @@ Dynamic: license-file
|
|
|
30
31
|
|
|
31
32
|
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
32
33
|
|
|
33
|
-
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/
|
|
34
|
+
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
|
|
35
|
+
|
|
34
36
|
|
|
35
37
|
## Table of Contents
|
|
36
38
|
|
|
@@ -45,6 +47,7 @@ Dynamic: license-file
|
|
|
45
47
|
- [Multimodal Annotations](#multimodal-annotations)
|
|
46
48
|
- [Hosting Assets](#hosting-assets)
|
|
47
49
|
- [Campaign Management](#campaign-management)
|
|
50
|
+
- [Custom Completion Messages](#custom-completion-messages)
|
|
48
51
|
- [CLI Commands](#cli-commands)
|
|
49
52
|
- [Terminology](#terminology)
|
|
50
53
|
- [Development](#development)
|
|
@@ -86,11 +89,13 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
86
89
|
{
|
|
87
90
|
"instructions": "Evaluate translation from en to cs_CZ", # message to show to users above the first item
|
|
88
91
|
"src": "This will be the year that Guinness loses its cool. Cheers to that!",
|
|
89
|
-
"tgt": {"modelA": "Nevím přesně, kdy jsem to poprvé zaznamenal. Možná to bylo ve chvíli, ..."}
|
|
92
|
+
"tgt": {"modelA": "Nevím přesně, kdy jsem to poprvé zaznamenal. Možná to bylo ve chvíli, ..."},
|
|
93
|
+
"item_id": "first item in first document"
|
|
90
94
|
},
|
|
91
95
|
{
|
|
92
96
|
"src": "I'm not sure I can remember exactly when I sensed it. Maybe it was when some...",
|
|
93
|
-
"tgt": {"modelA": "Tohle bude rok, kdy Guinness přijde o svůj „cool“ faktor. Na zdraví!"}
|
|
97
|
+
"tgt": {"modelA": "Tohle bude rok, kdy Guinness přijde o svůj „cool“ faktor. Na zdraví!"},
|
|
98
|
+
"item_id": "second item in first document"
|
|
94
99
|
}
|
|
95
100
|
...
|
|
96
101
|
],
|
|
@@ -105,20 +110,10 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
105
110
|
]
|
|
106
111
|
}
|
|
107
112
|
```
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
"src": "A najednou se všechna tato voda naplnila dalšími lidmi a dalšími věcmi.", # required
|
|
113
|
-
"tgt": {"modelA": "And suddenly all the water became full of other people and other people."} # required (dict)
|
|
114
|
-
},
|
|
115
|
-
{
|
|
116
|
-
"src": "toto je pokračování stejného dokumentu",
|
|
117
|
-
"tgt": {"modelA": "this is a continuation of the same document"}
|
|
118
|
-
# Additional keys stored for analysis
|
|
119
|
-
}
|
|
120
|
-
]
|
|
121
|
-
```
|
|
113
|
+
|
|
114
|
+
Each item has to have `src` (string) and `tgt` (dictionary from model names to strings, even for a single model evaluation).
|
|
115
|
+
For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
|
|
116
|
+
Any other keys that you add will simply be stored in the logs.
|
|
122
117
|
|
|
123
118
|
Load campaigns and start the server:
|
|
124
119
|
```bash
|
|
@@ -130,7 +125,7 @@ pearmut run
|
|
|
130
125
|
|
|
131
126
|
- **`task-based`**: Each user has predefined items
|
|
132
127
|
- **`single-stream`**: All users draw from a shared pool (random assignment)
|
|
133
|
-
- **`dynamic`**:
|
|
128
|
+
- **`dynamic`**: Items are dynamically assigned based on current model performance (see [Dynamic Assignment](#dynamic-assignment))
|
|
134
129
|
|
|
135
130
|
## Advanced Features
|
|
136
131
|
|
|
@@ -223,7 +218,8 @@ The `validation` field is an array (one per candidate). Dashboard shows ✅/❌
|
|
|
223
218
|
}
|
|
224
219
|
```
|
|
225
220
|
The `score_greaterthan` field specifies the index of the candidate that must have a lower score than the current candidate.
|
|
226
|
-
See [examples/
|
|
221
|
+
See [examples/tutorial/esa_deen.json](examples/tutorial/esa_deen.json) for a mock campaign with a fully prepared ESA tutorial.
|
|
222
|
+
To use it, simply extract the `data` attribute and prefix it to each task in your campaign.
|
|
227
223
|
|
|
228
224
|
### Single-stream Assignment
|
|
229
225
|
|
|
@@ -243,6 +239,36 @@ All annotators draw from a shared pool with random assignment:
|
|
|
243
239
|
}
|
|
244
240
|
```
|
|
245
241
|
|
|
242
|
+
### Dynamic Assignment
|
|
243
|
+
|
|
244
|
+
The `dynamic` assignment type intelligently selects items based on current model performance to focus annotation effort on top-performing models using contrastive comparisons.
|
|
245
|
+
All items must contain outputs from all models for this assignment type to work properly.
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
{
|
|
249
|
+
"campaign_id": "my dynamic campaign",
|
|
250
|
+
"info": {
|
|
251
|
+
"assignment": "dynamic",
|
|
252
|
+
"protocol": "ESA",
|
|
253
|
+
"users": 10, # number of annotators
|
|
254
|
+
"dynamic_top": 3, # how many top models to consider (required)
|
|
255
|
+
"dynamic_contrastive_models": 2, # how many models to compare per item (optional, default: 1)
|
|
256
|
+
"dynamic_first": 5, # annotations per model before dynamic kicks in (optional, default: 5)
|
|
257
|
+
"dynamic_backoff": 0.1, # probability of uniform sampling (optional, default: 0)
|
|
258
|
+
},
|
|
259
|
+
"data": [...], # list of all items (shared among all annotators)
|
|
260
|
+
}
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
**How it works:**
|
|
264
|
+
1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
|
|
265
|
+
2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
|
|
266
|
+
3. Contrastive evaluatoin: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
|
|
267
|
+
4. Item prioritization: Items with the least annotations for the selected models are prioritized
|
|
268
|
+
5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
|
|
269
|
+
|
|
270
|
+
This approach efficiently focuses annotation resources on distinguishing between the best-performing models while ensuring all models get adequate baseline coverage. The contrastive evaluation allows for direct comparison of multiple models simultaneously.
|
|
271
|
+
For an example, see [examples/dynamic.json](examples/dynamic.json).
|
|
246
272
|
|
|
247
273
|
### Pre-defined User IDs and Tokens
|
|
248
274
|
|
|
@@ -316,6 +342,10 @@ Completion tokens are shown at annotation end for verification (download correct
|
|
|
316
342
|
|
|
317
343
|
When tokens are supplied, the dashboard will try to show model rankings based on the names in the dictionaries.
|
|
318
344
|
|
|
345
|
+
### Custom Completion Messages
|
|
346
|
+
|
|
347
|
+
Customize the goodbye message shown to users when they complete all annotations using the `instructions_goodbye` field in campaign info. Supports arbitrary HTML for styling and formatting with variable replacement: `${TOKEN}` (completion token) and `${USER_ID}` (user ID). Default: `"If someone asks you for a token of completion, show them: ${TOKEN}"`.
|
|
348
|
+
|
|
319
349
|
## Terminology
|
|
320
350
|
|
|
321
351
|
- **Campaign**: An annotation project that contains configuration, data, and user assignments. Each campaign has a unique identifier and is defined in a JSON file.
|
|
@@ -343,7 +373,7 @@ When tokens are supplied, the dashboard will try to show model rankings based on
|
|
|
343
373
|
- **Assignment**: The method for distributing items to users:
|
|
344
374
|
- **Task-based**: Each user has predefined items
|
|
345
375
|
- **Single-stream**: Users draw from a shared pool with random assignment
|
|
346
|
-
- **Dynamic**:
|
|
376
|
+
- **Dynamic**: Items are intelligently assigned based on model performance to focus on top models
|
|
347
377
|
|
|
348
378
|
## Development
|
|
349
379
|
|
|
@@ -376,15 +406,14 @@ See [web/src/basic.ts](web/src/basic.ts) for example.
|
|
|
376
406
|
|
|
377
407
|
Run on public server or tunnel local port to public IP/domain and run locally.
|
|
378
408
|
|
|
379
|
-
##
|
|
409
|
+
## Citation
|
|
380
410
|
|
|
381
411
|
If you use this work in your paper, please cite as following.
|
|
382
412
|
```bibtex
|
|
383
|
-
@misc{
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
year={2026},
|
|
413
|
+
@misc{zouhar2026pearmut,
|
|
414
|
+
author = {Zouhar, Vilém},
|
|
415
|
+
title = {Pearmut: Human Evaluation of Translation Made Trivial},
|
|
416
|
+
year = {2026}
|
|
388
417
|
}
|
|
389
418
|
```
|
|
390
419
|
|
|
@@ -10,7 +10,8 @@
|
|
|
10
10
|
|
|
11
11
|
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
12
12
|
|
|
13
|
-
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/
|
|
13
|
+
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
|
|
14
|
+
|
|
14
15
|
|
|
15
16
|
## Table of Contents
|
|
16
17
|
|
|
@@ -25,6 +26,7 @@
|
|
|
25
26
|
- [Multimodal Annotations](#multimodal-annotations)
|
|
26
27
|
- [Hosting Assets](#hosting-assets)
|
|
27
28
|
- [Campaign Management](#campaign-management)
|
|
29
|
+
- [Custom Completion Messages](#custom-completion-messages)
|
|
28
30
|
- [CLI Commands](#cli-commands)
|
|
29
31
|
- [Terminology](#terminology)
|
|
30
32
|
- [Development](#development)
|
|
@@ -66,11 +68,13 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
66
68
|
{
|
|
67
69
|
"instructions": "Evaluate translation from en to cs_CZ", # message to show to users above the first item
|
|
68
70
|
"src": "This will be the year that Guinness loses its cool. Cheers to that!",
|
|
69
|
-
"tgt": {"modelA": "Nevím přesně, kdy jsem to poprvé zaznamenal. Možná to bylo ve chvíli, ..."}
|
|
71
|
+
"tgt": {"modelA": "Nevím přesně, kdy jsem to poprvé zaznamenal. Možná to bylo ve chvíli, ..."},
|
|
72
|
+
"item_id": "first item in first document"
|
|
70
73
|
},
|
|
71
74
|
{
|
|
72
75
|
"src": "I'm not sure I can remember exactly when I sensed it. Maybe it was when some...",
|
|
73
|
-
"tgt": {"modelA": "Tohle bude rok, kdy Guinness přijde o svůj „cool“ faktor. Na zdraví!"}
|
|
76
|
+
"tgt": {"modelA": "Tohle bude rok, kdy Guinness přijde o svůj „cool“ faktor. Na zdraví!"},
|
|
77
|
+
"item_id": "second item in first document"
|
|
74
78
|
}
|
|
75
79
|
...
|
|
76
80
|
],
|
|
@@ -85,20 +89,10 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
85
89
|
]
|
|
86
90
|
}
|
|
87
91
|
```
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
"src": "A najednou se všechna tato voda naplnila dalšími lidmi a dalšími věcmi.", # required
|
|
93
|
-
"tgt": {"modelA": "And suddenly all the water became full of other people and other people."} # required (dict)
|
|
94
|
-
},
|
|
95
|
-
{
|
|
96
|
-
"src": "toto je pokračování stejného dokumentu",
|
|
97
|
-
"tgt": {"modelA": "this is a continuation of the same document"}
|
|
98
|
-
# Additional keys stored for analysis
|
|
99
|
-
}
|
|
100
|
-
]
|
|
101
|
-
```
|
|
92
|
+
|
|
93
|
+
Each item has to have `src` (string) and `tgt` (dictionary from model names to strings, even for a single model evaluation).
|
|
94
|
+
For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
|
|
95
|
+
Any other keys that you add will simply be stored in the logs.
|
|
102
96
|
|
|
103
97
|
Load campaigns and start the server:
|
|
104
98
|
```bash
|
|
@@ -110,7 +104,7 @@ pearmut run
|
|
|
110
104
|
|
|
111
105
|
- **`task-based`**: Each user has predefined items
|
|
112
106
|
- **`single-stream`**: All users draw from a shared pool (random assignment)
|
|
113
|
-
- **`dynamic`**:
|
|
107
|
+
- **`dynamic`**: Items are dynamically assigned based on current model performance (see [Dynamic Assignment](#dynamic-assignment))
|
|
114
108
|
|
|
115
109
|
## Advanced Features
|
|
116
110
|
|
|
@@ -203,7 +197,8 @@ The `validation` field is an array (one per candidate). Dashboard shows ✅/❌
|
|
|
203
197
|
}
|
|
204
198
|
```
|
|
205
199
|
The `score_greaterthan` field specifies the index of the candidate that must have a lower score than the current candidate.
|
|
206
|
-
See [examples/
|
|
200
|
+
See [examples/tutorial/esa_deen.json](examples/tutorial/esa_deen.json) for a mock campaign with a fully prepared ESA tutorial.
|
|
201
|
+
To use it, simply extract the `data` attribute and prefix it to each task in your campaign.
|
|
207
202
|
|
|
208
203
|
### Single-stream Assignment
|
|
209
204
|
|
|
@@ -223,6 +218,36 @@ All annotators draw from a shared pool with random assignment:
|
|
|
223
218
|
}
|
|
224
219
|
```
|
|
225
220
|
|
|
221
|
+
### Dynamic Assignment
|
|
222
|
+
|
|
223
|
+
The `dynamic` assignment type intelligently selects items based on current model performance to focus annotation effort on top-performing models using contrastive comparisons.
|
|
224
|
+
All items must contain outputs from all models for this assignment type to work properly.
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
{
|
|
228
|
+
"campaign_id": "my dynamic campaign",
|
|
229
|
+
"info": {
|
|
230
|
+
"assignment": "dynamic",
|
|
231
|
+
"protocol": "ESA",
|
|
232
|
+
"users": 10, # number of annotators
|
|
233
|
+
"dynamic_top": 3, # how many top models to consider (required)
|
|
234
|
+
"dynamic_contrastive_models": 2, # how many models to compare per item (optional, default: 1)
|
|
235
|
+
"dynamic_first": 5, # annotations per model before dynamic kicks in (optional, default: 5)
|
|
236
|
+
"dynamic_backoff": 0.1, # probability of uniform sampling (optional, default: 0)
|
|
237
|
+
},
|
|
238
|
+
"data": [...], # list of all items (shared among all annotators)
|
|
239
|
+
}
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
**How it works:**
|
|
243
|
+
1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
|
|
244
|
+
2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
|
|
245
|
+
3. Contrastive evaluatoin: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
|
|
246
|
+
4. Item prioritization: Items with the least annotations for the selected models are prioritized
|
|
247
|
+
5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
|
|
248
|
+
|
|
249
|
+
This approach efficiently focuses annotation resources on distinguishing between the best-performing models while ensuring all models get adequate baseline coverage. The contrastive evaluation allows for direct comparison of multiple models simultaneously.
|
|
250
|
+
For an example, see [examples/dynamic.json](examples/dynamic.json).
|
|
226
251
|
|
|
227
252
|
### Pre-defined User IDs and Tokens
|
|
228
253
|
|
|
@@ -296,6 +321,10 @@ Completion tokens are shown at annotation end for verification (download correct
|
|
|
296
321
|
|
|
297
322
|
When tokens are supplied, the dashboard will try to show model rankings based on the names in the dictionaries.
|
|
298
323
|
|
|
324
|
+
### Custom Completion Messages
|
|
325
|
+
|
|
326
|
+
Customize the goodbye message shown to users when they complete all annotations using the `instructions_goodbye` field in campaign info. Supports arbitrary HTML for styling and formatting with variable replacement: `${TOKEN}` (completion token) and `${USER_ID}` (user ID). Default: `"If someone asks you for a token of completion, show them: ${TOKEN}"`.
|
|
327
|
+
|
|
299
328
|
## Terminology
|
|
300
329
|
|
|
301
330
|
- **Campaign**: An annotation project that contains configuration, data, and user assignments. Each campaign has a unique identifier and is defined in a JSON file.
|
|
@@ -323,7 +352,7 @@ When tokens are supplied, the dashboard will try to show model rankings based on
|
|
|
323
352
|
- **Assignment**: The method for distributing items to users:
|
|
324
353
|
- **Task-based**: Each user has predefined items
|
|
325
354
|
- **Single-stream**: Users draw from a shared pool with random assignment
|
|
326
|
-
- **Dynamic**:
|
|
355
|
+
- **Dynamic**: Items are intelligently assigned based on model performance to focus on top models
|
|
327
356
|
|
|
328
357
|
## Development
|
|
329
358
|
|
|
@@ -356,15 +385,14 @@ See [web/src/basic.ts](web/src/basic.ts) for example.
|
|
|
356
385
|
|
|
357
386
|
Run on public server or tunnel local port to public IP/domain and run locally.
|
|
358
387
|
|
|
359
|
-
##
|
|
388
|
+
## Citation
|
|
360
389
|
|
|
361
390
|
If you use this work in your paper, please cite as following.
|
|
362
391
|
```bibtex
|
|
363
|
-
@misc{
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
year={2026},
|
|
392
|
+
@misc{zouhar2026pearmut,
|
|
393
|
+
author = {Zouhar, Vilém},
|
|
394
|
+
title = {Pearmut: Human Evaluation of Translation Made Trivial},
|
|
395
|
+
year = {2026}
|
|
368
396
|
}
|
|
369
397
|
```
|
|
370
398
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -14,6 +14,7 @@ Requires-Dist: fastapi>=0.110.0
|
|
|
14
14
|
Requires-Dist: uvicorn>=0.29.0
|
|
15
15
|
Requires-Dist: wonderwords>=3.0.0
|
|
16
16
|
Requires-Dist: psutil>=7.1.0
|
|
17
|
+
Requires-Dist: typst>=0.14.4
|
|
17
18
|
Provides-Extra: dev
|
|
18
19
|
Requires-Dist: pytest; extra == "dev"
|
|
19
20
|
Dynamic: license-file
|
|
@@ -30,7 +31,8 @@ Dynamic: license-file
|
|
|
30
31
|
|
|
31
32
|
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
32
33
|
|
|
33
|
-
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/
|
|
34
|
+
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
|
|
35
|
+
|
|
34
36
|
|
|
35
37
|
## Table of Contents
|
|
36
38
|
|
|
@@ -45,6 +47,7 @@ Dynamic: license-file
|
|
|
45
47
|
- [Multimodal Annotations](#multimodal-annotations)
|
|
46
48
|
- [Hosting Assets](#hosting-assets)
|
|
47
49
|
- [Campaign Management](#campaign-management)
|
|
50
|
+
- [Custom Completion Messages](#custom-completion-messages)
|
|
48
51
|
- [CLI Commands](#cli-commands)
|
|
49
52
|
- [Terminology](#terminology)
|
|
50
53
|
- [Development](#development)
|
|
@@ -86,11 +89,13 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
86
89
|
{
|
|
87
90
|
"instructions": "Evaluate translation from en to cs_CZ", # message to show to users above the first item
|
|
88
91
|
"src": "This will be the year that Guinness loses its cool. Cheers to that!",
|
|
89
|
-
"tgt": {"modelA": "Nevím přesně, kdy jsem to poprvé zaznamenal. Možná to bylo ve chvíli, ..."}
|
|
92
|
+
"tgt": {"modelA": "Nevím přesně, kdy jsem to poprvé zaznamenal. Možná to bylo ve chvíli, ..."},
|
|
93
|
+
"item_id": "first item in first document"
|
|
90
94
|
},
|
|
91
95
|
{
|
|
92
96
|
"src": "I'm not sure I can remember exactly when I sensed it. Maybe it was when some...",
|
|
93
|
-
"tgt": {"modelA": "Tohle bude rok, kdy Guinness přijde o svůj „cool“ faktor. Na zdraví!"}
|
|
97
|
+
"tgt": {"modelA": "Tohle bude rok, kdy Guinness přijde o svůj „cool“ faktor. Na zdraví!"},
|
|
98
|
+
"item_id": "second item in first document"
|
|
94
99
|
}
|
|
95
100
|
...
|
|
96
101
|
],
|
|
@@ -105,20 +110,10 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
105
110
|
]
|
|
106
111
|
}
|
|
107
112
|
```
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
"src": "A najednou se všechna tato voda naplnila dalšími lidmi a dalšími věcmi.", # required
|
|
113
|
-
"tgt": {"modelA": "And suddenly all the water became full of other people and other people."} # required (dict)
|
|
114
|
-
},
|
|
115
|
-
{
|
|
116
|
-
"src": "toto je pokračování stejného dokumentu",
|
|
117
|
-
"tgt": {"modelA": "this is a continuation of the same document"}
|
|
118
|
-
# Additional keys stored for analysis
|
|
119
|
-
}
|
|
120
|
-
]
|
|
121
|
-
```
|
|
113
|
+
|
|
114
|
+
Each item has to have `src` (string) and `tgt` (dictionary from model names to strings, even for a single model evaluation).
|
|
115
|
+
For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
|
|
116
|
+
Any other keys that you add will simply be stored in the logs.
|
|
122
117
|
|
|
123
118
|
Load campaigns and start the server:
|
|
124
119
|
```bash
|
|
@@ -130,7 +125,7 @@ pearmut run
|
|
|
130
125
|
|
|
131
126
|
- **`task-based`**: Each user has predefined items
|
|
132
127
|
- **`single-stream`**: All users draw from a shared pool (random assignment)
|
|
133
|
-
- **`dynamic`**:
|
|
128
|
+
- **`dynamic`**: Items are dynamically assigned based on current model performance (see [Dynamic Assignment](#dynamic-assignment))
|
|
134
129
|
|
|
135
130
|
## Advanced Features
|
|
136
131
|
|
|
@@ -223,7 +218,8 @@ The `validation` field is an array (one per candidate). Dashboard shows ✅/❌
|
|
|
223
218
|
}
|
|
224
219
|
```
|
|
225
220
|
The `score_greaterthan` field specifies the index of the candidate that must have a lower score than the current candidate.
|
|
226
|
-
See [examples/
|
|
221
|
+
See [examples/tutorial/esa_deen.json](examples/tutorial/esa_deen.json) for a mock campaign with a fully prepared ESA tutorial.
|
|
222
|
+
To use it, simply extract the `data` attribute and prefix it to each task in your campaign.
|
|
227
223
|
|
|
228
224
|
### Single-stream Assignment
|
|
229
225
|
|
|
@@ -243,6 +239,36 @@ All annotators draw from a shared pool with random assignment:
|
|
|
243
239
|
}
|
|
244
240
|
```
|
|
245
241
|
|
|
242
|
+
### Dynamic Assignment
|
|
243
|
+
|
|
244
|
+
The `dynamic` assignment type intelligently selects items based on current model performance to focus annotation effort on top-performing models using contrastive comparisons.
|
|
245
|
+
All items must contain outputs from all models for this assignment type to work properly.
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
{
|
|
249
|
+
"campaign_id": "my dynamic campaign",
|
|
250
|
+
"info": {
|
|
251
|
+
"assignment": "dynamic",
|
|
252
|
+
"protocol": "ESA",
|
|
253
|
+
"users": 10, # number of annotators
|
|
254
|
+
"dynamic_top": 3, # how many top models to consider (required)
|
|
255
|
+
"dynamic_contrastive_models": 2, # how many models to compare per item (optional, default: 1)
|
|
256
|
+
"dynamic_first": 5, # annotations per model before dynamic kicks in (optional, default: 5)
|
|
257
|
+
"dynamic_backoff": 0.1, # probability of uniform sampling (optional, default: 0)
|
|
258
|
+
},
|
|
259
|
+
"data": [...], # list of all items (shared among all annotators)
|
|
260
|
+
}
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
**How it works:**
|
|
264
|
+
1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
|
|
265
|
+
2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
|
|
266
|
+
3. Contrastive evaluatoin: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
|
|
267
|
+
4. Item prioritization: Items with the least annotations for the selected models are prioritized
|
|
268
|
+
5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
|
|
269
|
+
|
|
270
|
+
This approach efficiently focuses annotation resources on distinguishing between the best-performing models while ensuring all models get adequate baseline coverage. The contrastive evaluation allows for direct comparison of multiple models simultaneously.
|
|
271
|
+
For an example, see [examples/dynamic.json](examples/dynamic.json).
|
|
246
272
|
|
|
247
273
|
### Pre-defined User IDs and Tokens
|
|
248
274
|
|
|
@@ -316,6 +342,10 @@ Completion tokens are shown at annotation end for verification (download correct
|
|
|
316
342
|
|
|
317
343
|
When tokens are supplied, the dashboard will try to show model rankings based on the names in the dictionaries.
|
|
318
344
|
|
|
345
|
+
### Custom Completion Messages
|
|
346
|
+
|
|
347
|
+
Customize the goodbye message shown to users when they complete all annotations using the `instructions_goodbye` field in campaign info. Supports arbitrary HTML for styling and formatting with variable replacement: `${TOKEN}` (completion token) and `${USER_ID}` (user ID). Default: `"If someone asks you for a token of completion, show them: ${TOKEN}"`.
|
|
348
|
+
|
|
319
349
|
## Terminology
|
|
320
350
|
|
|
321
351
|
- **Campaign**: An annotation project that contains configuration, data, and user assignments. Each campaign has a unique identifier and is defined in a JSON file.
|
|
@@ -343,7 +373,7 @@ When tokens are supplied, the dashboard will try to show model rankings based on
|
|
|
343
373
|
- **Assignment**: The method for distributing items to users:
|
|
344
374
|
- **Task-based**: Each user has predefined items
|
|
345
375
|
- **Single-stream**: Users draw from a shared pool with random assignment
|
|
346
|
-
- **Dynamic**:
|
|
376
|
+
- **Dynamic**: Items are intelligently assigned based on model performance to focus on top models
|
|
347
377
|
|
|
348
378
|
## Development
|
|
349
379
|
|
|
@@ -376,15 +406,14 @@ See [web/src/basic.ts](web/src/basic.ts) for example.
|
|
|
376
406
|
|
|
377
407
|
Run on public server or tunnel local port to public IP/domain and run locally.
|
|
378
408
|
|
|
379
|
-
##
|
|
409
|
+
## Citation
|
|
380
410
|
|
|
381
411
|
If you use this work in your paper, please cite as following.
|
|
382
412
|
```bibtex
|
|
383
|
-
@misc{
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
year={2026},
|
|
413
|
+
@misc{zouhar2026pearmut,
|
|
414
|
+
author = {Zouhar, Vilém},
|
|
415
|
+
title = {Pearmut: Human Evaluation of Translation Made Trivial},
|
|
416
|
+
year = {2026}
|
|
388
417
|
}
|
|
389
418
|
```
|
|
390
419
|
|
|
@@ -10,11 +10,13 @@ pearmut.egg-info/top_level.txt
|
|
|
10
10
|
server/app.py
|
|
11
11
|
server/assignment.py
|
|
12
12
|
server/cli.py
|
|
13
|
+
server/results_export.py
|
|
13
14
|
server/utils.py
|
|
14
15
|
server/static/basic.bundle.js
|
|
15
16
|
server/static/basic.html
|
|
16
17
|
server/static/dashboard.bundle.js
|
|
17
18
|
server/static/dashboard.html
|
|
18
19
|
server/static/favicon.svg
|
|
20
|
+
server/static/index.bundle.js
|
|
19
21
|
server/static/index.html
|
|
20
22
|
server/static/style.css
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "pearmut"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "1.0.0"
|
|
4
4
|
description = "A tool for evaluation of model outputs, primarily MT."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = { text = "MIT" }
|
|
@@ -17,6 +17,7 @@ dependencies = [
|
|
|
17
17
|
"uvicorn >= 0.29.0",
|
|
18
18
|
"wonderwords >= 3.0.0",
|
|
19
19
|
"psutil >= 7.1.0",
|
|
20
|
+
"typst >= 0.14.4",
|
|
20
21
|
]
|
|
21
22
|
|
|
22
23
|
[project.optional-dependencies]
|
|
@@ -1,20 +1,23 @@
|
|
|
1
|
-
import collections
|
|
2
1
|
import json
|
|
3
2
|
import os
|
|
4
|
-
import statistics
|
|
5
3
|
from typing import Any
|
|
6
4
|
|
|
7
5
|
from fastapi import FastAPI, Query
|
|
8
6
|
from fastapi.middleware.cors import CORSMiddleware
|
|
9
|
-
from fastapi.responses import JSONResponse
|
|
7
|
+
from fastapi.responses import JSONResponse, Response
|
|
10
8
|
from fastapi.staticfiles import StaticFiles
|
|
11
9
|
from pydantic import BaseModel
|
|
12
10
|
|
|
13
11
|
from .assignment import get_i_item, get_next_item, reset_task, update_progress
|
|
12
|
+
from .results_export import (
|
|
13
|
+
compute_model_scores,
|
|
14
|
+
generate_latex_table,
|
|
15
|
+
generate_pdf,
|
|
16
|
+
generate_typst_table,
|
|
17
|
+
)
|
|
14
18
|
from .utils import (
|
|
15
19
|
ROOT,
|
|
16
20
|
check_validation_threshold,
|
|
17
|
-
get_db_log,
|
|
18
21
|
load_progress_data,
|
|
19
22
|
save_db_payload,
|
|
20
23
|
save_progress_data,
|
|
@@ -159,7 +162,7 @@ async def _dashboard_data(request: DashboardDataRequest):
|
|
|
159
162
|
|
|
160
163
|
progress_new = {}
|
|
161
164
|
assignment = tasks_data[campaign_id]["info"]["assignment"]
|
|
162
|
-
if assignment not in ["task-based", "single-stream"]:
|
|
165
|
+
if assignment not in ["task-based", "single-stream", "dynamic"]:
|
|
163
166
|
return JSONResponse(
|
|
164
167
|
content="Unsupported campaign assignment type", status_code=400
|
|
165
168
|
)
|
|
@@ -211,31 +214,47 @@ async def _dashboard_results(request: DashboardResultsRequest):
|
|
|
211
214
|
if token != tasks_data[campaign_id]["token"]:
|
|
212
215
|
return JSONResponse(content="Invalid token", status_code=400)
|
|
213
216
|
|
|
214
|
-
|
|
215
|
-
model_scores = collections.defaultdict(dict)
|
|
216
|
-
|
|
217
|
-
# Iterate through all tasks to find items with 'models' field (basic template)
|
|
218
|
-
log = get_db_log(campaign_id)
|
|
219
|
-
for entry in log:
|
|
220
|
-
if "item" not in entry or "annotation" not in entry:
|
|
221
|
-
continue
|
|
222
|
-
for item, annotation in zip(entry["item"], entry["annotation"]):
|
|
223
|
-
for model, annotation in annotation.items():
|
|
224
|
-
if "score" in annotation:
|
|
225
|
-
model_scores[model][json.dumps(item)] = annotation["score"]
|
|
226
|
-
|
|
227
|
-
results = [
|
|
228
|
-
{
|
|
229
|
-
"model": model,
|
|
230
|
-
"score": statistics.mean(scores.values()),
|
|
231
|
-
"count": len(scores),
|
|
232
|
-
}
|
|
233
|
-
for model, scores in model_scores.items()
|
|
234
|
-
]
|
|
235
|
-
results.sort(key=lambda x: x["score"], reverse=True)
|
|
217
|
+
results = compute_model_scores(campaign_id)
|
|
236
218
|
return JSONResponse(content=results, status_code=200)
|
|
237
219
|
|
|
238
220
|
|
|
221
|
+
@app.get("/export-results")
|
|
222
|
+
async def _export_results(
|
|
223
|
+
campaign_id: str = Query(),
|
|
224
|
+
token: str = Query(),
|
|
225
|
+
format: str = Query(),
|
|
226
|
+
):
|
|
227
|
+
if campaign_id not in progress_data:
|
|
228
|
+
return JSONResponse(content="Unknown campaign ID", status_code=400)
|
|
229
|
+
|
|
230
|
+
# Check if token is valid
|
|
231
|
+
if token != tasks_data[campaign_id]["token"]:
|
|
232
|
+
return JSONResponse(content="Invalid token", status_code=400)
|
|
233
|
+
|
|
234
|
+
results = compute_model_scores(campaign_id)
|
|
235
|
+
|
|
236
|
+
if format == "typst":
|
|
237
|
+
content = generate_typst_table(results)
|
|
238
|
+
return Response(
|
|
239
|
+
content=content,
|
|
240
|
+
media_type="text/plain",
|
|
241
|
+
)
|
|
242
|
+
elif format == "latex":
|
|
243
|
+
content = generate_latex_table(results)
|
|
244
|
+
return Response(
|
|
245
|
+
content=content,
|
|
246
|
+
media_type="text/plain",
|
|
247
|
+
)
|
|
248
|
+
elif format == "pdf":
|
|
249
|
+
pdf_bytes = generate_pdf(results, campaign_id)
|
|
250
|
+
return Response(
|
|
251
|
+
content=pdf_bytes,
|
|
252
|
+
media_type="application/pdf",
|
|
253
|
+
)
|
|
254
|
+
else:
|
|
255
|
+
return JSONResponse(content="Invalid export format", status_code=400)
|
|
256
|
+
|
|
257
|
+
|
|
239
258
|
class ResetTaskRequest(BaseModel):
|
|
240
259
|
campaign_id: str
|
|
241
260
|
user_id: str
|
|
@@ -284,7 +303,9 @@ async def _download_annotations(
|
|
|
284
303
|
return JSONResponse(
|
|
285
304
|
content=output,
|
|
286
305
|
status_code=200,
|
|
287
|
-
headers={
|
|
306
|
+
headers={
|
|
307
|
+
"Content-Disposition": 'attachment; filename="annotations.json"',
|
|
308
|
+
},
|
|
288
309
|
)
|
|
289
310
|
|
|
290
311
|
|
|
@@ -312,7 +333,9 @@ async def _download_progress(
|
|
|
312
333
|
return JSONResponse(
|
|
313
334
|
content=output,
|
|
314
335
|
status_code=200,
|
|
315
|
-
headers={
|
|
336
|
+
headers={
|
|
337
|
+
"Content-Disposition": 'attachment; filename="progress.json"',
|
|
338
|
+
},
|
|
316
339
|
)
|
|
317
340
|
|
|
318
341
|
|