pearmut 1.0.0__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pearmut-1.0.0 → pearmut-1.0.1}/PKG-INFO +101 -11
- {pearmut-1.0.0 → pearmut-1.0.1}/README.md +100 -10
- {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/PKG-INFO +101 -11
- {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/SOURCES.txt +1 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/pyproject.toml +1 -1
- {pearmut-1.0.0 → pearmut-1.0.1}/server/app.py +73 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/server/assignment.py +70 -17
- {pearmut-1.0.0 → pearmut-1.0.1}/server/cli.py +209 -136
- pearmut-1.0.1/server/constants.py +93 -0
- pearmut-1.0.1/server/static/basic.bundle.js +1 -0
- pearmut-1.0.1/server/static/basic.html +133 -0
- pearmut-1.0.1/server/static/dashboard.bundle.js +1 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/server/static/dashboard.html +1 -1
- {pearmut-1.0.0 → pearmut-1.0.1}/server/static/index.html +1 -1
- {pearmut-1.0.0 → pearmut-1.0.1}/server/utils.py +1 -13
- pearmut-1.0.0/server/static/basic.bundle.js +0 -1
- pearmut-1.0.0/server/static/basic.html +0 -97
- pearmut-1.0.0/server/static/dashboard.bundle.js +0 -1
- {pearmut-1.0.0 → pearmut-1.0.1}/LICENSE +0 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/dependency_links.txt +0 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/entry_points.txt +0 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/requires.txt +0 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/pearmut.egg-info/top_level.txt +0 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/server/results_export.py +0 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/server/static/favicon.svg +0 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/server/static/index.bundle.js +0 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/server/static/style.css +0 -0
- {pearmut-1.0.0 → pearmut-1.0.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -19,17 +19,10 @@ Provides-Extra: dev
|
|
|
19
19
|
Requires-Dist: pytest; extra == "dev"
|
|
20
20
|
Dynamic: license-file
|
|
21
21
|
|
|
22
|
-
# Pearmut
|
|
22
|
+
# 🍐Pearmut [](https://pypi.org/project/pearmut) [](https://pypi.python.org/pypi/pearmut/) [](https://pypi.org/project/pearmut/) [](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
23
23
|
|
|
24
24
|
**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
|
|
25
25
|
|
|
26
|
-
[](https://pypi.org/project/pearmut)
|
|
27
|
-
|
|
28
|
-
[](https://pypi.python.org/pypi/pearmut/)
|
|
29
|
-
|
|
30
|
-
[](https://pypi.org/project/pearmut/)
|
|
31
|
-
|
|
32
|
-
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
33
26
|
|
|
34
27
|
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
|
|
35
28
|
|
|
@@ -52,6 +45,8 @@ Dynamic: license-file
|
|
|
52
45
|
- [Terminology](#terminology)
|
|
53
46
|
- [Development](#development)
|
|
54
47
|
- [Citation](#citation)
|
|
48
|
+
- [Changelog](#changelog)
|
|
49
|
+
|
|
55
50
|
|
|
56
51
|
## Quick Start
|
|
57
52
|
|
|
@@ -111,7 +106,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
111
106
|
}
|
|
112
107
|
```
|
|
113
108
|
|
|
114
|
-
Each item has to have `
|
|
109
|
+
Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
|
|
110
|
+
Optionally, you can also include `src` (source string) and/or `ref` (reference string).
|
|
111
|
+
If neither `src` nor `ref` is provided, only the model outputs will be displayed.
|
|
115
112
|
For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
|
|
116
113
|
Any other keys that you add will simply be stored in the logs.
|
|
117
114
|
|
|
@@ -145,6 +142,40 @@ The `shuffle` parameter in campaign `info` controls this behavior:
|
|
|
145
142
|
}
|
|
146
143
|
```
|
|
147
144
|
|
|
145
|
+
### Custom Score Sliders
|
|
146
|
+
|
|
147
|
+
For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
{
|
|
151
|
+
"info": {
|
|
152
|
+
"assignment": "task-based",
|
|
153
|
+
"protocol": "ESA",
|
|
154
|
+
"sliders": [
|
|
155
|
+
{"name": "Fluency", "min": 0, "max": 5, "step": 1},
|
|
156
|
+
{"name": "Adequacy", "min": 0, "max": 100, "step": 1}
|
|
157
|
+
]
|
|
158
|
+
},
|
|
159
|
+
"campaign_id": "my_campaign",
|
|
160
|
+
"data": [...]
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
|
|
165
|
+
|
|
166
|
+
### Custom Instructions
|
|
167
|
+
|
|
168
|
+
Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
|
|
169
|
+
Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
|
|
170
|
+
```python
|
|
171
|
+
{
|
|
172
|
+
"info": {
|
|
173
|
+
"protocol": "DA",
|
|
174
|
+
"instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
```
|
|
178
|
+
|
|
148
179
|
### Pre-filled Error Spans (ESA<sup>AI</sup>)
|
|
149
180
|
|
|
150
181
|
Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
|
|
@@ -263,7 +294,7 @@ All items must contain outputs from all models for this assignment type to work
|
|
|
263
294
|
**How it works:**
|
|
264
295
|
1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
|
|
265
296
|
2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
|
|
266
|
-
3. Contrastive
|
|
297
|
+
3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
|
|
267
298
|
4. Item prioritization: Items with the least annotations for the selected models are prioritized
|
|
268
299
|
5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
|
|
269
300
|
|
|
@@ -289,6 +320,7 @@ The `users` field accepts:
|
|
|
289
320
|
}
|
|
290
321
|
```
|
|
291
322
|
|
|
323
|
+
|
|
292
324
|
### Multimodal Annotations
|
|
293
325
|
|
|
294
326
|
Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
|
|
@@ -418,3 +450,61 @@ If you use this work in your paper, please cite as following.
|
|
|
418
450
|
```
|
|
419
451
|
|
|
420
452
|
Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
|
|
453
|
+
|
|
454
|
+
# Changelog
|
|
455
|
+
|
|
456
|
+
- v1.0.1
|
|
457
|
+
- Support RTL languages
|
|
458
|
+
- Add boxes for references
|
|
459
|
+
- Add custom score sliders for multi-dimensional evaluation
|
|
460
|
+
- Make instructions customizable and protocol-dependent
|
|
461
|
+
- Support custom sliders
|
|
462
|
+
- Purge/reset whole tasks from dashboard
|
|
463
|
+
- Fix resetting individual users in single-stream/dynamic
|
|
464
|
+
- Fix notification stacking
|
|
465
|
+
- Add campaigns from dashboard
|
|
466
|
+
- v0.3.3
|
|
467
|
+
- Rename `doc_id` to `item_id`
|
|
468
|
+
- Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
|
|
469
|
+
- Add dynamic assignment type with contrastive model comparison
|
|
470
|
+
- Add `instructions_goodbye` field with variable substitution
|
|
471
|
+
- Add visual anchors at 33% and 66% on sliders
|
|
472
|
+
- Add German→English ESA tutorial with attention checks
|
|
473
|
+
- Validate document model consistency before shuffle
|
|
474
|
+
- Fix UI block on any interaction
|
|
475
|
+
- v0.3.2
|
|
476
|
+
- Revert seeding of user IDs
|
|
477
|
+
- Set ESA (Error Span Annotation) as default
|
|
478
|
+
- Update server IP address configuration
|
|
479
|
+
- Show approximate alignment by default
|
|
480
|
+
- Unify pointwise and listwise interfaces into `basic`
|
|
481
|
+
- Refactor protocol configuration (breaking change)
|
|
482
|
+
- v0.2.11
|
|
483
|
+
- Add comment field in settings panel
|
|
484
|
+
- Add `score_gt` validation for listwise comparisons
|
|
485
|
+
- Add Content-Disposition headers for proper download filenames
|
|
486
|
+
- Add model results display to dashboard with rankings
|
|
487
|
+
- Add campaign file structure validation
|
|
488
|
+
- Purge command now unlinks assets
|
|
489
|
+
- v0.2.6
|
|
490
|
+
- Add frozen annotation links feature for view-only mode
|
|
491
|
+
- Add word-level annotation mode toggle for error spans
|
|
492
|
+
- Add `[missing]` token support
|
|
493
|
+
- Improve frontend speed and cleanup toolboxes on item load
|
|
494
|
+
- Host assets via symlinks
|
|
495
|
+
- Add validation threshold for success/fail tokens
|
|
496
|
+
- Implement reset masking for annotations
|
|
497
|
+
- Allow pre-defined user IDs and tokens in campaign data
|
|
498
|
+
- v0.1.1
|
|
499
|
+
- Set server defaults and add VM launch scripts
|
|
500
|
+
- Add warning dialog when navigating away with unsaved work
|
|
501
|
+
- Add tutorial validation support for pointwise and listwise
|
|
502
|
+
- Add ability to preview existing annotations via progress bar
|
|
503
|
+
- Add support for ESA<sup>AI</sup> pre-filled error_spans
|
|
504
|
+
- Rename pairwise to listwise and update layout
|
|
505
|
+
- Implement single-stream assignment type
|
|
506
|
+
- v0.0.3
|
|
507
|
+
- Support multimodal inputs and outputs
|
|
508
|
+
- Add dashboard
|
|
509
|
+
- Implement ESA (Error Span Annotation) and MQM support
|
|
510
|
+
|
|
@@ -1,14 +1,7 @@
|
|
|
1
|
-
# Pearmut
|
|
1
|
+
# 🍐Pearmut [](https://pypi.org/project/pearmut) [](https://pypi.python.org/pypi/pearmut/) [](https://pypi.org/project/pearmut/) [](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
2
2
|
|
|
3
3
|
**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
|
|
4
4
|
|
|
5
|
-
[](https://pypi.org/project/pearmut)
|
|
6
|
-
|
|
7
|
-
[](https://pypi.python.org/pypi/pearmut/)
|
|
8
|
-
|
|
9
|
-
[](https://pypi.org/project/pearmut/)
|
|
10
|
-
|
|
11
|
-
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
12
5
|
|
|
13
6
|
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
|
|
14
7
|
|
|
@@ -31,6 +24,8 @@
|
|
|
31
24
|
- [Terminology](#terminology)
|
|
32
25
|
- [Development](#development)
|
|
33
26
|
- [Citation](#citation)
|
|
27
|
+
- [Changelog](#changelog)
|
|
28
|
+
|
|
34
29
|
|
|
35
30
|
## Quick Start
|
|
36
31
|
|
|
@@ -90,7 +85,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
90
85
|
}
|
|
91
86
|
```
|
|
92
87
|
|
|
93
|
-
Each item has to have `
|
|
88
|
+
Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
|
|
89
|
+
Optionally, you can also include `src` (source string) and/or `ref` (reference string).
|
|
90
|
+
If neither `src` nor `ref` is provided, only the model outputs will be displayed.
|
|
94
91
|
For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
|
|
95
92
|
Any other keys that you add will simply be stored in the logs.
|
|
96
93
|
|
|
@@ -124,6 +121,40 @@ The `shuffle` parameter in campaign `info` controls this behavior:
|
|
|
124
121
|
}
|
|
125
122
|
```
|
|
126
123
|
|
|
124
|
+
### Custom Score Sliders
|
|
125
|
+
|
|
126
|
+
For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
{
|
|
130
|
+
"info": {
|
|
131
|
+
"assignment": "task-based",
|
|
132
|
+
"protocol": "ESA",
|
|
133
|
+
"sliders": [
|
|
134
|
+
{"name": "Fluency", "min": 0, "max": 5, "step": 1},
|
|
135
|
+
{"name": "Adequacy", "min": 0, "max": 100, "step": 1}
|
|
136
|
+
]
|
|
137
|
+
},
|
|
138
|
+
"campaign_id": "my_campaign",
|
|
139
|
+
"data": [...]
|
|
140
|
+
}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
|
|
144
|
+
|
|
145
|
+
### Custom Instructions
|
|
146
|
+
|
|
147
|
+
Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
|
|
148
|
+
Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
|
|
149
|
+
```python
|
|
150
|
+
{
|
|
151
|
+
"info": {
|
|
152
|
+
"protocol": "DA",
|
|
153
|
+
"instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
127
158
|
### Pre-filled Error Spans (ESA<sup>AI</sup>)
|
|
128
159
|
|
|
129
160
|
Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
|
|
@@ -242,7 +273,7 @@ All items must contain outputs from all models for this assignment type to work
|
|
|
242
273
|
**How it works:**
|
|
243
274
|
1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
|
|
244
275
|
2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
|
|
245
|
-
3. Contrastive
|
|
276
|
+
3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
|
|
246
277
|
4. Item prioritization: Items with the least annotations for the selected models are prioritized
|
|
247
278
|
5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
|
|
248
279
|
|
|
@@ -268,6 +299,7 @@ The `users` field accepts:
|
|
|
268
299
|
}
|
|
269
300
|
```
|
|
270
301
|
|
|
302
|
+
|
|
271
303
|
### Multimodal Annotations
|
|
272
304
|
|
|
273
305
|
Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
|
|
@@ -397,3 +429,61 @@ If you use this work in your paper, please cite as following.
|
|
|
397
429
|
```
|
|
398
430
|
|
|
399
431
|
Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
|
|
432
|
+
|
|
433
|
+
# Changelog
|
|
434
|
+
|
|
435
|
+
- v1.0.1
|
|
436
|
+
- Support RTL languages
|
|
437
|
+
- Add boxes for references
|
|
438
|
+
- Add custom score sliders for multi-dimensional evaluation
|
|
439
|
+
- Make instructions customizable and protocol-dependent
|
|
440
|
+
- Support custom sliders
|
|
441
|
+
- Purge/reset whole tasks from dashboard
|
|
442
|
+
- Fix resetting individual users in single-stream/dynamic
|
|
443
|
+
- Fix notification stacking
|
|
444
|
+
- Add campaigns from dashboard
|
|
445
|
+
- v0.3.3
|
|
446
|
+
- Rename `doc_id` to `item_id`
|
|
447
|
+
- Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
|
|
448
|
+
- Add dynamic assignment type with contrastive model comparison
|
|
449
|
+
- Add `instructions_goodbye` field with variable substitution
|
|
450
|
+
- Add visual anchors at 33% and 66% on sliders
|
|
451
|
+
- Add German→English ESA tutorial with attention checks
|
|
452
|
+
- Validate document model consistency before shuffle
|
|
453
|
+
- Fix UI block on any interaction
|
|
454
|
+
- v0.3.2
|
|
455
|
+
- Revert seeding of user IDs
|
|
456
|
+
- Set ESA (Error Span Annotation) as default
|
|
457
|
+
- Update server IP address configuration
|
|
458
|
+
- Show approximate alignment by default
|
|
459
|
+
- Unify pointwise and listwise interfaces into `basic`
|
|
460
|
+
- Refactor protocol configuration (breaking change)
|
|
461
|
+
- v0.2.11
|
|
462
|
+
- Add comment field in settings panel
|
|
463
|
+
- Add `score_gt` validation for listwise comparisons
|
|
464
|
+
- Add Content-Disposition headers for proper download filenames
|
|
465
|
+
- Add model results display to dashboard with rankings
|
|
466
|
+
- Add campaign file structure validation
|
|
467
|
+
- Purge command now unlinks assets
|
|
468
|
+
- v0.2.6
|
|
469
|
+
- Add frozen annotation links feature for view-only mode
|
|
470
|
+
- Add word-level annotation mode toggle for error spans
|
|
471
|
+
- Add `[missing]` token support
|
|
472
|
+
- Improve frontend speed and cleanup toolboxes on item load
|
|
473
|
+
- Host assets via symlinks
|
|
474
|
+
- Add validation threshold for success/fail tokens
|
|
475
|
+
- Implement reset masking for annotations
|
|
476
|
+
- Allow pre-defined user IDs and tokens in campaign data
|
|
477
|
+
- v0.1.1
|
|
478
|
+
- Set server defaults and add VM launch scripts
|
|
479
|
+
- Add warning dialog when navigating away with unsaved work
|
|
480
|
+
- Add tutorial validation support for pointwise and listwise
|
|
481
|
+
- Add ability to preview existing annotations via progress bar
|
|
482
|
+
- Add support for ESA<sup>AI</sup> pre-filled error_spans
|
|
483
|
+
- Rename pairwise to listwise and update layout
|
|
484
|
+
- Implement single-stream assignment type
|
|
485
|
+
- v0.0.3
|
|
486
|
+
- Support multimodal inputs and outputs
|
|
487
|
+
- Add dashboard
|
|
488
|
+
- Implement ESA (Error Span Annotation) and MQM support
|
|
489
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pearmut
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: A tool for evaluation of model outputs, primarily MT.
|
|
5
5
|
Author-email: Vilém Zouhar <vilem.zouhar@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -19,17 +19,10 @@ Provides-Extra: dev
|
|
|
19
19
|
Requires-Dist: pytest; extra == "dev"
|
|
20
20
|
Dynamic: license-file
|
|
21
21
|
|
|
22
|
-
# Pearmut
|
|
22
|
+
# 🍐Pearmut [](https://pypi.org/project/pearmut) [](https://pypi.python.org/pypi/pearmut/) [](https://pypi.org/project/pearmut/) [](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
23
23
|
|
|
24
24
|
**Platform for Evaluation and Reviewing of Multilingual Tasks**: Evaluate model outputs for translation and NLP tasks with support for multimodal data (text, video, audio, images) and multiple annotation protocols ([DA](https://aclanthology.org/N15-1124/), [ESA](https://aclanthology.org/2024.wmt-1.131/), [ESA<sup>AI</sup>](https://aclanthology.org/2025.naacl-long.255/), [MQM](https://doi.org/10.1162/tacl_a_00437), and more!).
|
|
25
25
|
|
|
26
|
-
[](https://pypi.org/project/pearmut)
|
|
27
|
-
|
|
28
|
-
[](https://pypi.python.org/pypi/pearmut/)
|
|
29
|
-
|
|
30
|
-
[](https://pypi.org/project/pearmut/)
|
|
31
|
-
|
|
32
|
-
[](https://github.com/zouharvi/pearmut/actions/workflows/test.yml)
|
|
33
26
|
|
|
34
27
|
<img width="1000" alt="Screenshot of ESA/MQM interface" src="https://github.com/user-attachments/assets/71334238-300b-4ffc-b777-7f3c242b1630" />
|
|
35
28
|
|
|
@@ -52,6 +45,8 @@ Dynamic: license-file
|
|
|
52
45
|
- [Terminology](#terminology)
|
|
53
46
|
- [Development](#development)
|
|
54
47
|
- [Citation](#citation)
|
|
48
|
+
- [Changelog](#changelog)
|
|
49
|
+
|
|
55
50
|
|
|
56
51
|
## Quick Start
|
|
57
52
|
|
|
@@ -111,7 +106,9 @@ Campaigns are defined in JSON files (see [examples/](examples/)). The simplest c
|
|
|
111
106
|
}
|
|
112
107
|
```
|
|
113
108
|
|
|
114
|
-
Each item has to have `
|
|
109
|
+
Each item has to have `tgt` (dictionary from model names to strings, even for a single model evaluation).
|
|
110
|
+
Optionally, you can also include `src` (source string) and/or `ref` (reference string).
|
|
111
|
+
If neither `src` nor `ref` is provided, only the model outputs will be displayed.
|
|
115
112
|
For full Pearmut functionality (e.g. automatic statistical analysis), add `item_id` as well.
|
|
116
113
|
Any other keys that you add will simply be stored in the logs.
|
|
117
114
|
|
|
@@ -145,6 +142,40 @@ The `shuffle` parameter in campaign `info` controls this behavior:
|
|
|
145
142
|
}
|
|
146
143
|
```
|
|
147
144
|
|
|
145
|
+
### Custom Score Sliders
|
|
146
|
+
|
|
147
|
+
For multi-dimensional evaluation tasks (e.g., assessing fluency on a Likert scale), you can define custom sliders with specific ranges and steps:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
{
|
|
151
|
+
"info": {
|
|
152
|
+
"assignment": "task-based",
|
|
153
|
+
"protocol": "ESA",
|
|
154
|
+
"sliders": [
|
|
155
|
+
{"name": "Fluency", "min": 0, "max": 5, "step": 1},
|
|
156
|
+
{"name": "Adequacy", "min": 0, "max": 100, "step": 1}
|
|
157
|
+
]
|
|
158
|
+
},
|
|
159
|
+
"campaign_id": "my_campaign",
|
|
160
|
+
"data": [...]
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
When `sliders` is specified, only the custom sliders are shown. Each slider must have `name`, `min`, `max`, and `step` properties. All sliders must be answered before proceeding.
|
|
165
|
+
|
|
166
|
+
### Custom Instructions
|
|
167
|
+
|
|
168
|
+
Set campaign-level instructions using the `instructions` field in `info` (supports HTML).
|
|
169
|
+
Instructions default to protocol-specific ones (DA: scoring, ESA: error spans + scoring, MQM: error spans + categories + scoring).
|
|
170
|
+
```python
|
|
171
|
+
{
|
|
172
|
+
"info": {
|
|
173
|
+
"protocol": "DA",
|
|
174
|
+
"instructions": "Rate translation quality on a 0-100 scale.<br>Pay special attention to document-level phenomena."
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
```
|
|
178
|
+
|
|
148
179
|
### Pre-filled Error Spans (ESA<sup>AI</sup>)
|
|
149
180
|
|
|
150
181
|
Include `error_spans` to pre-fill annotations that users can review, modify, or delete:
|
|
@@ -263,7 +294,7 @@ All items must contain outputs from all models for this assignment type to work
|
|
|
263
294
|
**How it works:**
|
|
264
295
|
1. Initial phase: Each model gets `dynamic_first` annotations with fully random contrastive evaluation
|
|
265
296
|
2. Dynamic phase: After the initial phase, top `dynamic_top` models (by average score) are identified
|
|
266
|
-
3. Contrastive
|
|
297
|
+
3. Contrastive evaluation: From the top N models, `dynamic_contrastive_models` models are randomly selected for each item
|
|
267
298
|
4. Item prioritization: Items with the least annotations for the selected models are prioritized
|
|
268
299
|
5. Backoff: With probability `dynamic_backoff`, uniform random selection is used instead to maintain exploration
|
|
269
300
|
|
|
@@ -289,6 +320,7 @@ The `users` field accepts:
|
|
|
289
320
|
}
|
|
290
321
|
```
|
|
291
322
|
|
|
323
|
+
|
|
292
324
|
### Multimodal Annotations
|
|
293
325
|
|
|
294
326
|
Support for HTML-compatible elements (YouTube embeds, `<video>` tags, images). Ensure elements are pre-styled. See [examples/multimodal.json](examples/multimodal.json).
|
|
@@ -418,3 +450,61 @@ If you use this work in your paper, please cite as following.
|
|
|
418
450
|
```
|
|
419
451
|
|
|
420
452
|
Contributions are welcome! Please reach out to [Vilém Zouhar](mailto:vilem.zouhar@gmail.com).
|
|
453
|
+
|
|
454
|
+
# Changelog
|
|
455
|
+
|
|
456
|
+
- v1.0.1
|
|
457
|
+
- Support RTL languages
|
|
458
|
+
- Add boxes for references
|
|
459
|
+
- Add custom score sliders for multi-dimensional evaluation
|
|
460
|
+
- Make instructions customizable and protocol-dependent
|
|
461
|
+
- Support custom sliders
|
|
462
|
+
- Purge/reset whole tasks from dashboard
|
|
463
|
+
- Fix resetting individual users in single-stream/dynamic
|
|
464
|
+
- Fix notification stacking
|
|
465
|
+
- Add campaigns from dashboard
|
|
466
|
+
- v0.3.3
|
|
467
|
+
- Rename `doc_id` to `item_id`
|
|
468
|
+
- Add Typst, LaTeX, and PDF export for model ranking tables. Hide them by default.
|
|
469
|
+
- Add dynamic assignment type with contrastive model comparison
|
|
470
|
+
- Add `instructions_goodbye` field with variable substitution
|
|
471
|
+
- Add visual anchors at 33% and 66% on sliders
|
|
472
|
+
- Add German→English ESA tutorial with attention checks
|
|
473
|
+
- Validate document model consistency before shuffle
|
|
474
|
+
- Fix UI block on any interaction
|
|
475
|
+
- v0.3.2
|
|
476
|
+
- Revert seeding of user IDs
|
|
477
|
+
- Set ESA (Error Span Annotation) as default
|
|
478
|
+
- Update server IP address configuration
|
|
479
|
+
- Show approximate alignment by default
|
|
480
|
+
- Unify pointwise and listwise interfaces into `basic`
|
|
481
|
+
- Refactor protocol configuration (breaking change)
|
|
482
|
+
- v0.2.11
|
|
483
|
+
- Add comment field in settings panel
|
|
484
|
+
- Add `score_gt` validation for listwise comparisons
|
|
485
|
+
- Add Content-Disposition headers for proper download filenames
|
|
486
|
+
- Add model results display to dashboard with rankings
|
|
487
|
+
- Add campaign file structure validation
|
|
488
|
+
- Purge command now unlinks assets
|
|
489
|
+
- v0.2.6
|
|
490
|
+
- Add frozen annotation links feature for view-only mode
|
|
491
|
+
- Add word-level annotation mode toggle for error spans
|
|
492
|
+
- Add `[missing]` token support
|
|
493
|
+
- Improve frontend speed and cleanup toolboxes on item load
|
|
494
|
+
- Host assets via symlinks
|
|
495
|
+
- Add validation threshold for success/fail tokens
|
|
496
|
+
- Implement reset masking for annotations
|
|
497
|
+
- Allow pre-defined user IDs and tokens in campaign data
|
|
498
|
+
- v0.1.1
|
|
499
|
+
- Set server defaults and add VM launch scripts
|
|
500
|
+
- Add warning dialog when navigating away with unsaved work
|
|
501
|
+
- Add tutorial validation support for pointwise and listwise
|
|
502
|
+
- Add ability to preview existing annotations via progress bar
|
|
503
|
+
- Add support for ESA<sup>AI</sup> pre-filled error_spans
|
|
504
|
+
- Rename pairwise to listwise and update layout
|
|
505
|
+
- Implement single-stream assignment type
|
|
506
|
+
- v0.0.3
|
|
507
|
+
- Support multimodal inputs and outputs
|
|
508
|
+
- Add dashboard
|
|
509
|
+
- Implement ESA (Error Span Annotation) and MQM support
|
|
510
|
+
|
|
@@ -280,6 +280,79 @@ async def _reset_task(request: ResetTaskRequest):
|
|
|
280
280
|
return response
|
|
281
281
|
|
|
282
282
|
|
|
283
|
+
class PurgeCampaignRequest(BaseModel):
|
|
284
|
+
campaign_id: str
|
|
285
|
+
token: str
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@app.post("/purge-campaign")
|
|
289
|
+
async def _purge_campaign(request: PurgeCampaignRequest):
|
|
290
|
+
global progress_data, tasks_data
|
|
291
|
+
|
|
292
|
+
campaign_id = request.campaign_id
|
|
293
|
+
token = request.token
|
|
294
|
+
|
|
295
|
+
if campaign_id not in progress_data:
|
|
296
|
+
return JSONResponse(content="Unknown campaign ID", status_code=400)
|
|
297
|
+
if token != tasks_data[campaign_id]["token"]:
|
|
298
|
+
return JSONResponse(content="Invalid token", status_code=400)
|
|
299
|
+
|
|
300
|
+
# Unlink assets if they exist
|
|
301
|
+
destination = tasks_data[campaign_id].get("info", {}).get("assets", {}).get("destination")
|
|
302
|
+
if destination:
|
|
303
|
+
symlink_path = f"{ROOT}/data/{destination}".rstrip("/")
|
|
304
|
+
if os.path.islink(symlink_path):
|
|
305
|
+
os.remove(symlink_path)
|
|
306
|
+
|
|
307
|
+
# Remove task file
|
|
308
|
+
task_file = f"{ROOT}/data/tasks/{campaign_id}.json"
|
|
309
|
+
if os.path.exists(task_file):
|
|
310
|
+
os.remove(task_file)
|
|
311
|
+
|
|
312
|
+
# Remove output file
|
|
313
|
+
output_file = f"{ROOT}/data/outputs/{campaign_id}.jsonl"
|
|
314
|
+
if os.path.exists(output_file):
|
|
315
|
+
os.remove(output_file)
|
|
316
|
+
|
|
317
|
+
# Remove from in-memory data structures
|
|
318
|
+
del tasks_data[campaign_id]
|
|
319
|
+
del progress_data[campaign_id]
|
|
320
|
+
|
|
321
|
+
# Save updated progress data
|
|
322
|
+
save_progress_data(progress_data)
|
|
323
|
+
|
|
324
|
+
return JSONResponse(content="ok", status_code=200)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class AddCampaignRequest(BaseModel):
|
|
328
|
+
campaign_data: dict[str, Any]
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
@app.post("/add-campaign")
|
|
332
|
+
async def _add_campaign(request: AddCampaignRequest):
|
|
333
|
+
global progress_data, tasks_data
|
|
334
|
+
|
|
335
|
+
from .cli import _add_single_campaign
|
|
336
|
+
|
|
337
|
+
try:
|
|
338
|
+
server = f"{os.environ.get('PEARMUT_SERVER_URL', 'http://localhost:8001')}"
|
|
339
|
+
_add_single_campaign(request.campaign_data, overwrite=False, server=server)
|
|
340
|
+
|
|
341
|
+
campaign_id = request.campaign_data['campaign_id']
|
|
342
|
+
with open(f"{ROOT}/data/tasks/{campaign_id}.json", "r") as f:
|
|
343
|
+
tasks_data[campaign_id] = json.load(f)
|
|
344
|
+
|
|
345
|
+
progress_data = load_progress_data(warn=None)
|
|
346
|
+
|
|
347
|
+
return JSONResponse(content={
|
|
348
|
+
"status": "ok",
|
|
349
|
+
"campaign_id": campaign_id,
|
|
350
|
+
"token": tasks_data[campaign_id]["token"]
|
|
351
|
+
}, status_code=200)
|
|
352
|
+
except Exception as e:
|
|
353
|
+
return JSONResponse(content={"error": str(e)}, status_code=400)
|
|
354
|
+
|
|
355
|
+
|
|
283
356
|
@app.get("/download-annotations")
|
|
284
357
|
async def _download_annotations(
|
|
285
358
|
campaign_id: list[str] = Query(),
|