EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +32 -14
- euroeval/benchmark_config_factory.py +92 -180
- euroeval/benchmark_modules/base.py +49 -39
- euroeval/benchmark_modules/fresh.py +35 -21
- euroeval/benchmark_modules/hf.py +280 -244
- euroeval/benchmark_modules/litellm.py +752 -312
- euroeval/benchmark_modules/vllm.py +570 -268
- euroeval/benchmarker.py +651 -528
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +49 -38
- euroeval/constants.py +44 -25
- euroeval/data_loading.py +111 -55
- euroeval/data_models.py +490 -323
- euroeval/dataset_configs/__init__.py +26 -4
- euroeval/dataset_configs/bosnian.py +39 -0
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/croatian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +78 -50
- euroeval/dataset_configs/dutch.py +74 -44
- euroeval/dataset_configs/english.py +71 -36
- euroeval/dataset_configs/estonian.py +111 -0
- euroeval/dataset_configs/faroese.py +25 -18
- euroeval/dataset_configs/finnish.py +63 -26
- euroeval/dataset_configs/french.py +65 -32
- euroeval/dataset_configs/german.py +77 -36
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +68 -57
- euroeval/dataset_configs/italian.py +68 -36
- euroeval/dataset_configs/latvian.py +87 -0
- euroeval/dataset_configs/lithuanian.py +64 -0
- euroeval/dataset_configs/norwegian.py +98 -72
- euroeval/dataset_configs/polish.py +96 -0
- euroeval/dataset_configs/portuguese.py +63 -40
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/slovene.py +56 -0
- euroeval/dataset_configs/spanish.py +68 -34
- euroeval/dataset_configs/swedish.py +82 -41
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/enums.py +12 -6
- euroeval/exceptions.py +21 -1
- euroeval/finetuning.py +34 -26
- euroeval/generation.py +76 -41
- euroeval/generation_utils.py +169 -34
- euroeval/languages.py +1020 -188
- euroeval/logging_utils.py +268 -0
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +85 -0
- euroeval/metrics/huggingface.py +216 -0
- euroeval/metrics/llm_as_a_judge.py +260 -0
- euroeval/metrics/pipeline.py +289 -0
- euroeval/metrics/speed.py +48 -0
- euroeval/model_cache.py +40 -21
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +157 -22
- euroeval/prompt_templates/multiple_choice.py +159 -17
- euroeval/prompt_templates/named_entity_recognition.py +318 -21
- euroeval/prompt_templates/reading_comprehension.py +207 -16
- euroeval/prompt_templates/sentiment_classification.py +205 -22
- euroeval/prompt_templates/summarization.py +122 -22
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +20 -9
- euroeval/speed_benchmark.py +11 -12
- euroeval/task_group_utils/multiple_choice_classification.py +21 -12
- euroeval/task_group_utils/question_answering.py +101 -73
- euroeval/task_group_utils/sequence_classification.py +144 -61
- euroeval/task_group_utils/text_to_text.py +33 -12
- euroeval/task_group_utils/token_classification.py +86 -89
- euroeval/tasks.py +75 -16
- euroeval/tokenisation_utils.py +603 -0
- euroeval/types.py +17 -11
- euroeval/utils.py +332 -137
- euroeval-16.7.1.dist-info/METADATA +623 -0
- euroeval-16.7.1.dist-info/RECORD +84 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -737
- euroeval/metrics.py +0 -452
- euroeval/tokenization_utils.py +0 -498
- euroeval-15.12.0.dist-info/METADATA +0 -285
- euroeval-15.12.0.dist-info/RECORD +0 -63
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
euroeval/human_evaluation.py
DELETED
|
@@ -1,737 +0,0 @@
|
|
|
1
|
-
"""Gradio app for conducting human evaluation of the tasks."""
|
|
2
|
-
|
|
3
|
-
import importlib.util
|
|
4
|
-
import json
|
|
5
|
-
import logging
|
|
6
|
-
import typing as t
|
|
7
|
-
from collections import defaultdict
|
|
8
|
-
from functools import partial
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
|
|
11
|
-
import click
|
|
12
|
-
from datasets import Dataset
|
|
13
|
-
|
|
14
|
-
from .benchmark_config_factory import build_benchmark_config
|
|
15
|
-
from .data_loading import load_data
|
|
16
|
-
from .data_models import BenchmarkResult, GenerativeModelOutput
|
|
17
|
-
from .dataset_configs import SPEED_CONFIG, get_all_dataset_configs
|
|
18
|
-
from .enums import GenerativeType, TaskGroup
|
|
19
|
-
from .exceptions import NeedsExtraInstalled
|
|
20
|
-
from .scores import aggregate_scores
|
|
21
|
-
from .task_group_utils import (
|
|
22
|
-
question_answering,
|
|
23
|
-
sequence_classification,
|
|
24
|
-
text_to_text,
|
|
25
|
-
token_classification,
|
|
26
|
-
)
|
|
27
|
-
from .tasks import NER
|
|
28
|
-
from .utils import enforce_reproducibility
|
|
29
|
-
|
|
30
|
-
if importlib.util.find_spec("gradio") is not None:
|
|
31
|
-
import gradio as gr
|
|
32
|
-
from gradio.components import HTML, Button, Dropdown, Markdown, Textbox
|
|
33
|
-
|
|
34
|
-
if t.TYPE_CHECKING:
|
|
35
|
-
from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
|
|
36
|
-
|
|
37
|
-
logger = logging.getLogger("euroeval")
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class HumanEvaluator:
|
|
41
|
-
"""An app for evaluating human performance on the EuroEval benchmark."""
|
|
42
|
-
|
|
43
|
-
def __init__(
|
|
44
|
-
self,
|
|
45
|
-
annotator_id: int,
|
|
46
|
-
title: str,
|
|
47
|
-
description: str,
|
|
48
|
-
dummy_model_id: str = "mistralai/Mistral-7B-v0.1",
|
|
49
|
-
) -> None:
|
|
50
|
-
"""Initialise the HumanEvaluator.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
annotator_id:
|
|
54
|
-
The annotator ID for the evaluation.
|
|
55
|
-
title:
|
|
56
|
-
The title of the app.
|
|
57
|
-
description:
|
|
58
|
-
The description of the app.
|
|
59
|
-
dummy_model_id:
|
|
60
|
-
The model ID to use for generating prompts.
|
|
61
|
-
"""
|
|
62
|
-
self.annotator_id = annotator_id
|
|
63
|
-
self.title = title
|
|
64
|
-
self.description = description
|
|
65
|
-
self.dummy_model_id = dummy_model_id
|
|
66
|
-
|
|
67
|
-
self.sample_idx: int
|
|
68
|
-
self.active_dataset: Dataset
|
|
69
|
-
|
|
70
|
-
self.dataset_configs = {
|
|
71
|
-
name: cfg
|
|
72
|
-
for name, cfg in get_all_dataset_configs().items()
|
|
73
|
-
if not cfg.unofficial
|
|
74
|
-
}
|
|
75
|
-
self.tasks = sorted(
|
|
76
|
-
{
|
|
77
|
-
cfg.task.name.replace("-", " ").title()
|
|
78
|
-
for cfg in self.dataset_configs.values()
|
|
79
|
-
if cfg != SPEED_CONFIG
|
|
80
|
-
}
|
|
81
|
-
)
|
|
82
|
-
self.languages = sorted(
|
|
83
|
-
{
|
|
84
|
-
language.name
|
|
85
|
-
for cfg in self.dataset_configs.values()
|
|
86
|
-
if cfg != SPEED_CONFIG
|
|
87
|
-
for language in cfg.languages
|
|
88
|
-
if language.name not in {"Norwegian Bokmål", "Norwegian Nynorsk"}
|
|
89
|
-
}
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
self.extract_labels_from_generation: "ExtractLabelsFunction"
|
|
93
|
-
self.compute_metrics: "ComputeMetricsFunction"
|
|
94
|
-
|
|
95
|
-
def create_app(self) -> "gr.Blocks":
|
|
96
|
-
"""Create the Gradio app for human evaluation.
|
|
97
|
-
|
|
98
|
-
Returns:
|
|
99
|
-
The Gradio app for human evaluation.
|
|
100
|
-
"""
|
|
101
|
-
with gr.Blocks(title=self.title, theme=gr.themes.Monochrome()) as app:
|
|
102
|
-
HTML(f"<center><h1>{self.title}</h1></center>")
|
|
103
|
-
Markdown(self.description)
|
|
104
|
-
with gr.Row(variant="panel"):
|
|
105
|
-
language_dropdown = Dropdown(label="Language", choices=self.languages)
|
|
106
|
-
task_dropdown = Dropdown(label="Task", choices=self.tasks)
|
|
107
|
-
dataset_dropdown = Dropdown(label="Dataset", choices=[""])
|
|
108
|
-
with gr.Row(variant="panel"):
|
|
109
|
-
with gr.Column():
|
|
110
|
-
task_examples = Markdown("Task Examples", visible=False)
|
|
111
|
-
with gr.Column():
|
|
112
|
-
question = Markdown(label="Question", visible=False)
|
|
113
|
-
with gr.Row():
|
|
114
|
-
ner_tag_dropdown = Dropdown(
|
|
115
|
-
label="Entity type",
|
|
116
|
-
choices=[""],
|
|
117
|
-
interactive=True,
|
|
118
|
-
visible=False,
|
|
119
|
-
scale=0.5, # type: ignore[arg-type]
|
|
120
|
-
)
|
|
121
|
-
ner_tag_answer = Textbox(
|
|
122
|
-
label="Entity", interactive=True, visible=False, scale=1
|
|
123
|
-
)
|
|
124
|
-
with gr.Column(scale=0.2): # type: ignore[arg-type]
|
|
125
|
-
ner_tag_add_button = Button("Add entity", visible=False)
|
|
126
|
-
ner_tag_reset_button = Button(
|
|
127
|
-
"Reset entities", visible=False
|
|
128
|
-
)
|
|
129
|
-
answer = Textbox(label="Answer", visible=False)
|
|
130
|
-
submit_button = Button("Submit", visible=False)
|
|
131
|
-
|
|
132
|
-
language_dropdown.change(
|
|
133
|
-
fn=self.update_dataset_choices,
|
|
134
|
-
inputs=[language_dropdown, task_dropdown],
|
|
135
|
-
outputs=dataset_dropdown,
|
|
136
|
-
)
|
|
137
|
-
task_dropdown.change(
|
|
138
|
-
fn=self.update_dataset_choices,
|
|
139
|
-
inputs=[language_dropdown, task_dropdown],
|
|
140
|
-
outputs=dataset_dropdown,
|
|
141
|
-
)
|
|
142
|
-
dataset_dropdown.change(
|
|
143
|
-
fn=partial(self.update_dataset, iteration=self.annotator_id),
|
|
144
|
-
inputs=dataset_dropdown,
|
|
145
|
-
outputs=[
|
|
146
|
-
task_examples,
|
|
147
|
-
question,
|
|
148
|
-
ner_tag_dropdown,
|
|
149
|
-
ner_tag_answer,
|
|
150
|
-
ner_tag_add_button,
|
|
151
|
-
ner_tag_reset_button,
|
|
152
|
-
answer,
|
|
153
|
-
submit_button,
|
|
154
|
-
],
|
|
155
|
-
)
|
|
156
|
-
ner_tag_add_button.click(
|
|
157
|
-
fn=self.add_entity_to_answer,
|
|
158
|
-
inputs=[question, ner_tag_dropdown, ner_tag_answer, answer],
|
|
159
|
-
outputs=[ner_tag_answer, answer],
|
|
160
|
-
)
|
|
161
|
-
ner_tag_answer.submit(
|
|
162
|
-
fn=self.add_entity_to_answer,
|
|
163
|
-
inputs=[question, ner_tag_dropdown, ner_tag_answer, answer],
|
|
164
|
-
outputs=[ner_tag_answer, answer],
|
|
165
|
-
)
|
|
166
|
-
ner_tag_reset_button.click(fn=self.reset_entities, outputs=answer)
|
|
167
|
-
submit_button.click(
|
|
168
|
-
fn=partial(self.submit_answer, annotator_id=self.annotator_id),
|
|
169
|
-
inputs=[dataset_dropdown, question, answer],
|
|
170
|
-
outputs=[question, answer],
|
|
171
|
-
)
|
|
172
|
-
answer.submit(
|
|
173
|
-
fn=partial(self.submit_answer, annotator_id=self.annotator_id),
|
|
174
|
-
inputs=[dataset_dropdown, question, answer],
|
|
175
|
-
outputs=[question, answer],
|
|
176
|
-
)
|
|
177
|
-
return app
|
|
178
|
-
|
|
179
|
-
def update_dataset_choices(
|
|
180
|
-
self, language: str | None, task: str | None
|
|
181
|
-
) -> "Dropdown":
|
|
182
|
-
"""Update the dataset choices based on the selected language and task.
|
|
183
|
-
|
|
184
|
-
Args:
|
|
185
|
-
language:
|
|
186
|
-
The language selected by the user.
|
|
187
|
-
task:
|
|
188
|
-
The task selected by the user.
|
|
189
|
-
|
|
190
|
-
Returns:
|
|
191
|
-
A list of dataset names that match the selected language and task.
|
|
192
|
-
"""
|
|
193
|
-
if language is None or task is None:
|
|
194
|
-
return Dropdown(choices=[])
|
|
195
|
-
|
|
196
|
-
dataset_configs = [
|
|
197
|
-
cfg
|
|
198
|
-
for cfg in get_all_dataset_configs().values()
|
|
199
|
-
if language in {language.name for language in cfg.languages}
|
|
200
|
-
and task.lower().replace(" ", "-") == cfg.task.name
|
|
201
|
-
and not cfg.unofficial
|
|
202
|
-
]
|
|
203
|
-
assert len(dataset_configs) > 0
|
|
204
|
-
|
|
205
|
-
choices = sorted([cfg.name for cfg in dataset_configs])
|
|
206
|
-
|
|
207
|
-
logger.info(
|
|
208
|
-
f"User selected {language} and {task}, which resulted in the datasets "
|
|
209
|
-
f"{choices}, with {choices[0]!r} being chosen by default."
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
return Dropdown(choices=choices, value=choices[0])
|
|
213
|
-
|
|
214
|
-
def update_dataset(
|
|
215
|
-
self, dataset_name: str, iteration: int
|
|
216
|
-
) -> (
|
|
217
|
-
"tuple[Markdown, Markdown, Dropdown, Textbox, Button, Button, Textbox, Button]"
|
|
218
|
-
):
|
|
219
|
-
"""Update the dataset based on a selected dataset name.
|
|
220
|
-
|
|
221
|
-
Args:
|
|
222
|
-
dataset_name:
|
|
223
|
-
The dataset name selected by the user.
|
|
224
|
-
iteration:
|
|
225
|
-
The iteration index of the datasets to evaluate.
|
|
226
|
-
|
|
227
|
-
Returns:
|
|
228
|
-
A tuple (task_examples, question, entity_type, entity, entity_add_button,
|
|
229
|
-
entity_reset_button, answer, submit_button) for the selected dataset.
|
|
230
|
-
"""
|
|
231
|
-
blank_answer = (
|
|
232
|
-
Markdown("", visible=False),
|
|
233
|
-
Markdown("", visible=False),
|
|
234
|
-
Dropdown(visible=False),
|
|
235
|
-
Textbox(visible=False),
|
|
236
|
-
Button(visible=False),
|
|
237
|
-
Button(visible=False),
|
|
238
|
-
Textbox("", visible=False),
|
|
239
|
-
Button(visible=False),
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
if not dataset_name:
|
|
243
|
-
return blank_answer
|
|
244
|
-
|
|
245
|
-
logger.info(f"User selected dataset {dataset_name} - loading dataset...")
|
|
246
|
-
gr.Info(f"Loading dataset {dataset_name}...")
|
|
247
|
-
|
|
248
|
-
benchmark_config = build_benchmark_config(
|
|
249
|
-
progress_bar=False,
|
|
250
|
-
save_results=True,
|
|
251
|
-
task=None,
|
|
252
|
-
dataset=None,
|
|
253
|
-
language=[
|
|
254
|
-
language.code
|
|
255
|
-
for cfg in get_all_dataset_configs().values()
|
|
256
|
-
for language in cfg.languages
|
|
257
|
-
if not cfg.unofficial
|
|
258
|
-
],
|
|
259
|
-
model_language=None,
|
|
260
|
-
dataset_language=None,
|
|
261
|
-
device=None,
|
|
262
|
-
batch_size=1,
|
|
263
|
-
raise_errors=False,
|
|
264
|
-
cache_dir=".euroeval_cache",
|
|
265
|
-
api_key=None,
|
|
266
|
-
force=False,
|
|
267
|
-
verbose=False,
|
|
268
|
-
trust_remote_code=False,
|
|
269
|
-
clear_model_cache=False,
|
|
270
|
-
evaluate_test_split=False,
|
|
271
|
-
few_shot=True,
|
|
272
|
-
num_iterations=iteration + 1,
|
|
273
|
-
api_base=None,
|
|
274
|
-
api_version=None,
|
|
275
|
-
gpu_memory_utilization=0.9,
|
|
276
|
-
debug=False,
|
|
277
|
-
run_with_cli=True,
|
|
278
|
-
only_allow_safetensors=False,
|
|
279
|
-
)
|
|
280
|
-
self.dataset_config = get_all_dataset_configs()[dataset_name]
|
|
281
|
-
|
|
282
|
-
# TODO: Is this needed?
|
|
283
|
-
# model_id = f"human-{iteration}"
|
|
284
|
-
# model_config = ModelConfig(
|
|
285
|
-
# model_id=model_id,
|
|
286
|
-
# revision="main",
|
|
287
|
-
# task="text-generation",
|
|
288
|
-
# languages=dataset_config.languages,
|
|
289
|
-
# model_type=ModelType.HUMAN,
|
|
290
|
-
# model_cache_dir=create_model_cache_dir(
|
|
291
|
-
# cache_dir=benchmark_config.cache_dir, model_id=model_id
|
|
292
|
-
# ),
|
|
293
|
-
# adapter_base_model_id=None,
|
|
294
|
-
# )
|
|
295
|
-
|
|
296
|
-
self.sample_idx = 0
|
|
297
|
-
|
|
298
|
-
dataset_path = (
|
|
299
|
-
Path(".euroeval_cache")
|
|
300
|
-
/ "human-evaluation"
|
|
301
|
-
/ dataset_name
|
|
302
|
-
/ f"human-{iteration}.csv"
|
|
303
|
-
)
|
|
304
|
-
if dataset_path.exists():
|
|
305
|
-
active_dataset = Dataset.from_csv(str(dataset_path))
|
|
306
|
-
assert isinstance(active_dataset, Dataset)
|
|
307
|
-
self.active_dataset = active_dataset
|
|
308
|
-
try:
|
|
309
|
-
while self.active_dataset["answer"][self.sample_idx] is not None:
|
|
310
|
-
self.sample_idx += 1
|
|
311
|
-
except IndexError:
|
|
312
|
-
self.compute_and_log_scores()
|
|
313
|
-
return blank_answer
|
|
314
|
-
else:
|
|
315
|
-
rng = enforce_reproducibility()
|
|
316
|
-
datasets = load_data(
|
|
317
|
-
rng=rng,
|
|
318
|
-
dataset_config=self.dataset_config,
|
|
319
|
-
benchmark_config=benchmark_config,
|
|
320
|
-
)
|
|
321
|
-
# TODO: Prepare data?
|
|
322
|
-
self.active_dataset = (
|
|
323
|
-
datasets[iteration]["test"]
|
|
324
|
-
.remove_columns(
|
|
325
|
-
column_names=["input_ids", "attention_mask"],
|
|
326
|
-
new_fingerprint=datasets[iteration]["test"]._fingerprint,
|
|
327
|
-
)
|
|
328
|
-
.add_column(
|
|
329
|
-
name="answer",
|
|
330
|
-
column=[None] * len(datasets[iteration]["test"]),
|
|
331
|
-
new_fingerprint=datasets[iteration]["test"]._fingerprint,
|
|
332
|
-
)
|
|
333
|
-
)
|
|
334
|
-
if self.dataset_config.task == NER:
|
|
335
|
-
labels_in_train: set[str] = {
|
|
336
|
-
tag
|
|
337
|
-
for tag_list in self.active_dataset["labels"]
|
|
338
|
-
for tag in tag_list
|
|
339
|
-
}
|
|
340
|
-
self.has_misc_tags = (
|
|
341
|
-
"B-MISC" in labels_in_train or "I-MISC" in labels_in_train
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
match self.dataset_config.task.task_group:
|
|
345
|
-
case TaskGroup.SEQUENCE_CLASSIFICATION:
|
|
346
|
-
self.compute_metrics = partial(
|
|
347
|
-
sequence_classification.compute_metrics,
|
|
348
|
-
dataset_config=self.dataset_config,
|
|
349
|
-
)
|
|
350
|
-
self.extract_labels_from_generation = partial(
|
|
351
|
-
sequence_classification.extract_labels_from_generation,
|
|
352
|
-
dataset_config=self.dataset_config,
|
|
353
|
-
)
|
|
354
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
355
|
-
self.compute_metrics = partial(
|
|
356
|
-
text_to_text.compute_metrics,
|
|
357
|
-
dataset_config=self.dataset_config,
|
|
358
|
-
benchmark_config=benchmark_config,
|
|
359
|
-
)
|
|
360
|
-
self.extract_labels_from_generation = (
|
|
361
|
-
text_to_text.extract_labels_from_generation
|
|
362
|
-
)
|
|
363
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
364
|
-
self.compute_metrics = partial(
|
|
365
|
-
token_classification.compute_metrics,
|
|
366
|
-
has_misc_tags=self.has_misc_tags,
|
|
367
|
-
dataset_config=self.dataset_config,
|
|
368
|
-
)
|
|
369
|
-
self.extract_labels_from_generation = partial(
|
|
370
|
-
token_classification.extract_labels_from_generation,
|
|
371
|
-
dataset_config=self.dataset_config,
|
|
372
|
-
)
|
|
373
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
374
|
-
self.compute_metrics = partial(
|
|
375
|
-
question_answering.compute_metrics,
|
|
376
|
-
dataset_config=self.dataset_config,
|
|
377
|
-
)
|
|
378
|
-
self.extract_labels_from_generation = (
|
|
379
|
-
question_answering.extract_labels_from_generation
|
|
380
|
-
)
|
|
381
|
-
case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
382
|
-
raise NotImplementedError
|
|
383
|
-
case _:
|
|
384
|
-
raise NotImplementedError(
|
|
385
|
-
f"Task group {self.dataset_config.task.task_group} is not "
|
|
386
|
-
"supported."
|
|
387
|
-
)
|
|
388
|
-
|
|
389
|
-
task_examples, question = self.example_to_markdown(
|
|
390
|
-
example=self.active_dataset[self.sample_idx]
|
|
391
|
-
)
|
|
392
|
-
|
|
393
|
-
logger.info(
|
|
394
|
-
f"Loaded dataset {dataset_name}, with the following task examples:\n\n"
|
|
395
|
-
f"{task_examples}"
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
if self.dataset_config.task == NER:
|
|
399
|
-
ner_tags = list()
|
|
400
|
-
for ner_tag in self.dataset_config.prompt_label_mapping.values():
|
|
401
|
-
if ner_tag not in ner_tags:
|
|
402
|
-
ner_tags.append(ner_tag)
|
|
403
|
-
return (
|
|
404
|
-
Markdown(task_examples, visible=True),
|
|
405
|
-
Markdown(question, visible=True),
|
|
406
|
-
Dropdown(
|
|
407
|
-
label="Entity type",
|
|
408
|
-
choices=ner_tags,
|
|
409
|
-
value=ner_tags[0],
|
|
410
|
-
visible=True,
|
|
411
|
-
),
|
|
412
|
-
Textbox(label="Entity", interactive=True, visible=True),
|
|
413
|
-
Button("Add entity", visible=True),
|
|
414
|
-
Button("Reset entities", visible=True),
|
|
415
|
-
Textbox(
|
|
416
|
-
json.dumps({ner_tag: [] for ner_tag in ner_tags}),
|
|
417
|
-
interactive=False,
|
|
418
|
-
visible=True,
|
|
419
|
-
),
|
|
420
|
-
Button("Submit", visible=True),
|
|
421
|
-
)
|
|
422
|
-
else:
|
|
423
|
-
return (
|
|
424
|
-
Markdown(task_examples, visible=True),
|
|
425
|
-
Markdown(question, visible=True),
|
|
426
|
-
Dropdown(label="Entity type", choices=[], visible=False),
|
|
427
|
-
Textbox(label="Entity", interactive=True, visible=False),
|
|
428
|
-
Button("Add entity", visible=False),
|
|
429
|
-
Button("Reset entities", visible=False),
|
|
430
|
-
Textbox("", interactive=True, visible=True),
|
|
431
|
-
Button("Submit", visible=True),
|
|
432
|
-
)
|
|
433
|
-
|
|
434
|
-
def add_entity_to_answer(
|
|
435
|
-
self, question: str, entity_type: str, entity: str, answer: str
|
|
436
|
-
) -> "tuple[Textbox, Textbox]":
|
|
437
|
-
"""Add an entity to the answer.
|
|
438
|
-
|
|
439
|
-
Args:
|
|
440
|
-
question:
|
|
441
|
-
The current question.
|
|
442
|
-
entity_type:
|
|
443
|
-
The entity type selected by the user.
|
|
444
|
-
entity:
|
|
445
|
-
The entity provided by the user.
|
|
446
|
-
answer:
|
|
447
|
-
The current answer.
|
|
448
|
-
|
|
449
|
-
Returns:
|
|
450
|
-
A tuple (entity, answer) with a (blank) entity and answer.
|
|
451
|
-
"""
|
|
452
|
-
if not entity_type or not entity:
|
|
453
|
-
return Textbox(""), Textbox("")
|
|
454
|
-
|
|
455
|
-
if entity not in question:
|
|
456
|
-
gr.Warning(
|
|
457
|
-
f"The entity {entity!r} is not present in the question. Please "
|
|
458
|
-
"write it *exactly* as it appears in the question."
|
|
459
|
-
)
|
|
460
|
-
return Textbox(entity), Textbox(answer)
|
|
461
|
-
|
|
462
|
-
current_answer_obj = json.loads(answer)
|
|
463
|
-
if entity not in current_answer_obj[entity_type]:
|
|
464
|
-
current_answer_obj[entity_type].append(entity)
|
|
465
|
-
|
|
466
|
-
answer = json.dumps(current_answer_obj)
|
|
467
|
-
return Textbox(""), Textbox(answer)
|
|
468
|
-
|
|
469
|
-
def reset_entities(self) -> "Textbox":
|
|
470
|
-
"""Reset the entities in the answer.
|
|
471
|
-
|
|
472
|
-
Returns:
|
|
473
|
-
A blank answer.
|
|
474
|
-
"""
|
|
475
|
-
ner_tags = list()
|
|
476
|
-
for ner_tag in self.dataset_config.prompt_label_mapping.values():
|
|
477
|
-
if ner_tag not in ner_tags:
|
|
478
|
-
ner_tags.append(ner_tag)
|
|
479
|
-
return Textbox(json.dumps({ner_tag: [] for ner_tag in ner_tags}))
|
|
480
|
-
|
|
481
|
-
def submit_answer(
|
|
482
|
-
self, dataset_name: str, question: str, answer: str, annotator_id: int
|
|
483
|
-
) -> tuple[str, str]:
|
|
484
|
-
"""Submit an answer to the dataset.
|
|
485
|
-
|
|
486
|
-
Args:
|
|
487
|
-
dataset_name:
|
|
488
|
-
The name of the dataset.
|
|
489
|
-
question:
|
|
490
|
-
The question for the dataset.
|
|
491
|
-
answer:
|
|
492
|
-
The answer to the question.
|
|
493
|
-
annotator_id:
|
|
494
|
-
The annotator ID for the evaluation.
|
|
495
|
-
|
|
496
|
-
Returns:
|
|
497
|
-
A tuple (question, answer), with `question` being the next question, and
|
|
498
|
-
`answer` being an empty string.
|
|
499
|
-
"""
|
|
500
|
-
if not answer:
|
|
501
|
-
gr.Warning("Please provide an answer before submitting.")
|
|
502
|
-
logger.info("User tried to submit without providing an answer.")
|
|
503
|
-
return question, answer
|
|
504
|
-
|
|
505
|
-
# Custom NER validation
|
|
506
|
-
if self.dataset_config.task == NER:
|
|
507
|
-
try:
|
|
508
|
-
json.loads(answer)
|
|
509
|
-
except json.JSONDecodeError:
|
|
510
|
-
gr.Warning("Please provide a valid JSON object as an answer.")
|
|
511
|
-
logger.info("User tried to submit an invalid JSON object as an answer.")
|
|
512
|
-
return question, answer
|
|
513
|
-
|
|
514
|
-
if not isinstance(json.loads(answer), dict):
|
|
515
|
-
gr.Warning(
|
|
516
|
-
"Please provide a JSON object with a dictionary as an answer."
|
|
517
|
-
)
|
|
518
|
-
logger.info(
|
|
519
|
-
"User tried to submit a JSON object without a dictionary as an "
|
|
520
|
-
"answer."
|
|
521
|
-
)
|
|
522
|
-
return question, answer
|
|
523
|
-
|
|
524
|
-
ner_tags = list(self.dataset_config.prompt_label_mapping.values())
|
|
525
|
-
for ner_tag in ner_tags:
|
|
526
|
-
if ner_tag not in json.loads(answer).keys():
|
|
527
|
-
gr.Warning(
|
|
528
|
-
f"Please provide a JSON object with the key {ner_tag!r}."
|
|
529
|
-
)
|
|
530
|
-
logger.info(
|
|
531
|
-
"User tried to submit a JSON object without the key "
|
|
532
|
-
f"{ner_tag!r}."
|
|
533
|
-
)
|
|
534
|
-
return question, answer
|
|
535
|
-
|
|
536
|
-
samples_left = len(self.active_dataset) - self.sample_idx - 1
|
|
537
|
-
if samples_left:
|
|
538
|
-
gr.Info(f"Submitted - {samples_left} to go!")
|
|
539
|
-
|
|
540
|
-
# Store the user's answer
|
|
541
|
-
answers = self.active_dataset["answer"]
|
|
542
|
-
answers[self.sample_idx] = answer
|
|
543
|
-
self.active_dataset = self.active_dataset.remove_columns(
|
|
544
|
-
column_names=["answer"], new_fingerprint=self.active_dataset._fingerprint
|
|
545
|
-
).add_column(
|
|
546
|
-
name="answer",
|
|
547
|
-
column=answers,
|
|
548
|
-
new_fingerprint=self.active_dataset._fingerprint,
|
|
549
|
-
)
|
|
550
|
-
logger.info(
|
|
551
|
-
f"User submitted the answer {answer!r} to the question {question!r}, with "
|
|
552
|
-
f"sample index {self.sample_idx}."
|
|
553
|
-
)
|
|
554
|
-
|
|
555
|
-
dataset_path = (
|
|
556
|
-
Path(".euroeval_cache")
|
|
557
|
-
/ "human-evaluation"
|
|
558
|
-
/ dataset_name
|
|
559
|
-
/ f"human-{annotator_id}.csv"
|
|
560
|
-
)
|
|
561
|
-
dataset_path.parent.mkdir(parents=True, exist_ok=True)
|
|
562
|
-
self.active_dataset.to_csv(dataset_path)
|
|
563
|
-
|
|
564
|
-
# Attempt to get the next question
|
|
565
|
-
try:
|
|
566
|
-
self.sample_idx += 1
|
|
567
|
-
_, question = self.example_to_markdown(
|
|
568
|
-
example=self.active_dataset[self.sample_idx]
|
|
569
|
-
)
|
|
570
|
-
|
|
571
|
-
if self.dataset_config.task == NER:
|
|
572
|
-
ner_tags = list()
|
|
573
|
-
for ner_tag in self.dataset_config.prompt_label_mapping.values():
|
|
574
|
-
if ner_tag not in ner_tags:
|
|
575
|
-
ner_tags.append(ner_tag)
|
|
576
|
-
answer = json.dumps({ner_tag: [] for ner_tag in ner_tags})
|
|
577
|
-
else:
|
|
578
|
-
answer = ""
|
|
579
|
-
|
|
580
|
-
# If we fail to get the next question it means that the user has finished
|
|
581
|
-
# annotating the dataset, so we compute and log the scores
|
|
582
|
-
except IndexError:
|
|
583
|
-
self.compute_and_log_scores()
|
|
584
|
-
question = ""
|
|
585
|
-
answer = ""
|
|
586
|
-
|
|
587
|
-
return question, answer
|
|
588
|
-
|
|
589
|
-
def example_to_markdown(self, example: dict) -> tuple[str, str]:
|
|
590
|
-
"""Convert an example to a Markdown string.
|
|
591
|
-
|
|
592
|
-
Args:
|
|
593
|
-
example:
|
|
594
|
-
The example to convert.
|
|
595
|
-
|
|
596
|
-
Returns:
|
|
597
|
-
A tuple (task_examples, question) for the example.
|
|
598
|
-
"""
|
|
599
|
-
task_examples: str | list[str] = [
|
|
600
|
-
sample.replace("\n", "\n\n")
|
|
601
|
-
for sample in example["text"].split("\n\n")[:-1]
|
|
602
|
-
]
|
|
603
|
-
task_examples = "\n\n**Example**\n\n".join(task_examples)
|
|
604
|
-
|
|
605
|
-
question = "**Question**\n\n"
|
|
606
|
-
question += "\n\n".join(example["text"].split("\n\n")[-1].split("\n")[:-1])
|
|
607
|
-
question += "\n\n" + example["text"].split("\n\n")[-1].split("\n")[-1]
|
|
608
|
-
|
|
609
|
-
return task_examples, question
|
|
610
|
-
|
|
611
|
-
def compute_and_log_scores(self) -> None:
|
|
612
|
-
"""Computes and logs the scores for the dataset."""
|
|
613
|
-
model_output = GenerativeModelOutput(sequences=self.active_dataset["answer"])
|
|
614
|
-
|
|
615
|
-
active_dataset_dict = self.active_dataset.to_dict()
|
|
616
|
-
assert isinstance(active_dataset_dict, dict)
|
|
617
|
-
|
|
618
|
-
all_preds = self.extract_labels_from_generation(
|
|
619
|
-
input_batch=active_dataset_dict, model_output=model_output
|
|
620
|
-
)
|
|
621
|
-
ground_truth = self.active_dataset["label"]
|
|
622
|
-
itr_scores: dict[str, float] = self.compute_metrics(
|
|
623
|
-
model_outputs_and_labels=(all_preds, ground_truth)
|
|
624
|
-
)
|
|
625
|
-
|
|
626
|
-
# We reverse the order, as the Info messages are printed in reverse order
|
|
627
|
-
scores = list(itr_scores.items())
|
|
628
|
-
scores.reverse()
|
|
629
|
-
gr.Info(
|
|
630
|
-
"If you want to evaluate another dataset then please select a new "
|
|
631
|
-
"one from the menus."
|
|
632
|
-
)
|
|
633
|
-
for metric_name, score in scores:
|
|
634
|
-
gr.Info(f"\n\n{metric_name}: {score:.2%}")
|
|
635
|
-
gr.Info("You have completed this dataset! Here are your scores:")
|
|
636
|
-
logger.info(
|
|
637
|
-
f"User completed the dataset {self.dataset_config.name!r}"
|
|
638
|
-
f", with the following scores: {itr_scores}"
|
|
639
|
-
)
|
|
640
|
-
|
|
641
|
-
# Load previous human results, if any. We do this since the human evaluation is
|
|
642
|
-
# only a single iteration, so the results from the current annotation should be
|
|
643
|
-
# added to the previous results.
|
|
644
|
-
results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
|
|
645
|
-
results: "ScoreDict" = defaultdict(list)
|
|
646
|
-
if results_path.exists():
|
|
647
|
-
all_results = [
|
|
648
|
-
json.loads(line.strip())
|
|
649
|
-
for line in results_path.read_text().strip().split("\n")
|
|
650
|
-
if line.strip()
|
|
651
|
-
]
|
|
652
|
-
human_result_candidates = [
|
|
653
|
-
result
|
|
654
|
-
for result in all_results
|
|
655
|
-
if result["model"] == "human"
|
|
656
|
-
and result["dataset"] == self.dataset_config.name
|
|
657
|
-
]
|
|
658
|
-
if human_result_candidates:
|
|
659
|
-
results = human_result_candidates[0]["results"]
|
|
660
|
-
|
|
661
|
-
# Append to results
|
|
662
|
-
results["raw"].append( # type: ignore[union-attr]
|
|
663
|
-
{f"test_{metric_name}": score for metric_name, score in itr_scores.items()}
|
|
664
|
-
)
|
|
665
|
-
|
|
666
|
-
# Aggregate scores
|
|
667
|
-
total_dict: dict[str, float] = dict()
|
|
668
|
-
for metric in self.dataset_config.task.metrics:
|
|
669
|
-
test_score, test_se = aggregate_scores(
|
|
670
|
-
scores=results["raw"], # type: ignore[arg-type]
|
|
671
|
-
metric=metric,
|
|
672
|
-
)
|
|
673
|
-
test_score, _ = metric.postprocessing_fn(test_score)
|
|
674
|
-
test_se, _ = metric.postprocessing_fn(test_se)
|
|
675
|
-
total_dict[f"test_{metric.name}"] = test_score
|
|
676
|
-
total_dict[f"test_{metric.name}_se"] = test_se
|
|
677
|
-
results["total"] = total_dict
|
|
678
|
-
|
|
679
|
-
benchmark_result = BenchmarkResult(
|
|
680
|
-
dataset=self.dataset_config.name,
|
|
681
|
-
task=self.dataset_config.task.name,
|
|
682
|
-
dataset_languages=[
|
|
683
|
-
language.code for language in self.dataset_config.languages
|
|
684
|
-
],
|
|
685
|
-
model="human",
|
|
686
|
-
results=results,
|
|
687
|
-
num_model_parameters=-1,
|
|
688
|
-
max_sequence_length=-1,
|
|
689
|
-
vocabulary_size=-1,
|
|
690
|
-
merge=False,
|
|
691
|
-
generative=True,
|
|
692
|
-
generative_type=GenerativeType.INSTRUCTION_TUNED,
|
|
693
|
-
few_shot=True,
|
|
694
|
-
validation_split=True,
|
|
695
|
-
)
|
|
696
|
-
benchmark_result.append_to_results(results_path=results_path)
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
@click.command()
|
|
700
|
-
@click.option(
|
|
701
|
-
"--annotator-id",
|
|
702
|
-
"-id",
|
|
703
|
-
type=int,
|
|
704
|
-
required=True,
|
|
705
|
-
help="""The annotator ID to use for the evaluation. Needs to be between 0 and 10,
|
|
706
|
-
inclusive.""",
|
|
707
|
-
)
|
|
708
|
-
def main(annotator_id: int) -> None:
|
|
709
|
-
"""Start the Gradio app for human evaluation."""
|
|
710
|
-
if importlib.util.find_spec("gradio") is None:
|
|
711
|
-
raise NeedsExtraInstalled(extra="human_evaluation")
|
|
712
|
-
|
|
713
|
-
evaluator = HumanEvaluator(
|
|
714
|
-
annotator_id=annotator_id,
|
|
715
|
-
title="EuroEval Human Evaluation",
|
|
716
|
-
description="""
|
|
717
|
-
In this app we will evaluate your performance on a variety of tasks, with the
|
|
718
|
-
goal of comparing human performance to language model performance.
|
|
719
|
-
|
|
720
|
-
When you select a language and a task then you will be given a brief
|
|
721
|
-
description of the task, as well as examples of how to solve it. Please read
|
|
722
|
-
through these examples before proceeding with the task.
|
|
723
|
-
|
|
724
|
-
Please do not use any additional aids (such as search engines) when completing
|
|
725
|
-
these tasks.
|
|
726
|
-
|
|
727
|
-
Note that several examples appear more than once - this is intentional, as it
|
|
728
|
-
allows us to compare your performance across multiple examples.
|
|
729
|
-
|
|
730
|
-
Note that the Enter key will also submit your answer!
|
|
731
|
-
""",
|
|
732
|
-
)
|
|
733
|
-
evaluator.create_app().queue().launch()
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
if __name__ == "__main__":
|
|
737
|
-
main()
|