EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,737 +0,0 @@
1
- """Gradio app for conducting human evaluation of the tasks."""
2
-
3
- import importlib.util
4
- import json
5
- import logging
6
- import typing as t
7
- from collections import defaultdict
8
- from functools import partial
9
- from pathlib import Path
10
-
11
- import click
12
- from datasets import Dataset
13
-
14
- from .benchmark_config_factory import build_benchmark_config
15
- from .data_loading import load_data
16
- from .data_models import BenchmarkResult, GenerativeModelOutput
17
- from .dataset_configs import SPEED_CONFIG, get_all_dataset_configs
18
- from .enums import GenerativeType, TaskGroup
19
- from .exceptions import NeedsExtraInstalled
20
- from .scores import aggregate_scores
21
- from .task_group_utils import (
22
- question_answering,
23
- sequence_classification,
24
- text_to_text,
25
- token_classification,
26
- )
27
- from .tasks import NER
28
- from .utils import enforce_reproducibility
29
-
30
- if importlib.util.find_spec("gradio") is not None:
31
- import gradio as gr
32
- from gradio.components import HTML, Button, Dropdown, Markdown, Textbox
33
-
34
- if t.TYPE_CHECKING:
35
- from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
36
-
37
- logger = logging.getLogger("euroeval")
38
-
39
-
40
- class HumanEvaluator:
41
- """An app for evaluating human performance on the EuroEval benchmark."""
42
-
43
- def __init__(
44
- self,
45
- annotator_id: int,
46
- title: str,
47
- description: str,
48
- dummy_model_id: str = "mistralai/Mistral-7B-v0.1",
49
- ) -> None:
50
- """Initialise the HumanEvaluator.
51
-
52
- Args:
53
- annotator_id:
54
- The annotator ID for the evaluation.
55
- title:
56
- The title of the app.
57
- description:
58
- The description of the app.
59
- dummy_model_id:
60
- The model ID to use for generating prompts.
61
- """
62
- self.annotator_id = annotator_id
63
- self.title = title
64
- self.description = description
65
- self.dummy_model_id = dummy_model_id
66
-
67
- self.sample_idx: int
68
- self.active_dataset: Dataset
69
-
70
- self.dataset_configs = {
71
- name: cfg
72
- for name, cfg in get_all_dataset_configs().items()
73
- if not cfg.unofficial
74
- }
75
- self.tasks = sorted(
76
- {
77
- cfg.task.name.replace("-", " ").title()
78
- for cfg in self.dataset_configs.values()
79
- if cfg != SPEED_CONFIG
80
- }
81
- )
82
- self.languages = sorted(
83
- {
84
- language.name
85
- for cfg in self.dataset_configs.values()
86
- if cfg != SPEED_CONFIG
87
- for language in cfg.languages
88
- if language.name not in {"Norwegian Bokmål", "Norwegian Nynorsk"}
89
- }
90
- )
91
-
92
- self.extract_labels_from_generation: "ExtractLabelsFunction"
93
- self.compute_metrics: "ComputeMetricsFunction"
94
-
95
- def create_app(self) -> "gr.Blocks":
96
- """Create the Gradio app for human evaluation.
97
-
98
- Returns:
99
- The Gradio app for human evaluation.
100
- """
101
- with gr.Blocks(title=self.title, theme=gr.themes.Monochrome()) as app:
102
- HTML(f"<center><h1>{self.title}</h1></center>")
103
- Markdown(self.description)
104
- with gr.Row(variant="panel"):
105
- language_dropdown = Dropdown(label="Language", choices=self.languages)
106
- task_dropdown = Dropdown(label="Task", choices=self.tasks)
107
- dataset_dropdown = Dropdown(label="Dataset", choices=[""])
108
- with gr.Row(variant="panel"):
109
- with gr.Column():
110
- task_examples = Markdown("Task Examples", visible=False)
111
- with gr.Column():
112
- question = Markdown(label="Question", visible=False)
113
- with gr.Row():
114
- ner_tag_dropdown = Dropdown(
115
- label="Entity type",
116
- choices=[""],
117
- interactive=True,
118
- visible=False,
119
- scale=0.5, # type: ignore[arg-type]
120
- )
121
- ner_tag_answer = Textbox(
122
- label="Entity", interactive=True, visible=False, scale=1
123
- )
124
- with gr.Column(scale=0.2): # type: ignore[arg-type]
125
- ner_tag_add_button = Button("Add entity", visible=False)
126
- ner_tag_reset_button = Button(
127
- "Reset entities", visible=False
128
- )
129
- answer = Textbox(label="Answer", visible=False)
130
- submit_button = Button("Submit", visible=False)
131
-
132
- language_dropdown.change(
133
- fn=self.update_dataset_choices,
134
- inputs=[language_dropdown, task_dropdown],
135
- outputs=dataset_dropdown,
136
- )
137
- task_dropdown.change(
138
- fn=self.update_dataset_choices,
139
- inputs=[language_dropdown, task_dropdown],
140
- outputs=dataset_dropdown,
141
- )
142
- dataset_dropdown.change(
143
- fn=partial(self.update_dataset, iteration=self.annotator_id),
144
- inputs=dataset_dropdown,
145
- outputs=[
146
- task_examples,
147
- question,
148
- ner_tag_dropdown,
149
- ner_tag_answer,
150
- ner_tag_add_button,
151
- ner_tag_reset_button,
152
- answer,
153
- submit_button,
154
- ],
155
- )
156
- ner_tag_add_button.click(
157
- fn=self.add_entity_to_answer,
158
- inputs=[question, ner_tag_dropdown, ner_tag_answer, answer],
159
- outputs=[ner_tag_answer, answer],
160
- )
161
- ner_tag_answer.submit(
162
- fn=self.add_entity_to_answer,
163
- inputs=[question, ner_tag_dropdown, ner_tag_answer, answer],
164
- outputs=[ner_tag_answer, answer],
165
- )
166
- ner_tag_reset_button.click(fn=self.reset_entities, outputs=answer)
167
- submit_button.click(
168
- fn=partial(self.submit_answer, annotator_id=self.annotator_id),
169
- inputs=[dataset_dropdown, question, answer],
170
- outputs=[question, answer],
171
- )
172
- answer.submit(
173
- fn=partial(self.submit_answer, annotator_id=self.annotator_id),
174
- inputs=[dataset_dropdown, question, answer],
175
- outputs=[question, answer],
176
- )
177
- return app
178
-
179
- def update_dataset_choices(
180
- self, language: str | None, task: str | None
181
- ) -> "Dropdown":
182
- """Update the dataset choices based on the selected language and task.
183
-
184
- Args:
185
- language:
186
- The language selected by the user.
187
- task:
188
- The task selected by the user.
189
-
190
- Returns:
191
- A list of dataset names that match the selected language and task.
192
- """
193
- if language is None or task is None:
194
- return Dropdown(choices=[])
195
-
196
- dataset_configs = [
197
- cfg
198
- for cfg in get_all_dataset_configs().values()
199
- if language in {language.name for language in cfg.languages}
200
- and task.lower().replace(" ", "-") == cfg.task.name
201
- and not cfg.unofficial
202
- ]
203
- assert len(dataset_configs) > 0
204
-
205
- choices = sorted([cfg.name for cfg in dataset_configs])
206
-
207
- logger.info(
208
- f"User selected {language} and {task}, which resulted in the datasets "
209
- f"{choices}, with {choices[0]!r} being chosen by default."
210
- )
211
-
212
- return Dropdown(choices=choices, value=choices[0])
213
-
214
- def update_dataset(
215
- self, dataset_name: str, iteration: int
216
- ) -> (
217
- "tuple[Markdown, Markdown, Dropdown, Textbox, Button, Button, Textbox, Button]"
218
- ):
219
- """Update the dataset based on a selected dataset name.
220
-
221
- Args:
222
- dataset_name:
223
- The dataset name selected by the user.
224
- iteration:
225
- The iteration index of the datasets to evaluate.
226
-
227
- Returns:
228
- A tuple (task_examples, question, entity_type, entity, entity_add_button,
229
- entity_reset_button, answer, submit_button) for the selected dataset.
230
- """
231
- blank_answer = (
232
- Markdown("", visible=False),
233
- Markdown("", visible=False),
234
- Dropdown(visible=False),
235
- Textbox(visible=False),
236
- Button(visible=False),
237
- Button(visible=False),
238
- Textbox("", visible=False),
239
- Button(visible=False),
240
- )
241
-
242
- if not dataset_name:
243
- return blank_answer
244
-
245
- logger.info(f"User selected dataset {dataset_name} - loading dataset...")
246
- gr.Info(f"Loading dataset {dataset_name}...")
247
-
248
- benchmark_config = build_benchmark_config(
249
- progress_bar=False,
250
- save_results=True,
251
- task=None,
252
- dataset=None,
253
- language=[
254
- language.code
255
- for cfg in get_all_dataset_configs().values()
256
- for language in cfg.languages
257
- if not cfg.unofficial
258
- ],
259
- model_language=None,
260
- dataset_language=None,
261
- device=None,
262
- batch_size=1,
263
- raise_errors=False,
264
- cache_dir=".euroeval_cache",
265
- api_key=None,
266
- force=False,
267
- verbose=False,
268
- trust_remote_code=False,
269
- clear_model_cache=False,
270
- evaluate_test_split=False,
271
- few_shot=True,
272
- num_iterations=iteration + 1,
273
- api_base=None,
274
- api_version=None,
275
- gpu_memory_utilization=0.9,
276
- debug=False,
277
- run_with_cli=True,
278
- only_allow_safetensors=False,
279
- )
280
- self.dataset_config = get_all_dataset_configs()[dataset_name]
281
-
282
- # TODO: Is this needed?
283
- # model_id = f"human-{iteration}"
284
- # model_config = ModelConfig(
285
- # model_id=model_id,
286
- # revision="main",
287
- # task="text-generation",
288
- # languages=dataset_config.languages,
289
- # model_type=ModelType.HUMAN,
290
- # model_cache_dir=create_model_cache_dir(
291
- # cache_dir=benchmark_config.cache_dir, model_id=model_id
292
- # ),
293
- # adapter_base_model_id=None,
294
- # )
295
-
296
- self.sample_idx = 0
297
-
298
- dataset_path = (
299
- Path(".euroeval_cache")
300
- / "human-evaluation"
301
- / dataset_name
302
- / f"human-{iteration}.csv"
303
- )
304
- if dataset_path.exists():
305
- active_dataset = Dataset.from_csv(str(dataset_path))
306
- assert isinstance(active_dataset, Dataset)
307
- self.active_dataset = active_dataset
308
- try:
309
- while self.active_dataset["answer"][self.sample_idx] is not None:
310
- self.sample_idx += 1
311
- except IndexError:
312
- self.compute_and_log_scores()
313
- return blank_answer
314
- else:
315
- rng = enforce_reproducibility()
316
- datasets = load_data(
317
- rng=rng,
318
- dataset_config=self.dataset_config,
319
- benchmark_config=benchmark_config,
320
- )
321
- # TODO: Prepare data?
322
- self.active_dataset = (
323
- datasets[iteration]["test"]
324
- .remove_columns(
325
- column_names=["input_ids", "attention_mask"],
326
- new_fingerprint=datasets[iteration]["test"]._fingerprint,
327
- )
328
- .add_column(
329
- name="answer",
330
- column=[None] * len(datasets[iteration]["test"]),
331
- new_fingerprint=datasets[iteration]["test"]._fingerprint,
332
- )
333
- )
334
- if self.dataset_config.task == NER:
335
- labels_in_train: set[str] = {
336
- tag
337
- for tag_list in self.active_dataset["labels"]
338
- for tag in tag_list
339
- }
340
- self.has_misc_tags = (
341
- "B-MISC" in labels_in_train or "I-MISC" in labels_in_train
342
- )
343
-
344
- match self.dataset_config.task.task_group:
345
- case TaskGroup.SEQUENCE_CLASSIFICATION:
346
- self.compute_metrics = partial(
347
- sequence_classification.compute_metrics,
348
- dataset_config=self.dataset_config,
349
- )
350
- self.extract_labels_from_generation = partial(
351
- sequence_classification.extract_labels_from_generation,
352
- dataset_config=self.dataset_config,
353
- )
354
- case TaskGroup.TEXT_TO_TEXT:
355
- self.compute_metrics = partial(
356
- text_to_text.compute_metrics,
357
- dataset_config=self.dataset_config,
358
- benchmark_config=benchmark_config,
359
- )
360
- self.extract_labels_from_generation = (
361
- text_to_text.extract_labels_from_generation
362
- )
363
- case TaskGroup.TOKEN_CLASSIFICATION:
364
- self.compute_metrics = partial(
365
- token_classification.compute_metrics,
366
- has_misc_tags=self.has_misc_tags,
367
- dataset_config=self.dataset_config,
368
- )
369
- self.extract_labels_from_generation = partial(
370
- token_classification.extract_labels_from_generation,
371
- dataset_config=self.dataset_config,
372
- )
373
- case TaskGroup.QUESTION_ANSWERING:
374
- self.compute_metrics = partial(
375
- question_answering.compute_metrics,
376
- dataset_config=self.dataset_config,
377
- )
378
- self.extract_labels_from_generation = (
379
- question_answering.extract_labels_from_generation
380
- )
381
- case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
382
- raise NotImplementedError
383
- case _:
384
- raise NotImplementedError(
385
- f"Task group {self.dataset_config.task.task_group} is not "
386
- "supported."
387
- )
388
-
389
- task_examples, question = self.example_to_markdown(
390
- example=self.active_dataset[self.sample_idx]
391
- )
392
-
393
- logger.info(
394
- f"Loaded dataset {dataset_name}, with the following task examples:\n\n"
395
- f"{task_examples}"
396
- )
397
-
398
- if self.dataset_config.task == NER:
399
- ner_tags = list()
400
- for ner_tag in self.dataset_config.prompt_label_mapping.values():
401
- if ner_tag not in ner_tags:
402
- ner_tags.append(ner_tag)
403
- return (
404
- Markdown(task_examples, visible=True),
405
- Markdown(question, visible=True),
406
- Dropdown(
407
- label="Entity type",
408
- choices=ner_tags,
409
- value=ner_tags[0],
410
- visible=True,
411
- ),
412
- Textbox(label="Entity", interactive=True, visible=True),
413
- Button("Add entity", visible=True),
414
- Button("Reset entities", visible=True),
415
- Textbox(
416
- json.dumps({ner_tag: [] for ner_tag in ner_tags}),
417
- interactive=False,
418
- visible=True,
419
- ),
420
- Button("Submit", visible=True),
421
- )
422
- else:
423
- return (
424
- Markdown(task_examples, visible=True),
425
- Markdown(question, visible=True),
426
- Dropdown(label="Entity type", choices=[], visible=False),
427
- Textbox(label="Entity", interactive=True, visible=False),
428
- Button("Add entity", visible=False),
429
- Button("Reset entities", visible=False),
430
- Textbox("", interactive=True, visible=True),
431
- Button("Submit", visible=True),
432
- )
433
-
434
- def add_entity_to_answer(
435
- self, question: str, entity_type: str, entity: str, answer: str
436
- ) -> "tuple[Textbox, Textbox]":
437
- """Add an entity to the answer.
438
-
439
- Args:
440
- question:
441
- The current question.
442
- entity_type:
443
- The entity type selected by the user.
444
- entity:
445
- The entity provided by the user.
446
- answer:
447
- The current answer.
448
-
449
- Returns:
450
- A tuple (entity, answer) with a (blank) entity and answer.
451
- """
452
- if not entity_type or not entity:
453
- return Textbox(""), Textbox("")
454
-
455
- if entity not in question:
456
- gr.Warning(
457
- f"The entity {entity!r} is not present in the question. Please "
458
- "write it *exactly* as it appears in the question."
459
- )
460
- return Textbox(entity), Textbox(answer)
461
-
462
- current_answer_obj = json.loads(answer)
463
- if entity not in current_answer_obj[entity_type]:
464
- current_answer_obj[entity_type].append(entity)
465
-
466
- answer = json.dumps(current_answer_obj)
467
- return Textbox(""), Textbox(answer)
468
-
469
- def reset_entities(self) -> "Textbox":
470
- """Reset the entities in the answer.
471
-
472
- Returns:
473
- A blank answer.
474
- """
475
- ner_tags = list()
476
- for ner_tag in self.dataset_config.prompt_label_mapping.values():
477
- if ner_tag not in ner_tags:
478
- ner_tags.append(ner_tag)
479
- return Textbox(json.dumps({ner_tag: [] for ner_tag in ner_tags}))
480
-
481
- def submit_answer(
482
- self, dataset_name: str, question: str, answer: str, annotator_id: int
483
- ) -> tuple[str, str]:
484
- """Submit an answer to the dataset.
485
-
486
- Args:
487
- dataset_name:
488
- The name of the dataset.
489
- question:
490
- The question for the dataset.
491
- answer:
492
- The answer to the question.
493
- annotator_id:
494
- The annotator ID for the evaluation.
495
-
496
- Returns:
497
- A tuple (question, answer), with `question` being the next question, and
498
- `answer` being an empty string.
499
- """
500
- if not answer:
501
- gr.Warning("Please provide an answer before submitting.")
502
- logger.info("User tried to submit without providing an answer.")
503
- return question, answer
504
-
505
- # Custom NER validation
506
- if self.dataset_config.task == NER:
507
- try:
508
- json.loads(answer)
509
- except json.JSONDecodeError:
510
- gr.Warning("Please provide a valid JSON object as an answer.")
511
- logger.info("User tried to submit an invalid JSON object as an answer.")
512
- return question, answer
513
-
514
- if not isinstance(json.loads(answer), dict):
515
- gr.Warning(
516
- "Please provide a JSON object with a dictionary as an answer."
517
- )
518
- logger.info(
519
- "User tried to submit a JSON object without a dictionary as an "
520
- "answer."
521
- )
522
- return question, answer
523
-
524
- ner_tags = list(self.dataset_config.prompt_label_mapping.values())
525
- for ner_tag in ner_tags:
526
- if ner_tag not in json.loads(answer).keys():
527
- gr.Warning(
528
- f"Please provide a JSON object with the key {ner_tag!r}."
529
- )
530
- logger.info(
531
- "User tried to submit a JSON object without the key "
532
- f"{ner_tag!r}."
533
- )
534
- return question, answer
535
-
536
- samples_left = len(self.active_dataset) - self.sample_idx - 1
537
- if samples_left:
538
- gr.Info(f"Submitted - {samples_left} to go!")
539
-
540
- # Store the user's answer
541
- answers = self.active_dataset["answer"]
542
- answers[self.sample_idx] = answer
543
- self.active_dataset = self.active_dataset.remove_columns(
544
- column_names=["answer"], new_fingerprint=self.active_dataset._fingerprint
545
- ).add_column(
546
- name="answer",
547
- column=answers,
548
- new_fingerprint=self.active_dataset._fingerprint,
549
- )
550
- logger.info(
551
- f"User submitted the answer {answer!r} to the question {question!r}, with "
552
- f"sample index {self.sample_idx}."
553
- )
554
-
555
- dataset_path = (
556
- Path(".euroeval_cache")
557
- / "human-evaluation"
558
- / dataset_name
559
- / f"human-{annotator_id}.csv"
560
- )
561
- dataset_path.parent.mkdir(parents=True, exist_ok=True)
562
- self.active_dataset.to_csv(dataset_path)
563
-
564
- # Attempt to get the next question
565
- try:
566
- self.sample_idx += 1
567
- _, question = self.example_to_markdown(
568
- example=self.active_dataset[self.sample_idx]
569
- )
570
-
571
- if self.dataset_config.task == NER:
572
- ner_tags = list()
573
- for ner_tag in self.dataset_config.prompt_label_mapping.values():
574
- if ner_tag not in ner_tags:
575
- ner_tags.append(ner_tag)
576
- answer = json.dumps({ner_tag: [] for ner_tag in ner_tags})
577
- else:
578
- answer = ""
579
-
580
- # If we fail to get the next question it means that the user has finished
581
- # annotating the dataset, so we compute and log the scores
582
- except IndexError:
583
- self.compute_and_log_scores()
584
- question = ""
585
- answer = ""
586
-
587
- return question, answer
588
-
589
- def example_to_markdown(self, example: dict) -> tuple[str, str]:
590
- """Convert an example to a Markdown string.
591
-
592
- Args:
593
- example:
594
- The example to convert.
595
-
596
- Returns:
597
- A tuple (task_examples, question) for the example.
598
- """
599
- task_examples: str | list[str] = [
600
- sample.replace("\n", "\n\n")
601
- for sample in example["text"].split("\n\n")[:-1]
602
- ]
603
- task_examples = "\n\n**Example**\n\n".join(task_examples)
604
-
605
- question = "**Question**\n\n"
606
- question += "\n\n".join(example["text"].split("\n\n")[-1].split("\n")[:-1])
607
- question += "\n\n" + example["text"].split("\n\n")[-1].split("\n")[-1]
608
-
609
- return task_examples, question
610
-
611
- def compute_and_log_scores(self) -> None:
612
- """Computes and logs the scores for the dataset."""
613
- model_output = GenerativeModelOutput(sequences=self.active_dataset["answer"])
614
-
615
- active_dataset_dict = self.active_dataset.to_dict()
616
- assert isinstance(active_dataset_dict, dict)
617
-
618
- all_preds = self.extract_labels_from_generation(
619
- input_batch=active_dataset_dict, model_output=model_output
620
- )
621
- ground_truth = self.active_dataset["label"]
622
- itr_scores: dict[str, float] = self.compute_metrics(
623
- model_outputs_and_labels=(all_preds, ground_truth)
624
- )
625
-
626
- # We reverse the order, as the Info messages are printed in reverse order
627
- scores = list(itr_scores.items())
628
- scores.reverse()
629
- gr.Info(
630
- "If you want to evaluate another dataset then please select a new "
631
- "one from the menus."
632
- )
633
- for metric_name, score in scores:
634
- gr.Info(f"\n\n{metric_name}: {score:.2%}")
635
- gr.Info("You have completed this dataset! Here are your scores:")
636
- logger.info(
637
- f"User completed the dataset {self.dataset_config.name!r}"
638
- f", with the following scores: {itr_scores}"
639
- )
640
-
641
- # Load previous human results, if any. We do this since the human evaluation is
642
- # only a single iteration, so the results from the current annotation should be
643
- # added to the previous results.
644
- results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
645
- results: "ScoreDict" = defaultdict(list)
646
- if results_path.exists():
647
- all_results = [
648
- json.loads(line.strip())
649
- for line in results_path.read_text().strip().split("\n")
650
- if line.strip()
651
- ]
652
- human_result_candidates = [
653
- result
654
- for result in all_results
655
- if result["model"] == "human"
656
- and result["dataset"] == self.dataset_config.name
657
- ]
658
- if human_result_candidates:
659
- results = human_result_candidates[0]["results"]
660
-
661
- # Append to results
662
- results["raw"].append( # type: ignore[union-attr]
663
- {f"test_{metric_name}": score for metric_name, score in itr_scores.items()}
664
- )
665
-
666
- # Aggregate scores
667
- total_dict: dict[str, float] = dict()
668
- for metric in self.dataset_config.task.metrics:
669
- test_score, test_se = aggregate_scores(
670
- scores=results["raw"], # type: ignore[arg-type]
671
- metric=metric,
672
- )
673
- test_score, _ = metric.postprocessing_fn(test_score)
674
- test_se, _ = metric.postprocessing_fn(test_se)
675
- total_dict[f"test_{metric.name}"] = test_score
676
- total_dict[f"test_{metric.name}_se"] = test_se
677
- results["total"] = total_dict
678
-
679
- benchmark_result = BenchmarkResult(
680
- dataset=self.dataset_config.name,
681
- task=self.dataset_config.task.name,
682
- dataset_languages=[
683
- language.code for language in self.dataset_config.languages
684
- ],
685
- model="human",
686
- results=results,
687
- num_model_parameters=-1,
688
- max_sequence_length=-1,
689
- vocabulary_size=-1,
690
- merge=False,
691
- generative=True,
692
- generative_type=GenerativeType.INSTRUCTION_TUNED,
693
- few_shot=True,
694
- validation_split=True,
695
- )
696
- benchmark_result.append_to_results(results_path=results_path)
697
-
698
-
699
- @click.command()
700
- @click.option(
701
- "--annotator-id",
702
- "-id",
703
- type=int,
704
- required=True,
705
- help="""The annotator ID to use for the evaluation. Needs to be between 0 and 10,
706
- inclusive.""",
707
- )
708
- def main(annotator_id: int) -> None:
709
- """Start the Gradio app for human evaluation."""
710
- if importlib.util.find_spec("gradio") is None:
711
- raise NeedsExtraInstalled(extra="human_evaluation")
712
-
713
- evaluator = HumanEvaluator(
714
- annotator_id=annotator_id,
715
- title="EuroEval Human Evaluation",
716
- description="""
717
- In this app we will evaluate your performance on a variety of tasks, with the
718
- goal of comparing human performance to language model performance.
719
-
720
- When you select a language and a task then you will be given a brief
721
- description of the task, as well as examples of how to solve it. Please read
722
- through these examples before proceeding with the task.
723
-
724
- Please do not use any additional aids (such as search engines) when completing
725
- these tasks.
726
-
727
- Note that several examples appear more than once - this is intentional, as it
728
- allows us to compare your performance across multiple examples.
729
-
730
- Note that the Enter key will also submit your answer!
731
- """,
732
- )
733
- evaluator.create_app().queue().launch()
734
-
735
-
736
- if __name__ == "__main__":
737
- main()