EuroEval 16.0.0__tar.gz → 16.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-16.0.0 → euroeval-16.0.1}/CHANGELOG.md +20 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/PKG-INFO +3 -1
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/README.md +5 -15
- {euroeval-16.0.0 → euroeval-16.0.1}/makefile +3 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/pyproject.toml +3 -1
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/__init__.py +5 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/vllm.py +41 -28
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/constants.py +6 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/data_models.py +20 -16
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/danish.py +0 -3
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/generation_utils.py +44 -6
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/pipeline.py +50 -8
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/model_cache.py +13 -1
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +2 -2
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/sequence_classification.py +66 -53
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/token_classification.py +14 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/tasks.py +9 -7
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/tokenization_utils.py +1 -2
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/utils.py +32 -1
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_european_values.py +33 -27
- {euroeval-16.0.0 → euroeval-16.0.1}/uv.lock +58 -1
- {euroeval-16.0.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/.github/workflows/ci.yaml +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/.gitignore +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/.pre-commit-config.yaml +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/CITATION.cff +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/CONTRIBUTING.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/Dockerfile.cuda +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/LICENSE +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/NEW_DATASET_GUIDE.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/README.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/CNAME +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/README.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/README.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/danish.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/dutch.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/english.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/estonian.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/faroese.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/finnish.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/french.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/german.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/icelandic.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/italian.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/latvian.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/norwegian.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/portuguese.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/spanish.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/swedish.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/extras/radial_plotter.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/faq.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/gfx/favicon.png +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/finnish.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/portuguese.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/methodology.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/python-package.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/README.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/knowledge.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/speed.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/summarization.md +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/gfx/euroeval.png +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/gfx/euroeval.xcf +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/gfx/scandeval.png +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/mkdocs.yaml +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_config_factory.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/base.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/hf.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/litellm.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmarker.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/callbacks.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/cli.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/data_loading.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/__init__.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/dutch.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/english.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/estonian.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/faroese.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/finnish.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/french.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/german.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/icelandic.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/italian.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/latvian.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/norwegian.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/portuguese.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/spanish.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/swedish.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/enums.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/exceptions.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/finetuning.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/generation.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/languages.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/__init__.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/base.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/huggingface.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/llm_as_a_judge.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/speed.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/model_config.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/model_loading.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/__init__.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/summarization.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/scores.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/speed_benchmark.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/__init__.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/question_answering.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/text_to_text.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/types.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/constants.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_allocine.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_arc.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_arc_is.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_belebele.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_boolq_pt.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_conll_en.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_conll_es.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_copa_lv.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_dane.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_danish_citizen_tests.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_dansk.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_dbrd.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_eltec.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_err_news.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_estner.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_estonian_valence.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_exam_et.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_fone.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_foqa.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_fosent.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_fquad.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_fullstack_ner.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_germanquad.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_germeval.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_goldenswag.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_grammar_et.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_harem.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_hellaswag_fi.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_icesum.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_idioms_no.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_jentoft.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_latvian_lsm_summary.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_life_in_the_uk.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mmlu.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mmlu_lv.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_multi_wiki_qa.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_no_cola.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norec.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norne.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norquad.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_nqii.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_publico.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_rrn.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sb10k.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_scala.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_scandisent_fi.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_schibsted.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_squad.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_squad_it.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sst2_pt.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sst5.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_suc3.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_swedn.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_swerec.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_turku_ner_fi.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_tydiqa_fi.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_wikiann_lv.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_winogrande_et.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_xlsum_fi.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_xquad_es.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/load_ud_pos.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/versioning.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/__init__.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/conftest.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_hf.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmarker.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_callbacks.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_cli.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_constants.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_data_loading.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_data_models.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_dataset_configs.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_enums.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_exceptions.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_finetuning.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_generation.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_languages.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_model_cache.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_model_config.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_model_loading.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_scores.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_speed_benchmark.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_tasks.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_tokenization_utils.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_types.py +0 -0
- {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_utils.py +0 -0
|
@@ -10,6 +10,26 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v16.0.1] - 2025-09-07
|
|
14
|
+
### Fixed
|
|
15
|
+
- Fixed a bug causing encoders to fail when evaluating on the Exam-et dataset.
|
|
16
|
+
- Previously we would abort an evaluation completely if the model outputted a single
|
|
17
|
+
invalid output on a classification task. As individual samples rarely have a great
|
|
18
|
+
influence on the overall score, we now just assign the closest label to the sample and
|
|
19
|
+
continue the evaluation. This will be logged to the user, so that they are aware of
|
|
20
|
+
this. Some tasks are more sensitive to individual samples, such as European values,
|
|
21
|
+
where we still abort the evaluation if a single sample is invalid.
|
|
22
|
+
- Fixed a bug where logprobs were not used for classification tasks when evaluating
|
|
23
|
+
generative models, due to the fact that we raised the number of generated tokens to 10
|
|
24
|
+
for such tasks. This did not affect the results, but it meant that some evaluations
|
|
25
|
+
failed.
|
|
26
|
+
- Now includes FlashInfer as a dependency, as it is required by vLLM.
|
|
27
|
+
- Changed the choices in European values to use letters, like the other multiple
|
|
28
|
+
choice tasks, rather than numbers. Aside from ensuring consistency, we also avoid the
|
|
29
|
+
issue where '10' and '1' often both have the same first token ('1'), causing us not to
|
|
30
|
+
be able to use logprobs to determine the answer.
|
|
31
|
+
|
|
32
|
+
|
|
13
33
|
## [v16.0.0] - 2025-09-05
|
|
14
34
|
### Added
|
|
15
35
|
- Added support for Latvian 🇱🇻! This includes the sentiment classification dataset
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 16.0.
|
|
3
|
+
Version: 16.0.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -61,10 +61,12 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
|
61
61
|
Provides-Extra: all
|
|
62
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
+
Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
|
|
64
65
|
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
65
66
|
Provides-Extra: generative
|
|
66
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
67
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
+
Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
70
|
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
69
71
|
Description-Content-Type: text/markdown
|
|
70
72
|
|
|
@@ -14,9 +14,8 @@ Each language has two leaderboards:
|
|
|
14
14
|
- **Generative Leaderboard**: This leaderboard shows the performance of models that can
|
|
15
15
|
generate text. These models have been evaluated on _all_ [tasks](/tasks), both NLU and
|
|
16
16
|
NLG.
|
|
17
|
-
- **NLU Leaderboard**: This leaderboard shows the performance of models that can
|
|
18
|
-
understand text,
|
|
19
|
-
the NLU tasks only.
|
|
17
|
+
- **NLU Leaderboard**: This leaderboard shows the performance of models that can
|
|
18
|
+
understand text, which includes both generative and non-generative models.
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
## 📊 How to Read the Leaderboards
|
|
@@ -26,15 +25,14 @@ model across all the tasks in the leaderboard. The lower the rank, the better th
|
|
|
26
25
|
|
|
27
26
|
The columns that follow the rank columns are metadata about the model:
|
|
28
27
|
|
|
29
|
-
- `Parameters`: The total number of parameters in the model, in millions.
|
|
30
|
-
- `Vocabulary`: The size of the model's vocabulary, in thousands.
|
|
31
|
-
- `Context`: The maximum number of tokens that the model can process at a time.
|
|
32
|
-
- `Speed`: The inference time of the model - see more [here](/tasks/speed).
|
|
33
28
|
- `Type`: The type of model:
|
|
34
29
|
- 🔍 indicates that it is an encoder model (e.g., BERT)
|
|
35
30
|
- 🧠 indicates that it is a base generative model (e.g., GPT-2)
|
|
36
31
|
- 📝 indicates that it is an instruction-tuned model (e.g., ChatGPT)
|
|
37
32
|
- 🤔 indicates that it is a reasoning model (e.g., o1)
|
|
33
|
+
- `Parameters`: The total number of parameters in the model, in millions.
|
|
34
|
+
- `Vocabulary`: The size of the model's vocabulary, in thousands.
|
|
35
|
+
- `Context`: The maximum number of tokens that the model can process at a time.
|
|
38
36
|
- `Commercial`: Whether the model can be used for commercial purposes. See [here](/faq)
|
|
39
37
|
for more information.
|
|
40
38
|
- `Merge`: Whether the model is a merge of other models.
|
|
@@ -47,11 +45,3 @@ the given model on each of the datasets.
|
|
|
47
45
|
To read more about the individual datasets, see the [datasets](/datasets) page. If
|
|
48
46
|
you're interested in the methodology behind the benchmark, see the
|
|
49
47
|
[methodology](/methodology) page.
|
|
50
|
-
|
|
51
|
-
/// tab | Generative Scatter Plot
|
|
52
|
-
|
|
53
|
-
///
|
|
54
|
-
|
|
55
|
-
/// tab | NLU Scatter Plot
|
|
56
|
-
|
|
57
|
-
///
|
|
@@ -144,3 +144,6 @@ publish-major: install check bump-major publish ## Publish a major version
|
|
|
144
144
|
publish-minor: install check bump-minor publish ## Publish a minor version
|
|
145
145
|
|
|
146
146
|
publish-patch: install check bump-patch publish ## Publish a patch version
|
|
147
|
+
|
|
148
|
+
loc: ## Count the number of lines of code in the project
|
|
149
|
+
@git ls-files | grep '\.py' | xargs wc -l | tail -n 1
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "EuroEval"
|
|
3
|
-
version = "16.0.
|
|
3
|
+
version = "16.0.1"
|
|
4
4
|
description = "The robust European language model benchmark."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -46,11 +46,13 @@ dependencies = [
|
|
|
46
46
|
generative = [
|
|
47
47
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
48
48
|
"vllm>=0.10.1; platform_system == 'Linux'",
|
|
49
|
+
"flashinfer-python>=0.3.1; platform_system == 'Linux'",
|
|
49
50
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
50
51
|
]
|
|
51
52
|
all = [
|
|
52
53
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
53
54
|
"vllm>=0.10.1; platform_system == 'Linux'",
|
|
55
|
+
"flashinfer-python>=0.3.1; platform_system == 'Linux'",
|
|
54
56
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
55
57
|
]
|
|
56
58
|
|
|
@@ -13,6 +13,7 @@ from termcolor import colored
|
|
|
13
13
|
|
|
14
14
|
# Block specific warnings before importing anything else, as they can be noisy
|
|
15
15
|
warnings.filterwarnings("ignore", category=UserWarning)
|
|
16
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
16
17
|
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
17
18
|
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
18
19
|
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
@@ -101,6 +102,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
|
|
|
101
102
|
os.environ["VLLM_USE_V1"] = "1"
|
|
102
103
|
|
|
103
104
|
|
|
105
|
+
# Use the FlashInfer flash-attention backend for vLLM
|
|
106
|
+
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
|
|
107
|
+
|
|
108
|
+
|
|
104
109
|
# Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
|
|
105
110
|
# former and LiteLLM uses the latter
|
|
106
111
|
if os.getenv("HUGGINGFACE_API_KEY"):
|
|
@@ -337,31 +337,6 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
337
337
|
if end_of_chat_token:
|
|
338
338
|
stop_tokens.append(end_of_chat_token)
|
|
339
339
|
|
|
340
|
-
structured_generation_schema = None
|
|
341
|
-
if self.dataset_config.task.uses_structured_output:
|
|
342
|
-
if self.generative_type == GenerativeType.REASONING:
|
|
343
|
-
log_once(
|
|
344
|
-
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
345
|
-
"and thus does not support structured generation, so we do not "
|
|
346
|
-
"enable it.",
|
|
347
|
-
level=logging.DEBUG,
|
|
348
|
-
)
|
|
349
|
-
else:
|
|
350
|
-
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
351
|
-
keys_and_their_types: dict[str, t.Any] = {
|
|
352
|
-
tag_name: (conlist(str, max_length=5), ...)
|
|
353
|
-
for tag_name in ner_tag_names
|
|
354
|
-
}
|
|
355
|
-
answer_format_class = create_model(
|
|
356
|
-
"AnswerFormat", **keys_and_their_types
|
|
357
|
-
)
|
|
358
|
-
structured_generation_schema = answer_format_class.model_json_schema()
|
|
359
|
-
log_once(
|
|
360
|
-
"Using structured generation with the JSON schema "
|
|
361
|
-
f"{structured_generation_schema}",
|
|
362
|
-
level=logging.DEBUG,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
340
|
# Get the mapping from labels to the first token in the label. We call this each
|
|
366
341
|
# time we generate a new dataset since the dataset config can change
|
|
367
342
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
@@ -382,8 +357,29 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
382
357
|
"error was. Skipping this evaluation."
|
|
383
358
|
)
|
|
384
359
|
|
|
385
|
-
|
|
386
|
-
if
|
|
360
|
+
structured_generation_schema = None
|
|
361
|
+
if (
|
|
362
|
+
self.dataset_config.task.uses_structured_output
|
|
363
|
+
or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
|
|
364
|
+
) and self.generative_type == GenerativeType.REASONING:
|
|
365
|
+
guided_decoding = None
|
|
366
|
+
logger.debug(
|
|
367
|
+
"The dataset uses structured output, but we are not using it as the "
|
|
368
|
+
"model is a reasoning model."
|
|
369
|
+
)
|
|
370
|
+
elif self.dataset_config.task.uses_structured_output:
|
|
371
|
+
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
372
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
373
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
374
|
+
for tag_name in ner_tag_names
|
|
375
|
+
}
|
|
376
|
+
answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
377
|
+
structured_generation_schema = answer_format_class.model_json_schema()
|
|
378
|
+
log_once(
|
|
379
|
+
"Using structured generation with the JSON schema: "
|
|
380
|
+
f"{json.dumps(structured_generation_schema)}",
|
|
381
|
+
level=logging.DEBUG,
|
|
382
|
+
)
|
|
387
383
|
guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
|
|
388
384
|
elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
|
|
389
385
|
guided_decoding = GuidedDecodingParams(
|
|
@@ -392,8 +388,17 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
392
388
|
for label in self.dataset_config.labels
|
|
393
389
|
]
|
|
394
390
|
)
|
|
391
|
+
log_once(
|
|
392
|
+
"Using structured generation with the choices: "
|
|
393
|
+
f"{guided_decoding.choice!r}.",
|
|
394
|
+
level=logging.DEBUG,
|
|
395
|
+
)
|
|
395
396
|
else:
|
|
396
397
|
guided_decoding = None
|
|
398
|
+
log_once(
|
|
399
|
+
"Not using structured generation as the dataset does not require it.",
|
|
400
|
+
level=logging.DEBUG,
|
|
401
|
+
)
|
|
397
402
|
|
|
398
403
|
# Define the parameters used for vLLM generation
|
|
399
404
|
max_tokens: int = (
|
|
@@ -439,6 +444,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
439
444
|
# Generate sequences using vLLM
|
|
440
445
|
input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
|
|
441
446
|
num_attempts = 3
|
|
447
|
+
truncation_attempts = 0
|
|
442
448
|
for _ in range(num_attempts):
|
|
443
449
|
try:
|
|
444
450
|
raw_outputs = self._model.generate(
|
|
@@ -466,12 +472,19 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
466
472
|
"Prompts are too long, so truncating them and trying again..."
|
|
467
473
|
)
|
|
468
474
|
logger.debug(f"The error message was: {str(e)}")
|
|
475
|
+
|
|
476
|
+
# If we have already tried truncating the prompts a few times, then
|
|
477
|
+
# we truncate a bit more aggressively
|
|
478
|
+
extra_truncation = 50 * truncation_attempts
|
|
479
|
+
truncation_attempts += 1
|
|
480
|
+
|
|
469
481
|
tokenized_prompts = self._tokeniser(
|
|
470
482
|
text=prompts,
|
|
471
483
|
truncation=True,
|
|
472
484
|
max_length=max(
|
|
473
485
|
min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
|
|
474
|
-
- max_tokens
|
|
486
|
+
- max_tokens
|
|
487
|
+
- extra_truncation,
|
|
475
488
|
0,
|
|
476
489
|
),
|
|
477
490
|
)
|
|
@@ -75,3 +75,9 @@ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
|
|
|
75
75
|
|
|
76
76
|
# These characters are stripped from JSON output when trying to identify the label
|
|
77
77
|
JSON_STRIP_CHARACTERS = ' {}\n\r":'
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# The number of tokens we generate when evaluating generative models on classification
|
|
81
|
+
# tasks. We also use this to determine whether we should store logprobs in the model
|
|
82
|
+
# outputs (and cache).
|
|
83
|
+
NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
|
|
@@ -125,6 +125,12 @@ class Task:
|
|
|
125
125
|
A list of generative model types that are allowed to be evaluated on this
|
|
126
126
|
task. If None, all generative model types are allowed. Only relevant if
|
|
127
127
|
`allowed_model_types` includes generative models.
|
|
128
|
+
allow_invalid_model_outputs (optional):
|
|
129
|
+
Whether to allow invalid model outputs. This is only relevant for generative
|
|
130
|
+
models on classification tasks, where the model may generate an output
|
|
131
|
+
which is not one of the allowed labels. If True, the model output will be
|
|
132
|
+
mapped to the closest valid label. If False, the model output will be
|
|
133
|
+
considered incorrect and the evaluation will be aborted. Defaults to True.
|
|
128
134
|
"""
|
|
129
135
|
|
|
130
136
|
name: str
|
|
@@ -148,6 +154,7 @@ class Task:
|
|
|
148
154
|
GenerativeType.REASONING,
|
|
149
155
|
]
|
|
150
156
|
)
|
|
157
|
+
allow_invalid_model_outputs: bool = True
|
|
151
158
|
|
|
152
159
|
def __post_init__(self) -> None:
|
|
153
160
|
"""Post-initialisation checks."""
|
|
@@ -430,7 +437,6 @@ class DatasetConfig:
|
|
|
430
437
|
if self._prompt_prefix is None
|
|
431
438
|
else self._prompt_prefix
|
|
432
439
|
)
|
|
433
|
-
prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
|
|
434
440
|
return prompt_prefix
|
|
435
441
|
|
|
436
442
|
@property
|
|
@@ -443,7 +449,6 @@ class DatasetConfig:
|
|
|
443
449
|
if self._prompt_template is None
|
|
444
450
|
else self._prompt_template
|
|
445
451
|
)
|
|
446
|
-
prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
|
|
447
452
|
return prompt_template
|
|
448
453
|
|
|
449
454
|
@property
|
|
@@ -456,9 +461,6 @@ class DatasetConfig:
|
|
|
456
461
|
if self._instruction_prompt is None
|
|
457
462
|
else self._instruction_prompt
|
|
458
463
|
)
|
|
459
|
-
instruction_prompt = instruction_prompt.replace(
|
|
460
|
-
"{labels_str}", self._labels_str
|
|
461
|
-
)
|
|
462
464
|
return instruction_prompt
|
|
463
465
|
|
|
464
466
|
@property
|
|
@@ -519,15 +521,16 @@ class DatasetConfig:
|
|
|
519
521
|
"""Return a hash of the dataset configuration."""
|
|
520
522
|
return hash(self.name)
|
|
521
523
|
|
|
522
|
-
|
|
523
|
-
def _labels_str(self) -> str:
|
|
524
|
+
def get_labels_str(self, labels: list[str] | None = None) -> str:
|
|
524
525
|
"""Converts a set of labels to a natural string, in the specified language.
|
|
525
526
|
|
|
526
527
|
If the task is NER, we separate using 'and' and use the mapped labels instead of
|
|
527
528
|
the BIO NER labels.
|
|
528
529
|
|
|
529
530
|
Args:
|
|
530
|
-
|
|
531
|
+
labels (optional):
|
|
532
|
+
The labels to convert to a natural string. If None, uses all the labels
|
|
533
|
+
in the dataset. Defaults to None.
|
|
531
534
|
|
|
532
535
|
Returns:
|
|
533
536
|
The natural string representation of the labels in specified language.
|
|
@@ -539,16 +542,17 @@ class DatasetConfig:
|
|
|
539
542
|
else:
|
|
540
543
|
sep_word = main_language.or_separator
|
|
541
544
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
545
|
+
if labels is None:
|
|
546
|
+
labels = list()
|
|
547
|
+
for english_label in self.labels:
|
|
548
|
+
if english_label not in self.prompt_label_mapping:
|
|
549
|
+
continue
|
|
550
|
+
label = self.prompt_label_mapping[english_label]
|
|
551
|
+
if label not in labels:
|
|
552
|
+
labels.append(label)
|
|
549
553
|
|
|
550
554
|
# Convert labels to single-quoted labels - and remove duplicates
|
|
551
|
-
quoted_labels = [f"'{label}'" for label in
|
|
555
|
+
quoted_labels = [f"'{label}'" for label in labels]
|
|
552
556
|
|
|
553
557
|
if not quoted_labels:
|
|
554
558
|
return ""
|
|
@@ -84,7 +84,6 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
|
|
|
84
84
|
languages=[DA],
|
|
85
85
|
splits=["test"],
|
|
86
86
|
bootstrap_samples=False,
|
|
87
|
-
_instruction_prompt="{text}",
|
|
88
87
|
)
|
|
89
88
|
|
|
90
89
|
|
|
@@ -159,7 +158,6 @@ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
|
|
|
159
158
|
languages=[DA],
|
|
160
159
|
splits=["test"],
|
|
161
160
|
bootstrap_samples=False,
|
|
162
|
-
_instruction_prompt="{text}",
|
|
163
161
|
unofficial=True,
|
|
164
162
|
)
|
|
165
163
|
|
|
@@ -172,6 +170,5 @@ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
|
|
|
172
170
|
languages=[DA],
|
|
173
171
|
splits=["test"],
|
|
174
172
|
bootstrap_samples=False,
|
|
175
|
-
_instruction_prompt="{text}",
|
|
176
173
|
unofficial=True,
|
|
177
174
|
)
|
|
@@ -9,7 +9,7 @@ import typing as t
|
|
|
9
9
|
from .enums import TaskGroup
|
|
10
10
|
from .exceptions import InvalidBenchmark
|
|
11
11
|
from .tokenization_utils import apply_chat_template
|
|
12
|
-
from .utils import log_once
|
|
12
|
+
from .utils import extract_multiple_choice_labels, log_once
|
|
13
13
|
|
|
14
14
|
if t.TYPE_CHECKING:
|
|
15
15
|
from datasets import DatasetDict
|
|
@@ -230,18 +230,49 @@ def apply_prompt(
|
|
|
230
230
|
return dataset_config.prompt_template.format(**kwargs), ""
|
|
231
231
|
|
|
232
232
|
match dataset_config.task.task_group:
|
|
233
|
-
case
|
|
234
|
-
|
|
235
|
-
|
|
233
|
+
case TaskGroup.SEQUENCE_CLASSIFICATION:
|
|
234
|
+
labels_str = dataset_config.get_labels_str()
|
|
235
|
+
few_shot_sections = [
|
|
236
|
+
create_prompt(
|
|
237
|
+
text=example["text"].replace("\n", " ").strip(),
|
|
238
|
+
label=example["label"].replace("\n", " ").strip(),
|
|
239
|
+
labels_str=labels_str,
|
|
240
|
+
)
|
|
241
|
+
for example in few_shot_examples
|
|
242
|
+
]
|
|
243
|
+
new_sections = [
|
|
244
|
+
create_prompt(
|
|
245
|
+
text=text.replace("\n", " ").strip(),
|
|
246
|
+
label="",
|
|
247
|
+
labels_str=labels_str,
|
|
248
|
+
)
|
|
249
|
+
for text in examples["text"]
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
236
253
|
few_shot_sections = [
|
|
237
254
|
create_prompt(
|
|
238
255
|
text=example["text"].replace("\n", " ").strip(),
|
|
239
256
|
label=example["label"].replace("\n", " ").strip(),
|
|
257
|
+
labels_str=dataset_config.get_labels_str(
|
|
258
|
+
labels=extract_multiple_choice_labels(
|
|
259
|
+
prompt=example["text"],
|
|
260
|
+
candidate_labels=dataset_config.labels,
|
|
261
|
+
)
|
|
262
|
+
),
|
|
240
263
|
)
|
|
241
264
|
for example in few_shot_examples
|
|
242
265
|
]
|
|
243
266
|
new_sections = [
|
|
244
|
-
create_prompt(
|
|
267
|
+
create_prompt(
|
|
268
|
+
text=text.replace("\n", " ").strip(),
|
|
269
|
+
label="",
|
|
270
|
+
labels_str=dataset_config.get_labels_str(
|
|
271
|
+
labels=extract_multiple_choice_labels(
|
|
272
|
+
prompt=text, candidate_labels=dataset_config.labels
|
|
273
|
+
)
|
|
274
|
+
),
|
|
275
|
+
)
|
|
245
276
|
for text in examples["text"]
|
|
246
277
|
]
|
|
247
278
|
|
|
@@ -259,6 +290,7 @@ def apply_prompt(
|
|
|
259
290
|
]
|
|
260
291
|
|
|
261
292
|
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
293
|
+
labels_str = dataset_config.get_labels_str()
|
|
262
294
|
|
|
263
295
|
def create_label(example: dict) -> str:
|
|
264
296
|
prompt_labels = dataset_config.prompt_label_mapping.values()
|
|
@@ -280,12 +312,15 @@ def apply_prompt(
|
|
|
280
312
|
create_prompt(
|
|
281
313
|
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
282
314
|
label=create_label(example=example),
|
|
315
|
+
labels_str=labels_str,
|
|
283
316
|
)
|
|
284
317
|
for example in few_shot_examples
|
|
285
318
|
]
|
|
286
319
|
new_sections = [
|
|
287
320
|
create_prompt(
|
|
288
|
-
text=" ".join(tokens).replace("\n", " ").strip(),
|
|
321
|
+
text=" ".join(tokens).replace("\n", " ").strip(),
|
|
322
|
+
label="",
|
|
323
|
+
labels_str=labels_str,
|
|
289
324
|
)
|
|
290
325
|
for tokens in examples["tokens"]
|
|
291
326
|
]
|
|
@@ -375,4 +410,7 @@ def apply_prompt(
|
|
|
375
410
|
for new_prompt, _ in new_sections
|
|
376
411
|
]
|
|
377
412
|
|
|
413
|
+
# Always add the final prompts without few-shot examples, too, for analysis
|
|
414
|
+
examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
|
|
415
|
+
|
|
378
416
|
return examples
|
|
@@ -26,6 +26,27 @@ logger: logging.Logger = logging.getLogger("euroeval")
|
|
|
26
26
|
T = t.TypeVar("T", bound=int | float | str | bool)
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
class PreprocessingFunction(t.Protocol):
|
|
30
|
+
"""A protocol for a preprocessing function."""
|
|
31
|
+
|
|
32
|
+
def __call__(
|
|
33
|
+
self, predictions: c.Sequence[int], dataset: "Dataset"
|
|
34
|
+
) -> c.Sequence[int]:
|
|
35
|
+
"""Preprocess the model predictions before they are passed to the pipeline.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
predictions:
|
|
39
|
+
The model predictions.
|
|
40
|
+
dataset:
|
|
41
|
+
The dataset used for evaluation. This is only used in case any
|
|
42
|
+
additional metadata is used to compute the metrics.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
The preprocessed model predictions.
|
|
46
|
+
"""
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
|
|
29
50
|
class PipelineMetric(Metric):
|
|
30
51
|
"""Load a scikit-learn pipeline and use it to get scores from the predictions."""
|
|
31
52
|
|
|
@@ -36,7 +57,7 @@ class PipelineMetric(Metric):
|
|
|
36
57
|
pipeline_repo: str,
|
|
37
58
|
pipeline_scoring_function: c.Callable[["Pipeline", c.Sequence], float],
|
|
38
59
|
pipeline_file_name: str = "pipeline.pkl",
|
|
39
|
-
preprocessing_fn:
|
|
60
|
+
preprocessing_fn: PreprocessingFunction | None = None,
|
|
40
61
|
postprocessing_fn: c.Callable[[float], tuple[float, str]] | None = None,
|
|
41
62
|
) -> None:
|
|
42
63
|
"""Initialise the pipeline transform metric.
|
|
@@ -101,7 +122,10 @@ class PipelineMetric(Metric):
|
|
|
101
122
|
"""
|
|
102
123
|
if self.pipeline is None:
|
|
103
124
|
self.pipeline = self._download_pipeline()
|
|
104
|
-
|
|
125
|
+
if self.preprocessing_fn is not None:
|
|
126
|
+
predictions = self.preprocessing_fn(
|
|
127
|
+
predictions=predictions, dataset=dataset
|
|
128
|
+
)
|
|
105
129
|
return self.pipeline_scoring_function(self.pipeline, predictions)
|
|
106
130
|
|
|
107
131
|
def _download_pipeline(self) -> "Pipeline":
|
|
@@ -133,13 +157,18 @@ class PipelineMetric(Metric):
|
|
|
133
157
|
### European Values Metric ###
|
|
134
158
|
|
|
135
159
|
|
|
136
|
-
def european_values_preprocessing_fn(
|
|
160
|
+
def european_values_preprocessing_fn(
|
|
161
|
+
predictions: c.Sequence[int], dataset: "Dataset"
|
|
162
|
+
) -> c.Sequence[int]:
|
|
137
163
|
"""Preprocess the model predictions for the European Values metric.
|
|
138
164
|
|
|
139
165
|
Args:
|
|
140
166
|
predictions:
|
|
141
167
|
The model predictions, a sequence of integers representing the predicted
|
|
142
168
|
choices for each question.
|
|
169
|
+
dataset:
|
|
170
|
+
The dataset used for evaluation. This is only used in case any additional
|
|
171
|
+
metadata is used to compute the metrics.
|
|
143
172
|
|
|
144
173
|
Returns:
|
|
145
174
|
The preprocessed model predictions, a sequence of integers representing the
|
|
@@ -154,6 +183,17 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
|
|
|
154
183
|
num_questions = 53
|
|
155
184
|
num_phrasings_per_question = 5
|
|
156
185
|
|
|
186
|
+
# Convert the predictions to integers
|
|
187
|
+
integer_predictions = []
|
|
188
|
+
for prediction, idx_to_choice in zip(predictions, dataset["idx_to_choice"]):
|
|
189
|
+
idx_to_choice = {
|
|
190
|
+
int(idx): int(choice)
|
|
191
|
+
for idx, choice in idx_to_choice.items()
|
|
192
|
+
if choice is not None
|
|
193
|
+
}
|
|
194
|
+
integer_prediction = idx_to_choice[prediction]
|
|
195
|
+
integer_predictions.append(integer_prediction)
|
|
196
|
+
|
|
157
197
|
assert len(predictions) % num_questions == 0, (
|
|
158
198
|
f"The number of predictions ({len(predictions)}) is not a multiple of "
|
|
159
199
|
f"{num_questions}, which is required for the European Values metric."
|
|
@@ -171,7 +211,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
|
|
|
171
211
|
# Shape: (num_questions, num_phrasings_per_question)
|
|
172
212
|
arr = np.array(
|
|
173
213
|
[
|
|
174
|
-
|
|
214
|
+
integer_predictions[i : i + num_phrasings_per_question]
|
|
175
215
|
for i in range(0, len(predictions), num_phrasings_per_question)
|
|
176
216
|
]
|
|
177
217
|
)
|
|
@@ -188,7 +228,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
|
|
|
188
228
|
arr = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=arr)
|
|
189
229
|
|
|
190
230
|
# Convert the array to a list
|
|
191
|
-
|
|
231
|
+
integer_predictions = arr.tolist()
|
|
192
232
|
|
|
193
233
|
# Some of the questions are categorical and we're only interested in whether the
|
|
194
234
|
# model chooses a specific choice or not. This mapping takes the question index
|
|
@@ -208,11 +248,13 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
|
|
|
208
248
|
}
|
|
209
249
|
|
|
210
250
|
# Map the predictions to the choices we're interested in
|
|
211
|
-
|
|
251
|
+
integer_predictions = list(integer_predictions)
|
|
212
252
|
for question_idx, choice in question_choices.items():
|
|
213
|
-
|
|
253
|
+
integer_predictions[question_idx] = (
|
|
254
|
+
1 if integer_predictions[question_idx] == choice else 0
|
|
255
|
+
)
|
|
214
256
|
|
|
215
|
-
return
|
|
257
|
+
return integer_predictions
|
|
216
258
|
|
|
217
259
|
|
|
218
260
|
def european_values_scoring_function(
|
|
@@ -10,7 +10,9 @@ from dataclasses import asdict
|
|
|
10
10
|
|
|
11
11
|
from tqdm.auto import tqdm
|
|
12
12
|
|
|
13
|
+
from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
|
|
13
14
|
from .data_models import GenerativeModelOutput, SingleGenerativeModelOutput
|
|
15
|
+
from .utils import log_once
|
|
14
16
|
|
|
15
17
|
if t.TYPE_CHECKING:
|
|
16
18
|
from pathlib import Path
|
|
@@ -189,10 +191,20 @@ class ModelCache:
|
|
|
189
191
|
# the indices of the top scores, to save space. Further, we only store
|
|
190
192
|
# the scores if the generated sequence is shorter than the maximum
|
|
191
193
|
# length
|
|
192
|
-
if
|
|
194
|
+
if (
|
|
195
|
+
model_output.scores is not None
|
|
196
|
+
and self.max_generated_tokens
|
|
197
|
+
<= NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
|
|
198
|
+
):
|
|
193
199
|
assert model_output.scores is not None
|
|
194
200
|
scores = model_output.scores[sample_idx]
|
|
195
201
|
else:
|
|
202
|
+
if model_output.scores is not None:
|
|
203
|
+
log_once(
|
|
204
|
+
"The generated sequence is longer than the maximum "
|
|
205
|
+
"length for classification. Not caching the scores.",
|
|
206
|
+
level=logging.DEBUG,
|
|
207
|
+
)
|
|
196
208
|
scores = None
|
|
197
209
|
self[model_input] = SingleGenerativeModelOutput(
|
|
198
210
|
sequence=model_output.sequences[sample_idx], scores=scores
|
{euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/multiple_choice_classification.py
RENAMED
|
@@ -126,7 +126,7 @@ def prepare_examples(
|
|
|
126
126
|
):
|
|
127
127
|
choice_idxs.append(idx)
|
|
128
128
|
|
|
129
|
-
choices = [sections[idx] for idx in choice_idxs]
|
|
129
|
+
choices = [sections[idx] for idx in reversed(choice_idxs)]
|
|
130
130
|
|
|
131
131
|
# Check that the choices are present, and that all of them are at the end
|
|
132
132
|
assert len(choices) > 0, "No choices found in the document."
|
|
@@ -146,7 +146,7 @@ def prepare_examples(
|
|
|
146
146
|
)
|
|
147
147
|
new_examples["label"] = [
|
|
148
148
|
int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
|
|
149
|
-
for letter, choice in zip("
|
|
149
|
+
for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
|
|
150
150
|
]
|
|
151
151
|
new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
|
|
152
152
|
return new_examples
|