EuroEval 15.16.0__tar.gz → 16.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-15.16.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +2 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +2 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/.github/workflows/ci.yaml +22 -14
- {euroeval-15.16.0 → euroeval-16.0.0}/.pre-commit-config.yaml +4 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/CHANGELOG.md +91 -36
- {euroeval-15.16.0 → euroeval-16.0.0}/PKG-INFO +11 -14
- {euroeval-15.16.0 → euroeval-16.0.0}/README.md +2 -1
- euroeval-16.0.0/docs/datasets/estonian.md +544 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/icelandic.md +11 -11
- euroeval-16.0.0/docs/datasets/latvian.md +536 -0
- euroeval-16.0.0/docs/leaderboards/Monolingual/portuguese.md +23 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/README.md +1 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/makefile +2 -2
- {euroeval-15.16.0 → euroeval-16.0.0}/pyproject.toml +11 -17
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/__init__.py +3 -7
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/benchmark_config_factory.py +3 -7
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/base.py +35 -19
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/fresh.py +24 -19
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/hf.py +136 -154
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/litellm.py +190 -110
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/vllm.py +161 -114
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/benchmarker.py +49 -22
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/cli.py +3 -3
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/constants.py +13 -15
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/data_loading.py +33 -28
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/data_models.py +53 -7
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/__init__.py +2 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/danish.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/dutch.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/english.py +38 -1
- euroeval-16.0.0/src/euroeval/dataset_configs/estonian.py +95 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/faroese.py +38 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/finnish.py +39 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/french.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/german.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/icelandic.py +39 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/italian.py +38 -1
- euroeval-16.0.0/src/euroeval/dataset_configs/latvian.py +81 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/norwegian.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/portuguese.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/spanish.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/swedish.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/enums.py +0 -6
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/finetuning.py +6 -6
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/generation.py +25 -14
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/generation_utils.py +46 -14
- euroeval-16.0.0/src/euroeval/languages.py +966 -0
- euroeval-16.0.0/src/euroeval/metrics/__init__.py +6 -0
- euroeval-16.0.0/src/euroeval/metrics/base.py +76 -0
- euroeval-16.0.0/src/euroeval/metrics/huggingface.py +192 -0
- euroeval-16.0.0/src/euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval-16.0.0/src/euroeval/metrics/pipeline.py +234 -0
- euroeval-16.0.0/src/euroeval/metrics/speed.py +51 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/multiple_choice.py +23 -2
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/named_entity_recognition.py +65 -2
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/reading_comprehension.py +42 -2
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/sentiment_classification.py +46 -2
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/summarization.py +24 -4
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/scores.py +7 -2
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/speed_benchmark.py +6 -6
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/question_answering.py +35 -28
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/sequence_classification.py +96 -23
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/text_to_text.py +7 -3
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/token_classification.py +47 -75
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/tasks.py +31 -6
- euroeval-16.0.0/src/euroeval/tokenization_utils.py +586 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/utils.py +118 -34
- euroeval-16.0.0/src/scripts/create_copa_lv.py +143 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_danish_citizen_tests.py +3 -2
- euroeval-16.0.0/src/scripts/create_err_news.py +83 -0
- euroeval-16.0.0/src/scripts/create_estner.py +115 -0
- euroeval-16.0.0/src/scripts/create_estonian_valence.py +86 -0
- euroeval-16.0.0/src/scripts/create_european_values.py +283 -0
- euroeval-16.0.0/src/scripts/create_exam_et.py +136 -0
- euroeval-16.0.0/src/scripts/create_fullstack_ner.py +248 -0
- euroeval-16.0.0/src/scripts/create_grammar_et.py +74 -0
- euroeval-16.0.0/src/scripts/create_latvian_lsm_summary.py +92 -0
- euroeval-16.0.0/src/scripts/create_latvian_twitter_sentiment.py +109 -0
- euroeval-16.0.0/src/scripts/create_mmlu_lv.py +263 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_multi_wiki_qa.py +1 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_scala.py +4 -0
- euroeval-16.0.0/src/scripts/create_wikiann_lv.py +116 -0
- euroeval-16.0.0/src/scripts/create_winogrande_et.py +90 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/load_ud_pos.py +36 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/conftest.py +2 -19
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_hf.py +10 -13
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_benchmarker.py +0 -44
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_cli.py +2 -2
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_data_loading.py +15 -8
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_data_models.py +2 -2
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_scores.py +1 -1
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_tokenization_utils.py +7 -7
- {euroeval-15.16.0 → euroeval-16.0.0}/uv.lock +1335 -2204
- euroeval-15.16.0/src/euroeval/human_evaluation.py +0 -738
- euroeval-15.16.0/src/euroeval/languages.py +0 -206
- euroeval-15.16.0/src/euroeval/metrics.py +0 -470
- euroeval-15.16.0/src/euroeval/tokenization_utils.py +0 -498
- euroeval-15.16.0/tests/test_human_evaluation.py +0 -8
- {euroeval-15.16.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/.gitignore +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/CITATION.cff +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/CONTRIBUTING.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/Dockerfile.cuda +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/LICENSE +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/NEW_DATASET_GUIDE.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/CNAME +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/README.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/README.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/danish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/dutch.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/english.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/faroese.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/finnish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/french.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/german.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/italian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/norwegian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/portuguese.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/spanish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/datasets/swedish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/extras/radial_plotter.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/faq.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/gfx/favicon.png +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/methodology.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/python-package.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/tasks/README.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/tasks/knowledge.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/tasks/speed.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/docs/tasks/summarization.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/gfx/euroeval.png +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/gfx/euroeval.xcf +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/gfx/scandeval.png +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/mkdocs.yaml +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/callbacks.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/exceptions.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/model_cache.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/model_config.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/model_loading.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/euroeval/types.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/constants.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_allocine.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_arc.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_arc_is.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_belebele.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_boolq_pt.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_conll_en.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_conll_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_dane.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_dansk.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_dbrd.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_eltec.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_fone.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_foqa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_fosent.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_fquad.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_germanquad.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_germeval.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_goldenswag.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_harem.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_hellaswag_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_icesum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_idioms_no.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_jentoft.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_life_in_the_uk.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_mmlu.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_no_cola.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_norec.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_norne.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_norquad.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_nqii.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_publico.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_rrn.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_sb10k.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_scandisent_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_schibsted.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_squad.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_squad_it.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_sst2_pt.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_sst5.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_suc3.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_swedn.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_swerec.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_turku_ner_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_tydiqa_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_xlsum_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/create_xquad_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/src/scripts/versioning.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_callbacks.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_constants.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_dataset_configs.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_enums.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_exceptions.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_finetuning.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_generation.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_languages.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_model_cache.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_model_config.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_model_loading.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_speed_benchmark.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_tasks.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_types.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.0}/tests/test_utils.py +0 -0
|
@@ -25,12 +25,14 @@ body:
|
|
|
25
25
|
- label: Danish
|
|
26
26
|
- label: Dutch
|
|
27
27
|
- label: English
|
|
28
|
+
- label: Estonian
|
|
28
29
|
- label: Faroese
|
|
29
30
|
- label: Finnish
|
|
30
31
|
- label: French
|
|
31
32
|
- label: German
|
|
32
33
|
- label: Icelandic
|
|
33
34
|
- label: Italian
|
|
35
|
+
- label: Latvian
|
|
34
36
|
- label: Norwegian (Bokmål or Nynorsk)
|
|
35
37
|
- label: Portuguese
|
|
36
38
|
- label: Spanish
|
|
@@ -21,7 +21,8 @@ body:
|
|
|
21
21
|
- label: Romance languages (French, Italian, Portuguese, Spanish)
|
|
22
22
|
- label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
|
|
23
23
|
- label: West Germanic languages (Dutch, English, German)
|
|
24
|
-
- label: Finnish
|
|
24
|
+
- label: Finnic languages (Estonian, Finnish)
|
|
25
|
+
- label: Latvian
|
|
25
26
|
validations:
|
|
26
27
|
required: true
|
|
27
28
|
- type: dropdown
|
|
@@ -22,16 +22,19 @@ jobs:
|
|
|
22
22
|
pull-requests: write
|
|
23
23
|
runs-on: ubuntu-latest
|
|
24
24
|
steps:
|
|
25
|
-
- uses: actions/checkout@
|
|
25
|
+
- uses: actions/checkout@v5
|
|
26
26
|
with:
|
|
27
27
|
persist-credentials: false
|
|
28
|
-
|
|
28
|
+
ref: main
|
|
29
|
+
|
|
30
|
+
- name: Install uv and set up Python
|
|
31
|
+
uses: astral-sh/setup-uv@v6
|
|
29
32
|
with:
|
|
33
|
+
enable-cache: false
|
|
30
34
|
python-version: "3.11"
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
shell: bash
|
|
35
|
+
|
|
36
|
+
- name: Run pre-commit hooks
|
|
37
|
+
uses: pre-commit/action@v3.0.1
|
|
35
38
|
|
|
36
39
|
pytest-linux:
|
|
37
40
|
if: github.event.pull_request.draft == false
|
|
@@ -40,21 +43,22 @@ jobs:
|
|
|
40
43
|
pull-requests: write
|
|
41
44
|
strategy:
|
|
42
45
|
matrix:
|
|
43
|
-
python-version: ["3.
|
|
46
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
44
47
|
runs-on: ubuntu-latest
|
|
45
48
|
steps:
|
|
46
|
-
- uses: actions/checkout@
|
|
49
|
+
- uses: actions/checkout@v5
|
|
47
50
|
with:
|
|
48
51
|
persist-credentials: false
|
|
52
|
+
ref: main
|
|
49
53
|
|
|
50
54
|
- name: Install uv and set up Python
|
|
51
|
-
uses: astral-sh/setup-uv@
|
|
55
|
+
uses: astral-sh/setup-uv@v6
|
|
52
56
|
with:
|
|
53
57
|
enable-cache: false
|
|
54
58
|
python-version: ${{ matrix.python-version }}
|
|
55
59
|
|
|
56
60
|
- name: Install Dependencies
|
|
57
|
-
run: uv sync --no-dev
|
|
61
|
+
run: uv sync --no-dev
|
|
58
62
|
|
|
59
63
|
- name: Start Ollama server
|
|
60
64
|
run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
|
|
@@ -79,21 +83,25 @@ jobs:
|
|
|
79
83
|
pull-requests: write
|
|
80
84
|
runs-on: macos-latest
|
|
81
85
|
steps:
|
|
82
|
-
- uses: actions/checkout@
|
|
86
|
+
- uses: actions/checkout@v5
|
|
87
|
+
with:
|
|
88
|
+
persist-credentials: false
|
|
89
|
+
ref: main
|
|
83
90
|
|
|
84
91
|
- name: Install uv and set up Python
|
|
85
|
-
uses: astral-sh/setup-uv@
|
|
92
|
+
uses: astral-sh/setup-uv@v6
|
|
86
93
|
with:
|
|
94
|
+
enable-cache: false
|
|
87
95
|
python-version: ${{ matrix.python-version }}
|
|
88
96
|
|
|
89
97
|
- name: Install Dependencies
|
|
90
|
-
run: uv sync --no-dev
|
|
98
|
+
run: uv sync --no-dev
|
|
91
99
|
|
|
92
100
|
- name: Start Ollama server
|
|
93
101
|
run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
|
|
94
102
|
|
|
95
103
|
- name: Test with pytest
|
|
96
|
-
run: uv run pytest
|
|
104
|
+
run: uv run pytest -vvv
|
|
97
105
|
env:
|
|
98
106
|
HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
99
107
|
HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
@@ -10,18 +10,21 @@ repos:
|
|
|
10
10
|
- id: trailing-whitespace
|
|
11
11
|
- id: debug-statements
|
|
12
12
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
-
rev: v0.12.
|
|
13
|
+
rev: v0.12.12
|
|
14
14
|
hooks:
|
|
15
15
|
- id: ruff
|
|
16
16
|
args:
|
|
17
17
|
- --fix
|
|
18
18
|
- --unsafe-fixes
|
|
19
19
|
- --exit-non-zero-on-fix
|
|
20
|
+
- --no-cache
|
|
20
21
|
types_or:
|
|
21
22
|
- python
|
|
22
23
|
- pyi
|
|
23
24
|
- jupyter
|
|
24
25
|
- id: ruff-format
|
|
26
|
+
args:
|
|
27
|
+
- --no-cache
|
|
25
28
|
types_or:
|
|
26
29
|
- python
|
|
27
30
|
- pyi
|
|
@@ -10,6 +10,62 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v16.0.0] - 2025-09-05
|
|
14
|
+
### Added
|
|
15
|
+
- Added support for Latvian 🇱🇻! This includes the sentiment classification dataset
|
|
16
|
+
Latvian Twitter Sentiment, the linguistic acceptability dataset ScaLA-lv, the named
|
|
17
|
+
entity recognition datasets FullStack-NER-lv and WikiANN-lv, the reading comprehension
|
|
18
|
+
dataset MultiWikiQA, the knowledge dataset MMLU-lv, the common-sense reasoning
|
|
19
|
+
dataset COPA-lv, and the summarisation dataset LSM.
|
|
20
|
+
- Added support for Estonian 🇪🇪! It includes the sentiment classification dataset
|
|
21
|
+
Estonian Valence, the linguistic acceptability datasets Grammar-et and ScaLA-et, the
|
|
22
|
+
named entity recognition dataset EstNER, the reading comprehension dataset
|
|
23
|
+
MultiWikiQA-et, the summarisation dataset ERRNews, the knowledge dataset Exam-et,
|
|
24
|
+
and the common-sense reasoning dataset Winogrande-et. This was contributed by
|
|
25
|
+
@slowwavesleep ✨
|
|
26
|
+
- It is now possible to evaluate how much a model adhere to European values! 🇪🇺 This
|
|
27
|
+
probes 53 questions from the European values survey, which have been chosen based on
|
|
28
|
+
an optimisation procedure that maximises agreement across the EU. We then measure how
|
|
29
|
+
well the model's answers align with the distribution of answers across the EU, using a
|
|
30
|
+
tree-based kernel density estimation. This can only be used zero-shot, and only with
|
|
31
|
+
instruction-based decoder models (including reasoning models).
|
|
32
|
+
|
|
33
|
+
### Changed
|
|
34
|
+
- When evaluating classification tasks, we now force the model to output one of the
|
|
35
|
+
labels. This is done directly with open models, and done via a JSON schema for API
|
|
36
|
+
models. This won't change the results for existing tasks, as logprobs are used, but
|
|
37
|
+
this was required to measure the European values.
|
|
38
|
+
- Updated `vllm` dependency to `>=0.10.1`, which includes GPT-OSS support.
|
|
39
|
+
- Updated `numpy` dependency to `>=2.0.0`, as the previous clash is not applicable
|
|
40
|
+
- Updated `transformers` dependency to `>=4.56.0`, which includes support for more
|
|
41
|
+
models.
|
|
42
|
+
- Now requires Python >=3.11, as Python 3.10 does not support structured generation with
|
|
43
|
+
a dynamic set of choices (Literal[*list_of_choices] is not supported)
|
|
44
|
+
|
|
45
|
+
### Fixed
|
|
46
|
+
- Enable support to evaluate Mistral models with their custom `mistral-common`
|
|
47
|
+
tokeniser, which includes all recent Mistral models. Note that we currently assume
|
|
48
|
+
that all of these models are instruction-tuned decoder models (which _is_ true
|
|
49
|
+
currently), which can lead to errors in case they publish different types of models in
|
|
50
|
+
the future.
|
|
51
|
+
- Now disables the `seed` parameter if the API inference model does not support it,
|
|
52
|
+
which prevented evaluating some models.
|
|
53
|
+
- Now correctly detects an API inference model as non-existing, even if LiteLLM *does*
|
|
54
|
+
see it as existing. We have an additional check during evaluation to ensure this now.
|
|
55
|
+
- Catch an `ImportError` error that sometimes happens when finishing the evaluation of a
|
|
56
|
+
vLLM model, during shutdown.
|
|
57
|
+
- Now uses `litellm>=1.75.6`, which fixes an issue related to evaluation of GPT-5 models
|
|
58
|
+
using Ollama.
|
|
59
|
+
- Now always uses the `multiprocessing` backend when evaluating vLLM models, rather than
|
|
60
|
+
reverting to `ray` when using multiple GPUs, as `ray` led to evaluations of several
|
|
61
|
+
models freezing.
|
|
62
|
+
- Now does not require the user to be logged in to Hugging Face to benchmark models on
|
|
63
|
+
the Hugging Face Hub, if the models are public.
|
|
64
|
+
|
|
65
|
+
### Removed
|
|
66
|
+
- Removed support for human evaluation, as it was not actively maintained and not used.
|
|
67
|
+
|
|
68
|
+
|
|
13
69
|
## [v15.16.0] - 2025-08-12
|
|
14
70
|
### Added
|
|
15
71
|
- Added metadata for GPT-5 models.
|
|
@@ -32,7 +88,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
32
88
|
- Added the common-sense reasoning dataset GoldenSwag for the following
|
|
33
89
|
languages: Danish, German, Spanish, Finnish, French, Italian, Dutch, Swedish.
|
|
34
90
|
The datasets are unofficial for now. This was contributed by
|
|
35
|
-
|
|
91
|
+
@oliverkinch ✨
|
|
36
92
|
|
|
37
93
|
### Changed
|
|
38
94
|
- Now allows metadata to be included in metrics, allowing more flexibility when
|
|
@@ -88,7 +144,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
88
144
|
acceptability dataset ScaLA-pt. The machine translated ones include the sentiment
|
|
89
145
|
classification dataset SST-2, the multiple choice reading comprehension dataset BoolQ,
|
|
90
146
|
the knowledge dataset MMLU, and the common-sense reasoning dataset GoldenSwag. This
|
|
91
|
-
was contributed by
|
|
147
|
+
was contributed by @duarteocarmo ✨
|
|
92
148
|
- Added `--gpu-memory-utilization` argument (`gpu_memory_utilization` in the
|
|
93
149
|
`Benchmarker` API), which can be lowered in case the user is experiencing OOM errors
|
|
94
150
|
when evaluating models. The default is 0.9 (same as previously), which means that vLLM
|
|
@@ -108,11 +164,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
108
164
|
- Added the English knowledge dataset Life in the UK, which has been added as an
|
|
109
165
|
official dataset, replacing the existing English knowledge dataset MMLU, which in turn
|
|
110
166
|
has been marked as unofficial now. This was contributed by
|
|
111
|
-
|
|
167
|
+
@oliverkinch ✨
|
|
112
168
|
- Added the Norwegian knowledge dataset Idioms-no, which is a multiple-choice question
|
|
113
169
|
dataset where the alternative answers have been generated using GPT-4o. This has been
|
|
114
170
|
added as an official dataset, and was contributed by
|
|
115
|
-
|
|
171
|
+
@oliverkinch ✨
|
|
116
172
|
- Added new `LLMAsAJudgeMetric`, which allows evaluating the performance of a model with
|
|
117
173
|
another judge model. This is useful for evaluating models in a reference-free manner,
|
|
118
174
|
or if the metric is sufficiently complex. It is currently not used in any task, but
|
|
@@ -216,11 +272,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
216
272
|
### Added
|
|
217
273
|
- Added the BeleBele datasets for Finnish, Italian and Spanish. They are listed as
|
|
218
274
|
unofficial for now. This was contributed by
|
|
219
|
-
|
|
275
|
+
@oliverkinch ✨
|
|
220
276
|
|
|
221
277
|
### Changed
|
|
222
278
|
- Now uses asyncronous requests when dealing with API models, speeding up the generation
|
|
223
|
-
immensely. This was contributed by
|
|
279
|
+
immensely. This was contributed by @mathiasesn ✨
|
|
224
280
|
|
|
225
281
|
### Fixed
|
|
226
282
|
- Add HellaSwag-fi back in, as the issue with the labels in the test split has been
|
|
@@ -272,7 +328,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
272
328
|
dataset [XL-Sum-fi](https://huggingface.co/datasets/TurkuNLP/xlsum-fi), and the
|
|
273
329
|
common-sense reasoning dataset
|
|
274
330
|
[HellaSwag-fi](https://huggingface.co/datasets/Finnish-NLP/hellaswag-fi-google-translate).
|
|
275
|
-
This was contributed by
|
|
331
|
+
This was contributed by @oliverkinch ✨
|
|
276
332
|
- Added metadata for GPT-4.1 and Grok-3 models.
|
|
277
333
|
- Marked Gemini-2.5-flash and Grok-3-mini as reasoning models, giving them more tokens
|
|
278
334
|
to think.
|
|
@@ -315,7 +371,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
315
371
|
## [v15.6.1] - 2025-04-14
|
|
316
372
|
### Changed
|
|
317
373
|
- Added more info about SQuAD-nl in the documentation. This was contributed by
|
|
318
|
-
|
|
374
|
+
@Rijgersberg ✨
|
|
319
375
|
|
|
320
376
|
### Fixed
|
|
321
377
|
- The "E" option for the Norwegian NorCommonSenseQA dataset was not included in the
|
|
@@ -343,7 +399,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
343
399
|
- Uniformised the prompt templates used for each task, so that they are more
|
|
344
400
|
consistent across tasks. Evaluation tests across different model types and sizes show
|
|
345
401
|
no significant performance difference between the new and old templates. This was
|
|
346
|
-
contributed by
|
|
402
|
+
contributed by @viggo-gascou ✨
|
|
347
403
|
|
|
348
404
|
### Fixed
|
|
349
405
|
- Avoid duplicate error messages when a rate limit occurs.
|
|
@@ -372,7 +428,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
372
428
|
- Allows all vLLM versions from v0.8.0 again, as the issue with the generation output
|
|
373
429
|
has been resolved.
|
|
374
430
|
- Added overall progress indicator during evaluation. This was contributed by
|
|
375
|
-
|
|
431
|
+
@mathiasesn ✨
|
|
376
432
|
|
|
377
433
|
### Changed
|
|
378
434
|
- Now does not use logprobs in text classification tasks with Google VertexAI models, as
|
|
@@ -411,9 +467,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
411
467
|
### Fixed
|
|
412
468
|
- Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
|
|
413
469
|
compatibility < 8.0. This was contributed by
|
|
414
|
-
|
|
470
|
+
@marksverdhei ✨
|
|
415
471
|
- Corrected the name of the French sentiment dataset AlloCiné. This was contributed by
|
|
416
|
-
|
|
472
|
+
@Alkarex ✨
|
|
417
473
|
- Evaluating a specific model revision did not work for adapter models, as there was a
|
|
418
474
|
confusion between the revision of the adapter and the revision of the base model. We
|
|
419
475
|
now use the revision for the adapter and use the latest revision for the base model.
|
|
@@ -439,7 +495,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
439
495
|
`HuggingFaceHubDown` exception.
|
|
440
496
|
- Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
|
|
441
497
|
compatibility < 8.0. This was contributed by
|
|
442
|
-
|
|
498
|
+
@marksverdhei ✨
|
|
443
499
|
- Fixed docs for ScandiQA-da and ScandiQA-sv, where it was incorrectly stated that
|
|
444
500
|
the splits were made by considering the original train/validation/test splits.
|
|
445
501
|
|
|
@@ -464,7 +520,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
464
520
|
[MMLU-es](https://hf.co/datasets/alexandrainst/m_mmlu), the common-sense reasoning
|
|
465
521
|
dataset [HellaSwag-es](https://hf.co/datasets/alexandrainst/m_hellaswag), and the
|
|
466
522
|
named entity recognition dataset [CoNLL-es](https://aclanthology.org/W02-2024/). This
|
|
467
|
-
was contributed by
|
|
523
|
+
was contributed by @oliverkinch ✨
|
|
468
524
|
- Now extracts number of parameters and context length for Ollama models, using the
|
|
469
525
|
`ollama` package. Vocabulary size is currently not available available in the `ollama`
|
|
470
526
|
package, so this is not extracted for Ollama models. For this reason, the `ollama`
|
|
@@ -517,7 +573,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
517
573
|
dataset [MMLU-it](https://hf.co/datasets/alexandrainst/m_mmlu), and the named entity
|
|
518
574
|
recognition dataset [MultiNERD IT](https://hf.co/datasets/Babelscape/multinerd) (and
|
|
519
575
|
unofficially [WikiNEuRal IT](https://hf.co/datasets/Babelscape/wikineural)). This was
|
|
520
|
-
contributed by
|
|
576
|
+
contributed by @viggo-gascou ✨
|
|
521
577
|
- Added the new Norwegian knowledge dataset NRK-Quiz-QA, consisting of quizzes on the
|
|
522
578
|
Norwegian language and culture, in both Bokmål and Nynorsk. The dataset has been split
|
|
523
579
|
into 635 / 256 / 2,048 samples for train, val, and test, respectively. This replaces
|
|
@@ -578,7 +634,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
578
634
|
- Added new `--only-allow-safetensors` flag, which disallows evaluating models from the
|
|
579
635
|
Hugging Face Hub if they are not stored as safetensors. This ensures a high level of
|
|
580
636
|
security on the system running the evaluations, if this is necessary. This was
|
|
581
|
-
contributed by
|
|
637
|
+
contributed by @Mikeriess ✨
|
|
582
638
|
|
|
583
639
|
|
|
584
640
|
### Fixed
|
|
@@ -607,19 +663,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
607
663
|
[personal-sum](https://github.com/SmartmediaAI/PersonalSum). It has been split into
|
|
608
664
|
121 / 64 / 256 samples for train / validation / test, respectively, and is set to
|
|
609
665
|
`unofficial` for now. This was contributed by
|
|
610
|
-
|
|
666
|
+
@oliverkinch ✨
|
|
611
667
|
- Added the Jentoft dataset - a linguistic acceptability dataset which was published in
|
|
612
668
|
[this Master's thesis](https://www.duo.uio.no/handle/10852/103885) by Matias Jentoft.
|
|
613
669
|
The original dataset consists of 85,771 / 10,827 / 10487 samples for training,
|
|
614
670
|
validation and test, respectively. We use a split of 1,024 / 256 / 2,048 samples for
|
|
615
671
|
training, validation and test, respectively. In each split, the distribution of
|
|
616
672
|
`correct` and `incorrect` is 50/50. This dataset has been set to `unofficial` for now.
|
|
617
|
-
This was contributed by
|
|
673
|
+
This was contributed by @oliverkinch ✨
|
|
618
674
|
- Added the dataset icelandic-knowledge, which is derived from the IcelandicQA dataset,
|
|
619
675
|
reformatted as a knowledge dataset with GPT-4o generated candidate answers. The split
|
|
620
676
|
is given by 845 / 128 / 1024 for train, val, and test, respectively. It is marked as
|
|
621
677
|
`unofficial` for now. This was contributed by
|
|
622
|
-
|
|
678
|
+
@oliverkinch ✨
|
|
623
679
|
|
|
624
680
|
### Changed
|
|
625
681
|
- Changed the instruction prompts to all text classification tasks by specifying
|
|
@@ -657,8 +713,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
657
713
|
dataset [OrangeSum](https://hf.co/datasets/EdinburghNLP/orange_sum).
|
|
658
714
|
- Added support for evaluating local models again, which supports models stored in the
|
|
659
715
|
Hugging Face format with a Hugging Face model configuration file (`config.json`) in
|
|
660
|
-
the model directory. This was contributed by
|
|
661
|
-
|
|
716
|
+
the model directory. This was contributed by @rlrs and
|
|
717
|
+
@peter-sk ✨
|
|
662
718
|
|
|
663
719
|
### Changed
|
|
664
720
|
- Changed the Belebele splits, as there were too few training splits for evaluation on
|
|
@@ -878,7 +934,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
878
934
|
dataset NO-Multi-QA-Sum (norglm-multi-qa). This dataset is part of the NLEBench
|
|
879
935
|
Norwegian benchmarks. The answers from the original dataset have been rephrased with
|
|
880
936
|
gpt-4o to contain the answer from the context. It has been marked as `unofficial` for
|
|
881
|
-
now. This was contributed by
|
|
937
|
+
now. This was contributed by @viggo-gascou ✨
|
|
882
938
|
- Added the sentiment classification part of the Icelandic dataset Hotter and Colder,
|
|
883
939
|
being a gold standard dataset. As no Icelandic sentiment classification dataset was
|
|
884
940
|
included in the benchmark previously, this is now the official Icelandic sentiment
|
|
@@ -897,18 +953,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
897
953
|
- Added the summarisation part of the Norwegian NorGLM multi-task human annotated
|
|
898
954
|
dataset NO-Multi-QA-Sum (`norglm-multi-sum`). This dataset is part of the NLEBench
|
|
899
955
|
Norwegian benchmarks. It has been marked as `unofficial` for now. This was contributed
|
|
900
|
-
by
|
|
956
|
+
by @viggo-gascou ✨
|
|
901
957
|
- Added `ice-linguistic` a linguistic acceptability dataset which is a subset of the
|
|
902
958
|
Icelandic Linguistic Benchmarks dataset. It is a small dataset with 94 train
|
|
903
959
|
samples, 32 validation samples, and 256 test samples, and has been marked as
|
|
904
960
|
`unofficial` for now. This was contributed by
|
|
905
|
-
|
|
961
|
+
@oliverkinch ✨
|
|
906
962
|
- Added `icelandic-qa`, an Icelandic question answering dataset about Icelandic culture
|
|
907
963
|
and history. The original dataset has 2000 samples, but only 375 of the samples have
|
|
908
964
|
answers that are found in the context (exact match). An LLM has therefore been used to
|
|
909
965
|
rephrase the answers and we now have 1683 samples where the answers are found in the
|
|
910
966
|
context (531 train, 128 val, 1024 test). It has been set to `unofficial` for now. This
|
|
911
|
-
was contributed by
|
|
967
|
+
was contributed by @oliverkinch ✨
|
|
912
968
|
|
|
913
969
|
### Fixed
|
|
914
970
|
- Small typo in prefix prompt used for few-shot evaluation of the English sentiment
|
|
@@ -920,21 +976,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
920
976
|
## [v13.1.0] - 2024-10-31
|
|
921
977
|
- Added `ice-ec` (a subset of the dataset) and `ice-ec-full` (the full dataset), an
|
|
922
978
|
Icelandic linguistic acceptability dataset. It has been set to `unofficial` for now.
|
|
923
|
-
This was contributed by
|
|
979
|
+
This was contributed by @oliverkinch ✨
|
|
924
980
|
- Added the Schibsted summarisation dataset, which contains summaries of published
|
|
925
981
|
articles from Schibsted Media's Norwegian and Swedish newsrooms. The dataset has been
|
|
926
982
|
split into two separate small datasets, `schibsted-sv` for Swedish and `schibsted-no`
|
|
927
983
|
for Norwegian. Note that both of these datasets are really small (89 and 374 test
|
|
928
984
|
samples in `schibsted-sv` and `schibsted-no`, respectively), and have been set to
|
|
929
985
|
`unofficial` for now. This was contributed by
|
|
930
|
-
|
|
986
|
+
@oliverkinch ✨
|
|
931
987
|
- Added the Icelandic summarisation dataset IceSum. IceSum is a collection of 1,000
|
|
932
988
|
Icelandic news articles from mbl.is, which have been manually annotated with
|
|
933
989
|
summaries. The dataset has been marked as unofficial, meaning that it will not be
|
|
934
990
|
automatically included when benchmarking models, but can be included by specifying the
|
|
935
991
|
dataset explicitly using the --dataset argument (or dataset argument if using the
|
|
936
992
|
Benchmarker API). This was contributed by
|
|
937
|
-
|
|
993
|
+
@viggo-gascou ✨
|
|
938
994
|
- Added the new Faroese reading comprehension dataset FoQA. This is now the default
|
|
939
995
|
Faroese reading comprehension benchmark, as there was none previously.
|
|
940
996
|
- Now supports evaluation of models with adapters. This requires that the model
|
|
@@ -1236,7 +1292,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1236
1292
|
|
|
1237
1293
|
### Fixed
|
|
1238
1294
|
- Move tensor to the correct device when benchmarking seq-to-seq models (#363). Thanks
|
|
1239
|
-
to
|
|
1295
|
+
to @ThomasKluiters for this contribution! :tada:
|
|
1240
1296
|
- Deals with the case where an instruction tuned model does not use any special token
|
|
1241
1297
|
at the end of the chat, such as `<|im_end|>`. This holds for, e.g., Qwen models.
|
|
1242
1298
|
- Better auto-detection of pipeline tag for models on the Hugging Face Hub, in case the
|
|
@@ -1250,7 +1306,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1250
1306
|
`AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_VERSION` need to have been set, or
|
|
1251
1307
|
alternatively through the `--azure-openai-api-key`, `--azure-openai-endpoint` and
|
|
1252
1308
|
`--azure-openai-api-version` arguments. Thanks to
|
|
1253
|
-
|
|
1309
|
+
@BramVanroy for all the help regarding the
|
|
1254
1310
|
implementation of this :tada:
|
|
1255
1311
|
- We now use the new JSON mode for newer OpenAI models for the NER task, to ensure
|
|
1256
1312
|
better JSON generation.
|
|
@@ -1761,7 +1817,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1761
1817
|
- A `--use-flash-attention` flag has been added, which enables Flash Attention 2.0,
|
|
1762
1818
|
which is required by some models, such as Mistral-based ones. If `flash-attn` has not
|
|
1763
1819
|
been installed then an informative error message will be raised. Thanks to
|
|
1764
|
-
|
|
1820
|
+
@peter-sk for this contribution! :tada:
|
|
1765
1821
|
|
|
1766
1822
|
### Changed
|
|
1767
1823
|
- Now uses 8-bit AdamW whenever CUDA is available, as opposed to regular AdamW.
|
|
@@ -1781,7 +1837,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1781
1837
|
OpenAI models. This currently happens automatically when specifying a generative
|
|
1782
1838
|
model from the Hugging Face Hub, and with all OpenAI models.
|
|
1783
1839
|
- Now stores model caches in separate directories, enabling parallel evaluations.
|
|
1784
|
-
Thanks to
|
|
1840
|
+
Thanks to @KennethEnevoldsen for this
|
|
1785
1841
|
contribution! :tada:
|
|
1786
1842
|
- Added `--device` argument to the CLI, which can be used to overwrite the automatic
|
|
1787
1843
|
detection of device (CPU, CUDA GPU, MPS GPU, TPU) to use.
|
|
@@ -1850,7 +1906,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1850
1906
|
- Now added support for benchmarking local models in the Hugging Face format (i.e.,
|
|
1851
1907
|
saved with the `save_pretrained` method). This automatically detects the framework
|
|
1852
1908
|
based on the file extension, but can also be set using the new `--model-framework`
|
|
1853
|
-
argument. Thanks to
|
|
1909
|
+
argument. Thanks to @peter-sk for implementing this!
|
|
1854
1910
|
:tada:
|
|
1855
1911
|
|
|
1856
1912
|
### Fixed
|
|
@@ -2149,7 +2205,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
2149
2205
|
- Specific branches/commits/tags can now be benchmarked, using the `@`
|
|
2150
2206
|
delimiter. For instance, `scandeval -m model_id@commit_hash` will benchmark
|
|
2151
2207
|
the model with model ID `model_id`, stored at commit with hash `commit_hash`.
|
|
2152
|
-
Thanks to
|
|
2208
|
+
Thanks to @versae for contributing! :tada:
|
|
2153
2209
|
|
|
2154
2210
|
|
|
2155
2211
|
## [v2.2.0] - 2022-01-18
|
|
@@ -2159,8 +2215,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
2159
2215
|
|
|
2160
2216
|
## [v2.1.0] - 2022-01-17
|
|
2161
2217
|
### Added
|
|
2162
|
-
- Added support for `flax` models. Thanks to
|
|
2163
|
-
[@versae](https://github.com/versae) for contributing! :tada:
|
|
2218
|
+
- Added support for `flax` models. Thanks to @versae for contributing! :tada:
|
|
2164
2219
|
|
|
2165
2220
|
|
|
2166
2221
|
## [v2.0.0] - 2022-01-07
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version:
|
|
3
|
+
Version: 16.0.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -28,18 +28,19 @@ License: MIT License
|
|
|
28
28
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
29
|
SOFTWARE.
|
|
30
30
|
License-File: LICENSE
|
|
31
|
-
Requires-Python: <4.0,>=3.
|
|
31
|
+
Requires-Python: <4.0,>=3.11
|
|
32
32
|
Requires-Dist: accelerate>=1.9.0
|
|
33
33
|
Requires-Dist: bert-score>=0.3.13
|
|
34
34
|
Requires-Dist: click>=8.1.3
|
|
35
|
+
Requires-Dist: cloudpickle>=3.1.1
|
|
35
36
|
Requires-Dist: datasets>=3.5.0
|
|
36
37
|
Requires-Dist: demjson3>=3.0.6
|
|
37
38
|
Requires-Dist: evaluate>=0.4.1
|
|
38
39
|
Requires-Dist: huggingface-hub>=0.30.1
|
|
39
40
|
Requires-Dist: levenshtein>=0.24.0
|
|
40
|
-
Requires-Dist: litellm>=1.
|
|
41
|
+
Requires-Dist: litellm>=1.75.6
|
|
41
42
|
Requires-Dist: more-itertools>=10.5.0
|
|
42
|
-
Requires-Dist: numpy
|
|
43
|
+
Requires-Dist: numpy>=2.0.0
|
|
43
44
|
Requires-Dist: ollama>=0.5.1
|
|
44
45
|
Requires-Dist: pandas>=2.2.0
|
|
45
46
|
Requires-Dist: peft>=0.15.0
|
|
@@ -49,27 +50,22 @@ Requires-Dist: pyinfer>=0.0.3
|
|
|
49
50
|
Requires-Dist: python-dotenv>=1.0.1
|
|
50
51
|
Requires-Dist: rouge-score>=0.1.2
|
|
51
52
|
Requires-Dist: sacremoses>=0.1.1
|
|
52
|
-
Requires-Dist: scikit-learn
|
|
53
|
+
Requires-Dist: scikit-learn==1.6.1
|
|
53
54
|
Requires-Dist: sentencepiece>=0.1.96
|
|
54
55
|
Requires-Dist: seqeval>=1.2.2
|
|
55
56
|
Requires-Dist: setuptools>=75.8.2
|
|
56
57
|
Requires-Dist: tenacity>=9.0.0
|
|
57
58
|
Requires-Dist: termcolor>=2.0.0
|
|
58
59
|
Requires-Dist: torch>=2.6.0
|
|
59
|
-
Requires-Dist: transformers>=4.
|
|
60
|
+
Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
60
61
|
Provides-Extra: all
|
|
61
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
62
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
|
-
Requires-Dist:
|
|
64
|
-
Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
+
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
65
65
|
Provides-Extra: generative
|
|
66
66
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
67
67
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
68
|
-
Requires-Dist: vllm>=0.10.
|
|
69
|
-
Provides-Extra: human-evaluation
|
|
70
|
-
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
71
|
-
Provides-Extra: test
|
|
72
|
-
Requires-Dist: gradio>=4.26.0; extra == 'test'
|
|
68
|
+
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
73
69
|
Description-Content-Type: text/markdown
|
|
74
70
|
|
|
75
71
|
<div align='center'>
|
|
@@ -223,17 +219,18 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
223
219
|
<a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
|
|
224
220
|
<a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
|
|
225
221
|
<a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
|
|
222
|
+
<a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
|
|
226
223
|
<a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
|
|
227
224
|
<a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
|
|
228
225
|
<a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
|
|
229
226
|
<a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
|
|
230
227
|
<a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
|
|
231
|
-
<a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
|
|
232
228
|
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
233
229
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
234
230
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
235
231
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
236
232
|
<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
|
|
233
|
+
<a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
|
|
237
234
|
|
|
238
235
|
|
|
239
236
|
### Contribute to EuroEval
|
|
@@ -149,17 +149,18 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
149
149
|
<a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
|
|
150
150
|
<a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
|
|
151
151
|
<a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
|
|
152
|
+
<a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
|
|
152
153
|
<a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
|
|
153
154
|
<a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
|
|
154
155
|
<a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
|
|
155
156
|
<a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
|
|
156
157
|
<a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
|
|
157
|
-
<a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
|
|
158
158
|
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
159
159
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
160
160
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
161
161
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
162
162
|
<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
|
|
163
|
+
<a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
|
|
163
164
|
|
|
164
165
|
|
|
165
166
|
### Contribute to EuroEval
|