EuroEval 15.16.0__tar.gz → 16.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-15.16.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +2 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +2 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/.github/workflows/ci.yaml +22 -14
- {euroeval-15.16.0 → euroeval-16.0.1}/.pre-commit-config.yaml +4 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/CHANGELOG.md +111 -36
- {euroeval-15.16.0 → euroeval-16.0.1}/PKG-INFO +13 -14
- {euroeval-15.16.0 → euroeval-16.0.1}/README.md +2 -1
- euroeval-16.0.1/docs/datasets/estonian.md +544 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/icelandic.md +11 -11
- euroeval-16.0.1/docs/datasets/latvian.md +536 -0
- euroeval-16.0.1/docs/leaderboards/Monolingual/portuguese.md +23 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/README.md +6 -16
- {euroeval-15.16.0 → euroeval-16.0.1}/makefile +5 -2
- {euroeval-15.16.0 → euroeval-16.0.1}/pyproject.toml +13 -17
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/__init__.py +8 -7
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_config_factory.py +3 -7
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/base.py +35 -19
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/fresh.py +24 -19
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/hf.py +136 -154
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/litellm.py +190 -110
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/vllm.py +199 -139
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmarker.py +49 -22
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/cli.py +3 -3
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/constants.py +19 -15
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/data_loading.py +33 -28
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/data_models.py +73 -23
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/__init__.py +2 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/danish.py +35 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/dutch.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/english.py +38 -1
- euroeval-16.0.1/src/euroeval/dataset_configs/estonian.py +95 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/faroese.py +38 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/finnish.py +39 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/french.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/german.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/icelandic.py +39 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/italian.py +38 -1
- euroeval-16.0.1/src/euroeval/dataset_configs/latvian.py +81 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/norwegian.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/portuguese.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/spanish.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/swedish.py +38 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/enums.py +0 -6
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/finetuning.py +6 -6
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/generation.py +25 -14
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/generation_utils.py +90 -20
- euroeval-16.0.1/src/euroeval/languages.py +966 -0
- euroeval-16.0.1/src/euroeval/metrics/__init__.py +6 -0
- euroeval-16.0.1/src/euroeval/metrics/base.py +76 -0
- euroeval-16.0.1/src/euroeval/metrics/huggingface.py +192 -0
- euroeval-16.0.1/src/euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval-16.0.1/src/euroeval/metrics/pipeline.py +276 -0
- euroeval-16.0.1/src/euroeval/metrics/speed.py +51 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/model_cache.py +13 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/multiple_choice.py +23 -2
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/named_entity_recognition.py +65 -2
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/reading_comprehension.py +42 -2
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/sentiment_classification.py +46 -2
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/summarization.py +24 -4
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/scores.py +7 -2
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/speed_benchmark.py +6 -6
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +19 -8
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/question_answering.py +35 -28
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/sequence_classification.py +128 -42
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/text_to_text.py +7 -3
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/token_classification.py +59 -73
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/tasks.py +33 -6
- euroeval-16.0.1/src/euroeval/tokenization_utils.py +585 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/utils.py +150 -35
- euroeval-16.0.1/src/scripts/create_copa_lv.py +143 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_danish_citizen_tests.py +3 -2
- euroeval-16.0.1/src/scripts/create_err_news.py +83 -0
- euroeval-16.0.1/src/scripts/create_estner.py +115 -0
- euroeval-16.0.1/src/scripts/create_estonian_valence.py +86 -0
- euroeval-16.0.1/src/scripts/create_european_values.py +289 -0
- euroeval-16.0.1/src/scripts/create_exam_et.py +136 -0
- euroeval-16.0.1/src/scripts/create_fullstack_ner.py +248 -0
- euroeval-16.0.1/src/scripts/create_grammar_et.py +74 -0
- euroeval-16.0.1/src/scripts/create_latvian_lsm_summary.py +92 -0
- euroeval-16.0.1/src/scripts/create_latvian_twitter_sentiment.py +109 -0
- euroeval-16.0.1/src/scripts/create_mmlu_lv.py +263 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_multi_wiki_qa.py +1 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_scala.py +4 -0
- euroeval-16.0.1/src/scripts/create_wikiann_lv.py +116 -0
- euroeval-16.0.1/src/scripts/create_winogrande_et.py +90 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/load_ud_pos.py +36 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/conftest.py +2 -19
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_hf.py +10 -13
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmarker.py +0 -44
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_cli.py +2 -2
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_data_loading.py +15 -8
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_data_models.py +2 -2
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_scores.py +1 -1
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_tokenization_utils.py +7 -7
- {euroeval-15.16.0 → euroeval-16.0.1}/uv.lock +1389 -2201
- euroeval-15.16.0/src/euroeval/human_evaluation.py +0 -738
- euroeval-15.16.0/src/euroeval/languages.py +0 -206
- euroeval-15.16.0/src/euroeval/metrics.py +0 -470
- euroeval-15.16.0/src/euroeval/tokenization_utils.py +0 -498
- euroeval-15.16.0/tests/test_human_evaluation.py +0 -8
- {euroeval-15.16.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/.gitignore +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/CITATION.cff +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/CONTRIBUTING.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/Dockerfile.cuda +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/LICENSE +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/NEW_DATASET_GUIDE.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/CNAME +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/README.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/README.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/danish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/dutch.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/english.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/faroese.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/finnish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/french.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/german.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/italian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/norwegian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/portuguese.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/spanish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/swedish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/extras/radial_plotter.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/faq.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/gfx/favicon.png +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/finnish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/methodology.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/python-package.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/README.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/knowledge.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/speed.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/summarization.md +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/gfx/euroeval.png +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/gfx/euroeval.xcf +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/gfx/scandeval.png +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/mkdocs.yaml +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/callbacks.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/exceptions.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/model_config.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/model_loading.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/types.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/constants.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_allocine.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_arc.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_arc_is.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_belebele.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_boolq_pt.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_conll_en.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_conll_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_dane.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_dansk.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_dbrd.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_eltec.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_fone.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_foqa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_fosent.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_fquad.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_germanquad.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_germeval.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_goldenswag.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_harem.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_hellaswag_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_icesum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_idioms_no.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_jentoft.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_life_in_the_uk.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mmlu.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_no_cola.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norec.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norne.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norquad.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_nqii.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_publico.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_rrn.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sb10k.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_scandisent_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_schibsted.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_squad.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_squad_it.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sst2_pt.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sst5.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_suc3.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_swedn.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_swerec.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_turku_ner_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_tydiqa_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_xlsum_fi.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_xquad_es.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/versioning.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_callbacks.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_constants.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_dataset_configs.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_enums.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_exceptions.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_finetuning.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_generation.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_languages.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_model_cache.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_model_config.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_model_loading.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_speed_benchmark.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_tasks.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_types.py +0 -0
- {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_utils.py +0 -0
|
@@ -25,12 +25,14 @@ body:
|
|
|
25
25
|
- label: Danish
|
|
26
26
|
- label: Dutch
|
|
27
27
|
- label: English
|
|
28
|
+
- label: Estonian
|
|
28
29
|
- label: Faroese
|
|
29
30
|
- label: Finnish
|
|
30
31
|
- label: French
|
|
31
32
|
- label: German
|
|
32
33
|
- label: Icelandic
|
|
33
34
|
- label: Italian
|
|
35
|
+
- label: Latvian
|
|
34
36
|
- label: Norwegian (Bokmål or Nynorsk)
|
|
35
37
|
- label: Portuguese
|
|
36
38
|
- label: Spanish
|
|
@@ -21,7 +21,8 @@ body:
|
|
|
21
21
|
- label: Romance languages (French, Italian, Portuguese, Spanish)
|
|
22
22
|
- label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
|
|
23
23
|
- label: West Germanic languages (Dutch, English, German)
|
|
24
|
-
- label: Finnish
|
|
24
|
+
- label: Finnic languages (Estonian, Finnish)
|
|
25
|
+
- label: Latvian
|
|
25
26
|
validations:
|
|
26
27
|
required: true
|
|
27
28
|
- type: dropdown
|
|
@@ -22,16 +22,19 @@ jobs:
|
|
|
22
22
|
pull-requests: write
|
|
23
23
|
runs-on: ubuntu-latest
|
|
24
24
|
steps:
|
|
25
|
-
- uses: actions/checkout@
|
|
25
|
+
- uses: actions/checkout@v5
|
|
26
26
|
with:
|
|
27
27
|
persist-credentials: false
|
|
28
|
-
|
|
28
|
+
ref: main
|
|
29
|
+
|
|
30
|
+
- name: Install uv and set up Python
|
|
31
|
+
uses: astral-sh/setup-uv@v6
|
|
29
32
|
with:
|
|
33
|
+
enable-cache: false
|
|
30
34
|
python-version: "3.11"
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
shell: bash
|
|
35
|
+
|
|
36
|
+
- name: Run pre-commit hooks
|
|
37
|
+
uses: pre-commit/action@v3.0.1
|
|
35
38
|
|
|
36
39
|
pytest-linux:
|
|
37
40
|
if: github.event.pull_request.draft == false
|
|
@@ -40,21 +43,22 @@ jobs:
|
|
|
40
43
|
pull-requests: write
|
|
41
44
|
strategy:
|
|
42
45
|
matrix:
|
|
43
|
-
python-version: ["3.
|
|
46
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
44
47
|
runs-on: ubuntu-latest
|
|
45
48
|
steps:
|
|
46
|
-
- uses: actions/checkout@
|
|
49
|
+
- uses: actions/checkout@v5
|
|
47
50
|
with:
|
|
48
51
|
persist-credentials: false
|
|
52
|
+
ref: main
|
|
49
53
|
|
|
50
54
|
- name: Install uv and set up Python
|
|
51
|
-
uses: astral-sh/setup-uv@
|
|
55
|
+
uses: astral-sh/setup-uv@v6
|
|
52
56
|
with:
|
|
53
57
|
enable-cache: false
|
|
54
58
|
python-version: ${{ matrix.python-version }}
|
|
55
59
|
|
|
56
60
|
- name: Install Dependencies
|
|
57
|
-
run: uv sync --no-dev
|
|
61
|
+
run: uv sync --no-dev
|
|
58
62
|
|
|
59
63
|
- name: Start Ollama server
|
|
60
64
|
run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
|
|
@@ -79,21 +83,25 @@ jobs:
|
|
|
79
83
|
pull-requests: write
|
|
80
84
|
runs-on: macos-latest
|
|
81
85
|
steps:
|
|
82
|
-
- uses: actions/checkout@
|
|
86
|
+
- uses: actions/checkout@v5
|
|
87
|
+
with:
|
|
88
|
+
persist-credentials: false
|
|
89
|
+
ref: main
|
|
83
90
|
|
|
84
91
|
- name: Install uv and set up Python
|
|
85
|
-
uses: astral-sh/setup-uv@
|
|
92
|
+
uses: astral-sh/setup-uv@v6
|
|
86
93
|
with:
|
|
94
|
+
enable-cache: false
|
|
87
95
|
python-version: ${{ matrix.python-version }}
|
|
88
96
|
|
|
89
97
|
- name: Install Dependencies
|
|
90
|
-
run: uv sync --no-dev
|
|
98
|
+
run: uv sync --no-dev
|
|
91
99
|
|
|
92
100
|
- name: Start Ollama server
|
|
93
101
|
run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
|
|
94
102
|
|
|
95
103
|
- name: Test with pytest
|
|
96
|
-
run: uv run pytest
|
|
104
|
+
run: uv run pytest -vvv
|
|
97
105
|
env:
|
|
98
106
|
HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
99
107
|
HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
@@ -10,18 +10,21 @@ repos:
|
|
|
10
10
|
- id: trailing-whitespace
|
|
11
11
|
- id: debug-statements
|
|
12
12
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
-
rev: v0.12.
|
|
13
|
+
rev: v0.12.12
|
|
14
14
|
hooks:
|
|
15
15
|
- id: ruff
|
|
16
16
|
args:
|
|
17
17
|
- --fix
|
|
18
18
|
- --unsafe-fixes
|
|
19
19
|
- --exit-non-zero-on-fix
|
|
20
|
+
- --no-cache
|
|
20
21
|
types_or:
|
|
21
22
|
- python
|
|
22
23
|
- pyi
|
|
23
24
|
- jupyter
|
|
24
25
|
- id: ruff-format
|
|
26
|
+
args:
|
|
27
|
+
- --no-cache
|
|
25
28
|
types_or:
|
|
26
29
|
- python
|
|
27
30
|
- pyi
|
|
@@ -10,6 +10,82 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v16.0.1] - 2025-09-07
|
|
14
|
+
### Fixed
|
|
15
|
+
- Fixed a bug causing encoders to fail when evaluating on the Exam-et dataset.
|
|
16
|
+
- Previously we would abort an evaluation completely if the model outputted a single
|
|
17
|
+
invalid output on a classification task. As individual samples rarely have a great
|
|
18
|
+
influence on the overall score, we now just assign the closest label to the sample and
|
|
19
|
+
continue the evaluation. This will be logged to the user, so that they are aware of
|
|
20
|
+
this. Some tasks are more sensitive to individual samples, such as European values,
|
|
21
|
+
where we still abort the evaluation if a single sample is invalid.
|
|
22
|
+
- Fixed a bug where logprobs were not used for classification tasks when evaluating
|
|
23
|
+
generative models, due to the fact that we raised the number of generated tokens to 10
|
|
24
|
+
for such tasks. This did not affect the results, but it meant that some evaluations
|
|
25
|
+
failed.
|
|
26
|
+
- Now includes FlashInfer as a dependency, as it is required by vLLM.
|
|
27
|
+
- Changed the choices in European values to use letters, like the other multiple
|
|
28
|
+
choice tasks, rather than numbers. Aside from ensuring consistency, we also avoid the
|
|
29
|
+
issue where '10' and '1' often both have the same first token ('1'), causing us not to
|
|
30
|
+
be able to use logprobs to determine the answer.
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
## [v16.0.0] - 2025-09-05
|
|
34
|
+
### Added
|
|
35
|
+
- Added support for Latvian 🇱🇻! This includes the sentiment classification dataset
|
|
36
|
+
Latvian Twitter Sentiment, the linguistic acceptability dataset ScaLA-lv, the named
|
|
37
|
+
entity recognition datasets FullStack-NER-lv and WikiANN-lv, the reading comprehension
|
|
38
|
+
dataset MultiWikiQA, the knowledge dataset MMLU-lv, the common-sense reasoning
|
|
39
|
+
dataset COPA-lv, and the summarisation dataset LSM.
|
|
40
|
+
- Added support for Estonian 🇪🇪! It includes the sentiment classification dataset
|
|
41
|
+
Estonian Valence, the linguistic acceptability datasets Grammar-et and ScaLA-et, the
|
|
42
|
+
named entity recognition dataset EstNER, the reading comprehension dataset
|
|
43
|
+
MultiWikiQA-et, the summarisation dataset ERRNews, the knowledge dataset Exam-et,
|
|
44
|
+
and the common-sense reasoning dataset Winogrande-et. This was contributed by
|
|
45
|
+
@slowwavesleep ✨
|
|
46
|
+
- It is now possible to evaluate how much a model adhere to European values! 🇪🇺 This
|
|
47
|
+
probes 53 questions from the European values survey, which have been chosen based on
|
|
48
|
+
an optimisation procedure that maximises agreement across the EU. We then measure how
|
|
49
|
+
well the model's answers align with the distribution of answers across the EU, using a
|
|
50
|
+
tree-based kernel density estimation. This can only be used zero-shot, and only with
|
|
51
|
+
instruction-based decoder models (including reasoning models).
|
|
52
|
+
|
|
53
|
+
### Changed
|
|
54
|
+
- When evaluating classification tasks, we now force the model to output one of the
|
|
55
|
+
labels. This is done directly with open models, and done via a JSON schema for API
|
|
56
|
+
models. This won't change the results for existing tasks, as logprobs are used, but
|
|
57
|
+
this was required to measure the European values.
|
|
58
|
+
- Updated `vllm` dependency to `>=0.10.1`, which includes GPT-OSS support.
|
|
59
|
+
- Updated `numpy` dependency to `>=2.0.0`, as the previous clash is not applicable
|
|
60
|
+
- Updated `transformers` dependency to `>=4.56.0`, which includes support for more
|
|
61
|
+
models.
|
|
62
|
+
- Now requires Python >=3.11, as Python 3.10 does not support structured generation with
|
|
63
|
+
a dynamic set of choices (Literal[*list_of_choices] is not supported)
|
|
64
|
+
|
|
65
|
+
### Fixed
|
|
66
|
+
- Enable support to evaluate Mistral models with their custom `mistral-common`
|
|
67
|
+
tokeniser, which includes all recent Mistral models. Note that we currently assume
|
|
68
|
+
that all of these models are instruction-tuned decoder models (which _is_ true
|
|
69
|
+
currently), which can lead to errors in case they publish different types of models in
|
|
70
|
+
the future.
|
|
71
|
+
- Now disables the `seed` parameter if the API inference model does not support it,
|
|
72
|
+
which prevented evaluating some models.
|
|
73
|
+
- Now correctly detects an API inference model as non-existing, even if LiteLLM *does*
|
|
74
|
+
see it as existing. We have an additional check during evaluation to ensure this now.
|
|
75
|
+
- Catch an `ImportError` error that sometimes happens when finishing the evaluation of a
|
|
76
|
+
vLLM model, during shutdown.
|
|
77
|
+
- Now uses `litellm>=1.75.6`, which fixes an issue related to evaluation of GPT-5 models
|
|
78
|
+
using Ollama.
|
|
79
|
+
- Now always uses the `multiprocessing` backend when evaluating vLLM models, rather than
|
|
80
|
+
reverting to `ray` when using multiple GPUs, as `ray` led to evaluations of several
|
|
81
|
+
models freezing.
|
|
82
|
+
- Now does not require the user to be logged in to Hugging Face to benchmark models on
|
|
83
|
+
the Hugging Face Hub, if the models are public.
|
|
84
|
+
|
|
85
|
+
### Removed
|
|
86
|
+
- Removed support for human evaluation, as it was not actively maintained and not used.
|
|
87
|
+
|
|
88
|
+
|
|
13
89
|
## [v15.16.0] - 2025-08-12
|
|
14
90
|
### Added
|
|
15
91
|
- Added metadata for GPT-5 models.
|
|
@@ -32,7 +108,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
32
108
|
- Added the common-sense reasoning dataset GoldenSwag for the following
|
|
33
109
|
languages: Danish, German, Spanish, Finnish, French, Italian, Dutch, Swedish.
|
|
34
110
|
The datasets are unofficial for now. This was contributed by
|
|
35
|
-
|
|
111
|
+
@oliverkinch ✨
|
|
36
112
|
|
|
37
113
|
### Changed
|
|
38
114
|
- Now allows metadata to be included in metrics, allowing more flexibility when
|
|
@@ -88,7 +164,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
88
164
|
acceptability dataset ScaLA-pt. The machine translated ones include the sentiment
|
|
89
165
|
classification dataset SST-2, the multiple choice reading comprehension dataset BoolQ,
|
|
90
166
|
the knowledge dataset MMLU, and the common-sense reasoning dataset GoldenSwag. This
|
|
91
|
-
was contributed by
|
|
167
|
+
was contributed by @duarteocarmo ✨
|
|
92
168
|
- Added `--gpu-memory-utilization` argument (`gpu_memory_utilization` in the
|
|
93
169
|
`Benchmarker` API), which can be lowered in case the user is experiencing OOM errors
|
|
94
170
|
when evaluating models. The default is 0.9 (same as previously), which means that vLLM
|
|
@@ -108,11 +184,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
108
184
|
- Added the English knowledge dataset Life in the UK, which has been added as an
|
|
109
185
|
official dataset, replacing the existing English knowledge dataset MMLU, which in turn
|
|
110
186
|
has been marked as unofficial now. This was contributed by
|
|
111
|
-
|
|
187
|
+
@oliverkinch ✨
|
|
112
188
|
- Added the Norwegian knowledge dataset Idioms-no, which is a multiple-choice question
|
|
113
189
|
dataset where the alternative answers have been generated using GPT-4o. This has been
|
|
114
190
|
added as an official dataset, and was contributed by
|
|
115
|
-
|
|
191
|
+
@oliverkinch ✨
|
|
116
192
|
- Added new `LLMAsAJudgeMetric`, which allows evaluating the performance of a model with
|
|
117
193
|
another judge model. This is useful for evaluating models in a reference-free manner,
|
|
118
194
|
or if the metric is sufficiently complex. It is currently not used in any task, but
|
|
@@ -216,11 +292,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
216
292
|
### Added
|
|
217
293
|
- Added the BeleBele datasets for Finnish, Italian and Spanish. They are listed as
|
|
218
294
|
unofficial for now. This was contributed by
|
|
219
|
-
|
|
295
|
+
@oliverkinch ✨
|
|
220
296
|
|
|
221
297
|
### Changed
|
|
222
298
|
- Now uses asyncronous requests when dealing with API models, speeding up the generation
|
|
223
|
-
immensely. This was contributed by
|
|
299
|
+
immensely. This was contributed by @mathiasesn ✨
|
|
224
300
|
|
|
225
301
|
### Fixed
|
|
226
302
|
- Add HellaSwag-fi back in, as the issue with the labels in the test split has been
|
|
@@ -272,7 +348,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
272
348
|
dataset [XL-Sum-fi](https://huggingface.co/datasets/TurkuNLP/xlsum-fi), and the
|
|
273
349
|
common-sense reasoning dataset
|
|
274
350
|
[HellaSwag-fi](https://huggingface.co/datasets/Finnish-NLP/hellaswag-fi-google-translate).
|
|
275
|
-
This was contributed by
|
|
351
|
+
This was contributed by @oliverkinch ✨
|
|
276
352
|
- Added metadata for GPT-4.1 and Grok-3 models.
|
|
277
353
|
- Marked Gemini-2.5-flash and Grok-3-mini as reasoning models, giving them more tokens
|
|
278
354
|
to think.
|
|
@@ -315,7 +391,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
315
391
|
## [v15.6.1] - 2025-04-14
|
|
316
392
|
### Changed
|
|
317
393
|
- Added more info about SQuAD-nl in the documentation. This was contributed by
|
|
318
|
-
|
|
394
|
+
@Rijgersberg ✨
|
|
319
395
|
|
|
320
396
|
### Fixed
|
|
321
397
|
- The "E" option for the Norwegian NorCommonSenseQA dataset was not included in the
|
|
@@ -343,7 +419,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
343
419
|
- Uniformised the prompt templates used for each task, so that they are more
|
|
344
420
|
consistent across tasks. Evaluation tests across different model types and sizes show
|
|
345
421
|
no significant performance difference between the new and old templates. This was
|
|
346
|
-
contributed by
|
|
422
|
+
contributed by @viggo-gascou ✨
|
|
347
423
|
|
|
348
424
|
### Fixed
|
|
349
425
|
- Avoid duplicate error messages when a rate limit occurs.
|
|
@@ -372,7 +448,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
372
448
|
- Allows all vLLM versions from v0.8.0 again, as the issue with the generation output
|
|
373
449
|
has been resolved.
|
|
374
450
|
- Added overall progress indicator during evaluation. This was contributed by
|
|
375
|
-
|
|
451
|
+
@mathiasesn ✨
|
|
376
452
|
|
|
377
453
|
### Changed
|
|
378
454
|
- Now does not use logprobs in text classification tasks with Google VertexAI models, as
|
|
@@ -411,9 +487,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
411
487
|
### Fixed
|
|
412
488
|
- Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
|
|
413
489
|
compatibility < 8.0. This was contributed by
|
|
414
|
-
|
|
490
|
+
@marksverdhei ✨
|
|
415
491
|
- Corrected the name of the French sentiment dataset AlloCiné. This was contributed by
|
|
416
|
-
|
|
492
|
+
@Alkarex ✨
|
|
417
493
|
- Evaluating a specific model revision did not work for adapter models, as there was a
|
|
418
494
|
confusion between the revision of the adapter and the revision of the base model. We
|
|
419
495
|
now use the revision for the adapter and use the latest revision for the base model.
|
|
@@ -439,7 +515,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
439
515
|
`HuggingFaceHubDown` exception.
|
|
440
516
|
- Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
|
|
441
517
|
compatibility < 8.0. This was contributed by
|
|
442
|
-
|
|
518
|
+
@marksverdhei ✨
|
|
443
519
|
- Fixed docs for ScandiQA-da and ScandiQA-sv, where it was incorrectly stated that
|
|
444
520
|
the splits were made by considering the original train/validation/test splits.
|
|
445
521
|
|
|
@@ -464,7 +540,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
464
540
|
[MMLU-es](https://hf.co/datasets/alexandrainst/m_mmlu), the common-sense reasoning
|
|
465
541
|
dataset [HellaSwag-es](https://hf.co/datasets/alexandrainst/m_hellaswag), and the
|
|
466
542
|
named entity recognition dataset [CoNLL-es](https://aclanthology.org/W02-2024/). This
|
|
467
|
-
was contributed by
|
|
543
|
+
was contributed by @oliverkinch ✨
|
|
468
544
|
- Now extracts number of parameters and context length for Ollama models, using the
|
|
469
545
|
`ollama` package. Vocabulary size is currently not available available in the `ollama`
|
|
470
546
|
package, so this is not extracted for Ollama models. For this reason, the `ollama`
|
|
@@ -517,7 +593,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
517
593
|
dataset [MMLU-it](https://hf.co/datasets/alexandrainst/m_mmlu), and the named entity
|
|
518
594
|
recognition dataset [MultiNERD IT](https://hf.co/datasets/Babelscape/multinerd) (and
|
|
519
595
|
unofficially [WikiNEuRal IT](https://hf.co/datasets/Babelscape/wikineural)). This was
|
|
520
|
-
contributed by
|
|
596
|
+
contributed by @viggo-gascou ✨
|
|
521
597
|
- Added the new Norwegian knowledge dataset NRK-Quiz-QA, consisting of quizzes on the
|
|
522
598
|
Norwegian language and culture, in both Bokmål and Nynorsk. The dataset has been split
|
|
523
599
|
into 635 / 256 / 2,048 samples for train, val, and test, respectively. This replaces
|
|
@@ -578,7 +654,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
578
654
|
- Added new `--only-allow-safetensors` flag, which disallows evaluating models from the
|
|
579
655
|
Hugging Face Hub if they are not stored as safetensors. This ensures a high level of
|
|
580
656
|
security on the system running the evaluations, if this is necessary. This was
|
|
581
|
-
contributed by
|
|
657
|
+
contributed by @Mikeriess ✨
|
|
582
658
|
|
|
583
659
|
|
|
584
660
|
### Fixed
|
|
@@ -607,19 +683,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
607
683
|
[personal-sum](https://github.com/SmartmediaAI/PersonalSum). It has been split into
|
|
608
684
|
121 / 64 / 256 samples for train / validation / test, respectively, and is set to
|
|
609
685
|
`unofficial` for now. This was contributed by
|
|
610
|
-
|
|
686
|
+
@oliverkinch ✨
|
|
611
687
|
- Added the Jentoft dataset - a linguistic acceptability dataset which was published in
|
|
612
688
|
[this Master's thesis](https://www.duo.uio.no/handle/10852/103885) by Matias Jentoft.
|
|
613
689
|
The original dataset consists of 85,771 / 10,827 / 10487 samples for training,
|
|
614
690
|
validation and test, respectively. We use a split of 1,024 / 256 / 2,048 samples for
|
|
615
691
|
training, validation and test, respectively. In each split, the distribution of
|
|
616
692
|
`correct` and `incorrect` is 50/50. This dataset has been set to `unofficial` for now.
|
|
617
|
-
This was contributed by
|
|
693
|
+
This was contributed by @oliverkinch ✨
|
|
618
694
|
- Added the dataset icelandic-knowledge, which is derived from the IcelandicQA dataset,
|
|
619
695
|
reformatted as a knowledge dataset with GPT-4o generated candidate answers. The split
|
|
620
696
|
is given by 845 / 128 / 1024 for train, val, and test, respectively. It is marked as
|
|
621
697
|
`unofficial` for now. This was contributed by
|
|
622
|
-
|
|
698
|
+
@oliverkinch ✨
|
|
623
699
|
|
|
624
700
|
### Changed
|
|
625
701
|
- Changed the instruction prompts to all text classification tasks by specifying
|
|
@@ -657,8 +733,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
657
733
|
dataset [OrangeSum](https://hf.co/datasets/EdinburghNLP/orange_sum).
|
|
658
734
|
- Added support for evaluating local models again, which supports models stored in the
|
|
659
735
|
Hugging Face format with a Hugging Face model configuration file (`config.json`) in
|
|
660
|
-
the model directory. This was contributed by
|
|
661
|
-
|
|
736
|
+
the model directory. This was contributed by @rlrs and
|
|
737
|
+
@peter-sk ✨
|
|
662
738
|
|
|
663
739
|
### Changed
|
|
664
740
|
- Changed the Belebele splits, as there were too few training splits for evaluation on
|
|
@@ -878,7 +954,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
878
954
|
dataset NO-Multi-QA-Sum (norglm-multi-qa). This dataset is part of the NLEBench
|
|
879
955
|
Norwegian benchmarks. The answers from the original dataset have been rephrased with
|
|
880
956
|
gpt-4o to contain the answer from the context. It has been marked as `unofficial` for
|
|
881
|
-
now. This was contributed by
|
|
957
|
+
now. This was contributed by @viggo-gascou ✨
|
|
882
958
|
- Added the sentiment classification part of the Icelandic dataset Hotter and Colder,
|
|
883
959
|
being a gold standard dataset. As no Icelandic sentiment classification dataset was
|
|
884
960
|
included in the benchmark previously, this is now the official Icelandic sentiment
|
|
@@ -897,18 +973,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
897
973
|
- Added the summarisation part of the Norwegian NorGLM multi-task human annotated
|
|
898
974
|
dataset NO-Multi-QA-Sum (`norglm-multi-sum`). This dataset is part of the NLEBench
|
|
899
975
|
Norwegian benchmarks. It has been marked as `unofficial` for now. This was contributed
|
|
900
|
-
by
|
|
976
|
+
by @viggo-gascou ✨
|
|
901
977
|
- Added `ice-linguistic` a linguistic acceptability dataset which is a subset of the
|
|
902
978
|
Icelandic Linguistic Benchmarks dataset. It is a small dataset with 94 train
|
|
903
979
|
samples, 32 validation samples, and 256 test samples, and has been marked as
|
|
904
980
|
`unofficial` for now. This was contributed by
|
|
905
|
-
|
|
981
|
+
@oliverkinch ✨
|
|
906
982
|
- Added `icelandic-qa`, an Icelandic question answering dataset about Icelandic culture
|
|
907
983
|
and history. The original dataset has 2000 samples, but only 375 of the samples have
|
|
908
984
|
answers that are found in the context (exact match). An LLM has therefore been used to
|
|
909
985
|
rephrase the answers and we now have 1683 samples where the answers are found in the
|
|
910
986
|
context (531 train, 128 val, 1024 test). It has been set to `unofficial` for now. This
|
|
911
|
-
was contributed by
|
|
987
|
+
was contributed by @oliverkinch ✨
|
|
912
988
|
|
|
913
989
|
### Fixed
|
|
914
990
|
- Small typo in prefix prompt used for few-shot evaluation of the English sentiment
|
|
@@ -920,21 +996,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
920
996
|
## [v13.1.0] - 2024-10-31
|
|
921
997
|
- Added `ice-ec` (a subset of the dataset) and `ice-ec-full` (the full dataset), an
|
|
922
998
|
Icelandic linguistic acceptability dataset. It has been set to `unofficial` for now.
|
|
923
|
-
This was contributed by
|
|
999
|
+
This was contributed by @oliverkinch ✨
|
|
924
1000
|
- Added the Schibsted summarisation dataset, which contains summaries of published
|
|
925
1001
|
articles from Schibsted Media's Norwegian and Swedish newsrooms. The dataset has been
|
|
926
1002
|
split into two separate small datasets, `schibsted-sv` for Swedish and `schibsted-no`
|
|
927
1003
|
for Norwegian. Note that both of these datasets are really small (89 and 374 test
|
|
928
1004
|
samples in `schibsted-sv` and `schibsted-no`, respectively), and have been set to
|
|
929
1005
|
`unofficial` for now. This was contributed by
|
|
930
|
-
|
|
1006
|
+
@oliverkinch ✨
|
|
931
1007
|
- Added the Icelandic summarisation dataset IceSum. IceSum is a collection of 1,000
|
|
932
1008
|
Icelandic news articles from mbl.is, which have been manually annotated with
|
|
933
1009
|
summaries. The dataset has been marked as unofficial, meaning that it will not be
|
|
934
1010
|
automatically included when benchmarking models, but can be included by specifying the
|
|
935
1011
|
dataset explicitly using the --dataset argument (or dataset argument if using the
|
|
936
1012
|
Benchmarker API). This was contributed by
|
|
937
|
-
|
|
1013
|
+
@viggo-gascou ✨
|
|
938
1014
|
- Added the new Faroese reading comprehension dataset FoQA. This is now the default
|
|
939
1015
|
Faroese reading comprehension benchmark, as there was none previously.
|
|
940
1016
|
- Now supports evaluation of models with adapters. This requires that the model
|
|
@@ -1236,7 +1312,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1236
1312
|
|
|
1237
1313
|
### Fixed
|
|
1238
1314
|
- Move tensor to the correct device when benchmarking seq-to-seq models (#363). Thanks
|
|
1239
|
-
to
|
|
1315
|
+
to @ThomasKluiters for this contribution! :tada:
|
|
1240
1316
|
- Deals with the case where an instruction tuned model does not use any special token
|
|
1241
1317
|
at the end of the chat, such as `<|im_end|>`. This holds for, e.g., Qwen models.
|
|
1242
1318
|
- Better auto-detection of pipeline tag for models on the Hugging Face Hub, in case the
|
|
@@ -1250,7 +1326,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1250
1326
|
`AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_VERSION` need to have been set, or
|
|
1251
1327
|
alternatively through the `--azure-openai-api-key`, `--azure-openai-endpoint` and
|
|
1252
1328
|
`--azure-openai-api-version` arguments. Thanks to
|
|
1253
|
-
|
|
1329
|
+
@BramVanroy for all the help regarding the
|
|
1254
1330
|
implementation of this :tada:
|
|
1255
1331
|
- We now use the new JSON mode for newer OpenAI models for the NER task, to ensure
|
|
1256
1332
|
better JSON generation.
|
|
@@ -1761,7 +1837,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1761
1837
|
- A `--use-flash-attention` flag has been added, which enables Flash Attention 2.0,
|
|
1762
1838
|
which is required by some models, such as Mistral-based ones. If `flash-attn` has not
|
|
1763
1839
|
been installed then an informative error message will be raised. Thanks to
|
|
1764
|
-
|
|
1840
|
+
@peter-sk for this contribution! :tada:
|
|
1765
1841
|
|
|
1766
1842
|
### Changed
|
|
1767
1843
|
- Now uses 8-bit AdamW whenever CUDA is available, as opposed to regular AdamW.
|
|
@@ -1781,7 +1857,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1781
1857
|
OpenAI models. This currently happens automatically when specifying a generative
|
|
1782
1858
|
model from the Hugging Face Hub, and with all OpenAI models.
|
|
1783
1859
|
- Now stores model caches in separate directories, enabling parallel evaluations.
|
|
1784
|
-
Thanks to
|
|
1860
|
+
Thanks to @KennethEnevoldsen for this
|
|
1785
1861
|
contribution! :tada:
|
|
1786
1862
|
- Added `--device` argument to the CLI, which can be used to overwrite the automatic
|
|
1787
1863
|
detection of device (CPU, CUDA GPU, MPS GPU, TPU) to use.
|
|
@@ -1850,7 +1926,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1850
1926
|
- Now added support for benchmarking local models in the Hugging Face format (i.e.,
|
|
1851
1927
|
saved with the `save_pretrained` method). This automatically detects the framework
|
|
1852
1928
|
based on the file extension, but can also be set using the new `--model-framework`
|
|
1853
|
-
argument. Thanks to
|
|
1929
|
+
argument. Thanks to @peter-sk for implementing this!
|
|
1854
1930
|
:tada:
|
|
1855
1931
|
|
|
1856
1932
|
### Fixed
|
|
@@ -2149,7 +2225,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
2149
2225
|
- Specific branches/commits/tags can now be benchmarked, using the `@`
|
|
2150
2226
|
delimiter. For instance, `scandeval -m model_id@commit_hash` will benchmark
|
|
2151
2227
|
the model with model ID `model_id`, stored at commit with hash `commit_hash`.
|
|
2152
|
-
Thanks to
|
|
2228
|
+
Thanks to @versae for contributing! :tada:
|
|
2153
2229
|
|
|
2154
2230
|
|
|
2155
2231
|
## [v2.2.0] - 2022-01-18
|
|
@@ -2159,8 +2235,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
2159
2235
|
|
|
2160
2236
|
## [v2.1.0] - 2022-01-17
|
|
2161
2237
|
### Added
|
|
2162
|
-
- Added support for `flax` models. Thanks to
|
|
2163
|
-
[@versae](https://github.com/versae) for contributing! :tada:
|
|
2238
|
+
- Added support for `flax` models. Thanks to @versae for contributing! :tada:
|
|
2164
2239
|
|
|
2165
2240
|
|
|
2166
2241
|
## [v2.0.0] - 2022-01-07
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version:
|
|
3
|
+
Version: 16.0.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -28,18 +28,19 @@ License: MIT License
|
|
|
28
28
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
29
|
SOFTWARE.
|
|
30
30
|
License-File: LICENSE
|
|
31
|
-
Requires-Python: <4.0,>=3.
|
|
31
|
+
Requires-Python: <4.0,>=3.11
|
|
32
32
|
Requires-Dist: accelerate>=1.9.0
|
|
33
33
|
Requires-Dist: bert-score>=0.3.13
|
|
34
34
|
Requires-Dist: click>=8.1.3
|
|
35
|
+
Requires-Dist: cloudpickle>=3.1.1
|
|
35
36
|
Requires-Dist: datasets>=3.5.0
|
|
36
37
|
Requires-Dist: demjson3>=3.0.6
|
|
37
38
|
Requires-Dist: evaluate>=0.4.1
|
|
38
39
|
Requires-Dist: huggingface-hub>=0.30.1
|
|
39
40
|
Requires-Dist: levenshtein>=0.24.0
|
|
40
|
-
Requires-Dist: litellm>=1.
|
|
41
|
+
Requires-Dist: litellm>=1.75.6
|
|
41
42
|
Requires-Dist: more-itertools>=10.5.0
|
|
42
|
-
Requires-Dist: numpy
|
|
43
|
+
Requires-Dist: numpy>=2.0.0
|
|
43
44
|
Requires-Dist: ollama>=0.5.1
|
|
44
45
|
Requires-Dist: pandas>=2.2.0
|
|
45
46
|
Requires-Dist: peft>=0.15.0
|
|
@@ -49,27 +50,24 @@ Requires-Dist: pyinfer>=0.0.3
|
|
|
49
50
|
Requires-Dist: python-dotenv>=1.0.1
|
|
50
51
|
Requires-Dist: rouge-score>=0.1.2
|
|
51
52
|
Requires-Dist: sacremoses>=0.1.1
|
|
52
|
-
Requires-Dist: scikit-learn
|
|
53
|
+
Requires-Dist: scikit-learn==1.6.1
|
|
53
54
|
Requires-Dist: sentencepiece>=0.1.96
|
|
54
55
|
Requires-Dist: seqeval>=1.2.2
|
|
55
56
|
Requires-Dist: setuptools>=75.8.2
|
|
56
57
|
Requires-Dist: tenacity>=9.0.0
|
|
57
58
|
Requires-Dist: termcolor>=2.0.0
|
|
58
59
|
Requires-Dist: torch>=2.6.0
|
|
59
|
-
Requires-Dist: transformers>=4.
|
|
60
|
+
Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
60
61
|
Provides-Extra: all
|
|
61
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
62
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
|
-
Requires-Dist:
|
|
64
|
-
Requires-Dist: vllm>=0.10.
|
|
64
|
+
Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
|
|
65
|
+
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
65
66
|
Provides-Extra: generative
|
|
66
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
67
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
68
|
-
Requires-Dist:
|
|
69
|
-
|
|
70
|
-
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
71
|
-
Provides-Extra: test
|
|
72
|
-
Requires-Dist: gradio>=4.26.0; extra == 'test'
|
|
69
|
+
Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
|
|
70
|
+
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
73
71
|
Description-Content-Type: text/markdown
|
|
74
72
|
|
|
75
73
|
<div align='center'>
|
|
@@ -223,17 +221,18 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
223
221
|
<a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
|
|
224
222
|
<a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
|
|
225
223
|
<a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
|
|
224
|
+
<a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
|
|
226
225
|
<a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
|
|
227
226
|
<a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
|
|
228
227
|
<a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
|
|
229
228
|
<a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
|
|
230
229
|
<a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
|
|
231
|
-
<a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
|
|
232
230
|
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
233
231
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
234
232
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
235
233
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
236
234
|
<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
|
|
235
|
+
<a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
|
|
237
236
|
|
|
238
237
|
|
|
239
238
|
### Contribute to EuroEval
|
|
@@ -149,17 +149,18 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
149
149
|
<a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
|
|
150
150
|
<a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
|
|
151
151
|
<a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
|
|
152
|
+
<a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
|
|
152
153
|
<a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
|
|
153
154
|
<a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
|
|
154
155
|
<a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
|
|
155
156
|
<a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
|
|
156
157
|
<a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
|
|
157
|
-
<a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
|
|
158
158
|
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
159
159
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
160
160
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
161
161
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
162
162
|
<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
|
|
163
|
+
<a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
|
|
163
164
|
|
|
164
165
|
|
|
165
166
|
### Contribute to EuroEval
|