EuroEval 15.15.0__tar.gz → 16.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-15.15.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +2 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/bug.yaml +1 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +2 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/.github/workflows/ci.yaml +26 -16
- {euroeval-15.15.0 → euroeval-16.0.0}/.pre-commit-config.yaml +5 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/CHANGELOG.md +108 -36
- {euroeval-15.15.0 → euroeval-16.0.0}/PKG-INFO +12 -14
- {euroeval-15.15.0 → euroeval-16.0.0}/README.md +3 -1
- euroeval-16.0.0/docs/datasets/estonian.md +544 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/icelandic.md +11 -11
- euroeval-16.0.0/docs/datasets/latvian.md +536 -0
- euroeval-16.0.0/docs/leaderboards/Monolingual/portuguese.md +23 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/README.md +1 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/makefile +2 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/pyproject.toml +11 -17
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/__init__.py +3 -7
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_config_factory.py +3 -7
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/base.py +35 -19
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/fresh.py +24 -19
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/hf.py +136 -154
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/litellm.py +323 -193
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/vllm.py +166 -112
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmarker.py +59 -33
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/cli.py +3 -3
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/constants.py +13 -15
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/data_loading.py +33 -28
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/data_models.py +53 -7
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/__init__.py +2 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/danish.py +38 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/dutch.py +38 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/english.py +38 -1
- euroeval-16.0.0/src/euroeval/dataset_configs/estonian.py +95 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/faroese.py +38 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/finnish.py +39 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/french.py +38 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/german.py +38 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/icelandic.py +39 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/italian.py +38 -1
- euroeval-16.0.0/src/euroeval/dataset_configs/latvian.py +81 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/norwegian.py +38 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/portuguese.py +38 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/spanish.py +38 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/swedish.py +38 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/enums.py +0 -6
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/finetuning.py +8 -7
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/generation.py +25 -14
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/generation_utils.py +46 -14
- euroeval-16.0.0/src/euroeval/languages.py +966 -0
- euroeval-16.0.0/src/euroeval/metrics/__init__.py +6 -0
- euroeval-16.0.0/src/euroeval/metrics/base.py +76 -0
- euroeval-16.0.0/src/euroeval/metrics/huggingface.py +192 -0
- euroeval-16.0.0/src/euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval-16.0.0/src/euroeval/metrics/pipeline.py +234 -0
- euroeval-16.0.0/src/euroeval/metrics/speed.py +51 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/multiple_choice.py +23 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/named_entity_recognition.py +65 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/reading_comprehension.py +42 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/sentiment_classification.py +46 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/summarization.py +24 -4
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/scores.py +7 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/speed_benchmark.py +6 -6
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/question_answering.py +35 -28
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/sequence_classification.py +96 -23
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/text_to_text.py +7 -3
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/token_classification.py +47 -75
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/tasks.py +31 -6
- euroeval-16.0.0/src/euroeval/tokenization_utils.py +586 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/utils.py +118 -34
- euroeval-16.0.0/src/scripts/create_copa_lv.py +143 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_danish_citizen_tests.py +3 -2
- euroeval-16.0.0/src/scripts/create_err_news.py +83 -0
- euroeval-16.0.0/src/scripts/create_estner.py +115 -0
- euroeval-16.0.0/src/scripts/create_estonian_valence.py +86 -0
- euroeval-16.0.0/src/scripts/create_european_values.py +283 -0
- euroeval-16.0.0/src/scripts/create_exam_et.py +136 -0
- euroeval-16.0.0/src/scripts/create_fullstack_ner.py +248 -0
- euroeval-16.0.0/src/scripts/create_grammar_et.py +74 -0
- euroeval-16.0.0/src/scripts/create_latvian_lsm_summary.py +92 -0
- euroeval-16.0.0/src/scripts/create_latvian_twitter_sentiment.py +109 -0
- euroeval-16.0.0/src/scripts/create_mmlu_lv.py +263 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_multi_wiki_qa.py +1 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_scala.py +4 -0
- euroeval-16.0.0/src/scripts/create_wikiann_lv.py +116 -0
- euroeval-16.0.0/src/scripts/create_winogrande_et.py +90 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/load_ud_pos.py +36 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/conftest.py +2 -19
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_hf.py +10 -13
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmarker.py +0 -44
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_cli.py +2 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_data_loading.py +15 -8
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_data_models.py +2 -2
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_scores.py +1 -1
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_tokenization_utils.py +7 -7
- {euroeval-15.15.0 → euroeval-16.0.0}/uv.lock +1335 -2204
- euroeval-15.15.0/src/euroeval/human_evaluation.py +0 -738
- euroeval-15.15.0/src/euroeval/languages.py +0 -206
- euroeval-15.15.0/src/euroeval/metrics.py +0 -468
- euroeval-15.15.0/src/euroeval/tokenization_utils.py +0 -498
- euroeval-15.15.0/tests/test_human_evaluation.py +0 -8
- {euroeval-15.15.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/.gitignore +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/CITATION.cff +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/CONTRIBUTING.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/Dockerfile.cuda +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/LICENSE +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/NEW_DATASET_GUIDE.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/CNAME +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/README.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/README.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/danish.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/dutch.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/english.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/faroese.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/finnish.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/french.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/german.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/italian.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/norwegian.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/portuguese.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/spanish.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/swedish.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/extras/radial_plotter.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/faq.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/gfx/favicon.png +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/methodology.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/python-package.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/README.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/knowledge.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/speed.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/summarization.md +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/gfx/euroeval.png +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/gfx/euroeval.xcf +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/gfx/scandeval.png +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/mkdocs.yaml +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/callbacks.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/exceptions.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/model_cache.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/model_config.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/model_loading.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/types.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/constants.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_allocine.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_arc.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_arc_is.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_belebele.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_boolq_pt.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_conll_en.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_conll_es.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_dane.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_dansk.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_dbrd.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_eltec.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_fone.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_foqa.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_fosent.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_fquad.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_germanquad.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_germeval.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_goldenswag.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_harem.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_hellaswag_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_icesum.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_idioms_no.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_jentoft.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_life_in_the_uk.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mmlu.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_no_cola.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norec.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norne.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norquad.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_nqii.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_publico.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_rrn.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sb10k.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_scandisent_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_schibsted.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_squad.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_squad_it.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sst2_pt.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sst5.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_suc3.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_swedn.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_swerec.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_turku_ner_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_tydiqa_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_xlsum_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_xquad_es.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/versioning.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_callbacks.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_constants.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_dataset_configs.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_enums.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_exceptions.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_finetuning.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_generation.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_languages.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_model_cache.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_model_config.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_model_loading.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_speed_benchmark.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_tasks.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_types.py +0 -0
- {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_utils.py +0 -0
|
@@ -25,12 +25,14 @@ body:
|
|
|
25
25
|
- label: Danish
|
|
26
26
|
- label: Dutch
|
|
27
27
|
- label: English
|
|
28
|
+
- label: Estonian
|
|
28
29
|
- label: Faroese
|
|
29
30
|
- label: Finnish
|
|
30
31
|
- label: French
|
|
31
32
|
- label: German
|
|
32
33
|
- label: Icelandic
|
|
33
34
|
- label: Italian
|
|
35
|
+
- label: Latvian
|
|
34
36
|
- label: Norwegian (Bokmål or Nynorsk)
|
|
35
37
|
- label: Portuguese
|
|
36
38
|
- label: Spanish
|
|
@@ -55,7 +55,7 @@ body:
|
|
|
55
55
|
attributes:
|
|
56
56
|
label: EuroEval version
|
|
57
57
|
description: What version of EuroEval are you using?
|
|
58
|
-
placeholder: Output of `pip list | grep
|
|
58
|
+
placeholder: Output of `pip list | grep euroeval`
|
|
59
59
|
validations:
|
|
60
60
|
required: true
|
|
61
61
|
- type: input
|
|
@@ -21,7 +21,8 @@ body:
|
|
|
21
21
|
- label: Romance languages (French, Italian, Portuguese, Spanish)
|
|
22
22
|
- label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
|
|
23
23
|
- label: West Germanic languages (Dutch, English, German)
|
|
24
|
-
- label: Finnish
|
|
24
|
+
- label: Finnic languages (Estonian, Finnish)
|
|
25
|
+
- label: Latvian
|
|
25
26
|
validations:
|
|
26
27
|
required: true
|
|
27
28
|
- type: dropdown
|
|
@@ -22,16 +22,19 @@ jobs:
|
|
|
22
22
|
pull-requests: write
|
|
23
23
|
runs-on: ubuntu-latest
|
|
24
24
|
steps:
|
|
25
|
-
- uses: actions/checkout@
|
|
25
|
+
- uses: actions/checkout@v5
|
|
26
26
|
with:
|
|
27
27
|
persist-credentials: false
|
|
28
|
-
|
|
28
|
+
ref: main
|
|
29
|
+
|
|
30
|
+
- name: Install uv and set up Python
|
|
31
|
+
uses: astral-sh/setup-uv@v6
|
|
29
32
|
with:
|
|
33
|
+
enable-cache: false
|
|
30
34
|
python-version: "3.11"
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
shell: bash
|
|
35
|
+
|
|
36
|
+
- name: Run pre-commit hooks
|
|
37
|
+
uses: pre-commit/action@v3.0.1
|
|
35
38
|
|
|
36
39
|
pytest-linux:
|
|
37
40
|
if: github.event.pull_request.draft == false
|
|
@@ -40,24 +43,25 @@ jobs:
|
|
|
40
43
|
pull-requests: write
|
|
41
44
|
strategy:
|
|
42
45
|
matrix:
|
|
43
|
-
python-version: ["3.
|
|
46
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
44
47
|
runs-on: ubuntu-latest
|
|
45
48
|
steps:
|
|
46
|
-
- uses: actions/checkout@
|
|
49
|
+
- uses: actions/checkout@v5
|
|
47
50
|
with:
|
|
48
51
|
persist-credentials: false
|
|
52
|
+
ref: main
|
|
49
53
|
|
|
50
54
|
- name: Install uv and set up Python
|
|
51
|
-
uses: astral-sh/setup-uv@
|
|
55
|
+
uses: astral-sh/setup-uv@v6
|
|
52
56
|
with:
|
|
53
57
|
enable-cache: false
|
|
54
58
|
python-version: ${{ matrix.python-version }}
|
|
55
59
|
|
|
56
60
|
- name: Install Dependencies
|
|
57
|
-
run: uv sync --no-dev
|
|
61
|
+
run: uv sync --no-dev
|
|
58
62
|
|
|
59
63
|
- name: Start Ollama server
|
|
60
|
-
run: curl -fsSL https://ollama.com/install.sh | sh
|
|
64
|
+
run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
|
|
61
65
|
|
|
62
66
|
- name: Test with pytest
|
|
63
67
|
run: uv run pytest
|
|
@@ -66,6 +70,8 @@ jobs:
|
|
|
66
70
|
HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
67
71
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
68
72
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
73
|
+
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
|
74
|
+
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
|
|
69
75
|
|
|
70
76
|
- name: Delete EuroEval cache
|
|
71
77
|
run: rm -rf .euroeval_cache
|
|
@@ -77,21 +83,25 @@ jobs:
|
|
|
77
83
|
pull-requests: write
|
|
78
84
|
runs-on: macos-latest
|
|
79
85
|
steps:
|
|
80
|
-
- uses: actions/checkout@
|
|
86
|
+
- uses: actions/checkout@v5
|
|
87
|
+
with:
|
|
88
|
+
persist-credentials: false
|
|
89
|
+
ref: main
|
|
81
90
|
|
|
82
91
|
- name: Install uv and set up Python
|
|
83
|
-
uses: astral-sh/setup-uv@
|
|
92
|
+
uses: astral-sh/setup-uv@v6
|
|
84
93
|
with:
|
|
94
|
+
enable-cache: false
|
|
85
95
|
python-version: ${{ matrix.python-version }}
|
|
86
96
|
|
|
87
97
|
- name: Install Dependencies
|
|
88
|
-
run: uv sync --no-dev
|
|
98
|
+
run: uv sync --no-dev
|
|
89
99
|
|
|
90
100
|
- name: Start Ollama server
|
|
91
|
-
run: curl -fsSL https://ollama.com/install.sh | sh
|
|
101
|
+
run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
|
|
92
102
|
|
|
93
103
|
- name: Test with pytest
|
|
94
|
-
run: uv run pytest
|
|
104
|
+
run: uv run pytest -vvv
|
|
95
105
|
env:
|
|
96
106
|
HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
97
107
|
HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
@@ -4,24 +4,27 @@ repos:
|
|
|
4
4
|
hooks:
|
|
5
5
|
- id: python-use-type-annotations
|
|
6
6
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
7
|
-
rev:
|
|
7
|
+
rev: v6.0.0
|
|
8
8
|
hooks:
|
|
9
9
|
- id: end-of-file-fixer
|
|
10
10
|
- id: trailing-whitespace
|
|
11
11
|
- id: debug-statements
|
|
12
12
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
-
rev: v0.12.
|
|
13
|
+
rev: v0.12.12
|
|
14
14
|
hooks:
|
|
15
15
|
- id: ruff
|
|
16
16
|
args:
|
|
17
17
|
- --fix
|
|
18
18
|
- --unsafe-fixes
|
|
19
19
|
- --exit-non-zero-on-fix
|
|
20
|
+
- --no-cache
|
|
20
21
|
types_or:
|
|
21
22
|
- python
|
|
22
23
|
- pyi
|
|
23
24
|
- jupyter
|
|
24
25
|
- id: ruff-format
|
|
26
|
+
args:
|
|
27
|
+
- --no-cache
|
|
25
28
|
types_or:
|
|
26
29
|
- python
|
|
27
30
|
- pyi
|
|
@@ -10,12 +10,85 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v16.0.0] - 2025-09-05
|
|
14
|
+
### Added
|
|
15
|
+
- Added support for Latvian 🇱🇻! This includes the sentiment classification dataset
|
|
16
|
+
Latvian Twitter Sentiment, the linguistic acceptability dataset ScaLA-lv, the named
|
|
17
|
+
entity recognition datasets FullStack-NER-lv and WikiANN-lv, the reading comprehension
|
|
18
|
+
dataset MultiWikiQA, the knowledge dataset MMLU-lv, the common-sense reasoning
|
|
19
|
+
dataset COPA-lv, and the summarisation dataset LSM.
|
|
20
|
+
- Added support for Estonian 🇪🇪! It includes the sentiment classification dataset
|
|
21
|
+
Estonian Valence, the linguistic acceptability datasets Grammar-et and ScaLA-et, the
|
|
22
|
+
named entity recognition dataset EstNER, the reading comprehension dataset
|
|
23
|
+
MultiWikiQA-et, the summarisation dataset ERRNews, the knowledge dataset Exam-et,
|
|
24
|
+
and the common-sense reasoning dataset Winogrande-et. This was contributed by
|
|
25
|
+
@slowwavesleep ✨
|
|
26
|
+
- It is now possible to evaluate how much a model adhere to European values! 🇪🇺 This
|
|
27
|
+
probes 53 questions from the European values survey, which have been chosen based on
|
|
28
|
+
an optimisation procedure that maximises agreement across the EU. We then measure how
|
|
29
|
+
well the model's answers align with the distribution of answers across the EU, using a
|
|
30
|
+
tree-based kernel density estimation. This can only be used zero-shot, and only with
|
|
31
|
+
instruction-based decoder models (including reasoning models).
|
|
32
|
+
|
|
33
|
+
### Changed
|
|
34
|
+
- When evaluating classification tasks, we now force the model to output one of the
|
|
35
|
+
labels. This is done directly with open models, and done via a JSON schema for API
|
|
36
|
+
models. This won't change the results for existing tasks, as logprobs are used, but
|
|
37
|
+
this was required to measure the European values.
|
|
38
|
+
- Updated `vllm` dependency to `>=0.10.1`, which includes GPT-OSS support.
|
|
39
|
+
- Updated `numpy` dependency to `>=2.0.0`, as the previous clash is not applicable
|
|
40
|
+
- Updated `transformers` dependency to `>=4.56.0`, which includes support for more
|
|
41
|
+
models.
|
|
42
|
+
- Now requires Python >=3.11, as Python 3.10 does not support structured generation with
|
|
43
|
+
a dynamic set of choices (Literal[*list_of_choices] is not supported)
|
|
44
|
+
|
|
45
|
+
### Fixed
|
|
46
|
+
- Enable support to evaluate Mistral models with their custom `mistral-common`
|
|
47
|
+
tokeniser, which includes all recent Mistral models. Note that we currently assume
|
|
48
|
+
that all of these models are instruction-tuned decoder models (which _is_ true
|
|
49
|
+
currently), which can lead to errors in case they publish different types of models in
|
|
50
|
+
the future.
|
|
51
|
+
- Now disables the `seed` parameter if the API inference model does not support it,
|
|
52
|
+
which prevented evaluating some models.
|
|
53
|
+
- Now correctly detects an API inference model as non-existing, even if LiteLLM *does*
|
|
54
|
+
see it as existing. We have an additional check during evaluation to ensure this now.
|
|
55
|
+
- Catch an `ImportError` error that sometimes happens when finishing the evaluation of a
|
|
56
|
+
vLLM model, during shutdown.
|
|
57
|
+
- Now uses `litellm>=1.75.6`, which fixes an issue related to evaluation of GPT-5 models
|
|
58
|
+
using Ollama.
|
|
59
|
+
- Now always uses the `multiprocessing` backend when evaluating vLLM models, rather than
|
|
60
|
+
reverting to `ray` when using multiple GPUs, as `ray` led to evaluations of several
|
|
61
|
+
models freezing.
|
|
62
|
+
- Now does not require the user to be logged in to Hugging Face to benchmark models on
|
|
63
|
+
the Hugging Face Hub, if the models are public.
|
|
64
|
+
|
|
65
|
+
### Removed
|
|
66
|
+
- Removed support for human evaluation, as it was not actively maintained and not used.
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
## [v15.16.0] - 2025-08-12
|
|
70
|
+
### Added
|
|
71
|
+
- Added metadata for GPT-5 models.
|
|
72
|
+
|
|
73
|
+
### Changed
|
|
74
|
+
- Updated `transformers` dependency to `>=4.55.0`.
|
|
75
|
+
|
|
76
|
+
### Fixed
|
|
77
|
+
- If the model uses 'mxfp4' quantisation then we allow the dtype to be bfloat16, rather
|
|
78
|
+
than forcing float16. This caused issues with the new GPT-OSS models.
|
|
79
|
+
- Prevent multiple `Model <model-id> does not exist` logs when evaluating a model
|
|
80
|
+
that does not exist - now only logs this once.
|
|
81
|
+
- Cleaner error message when attempting to benchmark a generative model without having a
|
|
82
|
+
GPU available.
|
|
83
|
+
- Now raises error if an inference API is used with a parameter that is not supported.
|
|
84
|
+
|
|
85
|
+
|
|
13
86
|
## [v15.15.0] - 2025-08-06
|
|
14
87
|
### Added
|
|
15
88
|
- Added the common-sense reasoning dataset GoldenSwag for the following
|
|
16
89
|
languages: Danish, German, Spanish, Finnish, French, Italian, Dutch, Swedish.
|
|
17
90
|
The datasets are unofficial for now. This was contributed by
|
|
18
|
-
|
|
91
|
+
@oliverkinch ✨
|
|
19
92
|
|
|
20
93
|
### Changed
|
|
21
94
|
- Now allows metadata to be included in metrics, allowing more flexibility when
|
|
@@ -71,7 +144,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
71
144
|
acceptability dataset ScaLA-pt. The machine translated ones include the sentiment
|
|
72
145
|
classification dataset SST-2, the multiple choice reading comprehension dataset BoolQ,
|
|
73
146
|
the knowledge dataset MMLU, and the common-sense reasoning dataset GoldenSwag. This
|
|
74
|
-
was contributed by
|
|
147
|
+
was contributed by @duarteocarmo ✨
|
|
75
148
|
- Added `--gpu-memory-utilization` argument (`gpu_memory_utilization` in the
|
|
76
149
|
`Benchmarker` API), which can be lowered in case the user is experiencing OOM errors
|
|
77
150
|
when evaluating models. The default is 0.9 (same as previously), which means that vLLM
|
|
@@ -91,11 +164,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
91
164
|
- Added the English knowledge dataset Life in the UK, which has been added as an
|
|
92
165
|
official dataset, replacing the existing English knowledge dataset MMLU, which in turn
|
|
93
166
|
has been marked as unofficial now. This was contributed by
|
|
94
|
-
|
|
167
|
+
@oliverkinch ✨
|
|
95
168
|
- Added the Norwegian knowledge dataset Idioms-no, which is a multiple-choice question
|
|
96
169
|
dataset where the alternative answers have been generated using GPT-4o. This has been
|
|
97
170
|
added as an official dataset, and was contributed by
|
|
98
|
-
|
|
171
|
+
@oliverkinch ✨
|
|
99
172
|
- Added new `LLMAsAJudgeMetric`, which allows evaluating the performance of a model with
|
|
100
173
|
another judge model. This is useful for evaluating models in a reference-free manner,
|
|
101
174
|
or if the metric is sufficiently complex. It is currently not used in any task, but
|
|
@@ -199,11 +272,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
199
272
|
### Added
|
|
200
273
|
- Added the BeleBele datasets for Finnish, Italian and Spanish. They are listed as
|
|
201
274
|
unofficial for now. This was contributed by
|
|
202
|
-
|
|
275
|
+
@oliverkinch ✨
|
|
203
276
|
|
|
204
277
|
### Changed
|
|
205
278
|
- Now uses asyncronous requests when dealing with API models, speeding up the generation
|
|
206
|
-
immensely. This was contributed by
|
|
279
|
+
immensely. This was contributed by @mathiasesn ✨
|
|
207
280
|
|
|
208
281
|
### Fixed
|
|
209
282
|
- Add HellaSwag-fi back in, as the issue with the labels in the test split has been
|
|
@@ -255,7 +328,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
255
328
|
dataset [XL-Sum-fi](https://huggingface.co/datasets/TurkuNLP/xlsum-fi), and the
|
|
256
329
|
common-sense reasoning dataset
|
|
257
330
|
[HellaSwag-fi](https://huggingface.co/datasets/Finnish-NLP/hellaswag-fi-google-translate).
|
|
258
|
-
This was contributed by
|
|
331
|
+
This was contributed by @oliverkinch ✨
|
|
259
332
|
- Added metadata for GPT-4.1 and Grok-3 models.
|
|
260
333
|
- Marked Gemini-2.5-flash and Grok-3-mini as reasoning models, giving them more tokens
|
|
261
334
|
to think.
|
|
@@ -298,7 +371,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
298
371
|
## [v15.6.1] - 2025-04-14
|
|
299
372
|
### Changed
|
|
300
373
|
- Added more info about SQuAD-nl in the documentation. This was contributed by
|
|
301
|
-
|
|
374
|
+
@Rijgersberg ✨
|
|
302
375
|
|
|
303
376
|
### Fixed
|
|
304
377
|
- The "E" option for the Norwegian NorCommonSenseQA dataset was not included in the
|
|
@@ -326,7 +399,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
326
399
|
- Uniformised the prompt templates used for each task, so that they are more
|
|
327
400
|
consistent across tasks. Evaluation tests across different model types and sizes show
|
|
328
401
|
no significant performance difference between the new and old templates. This was
|
|
329
|
-
contributed by
|
|
402
|
+
contributed by @viggo-gascou ✨
|
|
330
403
|
|
|
331
404
|
### Fixed
|
|
332
405
|
- Avoid duplicate error messages when a rate limit occurs.
|
|
@@ -355,7 +428,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
355
428
|
- Allows all vLLM versions from v0.8.0 again, as the issue with the generation output
|
|
356
429
|
has been resolved.
|
|
357
430
|
- Added overall progress indicator during evaluation. This was contributed by
|
|
358
|
-
|
|
431
|
+
@mathiasesn ✨
|
|
359
432
|
|
|
360
433
|
### Changed
|
|
361
434
|
- Now does not use logprobs in text classification tasks with Google VertexAI models, as
|
|
@@ -394,9 +467,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
394
467
|
### Fixed
|
|
395
468
|
- Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
|
|
396
469
|
compatibility < 8.0. This was contributed by
|
|
397
|
-
|
|
470
|
+
@marksverdhei ✨
|
|
398
471
|
- Corrected the name of the French sentiment dataset AlloCiné. This was contributed by
|
|
399
|
-
|
|
472
|
+
@Alkarex ✨
|
|
400
473
|
- Evaluating a specific model revision did not work for adapter models, as there was a
|
|
401
474
|
confusion between the revision of the adapter and the revision of the base model. We
|
|
402
475
|
now use the revision for the adapter and use the latest revision for the base model.
|
|
@@ -422,7 +495,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
422
495
|
`HuggingFaceHubDown` exception.
|
|
423
496
|
- Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
|
|
424
497
|
compatibility < 8.0. This was contributed by
|
|
425
|
-
|
|
498
|
+
@marksverdhei ✨
|
|
426
499
|
- Fixed docs for ScandiQA-da and ScandiQA-sv, where it was incorrectly stated that
|
|
427
500
|
the splits were made by considering the original train/validation/test splits.
|
|
428
501
|
|
|
@@ -447,7 +520,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
447
520
|
[MMLU-es](https://hf.co/datasets/alexandrainst/m_mmlu), the common-sense reasoning
|
|
448
521
|
dataset [HellaSwag-es](https://hf.co/datasets/alexandrainst/m_hellaswag), and the
|
|
449
522
|
named entity recognition dataset [CoNLL-es](https://aclanthology.org/W02-2024/). This
|
|
450
|
-
was contributed by
|
|
523
|
+
was contributed by @oliverkinch ✨
|
|
451
524
|
- Now extracts number of parameters and context length for Ollama models, using the
|
|
452
525
|
`ollama` package. Vocabulary size is currently not available available in the `ollama`
|
|
453
526
|
package, so this is not extracted for Ollama models. For this reason, the `ollama`
|
|
@@ -500,7 +573,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
500
573
|
dataset [MMLU-it](https://hf.co/datasets/alexandrainst/m_mmlu), and the named entity
|
|
501
574
|
recognition dataset [MultiNERD IT](https://hf.co/datasets/Babelscape/multinerd) (and
|
|
502
575
|
unofficially [WikiNEuRal IT](https://hf.co/datasets/Babelscape/wikineural)). This was
|
|
503
|
-
contributed by
|
|
576
|
+
contributed by @viggo-gascou ✨
|
|
504
577
|
- Added the new Norwegian knowledge dataset NRK-Quiz-QA, consisting of quizzes on the
|
|
505
578
|
Norwegian language and culture, in both Bokmål and Nynorsk. The dataset has been split
|
|
506
579
|
into 635 / 256 / 2,048 samples for train, val, and test, respectively. This replaces
|
|
@@ -561,7 +634,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
561
634
|
- Added new `--only-allow-safetensors` flag, which disallows evaluating models from the
|
|
562
635
|
Hugging Face Hub if they are not stored as safetensors. This ensures a high level of
|
|
563
636
|
security on the system running the evaluations, if this is necessary. This was
|
|
564
|
-
contributed by
|
|
637
|
+
contributed by @Mikeriess ✨
|
|
565
638
|
|
|
566
639
|
|
|
567
640
|
### Fixed
|
|
@@ -590,19 +663,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
590
663
|
[personal-sum](https://github.com/SmartmediaAI/PersonalSum). It has been split into
|
|
591
664
|
121 / 64 / 256 samples for train / validation / test, respectively, and is set to
|
|
592
665
|
`unofficial` for now. This was contributed by
|
|
593
|
-
|
|
666
|
+
@oliverkinch ✨
|
|
594
667
|
- Added the Jentoft dataset - a linguistic acceptability dataset which was published in
|
|
595
668
|
[this Master's thesis](https://www.duo.uio.no/handle/10852/103885) by Matias Jentoft.
|
|
596
669
|
The original dataset consists of 85,771 / 10,827 / 10487 samples for training,
|
|
597
670
|
validation and test, respectively. We use a split of 1,024 / 256 / 2,048 samples for
|
|
598
671
|
training, validation and test, respectively. In each split, the distribution of
|
|
599
672
|
`correct` and `incorrect` is 50/50. This dataset has been set to `unofficial` for now.
|
|
600
|
-
This was contributed by
|
|
673
|
+
This was contributed by @oliverkinch ✨
|
|
601
674
|
- Added the dataset icelandic-knowledge, which is derived from the IcelandicQA dataset,
|
|
602
675
|
reformatted as a knowledge dataset with GPT-4o generated candidate answers. The split
|
|
603
676
|
is given by 845 / 128 / 1024 for train, val, and test, respectively. It is marked as
|
|
604
677
|
`unofficial` for now. This was contributed by
|
|
605
|
-
|
|
678
|
+
@oliverkinch ✨
|
|
606
679
|
|
|
607
680
|
### Changed
|
|
608
681
|
- Changed the instruction prompts to all text classification tasks by specifying
|
|
@@ -640,8 +713,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
640
713
|
dataset [OrangeSum](https://hf.co/datasets/EdinburghNLP/orange_sum).
|
|
641
714
|
- Added support for evaluating local models again, which supports models stored in the
|
|
642
715
|
Hugging Face format with a Hugging Face model configuration file (`config.json`) in
|
|
643
|
-
the model directory. This was contributed by
|
|
644
|
-
|
|
716
|
+
the model directory. This was contributed by @rlrs and
|
|
717
|
+
@peter-sk ✨
|
|
645
718
|
|
|
646
719
|
### Changed
|
|
647
720
|
- Changed the Belebele splits, as there were too few training splits for evaluation on
|
|
@@ -861,7 +934,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
861
934
|
dataset NO-Multi-QA-Sum (norglm-multi-qa). This dataset is part of the NLEBench
|
|
862
935
|
Norwegian benchmarks. The answers from the original dataset have been rephrased with
|
|
863
936
|
gpt-4o to contain the answer from the context. It has been marked as `unofficial` for
|
|
864
|
-
now. This was contributed by
|
|
937
|
+
now. This was contributed by @viggo-gascou ✨
|
|
865
938
|
- Added the sentiment classification part of the Icelandic dataset Hotter and Colder,
|
|
866
939
|
being a gold standard dataset. As no Icelandic sentiment classification dataset was
|
|
867
940
|
included in the benchmark previously, this is now the official Icelandic sentiment
|
|
@@ -880,18 +953,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
880
953
|
- Added the summarisation part of the Norwegian NorGLM multi-task human annotated
|
|
881
954
|
dataset NO-Multi-QA-Sum (`norglm-multi-sum`). This dataset is part of the NLEBench
|
|
882
955
|
Norwegian benchmarks. It has been marked as `unofficial` for now. This was contributed
|
|
883
|
-
by
|
|
956
|
+
by @viggo-gascou ✨
|
|
884
957
|
- Added `ice-linguistic` a linguistic acceptability dataset which is a subset of the
|
|
885
958
|
Icelandic Linguistic Benchmarks dataset. It is a small dataset with 94 train
|
|
886
959
|
samples, 32 validation samples, and 256 test samples, and has been marked as
|
|
887
960
|
`unofficial` for now. This was contributed by
|
|
888
|
-
|
|
961
|
+
@oliverkinch ✨
|
|
889
962
|
- Added `icelandic-qa`, an Icelandic question answering dataset about Icelandic culture
|
|
890
963
|
and history. The original dataset has 2000 samples, but only 375 of the samples have
|
|
891
964
|
answers that are found in the context (exact match). An LLM has therefore been used to
|
|
892
965
|
rephrase the answers and we now have 1683 samples where the answers are found in the
|
|
893
966
|
context (531 train, 128 val, 1024 test). It has been set to `unofficial` for now. This
|
|
894
|
-
was contributed by
|
|
967
|
+
was contributed by @oliverkinch ✨
|
|
895
968
|
|
|
896
969
|
### Fixed
|
|
897
970
|
- Small typo in prefix prompt used for few-shot evaluation of the English sentiment
|
|
@@ -903,21 +976,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
903
976
|
## [v13.1.0] - 2024-10-31
|
|
904
977
|
- Added `ice-ec` (a subset of the dataset) and `ice-ec-full` (the full dataset), an
|
|
905
978
|
Icelandic linguistic acceptability dataset. It has been set to `unofficial` for now.
|
|
906
|
-
This was contributed by
|
|
979
|
+
This was contributed by @oliverkinch ✨
|
|
907
980
|
- Added the Schibsted summarisation dataset, which contains summaries of published
|
|
908
981
|
articles from Schibsted Media's Norwegian and Swedish newsrooms. The dataset has been
|
|
909
982
|
split into two separate small datasets, `schibsted-sv` for Swedish and `schibsted-no`
|
|
910
983
|
for Norwegian. Note that both of these datasets are really small (89 and 374 test
|
|
911
984
|
samples in `schibsted-sv` and `schibsted-no`, respectively), and have been set to
|
|
912
985
|
`unofficial` for now. This was contributed by
|
|
913
|
-
|
|
986
|
+
@oliverkinch ✨
|
|
914
987
|
- Added the Icelandic summarisation dataset IceSum. IceSum is a collection of 1,000
|
|
915
988
|
Icelandic news articles from mbl.is, which have been manually annotated with
|
|
916
989
|
summaries. The dataset has been marked as unofficial, meaning that it will not be
|
|
917
990
|
automatically included when benchmarking models, but can be included by specifying the
|
|
918
991
|
dataset explicitly using the --dataset argument (or dataset argument if using the
|
|
919
992
|
Benchmarker API). This was contributed by
|
|
920
|
-
|
|
993
|
+
@viggo-gascou ✨
|
|
921
994
|
- Added the new Faroese reading comprehension dataset FoQA. This is now the default
|
|
922
995
|
Faroese reading comprehension benchmark, as there was none previously.
|
|
923
996
|
- Now supports evaluation of models with adapters. This requires that the model
|
|
@@ -1219,7 +1292,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1219
1292
|
|
|
1220
1293
|
### Fixed
|
|
1221
1294
|
- Move tensor to the correct device when benchmarking seq-to-seq models (#363). Thanks
|
|
1222
|
-
to
|
|
1295
|
+
to @ThomasKluiters for this contribution! :tada:
|
|
1223
1296
|
- Deals with the case where an instruction tuned model does not use any special token
|
|
1224
1297
|
at the end of the chat, such as `<|im_end|>`. This holds for, e.g., Qwen models.
|
|
1225
1298
|
- Better auto-detection of pipeline tag for models on the Hugging Face Hub, in case the
|
|
@@ -1233,7 +1306,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1233
1306
|
`AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_VERSION` need to have been set, or
|
|
1234
1307
|
alternatively through the `--azure-openai-api-key`, `--azure-openai-endpoint` and
|
|
1235
1308
|
`--azure-openai-api-version` arguments. Thanks to
|
|
1236
|
-
|
|
1309
|
+
@BramVanroy for all the help regarding the
|
|
1237
1310
|
implementation of this :tada:
|
|
1238
1311
|
- We now use the new JSON mode for newer OpenAI models for the NER task, to ensure
|
|
1239
1312
|
better JSON generation.
|
|
@@ -1744,7 +1817,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1744
1817
|
- A `--use-flash-attention` flag has been added, which enables Flash Attention 2.0,
|
|
1745
1818
|
which is required by some models, such as Mistral-based ones. If `flash-attn` has not
|
|
1746
1819
|
been installed then an informative error message will be raised. Thanks to
|
|
1747
|
-
|
|
1820
|
+
@peter-sk for this contribution! :tada:
|
|
1748
1821
|
|
|
1749
1822
|
### Changed
|
|
1750
1823
|
- Now uses 8-bit AdamW whenever CUDA is available, as opposed to regular AdamW.
|
|
@@ -1764,7 +1837,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1764
1837
|
OpenAI models. This currently happens automatically when specifying a generative
|
|
1765
1838
|
model from the Hugging Face Hub, and with all OpenAI models.
|
|
1766
1839
|
- Now stores model caches in separate directories, enabling parallel evaluations.
|
|
1767
|
-
Thanks to
|
|
1840
|
+
Thanks to @KennethEnevoldsen for this
|
|
1768
1841
|
contribution! :tada:
|
|
1769
1842
|
- Added `--device` argument to the CLI, which can be used to overwrite the automatic
|
|
1770
1843
|
detection of device (CPU, CUDA GPU, MPS GPU, TPU) to use.
|
|
@@ -1833,7 +1906,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
1833
1906
|
- Now added support for benchmarking local models in the Hugging Face format (i.e.,
|
|
1834
1907
|
saved with the `save_pretrained` method). This automatically detects the framework
|
|
1835
1908
|
based on the file extension, but can also be set using the new `--model-framework`
|
|
1836
|
-
argument. Thanks to
|
|
1909
|
+
argument. Thanks to @peter-sk for implementing this!
|
|
1837
1910
|
:tada:
|
|
1838
1911
|
|
|
1839
1912
|
### Fixed
|
|
@@ -2132,7 +2205,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
2132
2205
|
- Specific branches/commits/tags can now be benchmarked, using the `@`
|
|
2133
2206
|
delimiter. For instance, `scandeval -m model_id@commit_hash` will benchmark
|
|
2134
2207
|
the model with model ID `model_id`, stored at commit with hash `commit_hash`.
|
|
2135
|
-
Thanks to
|
|
2208
|
+
Thanks to @versae for contributing! :tada:
|
|
2136
2209
|
|
|
2137
2210
|
|
|
2138
2211
|
## [v2.2.0] - 2022-01-18
|
|
@@ -2142,8 +2215,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
2142
2215
|
|
|
2143
2216
|
## [v2.1.0] - 2022-01-17
|
|
2144
2217
|
### Added
|
|
2145
|
-
- Added support for `flax` models. Thanks to
|
|
2146
|
-
[@versae](https://github.com/versae) for contributing! :tada:
|
|
2218
|
+
- Added support for `flax` models. Thanks to @versae for contributing! :tada:
|
|
2147
2219
|
|
|
2148
2220
|
|
|
2149
2221
|
## [v2.0.0] - 2022-01-07
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version:
|
|
3
|
+
Version: 16.0.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -28,18 +28,19 @@ License: MIT License
|
|
|
28
28
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
29
|
SOFTWARE.
|
|
30
30
|
License-File: LICENSE
|
|
31
|
-
Requires-Python: <4.0,>=3.
|
|
31
|
+
Requires-Python: <4.0,>=3.11
|
|
32
32
|
Requires-Dist: accelerate>=1.9.0
|
|
33
33
|
Requires-Dist: bert-score>=0.3.13
|
|
34
34
|
Requires-Dist: click>=8.1.3
|
|
35
|
+
Requires-Dist: cloudpickle>=3.1.1
|
|
35
36
|
Requires-Dist: datasets>=3.5.0
|
|
36
37
|
Requires-Dist: demjson3>=3.0.6
|
|
37
38
|
Requires-Dist: evaluate>=0.4.1
|
|
38
39
|
Requires-Dist: huggingface-hub>=0.30.1
|
|
39
40
|
Requires-Dist: levenshtein>=0.24.0
|
|
40
|
-
Requires-Dist: litellm>=1.
|
|
41
|
+
Requires-Dist: litellm>=1.75.6
|
|
41
42
|
Requires-Dist: more-itertools>=10.5.0
|
|
42
|
-
Requires-Dist: numpy
|
|
43
|
+
Requires-Dist: numpy>=2.0.0
|
|
43
44
|
Requires-Dist: ollama>=0.5.1
|
|
44
45
|
Requires-Dist: pandas>=2.2.0
|
|
45
46
|
Requires-Dist: peft>=0.15.0
|
|
@@ -49,27 +50,22 @@ Requires-Dist: pyinfer>=0.0.3
|
|
|
49
50
|
Requires-Dist: python-dotenv>=1.0.1
|
|
50
51
|
Requires-Dist: rouge-score>=0.1.2
|
|
51
52
|
Requires-Dist: sacremoses>=0.1.1
|
|
52
|
-
Requires-Dist: scikit-learn
|
|
53
|
+
Requires-Dist: scikit-learn==1.6.1
|
|
53
54
|
Requires-Dist: sentencepiece>=0.1.96
|
|
54
55
|
Requires-Dist: seqeval>=1.2.2
|
|
55
56
|
Requires-Dist: setuptools>=75.8.2
|
|
56
57
|
Requires-Dist: tenacity>=9.0.0
|
|
57
58
|
Requires-Dist: termcolor>=2.0.0
|
|
58
59
|
Requires-Dist: torch>=2.6.0
|
|
59
|
-
Requires-Dist: transformers>=4.
|
|
60
|
+
Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
60
61
|
Provides-Extra: all
|
|
61
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
62
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
|
-
Requires-Dist:
|
|
64
|
-
Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
+
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
65
65
|
Provides-Extra: generative
|
|
66
66
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
67
67
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
68
|
-
Requires-Dist: vllm>=0.10.
|
|
69
|
-
Provides-Extra: human-evaluation
|
|
70
|
-
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
71
|
-
Provides-Extra: test
|
|
72
|
-
Requires-Dist: gradio>=4.26.0; extra == 'test'
|
|
68
|
+
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
73
69
|
Description-Content-Type: text/markdown
|
|
74
70
|
|
|
75
71
|
<div align='center'>
|
|
@@ -223,16 +219,18 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
223
219
|
<a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
|
|
224
220
|
<a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
|
|
225
221
|
<a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
|
|
222
|
+
<a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
|
|
226
223
|
<a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
|
|
227
224
|
<a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
|
|
228
225
|
<a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
|
|
229
226
|
<a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
|
|
230
227
|
<a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
|
|
231
|
-
<a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
|
|
232
228
|
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
233
229
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
234
230
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
235
231
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
232
|
+
<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
|
|
233
|
+
<a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
|
|
236
234
|
|
|
237
235
|
|
|
238
236
|
### Contribute to EuroEval
|
|
@@ -149,16 +149,18 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
149
149
|
<a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
|
|
150
150
|
<a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
|
|
151
151
|
<a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
|
|
152
|
+
<a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
|
|
152
153
|
<a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
|
|
153
154
|
<a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
|
|
154
155
|
<a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
|
|
155
156
|
<a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
|
|
156
157
|
<a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
|
|
157
|
-
<a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
|
|
158
158
|
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
159
159
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
160
160
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
161
161
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
162
|
+
<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
|
|
163
|
+
<a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
|
|
162
164
|
|
|
163
165
|
|
|
164
166
|
### Contribute to EuroEval
|