EuroEval 16.1.1__tar.gz → 16.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-16.1.1 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +1 -0
- euroeval-16.2.1/.github/ISSUE_TEMPLATE/language_request.yaml +49 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +1 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/CHANGELOG.md +34 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/PKG-INFO +31 -7
- {euroeval-16.1.1 → euroeval-16.2.1}/README.md +26 -2
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/icelandic.md +10 -10
- {euroeval-16.1.1 → euroeval-16.2.1}/pyproject.toml +10 -7
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/__init__.py +7 -6
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_config_factory.py +41 -125
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/hf.py +31 -16
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/litellm.py +2 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/vllm.py +24 -9
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmarker.py +138 -16
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/cli.py +8 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/data_models.py +5 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/generation.py +3 -1
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/base.py +12 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/huggingface.py +23 -2
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +6 -5
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/named_entity_recognition.py +3 -3
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/sentiment_classification.py +5 -5
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/tasks.py +3 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/tokenisation_utils.py +0 -6
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/types.py +2 -2
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/utils.py +77 -5
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/conftest.py +1 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_benchmarker.py +56 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_cli.py +2 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_data_loading.py +9 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/uv.lock +668 -522
- {euroeval-16.1.1 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/.github/workflows/ci.yaml +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/.gitignore +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/.pre-commit-config.yaml +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/CITATION.cff +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/CONTRIBUTING.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/Dockerfile.cuda +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/LICENSE +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/NEW_DATASET_GUIDE.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/CNAME +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/README.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/README.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/danish.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/dutch.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/english.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/estonian.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/faroese.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/finnish.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/french.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/german.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/italian.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/latvian.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/norwegian.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/polish.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/portuguese.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/spanish.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/swedish.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/extras/radial_plotter.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/faq.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/gfx/favicon.png +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/estonian.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/finnish.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/portuguese.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/finnic.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/README.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/methodology.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/python-package.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/README.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/knowledge.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/speed.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/summarization.md +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/gfx/euroeval.png +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/gfx/euroeval.xcf +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/gfx/scandeval.png +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/makefile +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/mkdocs.yaml +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/base.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/callbacks.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/constants.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/data_loading.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/__init__.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/danish.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/dutch.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/english.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/estonian.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/faroese.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/finnish.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/french.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/german.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/icelandic.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/italian.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/latvian.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/norwegian.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/polish.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/portuguese.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/spanish.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/swedish.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/enums.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/exceptions.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/finetuning.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/generation_utils.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/languages.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/__init__.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/llm_as_a_judge.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/pipeline.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/speed.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/model_cache.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/model_config.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/model_loading.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/__init__.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/summarization.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/scores.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/speed_benchmark.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/__init__.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/question_answering.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/text_to_text.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/token_classification.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/constants.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_allocine.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_arc.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_arc_is.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_belebele.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_boolq_pt.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_conll_en.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_conll_es.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_copa_lv.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_dane.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_danish_citizen_tests.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_dansk.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_dbrd.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_eltec.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_err_news.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_estner.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_estonian_valence.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_european_values.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_exam_et.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_fone.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_foqa.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_fosent.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_fquad.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_fullstack_ner.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_germanquad.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_germeval.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_goldenswag.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_grammar_et.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_harem.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_hellaswag_fi.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_icesum.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_idioms_no.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_jentoft.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_kpwr_ner.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_latvian_lsm_summary.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_life_in_the_uk.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_llmzszl.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mmlu.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mmlu_lv.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_multi_wiki_qa.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_no_cola.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norec.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norne.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norquad.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_nqii.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_polemo2.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_poquad.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_psc.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_publico.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_rrn.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sb10k.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_scala.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_scandisent_fi.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_schibsted.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_squad.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_squad_it.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sst2_pt.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sst5.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_suc3.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_swedish_skolprov.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_swedn.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_swerec.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_trivia_et.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_turku_ner_fi.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_tydiqa_fi.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_wikiann_lv.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_winogrande.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_winogrande_et.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_xlsum_fi.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_xquad.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/load_ud_pos.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/versioning.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/__init__.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_benchmark_modules/test_hf.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_callbacks.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_constants.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_data_models.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_dataset_configs.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_enums.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_exceptions.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_finetuning.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_languages.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_model_config.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_model_loading.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_scores.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_speed_benchmark.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_tasks.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_tokenisation_utils.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_types.py +0 -0
- {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_utils.py +0 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
name: 🌍 Language Request
|
|
2
|
+
description: Is there a European language missing in EuroEval?
|
|
3
|
+
title: "[LANGUAGE REQUEST] <language-name>"
|
|
4
|
+
labels: "new language"
|
|
5
|
+
type: task
|
|
6
|
+
|
|
7
|
+
body:
|
|
8
|
+
- type: input
|
|
9
|
+
attributes:
|
|
10
|
+
label: Language name and code
|
|
11
|
+
description: What is the name and ISO 639 code of the language?
|
|
12
|
+
validations:
|
|
13
|
+
required: true
|
|
14
|
+
- type: markdown
|
|
15
|
+
attributes:
|
|
16
|
+
value: >
|
|
17
|
+
Here are some existing evaluation datasets in the language, that could be used:
|
|
18
|
+
- type: textarea
|
|
19
|
+
attributes:
|
|
20
|
+
label: Sentiment classification dataset
|
|
21
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
22
|
+
- type: textarea
|
|
23
|
+
attributes:
|
|
24
|
+
label: Linguistic acceptability dataset
|
|
25
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
26
|
+
- type: textarea
|
|
27
|
+
attributes:
|
|
28
|
+
label: Named entity recognition dataset
|
|
29
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
30
|
+
- type: textarea
|
|
31
|
+
attributes:
|
|
32
|
+
label: Reading comprehension dataset
|
|
33
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
34
|
+
- type: textarea
|
|
35
|
+
attributes:
|
|
36
|
+
label: Summarisation dataset
|
|
37
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
38
|
+
- type: textarea
|
|
39
|
+
attributes:
|
|
40
|
+
label: Knowledge dataset
|
|
41
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
42
|
+
- type: textarea
|
|
43
|
+
attributes:
|
|
44
|
+
label: Common-sense reasoning dataset
|
|
45
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
46
|
+
- type: markdown
|
|
47
|
+
attributes:
|
|
48
|
+
value: >
|
|
49
|
+
Thanks for contributing 🎉!
|
|
@@ -10,6 +10,40 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v16.2.1] - 2025-09-15
|
|
14
|
+
### Fixed
|
|
15
|
+
- Some of the `download_only` arguments were missing in the code, and have now been
|
|
16
|
+
added.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
## [v16.2.0] - 2025-09-15
|
|
20
|
+
### Added
|
|
21
|
+
- Now supports evaluating models in an offline environment. This is done by first
|
|
22
|
+
downloading all necessary models, datasets, metrics and other artifacts while online,
|
|
23
|
+
using the new `--download-only` flag (or `download_only=True` in the `Benchmarker`
|
|
24
|
+
API). Then you can safely disable internet access and run the evaluation as normal,
|
|
25
|
+
and it will use the cached models, datasets and metrics. This was contributed by
|
|
26
|
+
@viggo-gascou ✨
|
|
27
|
+
- Added the `timm` package to the set of `generative` extra dependencies, as it is
|
|
28
|
+
required to load some multimodal models, such as Gemma-3n.
|
|
29
|
+
|
|
30
|
+
### Changed
|
|
31
|
+
- Now does not benchmark encoder models on multiple-choice classification tasks, as they
|
|
32
|
+
get near-random performance and these scores are not used in the leaderboards. We can
|
|
33
|
+
change this in the future if we find a way to make encoder models work better on these
|
|
34
|
+
tasks.
|
|
35
|
+
- For generative vLLM models that can swap between reasoning and non-reasoning modes,
|
|
36
|
+
we previously defaulted to reasoning. We now default to what the model uses by
|
|
37
|
+
default, which is non-reasoning for most models.
|
|
38
|
+
|
|
39
|
+
### Fixed
|
|
40
|
+
- Fixed an issue where old evaluation records could not be loaded, as the format had
|
|
41
|
+
changed. We are now able to load old records again.
|
|
42
|
+
- Fixed some grammatical errors in the Icelandic prompts.
|
|
43
|
+
- Now stores model IDs with parameters (e.g., `o3#low`) correctly in the benchmark
|
|
44
|
+
results, rather than just the base model ID (e.g., `o3`).
|
|
45
|
+
|
|
46
|
+
|
|
13
47
|
## [v16.1.1] - 2025-09-12
|
|
14
48
|
### Fixed
|
|
15
49
|
- Fixed an issue from v16.1.0, where reasoning models were not using the tokeniser's
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 16.
|
|
3
|
+
Version: 16.2.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -61,13 +61,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
|
61
61
|
Provides-Extra: all
|
|
62
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
-
Requires-Dist:
|
|
65
|
-
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
+
Requires-Dist: timm>=1.0.19; extra == 'all'
|
|
65
|
+
Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
66
66
|
Provides-Extra: generative
|
|
67
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
-
Requires-Dist:
|
|
70
|
-
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
+
Requires-Dist: timm>=1.0.19; extra == 'generative'
|
|
70
|
+
Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
71
71
|
Description-Content-Type: text/markdown
|
|
72
72
|
|
|
73
73
|
<div align='center'>
|
|
@@ -152,13 +152,13 @@ model:
|
|
|
152
152
|
```
|
|
153
153
|
>>> from euroeval import Benchmarker
|
|
154
154
|
>>> benchmark = Benchmarker()
|
|
155
|
-
>>> benchmark(model="<model>")
|
|
155
|
+
>>> benchmark(model="<model-id>")
|
|
156
156
|
```
|
|
157
157
|
|
|
158
158
|
To benchmark on a specific task and/or language, you simply specify the `task` or
|
|
159
159
|
`language` arguments, shown here with same example as above:
|
|
160
160
|
```
|
|
161
|
-
>>> benchmark(model="<model>", task="sentiment-classification", language="da")
|
|
161
|
+
>>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
|
|
162
162
|
```
|
|
163
163
|
|
|
164
164
|
If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
|
|
@@ -168,6 +168,30 @@ models on the Danish sentiment classification task:
|
|
|
168
168
|
>>> benchmark(task="sentiment-classification", language="da")
|
|
169
169
|
```
|
|
170
170
|
|
|
171
|
+
### Benchmarking in an Offline Environment
|
|
172
|
+
If you need to benchmark in an offline environment, you need to download the models,
|
|
173
|
+
datasets and metrics beforehand. This can be done by adding the `--download-only`
|
|
174
|
+
argument, from the command line, or the `download_only` argument, if benchmarking from a
|
|
175
|
+
script. For example to download the model you want and all of the Danish sentiment
|
|
176
|
+
classification datasets:
|
|
177
|
+
```
|
|
178
|
+
$ euroeval --model <model-id> --task sentiment-classification --language da --download-only
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Or from a script:
|
|
182
|
+
```
|
|
183
|
+
>>> benchmark(
|
|
184
|
+
... model="<model-id>",
|
|
185
|
+
... task="sentiment-classification",
|
|
186
|
+
... language="da",
|
|
187
|
+
... download_only=True,
|
|
188
|
+
... )
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Please note: Offline benchmarking of adapter models is not currently supported. An
|
|
192
|
+
internet connection will be required during evaluation. If offline support is important
|
|
193
|
+
to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
|
|
194
|
+
|
|
171
195
|
### Benchmarking from Docker
|
|
172
196
|
A Dockerfile is provided in the repo, which can be downloaded and run, without needing
|
|
173
197
|
to clone the repo and installing from source. This can be fetched programmatically by
|
|
@@ -80,13 +80,13 @@ model:
|
|
|
80
80
|
```
|
|
81
81
|
>>> from euroeval import Benchmarker
|
|
82
82
|
>>> benchmark = Benchmarker()
|
|
83
|
-
>>> benchmark(model="<model>")
|
|
83
|
+
>>> benchmark(model="<model-id>")
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
To benchmark on a specific task and/or language, you simply specify the `task` or
|
|
87
87
|
`language` arguments, shown here with same example as above:
|
|
88
88
|
```
|
|
89
|
-
>>> benchmark(model="<model>", task="sentiment-classification", language="da")
|
|
89
|
+
>>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
|
|
90
90
|
```
|
|
91
91
|
|
|
92
92
|
If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
|
|
@@ -96,6 +96,30 @@ models on the Danish sentiment classification task:
|
|
|
96
96
|
>>> benchmark(task="sentiment-classification", language="da")
|
|
97
97
|
```
|
|
98
98
|
|
|
99
|
+
### Benchmarking in an Offline Environment
|
|
100
|
+
If you need to benchmark in an offline environment, you need to download the models,
|
|
101
|
+
datasets and metrics beforehand. This can be done by adding the `--download-only`
|
|
102
|
+
argument, from the command line, or the `download_only` argument, if benchmarking from a
|
|
103
|
+
script. For example to download the model you want and all of the Danish sentiment
|
|
104
|
+
classification datasets:
|
|
105
|
+
```
|
|
106
|
+
$ euroeval --model <model-id> --task sentiment-classification --language da --download-only
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Or from a script:
|
|
110
|
+
```
|
|
111
|
+
>>> benchmark(
|
|
112
|
+
... model="<model-id>",
|
|
113
|
+
... task="sentiment-classification",
|
|
114
|
+
... language="da",
|
|
115
|
+
... download_only=True,
|
|
116
|
+
... )
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Please note: Offline benchmarking of adapter models is not currently supported. An
|
|
120
|
+
internet connection will be required during evaluation. If offline support is important
|
|
121
|
+
to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
|
|
122
|
+
|
|
99
123
|
### Benchmarking from Docker
|
|
100
124
|
A Dockerfile is provided in the repo, which can be downloaded and run, without needing
|
|
101
125
|
to clone the repo and installing from source. This can be fetched programmatically by
|
|
@@ -44,11 +44,11 @@ When evaluating generative models, we use the following setup (see the
|
|
|
44
44
|
- Number of few-shot examples: 12
|
|
45
45
|
- Prefix prompt:
|
|
46
46
|
```
|
|
47
|
-
|
|
47
|
+
Hér fyrir neðan eru textabrot ásamt lyndisgildi þeirra sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.
|
|
48
48
|
```
|
|
49
49
|
- Base prompt template:
|
|
50
50
|
```
|
|
51
|
-
|
|
51
|
+
Textabrot: {text}
|
|
52
52
|
Lyndi: {label}
|
|
53
53
|
```
|
|
54
54
|
- Instruction-tuned prompt template:
|
|
@@ -117,13 +117,13 @@ When evaluating generative models, we use the following setup (see the
|
|
|
117
117
|
- Base prompt template:
|
|
118
118
|
```
|
|
119
119
|
Setning: {text}
|
|
120
|
-
|
|
120
|
+
Nafneiningar: {label}
|
|
121
121
|
```
|
|
122
122
|
- Instruction-tuned prompt template:
|
|
123
123
|
```
|
|
124
124
|
Setning: {text}
|
|
125
125
|
|
|
126
|
-
|
|
126
|
+
Greindu nefndu einingarnar í setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum 'einstaklingur', 'staðsetning', 'stofnun' og 'ýmislegt'. Gildin ættu að vera listi yfir nefndu einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.
|
|
127
127
|
```
|
|
128
128
|
- Label mapping:
|
|
129
129
|
- `B-PER` ➡️ `einstaklingur`
|
|
@@ -186,7 +186,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
186
186
|
- Number of few-shot examples: 12
|
|
187
187
|
- Prefix prompt:
|
|
188
188
|
```
|
|
189
|
-
|
|
189
|
+
Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
|
|
190
190
|
```
|
|
191
191
|
- Base prompt template:
|
|
192
192
|
```
|
|
@@ -197,7 +197,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
197
197
|
```
|
|
198
198
|
Setning: {text}
|
|
199
199
|
|
|
200
|
-
|
|
200
|
+
Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
|
|
201
201
|
```
|
|
202
202
|
- Label mapping:
|
|
203
203
|
- `correct` ➡️ `já`
|
|
@@ -249,7 +249,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
249
249
|
- Number of few-shot examples: 12
|
|
250
250
|
- Prefix prompt:
|
|
251
251
|
```
|
|
252
|
-
|
|
252
|
+
Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
|
|
253
253
|
```
|
|
254
254
|
- Base prompt template:
|
|
255
255
|
```
|
|
@@ -260,7 +260,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
260
260
|
```
|
|
261
261
|
Setning: {text}
|
|
262
262
|
|
|
263
|
-
|
|
263
|
+
Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
|
|
264
264
|
```
|
|
265
265
|
- Label mapping:
|
|
266
266
|
- `correct` ➡️ `já`
|
|
@@ -310,7 +310,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
310
310
|
- Number of few-shot examples: 12
|
|
311
311
|
- Prefix prompt:
|
|
312
312
|
```
|
|
313
|
-
|
|
313
|
+
Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
|
|
314
314
|
```
|
|
315
315
|
- Base prompt template:
|
|
316
316
|
```
|
|
@@ -321,7 +321,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
321
321
|
```
|
|
322
322
|
Setning: {text}
|
|
323
323
|
|
|
324
|
-
|
|
324
|
+
Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
|
|
325
325
|
```
|
|
326
326
|
- Label mapping:
|
|
327
327
|
- `correct` ➡️ `já`
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "EuroEval"
|
|
3
|
-
version = "16.
|
|
3
|
+
version = "16.2.1"
|
|
4
4
|
description = "The robust European language model benchmark."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -33,7 +33,7 @@ dependencies = [
|
|
|
33
33
|
"rouge-score>=0.1.2",
|
|
34
34
|
"bert-score>=0.3.13",
|
|
35
35
|
"levenshtein>=0.24.0",
|
|
36
|
-
"scikit-learn==1.6.1",
|
|
36
|
+
"scikit-learn==1.6.1", # Required for loading European values pipeline
|
|
37
37
|
"setuptools>=75.8.2",
|
|
38
38
|
"demjson3>=3.0.6",
|
|
39
39
|
"ollama>=0.5.1",
|
|
@@ -45,15 +45,15 @@ dependencies = [
|
|
|
45
45
|
[project.optional-dependencies]
|
|
46
46
|
generative = [
|
|
47
47
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
48
|
-
"vllm>=0.10.1; platform_system == 'Linux'",
|
|
49
|
-
"flashinfer-python>=0.3.1; platform_system == 'Linux'",
|
|
48
|
+
"vllm[flashinfer]>=0.10.1; platform_system == 'Linux'",
|
|
50
49
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
50
|
+
"timm>=1.0.19",
|
|
51
51
|
]
|
|
52
52
|
all = [
|
|
53
53
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
54
|
-
"vllm>=0.10.1; platform_system == 'Linux'",
|
|
55
|
-
"flashinfer-python>=0.3.1; platform_system == 'Linux'",
|
|
54
|
+
"vllm[flashinfer]>=0.10.1; platform_system == 'Linux'",
|
|
56
55
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
56
|
+
"timm>=1.0.19",
|
|
57
57
|
]
|
|
58
58
|
|
|
59
59
|
[project.urls]
|
|
@@ -100,6 +100,8 @@ dev-dependencies = [
|
|
|
100
100
|
"types-ujson>=5.10.0.20240515",
|
|
101
101
|
"types-simplejson>=3.2.0.2025032",
|
|
102
102
|
"debugpy>=1.8.13",
|
|
103
|
+
"pytest-socket>=0.7.0",
|
|
104
|
+
"pytest-dependency>=0.6.0",
|
|
103
105
|
]
|
|
104
106
|
|
|
105
107
|
[tool.ruff]
|
|
@@ -170,6 +172,7 @@ addopts = [
|
|
|
170
172
|
"--cov=src/euroeval",
|
|
171
173
|
"--color=yes",
|
|
172
174
|
"-vvv",
|
|
175
|
+
"--allow-unix-socket"
|
|
173
176
|
]
|
|
174
177
|
xfail_strict = true
|
|
175
178
|
filterwarnings = [
|
|
@@ -181,7 +184,7 @@ filterwarnings = [
|
|
|
181
184
|
"ignore::ResourceWarning",
|
|
182
185
|
"ignore::FutureWarning",
|
|
183
186
|
]
|
|
184
|
-
log_cli_level = "
|
|
187
|
+
log_cli_level = "INFO"
|
|
185
188
|
testpaths = [
|
|
186
189
|
"tests",
|
|
187
190
|
"src/euroeval",
|
|
@@ -12,12 +12,13 @@ import warnings
|
|
|
12
12
|
from termcolor import colored
|
|
13
13
|
|
|
14
14
|
# Block specific warnings before importing anything else, as they can be noisy
|
|
15
|
-
|
|
16
|
-
warnings.filterwarnings("ignore", category=
|
|
17
|
-
|
|
18
|
-
logging.getLogger("
|
|
19
|
-
logging.getLogger("
|
|
20
|
-
|
|
15
|
+
if os.getenv("FULL_LOG") != "1":
|
|
16
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
17
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
18
|
+
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
19
|
+
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
20
|
+
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
21
|
+
os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
|
|
21
22
|
|
|
22
23
|
# Set up logging
|
|
23
24
|
fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
@@ -6,9 +6,9 @@ import typing as t
|
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
8
|
|
|
9
|
-
from .data_models import BenchmarkConfig
|
|
9
|
+
from .data_models import BenchmarkConfig, BenchmarkConfigParams
|
|
10
10
|
from .dataset_configs import get_all_dataset_configs
|
|
11
|
-
from .enums import Device
|
|
11
|
+
from .enums import Device
|
|
12
12
|
from .exceptions import InvalidBenchmark
|
|
13
13
|
from .languages import get_all_languages
|
|
14
14
|
from .tasks import SPEED, get_all_tasks
|
|
@@ -21,150 +21,66 @@ logger = logging.getLogger("euroeval")
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def build_benchmark_config(
|
|
24
|
-
|
|
25
|
-
save_results: bool,
|
|
26
|
-
task: str | list[str] | None,
|
|
27
|
-
dataset: str | list[str] | None,
|
|
28
|
-
language: str | list[str],
|
|
29
|
-
model_language: str | list[str] | None,
|
|
30
|
-
dataset_language: str | list[str] | None,
|
|
31
|
-
device: Device | None,
|
|
32
|
-
batch_size: int,
|
|
33
|
-
raise_errors: bool,
|
|
34
|
-
cache_dir: str,
|
|
35
|
-
api_key: str | None,
|
|
36
|
-
force: bool,
|
|
37
|
-
verbose: bool,
|
|
38
|
-
trust_remote_code: bool,
|
|
39
|
-
clear_model_cache: bool,
|
|
40
|
-
evaluate_test_split: bool,
|
|
41
|
-
few_shot: bool,
|
|
42
|
-
num_iterations: int,
|
|
43
|
-
api_base: str | None,
|
|
44
|
-
api_version: str | None,
|
|
45
|
-
gpu_memory_utilization: float,
|
|
46
|
-
generative_type: GenerativeType | None,
|
|
47
|
-
debug: bool,
|
|
48
|
-
run_with_cli: bool,
|
|
49
|
-
requires_safetensors: bool,
|
|
24
|
+
benchmark_config_params: BenchmarkConfigParams,
|
|
50
25
|
) -> BenchmarkConfig:
|
|
51
26
|
"""Create a benchmark configuration.
|
|
52
27
|
|
|
53
28
|
Args:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
save_results:
|
|
57
|
-
Whether to save the benchmark results to a file.
|
|
58
|
-
task:
|
|
59
|
-
The tasks to include for dataset. If None then datasets will not be
|
|
60
|
-
filtered based on their task.
|
|
61
|
-
dataset:
|
|
62
|
-
The datasets to include for task. If None then all datasets will be
|
|
63
|
-
included, limited by the `task` parameter.
|
|
64
|
-
language:
|
|
65
|
-
The language codes of the languages to include, both for models and
|
|
66
|
-
datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
|
|
67
|
-
to 'all' if all languages should be considered.
|
|
68
|
-
model_language:
|
|
69
|
-
The language codes of the languages to include for models. If None then
|
|
70
|
-
the `language` parameter will be used.
|
|
71
|
-
dataset_language:
|
|
72
|
-
The language codes of the languages to include for datasets. If None then
|
|
73
|
-
the `language` parameter will be used.
|
|
74
|
-
device:
|
|
75
|
-
The device to use for running the models. If None then the device will be
|
|
76
|
-
set automatically.
|
|
77
|
-
batch_size:
|
|
78
|
-
The batch size to use for running the models.
|
|
79
|
-
raise_errors:
|
|
80
|
-
Whether to raise errors when running the benchmark.
|
|
81
|
-
cache_dir:
|
|
82
|
-
The directory to use for caching the models.
|
|
83
|
-
api_key:
|
|
84
|
-
The API key to use for a given inference server.
|
|
85
|
-
force:
|
|
86
|
-
Whether to force the benchmark to run even if the results are already
|
|
87
|
-
cached.
|
|
88
|
-
verbose:
|
|
89
|
-
Whether to print verbose output when running the benchmark. This is
|
|
90
|
-
automatically set if `debug` is True.
|
|
91
|
-
trust_remote_code:
|
|
92
|
-
Whether to trust remote code when running the benchmark.
|
|
93
|
-
clear_model_cache:
|
|
94
|
-
Whether to clear the model cache before running the benchmark.
|
|
95
|
-
evaluate_test_split:
|
|
96
|
-
Whether to use the test split for the datasets.
|
|
97
|
-
few_shot:
|
|
98
|
-
Whether to use few-shot learning for the models.
|
|
99
|
-
num_iterations:
|
|
100
|
-
The number of iterations each model should be evaluated for.
|
|
101
|
-
api_base:
|
|
102
|
-
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
103
|
-
model on an inference API.
|
|
104
|
-
api_version:
|
|
105
|
-
The version of the API to use for a given inference API.
|
|
106
|
-
gpu_memory_utilization:
|
|
107
|
-
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
108
|
-
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
109
|
-
this if you are running out of GPU memory. Only relevant if the model is
|
|
110
|
-
generative.
|
|
111
|
-
generative_type:
|
|
112
|
-
The type of generative model. Only relevant if the model is generative. If
|
|
113
|
-
not specified, the type will be inferred automatically.
|
|
114
|
-
debug:
|
|
115
|
-
Whether to run the benchmark in debug mode.
|
|
116
|
-
run_with_cli:
|
|
117
|
-
Whether the benchmark is being run with the CLI.
|
|
118
|
-
requires_safetensors:
|
|
119
|
-
Whether to only allow evaluations of models stored as safetensors.
|
|
29
|
+
benchmark_config_params:
|
|
30
|
+
The parameters for creating the benchmark configuration.
|
|
120
31
|
|
|
121
32
|
Returns:
|
|
122
33
|
The benchmark configuration.
|
|
123
34
|
"""
|
|
124
|
-
language_codes = get_correct_language_codes(
|
|
35
|
+
language_codes = get_correct_language_codes(
|
|
36
|
+
language_codes=benchmark_config_params.language
|
|
37
|
+
)
|
|
125
38
|
model_languages = prepare_languages(
|
|
126
|
-
language_codes=model_language,
|
|
39
|
+
language_codes=benchmark_config_params.model_language,
|
|
40
|
+
default_language_codes=language_codes,
|
|
127
41
|
)
|
|
128
42
|
dataset_languages = prepare_languages(
|
|
129
|
-
language_codes=dataset_language,
|
|
43
|
+
language_codes=benchmark_config_params.dataset_language,
|
|
44
|
+
default_language_codes=language_codes,
|
|
130
45
|
)
|
|
131
46
|
|
|
132
47
|
tasks, datasets = prepare_tasks_and_datasets(
|
|
133
|
-
task=task,
|
|
48
|
+
task=benchmark_config_params.task,
|
|
49
|
+
dataset=benchmark_config_params.dataset,
|
|
50
|
+
dataset_languages=dataset_languages,
|
|
134
51
|
)
|
|
135
52
|
|
|
136
|
-
torch_device = prepare_device(device=device)
|
|
137
|
-
|
|
138
|
-
# Set variable with number of iterations
|
|
139
|
-
if hasattr(sys, "_called_from_test"):
|
|
140
|
-
num_iterations = 1
|
|
141
|
-
|
|
142
53
|
return BenchmarkConfig(
|
|
143
54
|
model_languages=model_languages,
|
|
144
55
|
dataset_languages=dataset_languages,
|
|
145
56
|
tasks=tasks,
|
|
146
57
|
datasets=datasets,
|
|
147
|
-
batch_size=batch_size,
|
|
148
|
-
raise_errors=raise_errors,
|
|
149
|
-
cache_dir=cache_dir,
|
|
150
|
-
api_key=api_key,
|
|
151
|
-
force=force,
|
|
152
|
-
progress_bar=progress_bar,
|
|
153
|
-
save_results=save_results,
|
|
154
|
-
verbose=verbose or debug,
|
|
155
|
-
device=
|
|
156
|
-
trust_remote_code=trust_remote_code,
|
|
157
|
-
clear_model_cache=clear_model_cache,
|
|
158
|
-
evaluate_test_split=evaluate_test_split,
|
|
159
|
-
few_shot=few_shot,
|
|
160
|
-
num_iterations=
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
58
|
+
batch_size=benchmark_config_params.batch_size,
|
|
59
|
+
raise_errors=benchmark_config_params.raise_errors,
|
|
60
|
+
cache_dir=benchmark_config_params.cache_dir,
|
|
61
|
+
api_key=benchmark_config_params.api_key,
|
|
62
|
+
force=benchmark_config_params.force,
|
|
63
|
+
progress_bar=benchmark_config_params.progress_bar,
|
|
64
|
+
save_results=benchmark_config_params.save_results,
|
|
65
|
+
verbose=benchmark_config_params.verbose or benchmark_config_params.debug,
|
|
66
|
+
device=prepare_device(device=benchmark_config_params.device),
|
|
67
|
+
trust_remote_code=benchmark_config_params.trust_remote_code,
|
|
68
|
+
clear_model_cache=benchmark_config_params.clear_model_cache,
|
|
69
|
+
evaluate_test_split=benchmark_config_params.evaluate_test_split,
|
|
70
|
+
few_shot=benchmark_config_params.few_shot,
|
|
71
|
+
num_iterations=(
|
|
72
|
+
1
|
|
73
|
+
if hasattr(sys, "_called_from_test")
|
|
74
|
+
else benchmark_config_params.num_iterations
|
|
75
|
+
),
|
|
76
|
+
api_base=benchmark_config_params.api_base,
|
|
77
|
+
api_version=benchmark_config_params.api_version,
|
|
78
|
+
gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
|
|
79
|
+
generative_type=benchmark_config_params.generative_type,
|
|
80
|
+
debug=benchmark_config_params.debug,
|
|
81
|
+
run_with_cli=benchmark_config_params.run_with_cli,
|
|
82
|
+
requires_safetensors=benchmark_config_params.requires_safetensors,
|
|
83
|
+
download_only=benchmark_config_params.download_only,
|
|
168
84
|
)
|
|
169
85
|
|
|
170
86
|
|