EuroEval 15.9.2__tar.gz → 15.10.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-15.9.2 → euroeval-15.10.1}/.pre-commit-config.yaml +2 -2
- {euroeval-15.9.2 → euroeval-15.10.1}/CHANGELOG.md +32 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/PKG-INFO +7 -8
- {euroeval-15.9.2 → euroeval-15.10.1}/README.md +1 -1
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/README.md +1 -1
- euroeval-15.10.1/docs/leaderboards/Monolingual/finnish.md +15 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/pyproject.toml +7 -7
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/hf.py +3 -3
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/litellm.py +158 -122
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/vllm.py +47 -143
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/data_loading.py +8 -2
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/finetuning.py +22 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +11 -1
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/question_answering.py +14 -4
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/tokenization_utils.py +103 -9
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/utils.py +13 -8
- {euroeval-15.9.2 → euroeval-15.10.1}/uv.lock +1754 -1758
- {euroeval-15.9.2 → euroeval-15.10.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/.github/workflows/ci.yaml +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/.gitignore +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/CITATION.cff +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/CONTRIBUTING.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/Dockerfile.cuda +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/LICENSE +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/NEW_DATASET_GUIDE.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/CNAME +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/README.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/danish.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/dutch.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/english.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/faroese.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/finnish.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/french.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/german.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/icelandic.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/italian.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/norwegian.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/spanish.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/swedish.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/extras/radial_plotter.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/faq.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/gfx/favicon.png +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/README.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/methodology.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/python-package.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/README.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/knowledge.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/speed.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/summarization.md +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/gfx/euroeval.png +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/gfx/euroeval.xcf +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/gfx/scandeval.png +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/makefile +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/mkdocs.yaml +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/__init__.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_config_factory.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/base.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmarker.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/callbacks.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/cli.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/constants.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/data_models.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/__init__.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/danish.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/dutch.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/english.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/faroese.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/finnish.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/french.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/german.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/icelandic.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/italian.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/norwegian.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/spanish.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/swedish.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/enums.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/exceptions.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/generation.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/generation_utils.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/human_evaluation.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/languages.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/model_cache.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/model_config.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/model_loading.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/__init__.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/summarization.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/scores.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/speed_benchmark.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/__init__.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/text_to_text.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/token_classification.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/tasks.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/types.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/constants.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_allocine.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_arc.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_arc_is.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_belebele.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_conll_en.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_conll_es.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_dane.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_danish_citizen_tests.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_dansk.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_dbrd.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_eltec.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_fone.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_foqa.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_fosent.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_fquad.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_germanquad.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_germeval.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_hellaswag_fi.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_icesum.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_jentoft.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mmlu.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_no_cola.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norec.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norne.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norquad.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_nqii.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_rrn.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_sb10k.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_scala.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_scandisent_fi.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_schibsted.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_squad.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_squad_it.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_sst5.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_suc3.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_swedn.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_swerec.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_turku_ner_fi.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_tydiqa_fi.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_xlsum_fi.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_xquad_es.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/load_ud_pos.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/versioning.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/__init__.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/conftest.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_hf.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmarker.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_callbacks.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_cli.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_constants.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_data_loading.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_data_models.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_dataset_configs.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_enums.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_exceptions.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_finetuning.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_generation.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_human_evaluation.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_languages.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_model_cache.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_model_config.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_model_loading.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_scores.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_speed_benchmark.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_tasks.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_tokenization_utils.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_types.py +0 -0
- {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_utils.py +0 -0
|
@@ -10,7 +10,7 @@ repos:
|
|
|
10
10
|
- id: trailing-whitespace
|
|
11
11
|
- id: debug-statements
|
|
12
12
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
-
rev: v0.
|
|
13
|
+
rev: v0.12.0
|
|
14
14
|
hooks:
|
|
15
15
|
- id: ruff
|
|
16
16
|
args:
|
|
@@ -31,7 +31,7 @@ repos:
|
|
|
31
31
|
hooks:
|
|
32
32
|
- id: nbstripout
|
|
33
33
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
34
|
-
rev: v1.16.
|
|
34
|
+
rev: v1.16.1
|
|
35
35
|
hooks:
|
|
36
36
|
- id: mypy
|
|
37
37
|
args:
|
|
@@ -10,6 +10,38 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v15.10.1] - 2025-06-20
|
|
14
|
+
### Fixed
|
|
15
|
+
- Fixed an issue when benchmarking encoder models on reading comprehension tasks, where
|
|
16
|
+
we sometimes would truncate the model outputs when they should not have been.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
## [v15.10.0] - 2025-06-17
|
|
20
|
+
### Changed
|
|
21
|
+
- Updated `vllm` to `>=0.9.1`.
|
|
22
|
+
- Updated `litellm` to `>=1.72.2`.
|
|
23
|
+
- Updated `ollama` to `>=0.5.1`.
|
|
24
|
+
- Better detecmtion of instruction-tuned models.
|
|
25
|
+
|
|
26
|
+
### Fixed
|
|
27
|
+
- Fixed an issue where the EOS token would be included in the vLLM generation output,
|
|
28
|
+
leading to incorrect evaluation results. We now manually remove all stop tokens from
|
|
29
|
+
the generation output, which fixes this issue.
|
|
30
|
+
- Now correctly detects reasoning models for Ollama models and enables their new "think"
|
|
31
|
+
parameter whenever a reasoning model is detected.
|
|
32
|
+
- Added a cap on the number of concurrent connections when evaluating API models, to
|
|
33
|
+
avoid running into errors related to too many open file descriptors. In case this
|
|
34
|
+
error _still_ occurs, we now give the user an informative error message on how to
|
|
35
|
+
increase the maximum number of open file descriptors on their system.
|
|
36
|
+
- Catch requests.ConnectionError when loading datasets.
|
|
37
|
+
- When benchmarking encoder models on reading comprehension tasks, we allow the model
|
|
38
|
+
outputs to have more than two elements (start and end position logits), where we
|
|
39
|
+
instead just use the first two elements and ignore the rest.
|
|
40
|
+
- When an encoder model outputs additional tensors aside from the logits, we now remove
|
|
41
|
+
these tensors from the output dictionary via the `preprocess_logits_for_metrics`
|
|
42
|
+
argument to `Trainer`.
|
|
43
|
+
|
|
44
|
+
|
|
13
45
|
## [v15.9.2] - 2025-06-04
|
|
14
46
|
### Fixed
|
|
15
47
|
- Allow a model to not have any BOS and EOS tokens.
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.10.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
7
7
|
Author-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
|
|
8
|
-
Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk
|
|
8
|
+
Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
|
|
9
9
|
License: MIT License
|
|
10
10
|
|
|
11
11
|
Copyright (c) 2022-2024 Dan Saattrup Nielsen
|
|
@@ -37,13 +37,12 @@ Requires-Dist: demjson3>=3.0.6
|
|
|
37
37
|
Requires-Dist: evaluate>=0.4.1
|
|
38
38
|
Requires-Dist: huggingface-hub>=0.30.1
|
|
39
39
|
Requires-Dist: levenshtein>=0.24.0
|
|
40
|
-
Requires-Dist: litellm>=1.
|
|
40
|
+
Requires-Dist: litellm>=1.72.2
|
|
41
41
|
Requires-Dist: more-itertools>=10.5.0
|
|
42
42
|
Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
43
|
-
Requires-Dist: ollama>=0.
|
|
43
|
+
Requires-Dist: ollama>=0.5.1
|
|
44
44
|
Requires-Dist: pandas>=2.2.0
|
|
45
45
|
Requires-Dist: peft>=0.15.0
|
|
46
|
-
Requires-Dist: protobuf~=3.20.0
|
|
47
46
|
Requires-Dist: pydantic>=2.6.0
|
|
48
47
|
Requires-Dist: pyinfer>=0.0.3
|
|
49
48
|
Requires-Dist: python-dotenv>=1.0.1
|
|
@@ -62,12 +61,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
|
|
|
62
61
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
62
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
64
63
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
65
|
-
Requires-Dist: vllm>=0.9.
|
|
64
|
+
Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
|
|
66
65
|
Provides-Extra: generative
|
|
67
66
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
67
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
68
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
70
|
-
Requires-Dist: vllm>=0.9.
|
|
69
|
+
Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
|
|
71
70
|
Provides-Extra: human-evaluation
|
|
72
71
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
73
72
|
Provides-Extra: test
|
|
@@ -93,7 +92,7 @@ ______________________________________________________________________
|
|
|
93
92
|
[](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
|
|
94
93
|
|
|
95
94
|
|
|
96
|
-
##
|
|
95
|
+
## Maintainer
|
|
97
96
|
|
|
98
97
|
- Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
|
|
99
98
|
dan.nielsen@alexandra.dk)
|
|
@@ -17,7 +17,7 @@ ______________________________________________________________________
|
|
|
17
17
|
[](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
##
|
|
20
|
+
## Maintainer
|
|
21
21
|
|
|
22
22
|
- Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
|
|
23
23
|
dan.nielsen@alexandra.dk)
|
|
@@ -29,7 +29,7 @@ or [LM Studio](https://lmstudio.ai/).
|
|
|
29
29
|
The idea of EuroEval grew out of the development of Danish language model RøBÆRTa in
|
|
30
30
|
2021, when we realised that there was no standard way to evaluate Danish language
|
|
31
31
|
models. It started as a hobby project including Danish, Swedish and Norwegian, but has
|
|
32
|
-
since grown to include
|
|
32
|
+
since grown to include 12+ European languages.
|
|
33
33
|
|
|
34
34
|
EuroEval is maintained by [Dan Saattrup Nielsen](https://www.saattrupdan.com/) from the
|
|
35
35
|
[Alexandra Institute](https://alexandra.dk), and is funded by the EU project
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
---
|
|
2
|
+
hide:
|
|
3
|
+
- toc
|
|
4
|
+
---
|
|
5
|
+
# 🇫🇮 Finnish
|
|
6
|
+
|
|
7
|
+
See the [leaderboard page](/leaderboards) for more information about all the columns.
|
|
8
|
+
|
|
9
|
+
/// tab | Generative Leaderboard
|
|
10
|
+
<iframe title="" aria-label="Table" id="datawrapper-chart-ubHSy" src="https://datawrapper.dwcdn.net/ubHSy" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="847" data-external="1"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}}))}();</script>
|
|
11
|
+
///
|
|
12
|
+
|
|
13
|
+
/// tab | NLU Leaderboard
|
|
14
|
+
<iframe title="" aria-label="Table" id="datawrapper-chart-qVbA3" src="https://datawrapper.dwcdn.net/qVbA3/1/" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="818" data-external="1"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}}))}();</script>
|
|
15
|
+
///
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "EuroEval"
|
|
3
|
-
version = "15.
|
|
3
|
+
version = "15.10.1"
|
|
4
4
|
description = "The robust European language model benchmark."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -8,7 +8,6 @@ authors = [
|
|
|
8
8
|
]
|
|
9
9
|
maintainers = [
|
|
10
10
|
{name = "Dan Saattrup Nielsen", email = "dan.nielsen@alexandra.dk"},
|
|
11
|
-
{name = "Kenneth Enevoldsen", email = "kenneth.enevoldsen@cas.au.dk"},
|
|
12
11
|
]
|
|
13
12
|
requires-python = ">=3.10,<4.0"
|
|
14
13
|
dependencies = [
|
|
@@ -27,18 +26,17 @@ dependencies = [
|
|
|
27
26
|
"huggingface-hub>=0.30.1",
|
|
28
27
|
"pyinfer>=0.0.3",
|
|
29
28
|
"sentencepiece>=0.1.96",
|
|
30
|
-
"protobuf~=3.20.0",
|
|
31
29
|
"sacremoses>=0.1.1",
|
|
32
30
|
"more-itertools>=10.5.0",
|
|
33
31
|
"tenacity>=9.0.0",
|
|
34
|
-
"litellm>=1.
|
|
32
|
+
"litellm>=1.72.2",
|
|
35
33
|
"rouge-score>=0.1.2",
|
|
36
34
|
"bert-score>=0.3.13",
|
|
37
35
|
"levenshtein>=0.24.0",
|
|
38
36
|
"scikit-learn<1.6.0",
|
|
39
37
|
"setuptools>=75.8.2",
|
|
40
38
|
"demjson3>=3.0.6",
|
|
41
|
-
"ollama>=0.
|
|
39
|
+
"ollama>=0.5.1",
|
|
42
40
|
"peft>=0.15.0",
|
|
43
41
|
]
|
|
44
42
|
|
|
@@ -46,7 +44,7 @@ dependencies = [
|
|
|
46
44
|
generative = [
|
|
47
45
|
"outlines>=0.1.11",
|
|
48
46
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
49
|
-
"vllm>=0.9.
|
|
47
|
+
"vllm>=0.9.1; platform_system == 'Linux'",
|
|
50
48
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
51
49
|
]
|
|
52
50
|
human_evaluation = [
|
|
@@ -55,7 +53,7 @@ human_evaluation = [
|
|
|
55
53
|
all = [
|
|
56
54
|
"outlines>=0.1.11",
|
|
57
55
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
58
|
-
"vllm>=0.9.
|
|
56
|
+
"vllm>=0.9.1; platform_system == 'Linux'",
|
|
59
57
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
60
58
|
"gradio>=4.26.0",
|
|
61
59
|
]
|
|
@@ -150,6 +148,8 @@ ignore = [
|
|
|
150
148
|
"ANN101",
|
|
151
149
|
# Type annotations for "cls" arguments
|
|
152
150
|
"ANN102",
|
|
151
|
+
# Type annotations for *args
|
|
152
|
+
"ANN002",
|
|
153
153
|
# Type annotations for **kwargs
|
|
154
154
|
"ANN003",
|
|
155
155
|
# Docstrings for **kwargs
|
|
@@ -378,7 +378,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
378
378
|
tokenizer=self._tokenizer,
|
|
379
379
|
),
|
|
380
380
|
batched=True,
|
|
381
|
-
batch_size=
|
|
381
|
+
batch_size=10,
|
|
382
382
|
remove_columns=dataset["train"].column_names,
|
|
383
383
|
load_from_cache_file=False,
|
|
384
384
|
keep_in_memory=True,
|
|
@@ -389,7 +389,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
389
389
|
tokenizer=self._tokenizer,
|
|
390
390
|
),
|
|
391
391
|
batched=True,
|
|
392
|
-
batch_size=
|
|
392
|
+
batch_size=10,
|
|
393
393
|
remove_columns=dataset["val"].column_names,
|
|
394
394
|
load_from_cache_file=False,
|
|
395
395
|
keep_in_memory=True,
|
|
@@ -400,7 +400,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
400
400
|
tokenizer=self._tokenizer,
|
|
401
401
|
),
|
|
402
402
|
batched=True,
|
|
403
|
-
batch_size=
|
|
403
|
+
batch_size=10,
|
|
404
404
|
remove_columns=dataset["test"].column_names,
|
|
405
405
|
load_from_cache_file=False,
|
|
406
406
|
keep_in_memory=True,
|