EuroEval 15.15.0__tar.gz → 15.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-15.15.0 → euroeval-15.16.0}/.github/ISSUE_TEMPLATE/bug.yaml +1 -1
- {euroeval-15.15.0 → euroeval-15.16.0}/.github/workflows/ci.yaml +4 -2
- {euroeval-15.15.0 → euroeval-15.16.0}/.pre-commit-config.yaml +2 -2
- {euroeval-15.15.0 → euroeval-15.16.0}/CHANGELOG.md +17 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/PKG-INFO +3 -2
- {euroeval-15.15.0 → euroeval-15.16.0}/README.md +1 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/pyproject.toml +2 -2
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/litellm.py +155 -105
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/vllm.py +10 -3
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmarker.py +10 -11
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/finetuning.py +2 -1
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/metrics.py +6 -4
- {euroeval-15.15.0 → euroeval-15.16.0}/uv.lock +5 -5
- {euroeval-15.15.0 → euroeval-15.16.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/.gitignore +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/CITATION.cff +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/CONTRIBUTING.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/Dockerfile.cuda +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/LICENSE +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/NEW_DATASET_GUIDE.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/CNAME +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/README.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/README.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/danish.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/dutch.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/english.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/faroese.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/finnish.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/french.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/german.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/icelandic.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/italian.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/norwegian.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/portuguese.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/spanish.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/swedish.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/extras/radial_plotter.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/faq.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/gfx/favicon.png +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/README.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/methodology.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/python-package.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/README.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/knowledge.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/speed.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/summarization.md +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/gfx/euroeval.png +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/gfx/euroeval.xcf +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/gfx/scandeval.png +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/makefile +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/mkdocs.yaml +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_config_factory.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/base.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/fresh.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/hf.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/callbacks.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/cli.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/constants.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/data_loading.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/data_models.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/danish.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/dutch.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/english.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/faroese.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/finnish.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/french.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/german.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/icelandic.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/italian.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/norwegian.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/portuguese.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/spanish.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/swedish.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/enums.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/exceptions.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/generation.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/generation_utils.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/human_evaluation.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/languages.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/model_cache.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/model_config.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/model_loading.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/summarization.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/scores.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/speed_benchmark.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/question_answering.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/text_to_text.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/token_classification.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/tasks.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/tokenization_utils.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/types.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/utils.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/constants.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_allocine.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_arc.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_arc_is.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_belebele.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_boolq_pt.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_conll_en.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_conll_es.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_dane.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_danish_citizen_tests.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_dansk.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_dbrd.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_eltec.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_fone.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_foqa.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_fosent.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_fquad.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_germanquad.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_germeval.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_goldenswag.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_harem.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_hellaswag_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_icesum.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_idioms_no.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_jentoft.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_life_in_the_uk.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mmlu.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_multi_wiki_qa.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_no_cola.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norec.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norne.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norquad.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_nqii.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_publico.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_rrn.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sb10k.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_scala.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_scandisent_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_schibsted.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_squad.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_squad_it.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sst2_pt.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sst5.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_suc3.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_swedn.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_swerec.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_turku_ner_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_tydiqa_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_xlsum_fi.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_xquad_es.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/load_ud_pos.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/versioning.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/conftest.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_hf.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmarker.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_callbacks.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_cli.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_constants.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_data_loading.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_data_models.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_dataset_configs.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_enums.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_exceptions.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_finetuning.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_generation.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_human_evaluation.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_languages.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_model_cache.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_model_config.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_model_loading.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_scores.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_speed_benchmark.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_tasks.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_tokenization_utils.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_types.py +0 -0
- {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_utils.py +0 -0
|
@@ -55,7 +55,7 @@ body:
|
|
|
55
55
|
attributes:
|
|
56
56
|
label: EuroEval version
|
|
57
57
|
description: What version of EuroEval are you using?
|
|
58
|
-
placeholder: Output of `pip list | grep
|
|
58
|
+
placeholder: Output of `pip list | grep euroeval`
|
|
59
59
|
validations:
|
|
60
60
|
required: true
|
|
61
61
|
- type: input
|
|
@@ -57,7 +57,7 @@ jobs:
|
|
|
57
57
|
run: uv sync --no-dev --extra test
|
|
58
58
|
|
|
59
59
|
- name: Start Ollama server
|
|
60
|
-
run: curl -fsSL https://ollama.com/install.sh | sh
|
|
60
|
+
run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
|
|
61
61
|
|
|
62
62
|
- name: Test with pytest
|
|
63
63
|
run: uv run pytest
|
|
@@ -66,6 +66,8 @@ jobs:
|
|
|
66
66
|
HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
67
67
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
68
68
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
69
|
+
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
|
70
|
+
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
|
|
69
71
|
|
|
70
72
|
- name: Delete EuroEval cache
|
|
71
73
|
run: rm -rf .euroeval_cache
|
|
@@ -88,7 +90,7 @@ jobs:
|
|
|
88
90
|
run: uv sync --no-dev --extra test
|
|
89
91
|
|
|
90
92
|
- name: Start Ollama server
|
|
91
|
-
run: curl -fsSL https://ollama.com/install.sh | sh
|
|
93
|
+
run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
|
|
92
94
|
|
|
93
95
|
- name: Test with pytest
|
|
94
96
|
run: uv run pytest
|
|
@@ -4,13 +4,13 @@ repos:
|
|
|
4
4
|
hooks:
|
|
5
5
|
- id: python-use-type-annotations
|
|
6
6
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
7
|
-
rev:
|
|
7
|
+
rev: v6.0.0
|
|
8
8
|
hooks:
|
|
9
9
|
- id: end-of-file-fixer
|
|
10
10
|
- id: trailing-whitespace
|
|
11
11
|
- id: debug-statements
|
|
12
12
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
-
rev: v0.12.
|
|
13
|
+
rev: v0.12.8
|
|
14
14
|
hooks:
|
|
15
15
|
- id: ruff
|
|
16
16
|
args:
|
|
@@ -10,6 +10,23 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v15.16.0] - 2025-08-12
|
|
14
|
+
### Added
|
|
15
|
+
- Added metadata for GPT-5 models.
|
|
16
|
+
|
|
17
|
+
### Changed
|
|
18
|
+
- Updated `transformers` dependency to `>=4.55.0`.
|
|
19
|
+
|
|
20
|
+
### Fixed
|
|
21
|
+
- If the model uses 'mxfp4' quantisation then we allow the dtype to be bfloat16, rather
|
|
22
|
+
than forcing float16. This caused issues with the new GPT-OSS models.
|
|
23
|
+
- Prevent multiple `Model <model-id> does not exist` logs when evaluating a model
|
|
24
|
+
that does not exist - now only logs this once.
|
|
25
|
+
- Cleaner error message when attempting to benchmark a generative model without having a
|
|
26
|
+
GPU available.
|
|
27
|
+
- Now raises error if an inference API is used with a parameter that is not supported.
|
|
28
|
+
|
|
29
|
+
|
|
13
30
|
## [v15.15.0] - 2025-08-06
|
|
14
31
|
### Added
|
|
15
32
|
- Added the common-sense reasoning dataset GoldenSwag for the following
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.16.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -56,7 +56,7 @@ Requires-Dist: setuptools>=75.8.2
|
|
|
56
56
|
Requires-Dist: tenacity>=9.0.0
|
|
57
57
|
Requires-Dist: termcolor>=2.0.0
|
|
58
58
|
Requires-Dist: torch>=2.6.0
|
|
59
|
-
Requires-Dist: transformers>=4.
|
|
59
|
+
Requires-Dist: transformers>=4.55.0
|
|
60
60
|
Provides-Extra: all
|
|
61
61
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
62
62
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
@@ -233,6 +233,7 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
233
233
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
234
234
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
235
235
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
236
|
+
<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
|
|
236
237
|
|
|
237
238
|
|
|
238
239
|
### Contribute to EuroEval
|
|
@@ -159,6 +159,7 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
159
159
|
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
160
160
|
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
161
161
|
<a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
|
|
162
|
+
<a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
|
|
162
163
|
|
|
163
164
|
|
|
164
165
|
### Contribute to EuroEval
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "EuroEval"
|
|
3
|
-
version = "15.
|
|
3
|
+
version = "15.16.0"
|
|
4
4
|
description = "The robust European language model benchmark."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -14,7 +14,7 @@ dependencies = [
|
|
|
14
14
|
"torch>=2.6.0",
|
|
15
15
|
"pandas>=2.2.0",
|
|
16
16
|
"numpy>=1.23.0,<2.0.0",
|
|
17
|
-
"transformers>=4.
|
|
17
|
+
"transformers>=4.55.0",
|
|
18
18
|
"accelerate>=1.9.0",
|
|
19
19
|
"evaluate>=0.4.1",
|
|
20
20
|
"datasets>=3.5.0",
|
|
@@ -6,7 +6,7 @@ import logging
|
|
|
6
6
|
import os
|
|
7
7
|
import re
|
|
8
8
|
import typing as t
|
|
9
|
-
from functools import cached_property, partial
|
|
9
|
+
from functools import cache, cached_property, partial
|
|
10
10
|
from time import sleep
|
|
11
11
|
|
|
12
12
|
import litellm
|
|
@@ -27,6 +27,7 @@ from litellm.exceptions import (
|
|
|
27
27
|
RateLimitError,
|
|
28
28
|
ServiceUnavailableError,
|
|
29
29
|
Timeout,
|
|
30
|
+
UnsupportedParamsError,
|
|
30
31
|
)
|
|
31
32
|
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
32
33
|
from litellm.router import Router
|
|
@@ -87,6 +88,7 @@ logger = logging.getLogger("euroeval")
|
|
|
87
88
|
|
|
88
89
|
VOCAB_SIZE_MAPPING = {
|
|
89
90
|
# OpenAI models
|
|
91
|
+
r"gpt-5-.*": 100_256,
|
|
90
92
|
r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
|
|
91
93
|
r"gpt-4-[0-9]{4}-preview": 100_256,
|
|
92
94
|
r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
|
|
@@ -105,6 +107,7 @@ VOCAB_SIZE_MAPPING = {
|
|
|
105
107
|
|
|
106
108
|
MODEL_MAX_LENGTH_MAPPING = {
|
|
107
109
|
# OpenAI models
|
|
110
|
+
r"gpt-5-.*": 272_000,
|
|
108
111
|
r"gpt-4(-[0-9]{4})?": 8_191,
|
|
109
112
|
r"gpt-4-32k(-[0-9]{4})?": 32_767,
|
|
110
113
|
r"gpt-4-[0-9]{4}-preview": 128_000,
|
|
@@ -129,6 +132,7 @@ MODEL_MAX_LENGTH_MAPPING = {
|
|
|
129
132
|
|
|
130
133
|
NUM_PARAMS_MAPPING = {
|
|
131
134
|
# OpenAI models
|
|
135
|
+
r"gpt-5-.*": -1,
|
|
132
136
|
r"gpt-4.*": -1,
|
|
133
137
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
134
138
|
# Anthropic models
|
|
@@ -144,6 +148,7 @@ NUM_PARAMS_MAPPING = {
|
|
|
144
148
|
|
|
145
149
|
ALLOWED_PARAMS = {
|
|
146
150
|
# OpenAI models
|
|
151
|
+
r"gpt-5-.*": ["minimal", "low", "medium", "high"],
|
|
147
152
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
|
|
148
153
|
# Anthropic models
|
|
149
154
|
r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
|
|
@@ -269,28 +274,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
269
274
|
generative_type=self.generative_type,
|
|
270
275
|
)
|
|
271
276
|
|
|
272
|
-
#
|
|
273
|
-
|
|
274
|
-
model=self.model_config.model_id,
|
|
275
|
-
max_completion_tokens=(
|
|
276
|
-
REASONING_MAX_TOKENS
|
|
277
|
-
if self.generative_type == GenerativeType.REASONING
|
|
278
|
-
else self.dataset_config.max_generated_tokens
|
|
279
|
-
),
|
|
280
|
-
stop=[],
|
|
281
|
-
temperature=0.0,
|
|
282
|
-
seed=4242,
|
|
283
|
-
api_key=self.benchmark_config.api_key,
|
|
284
|
-
api_base=self.benchmark_config.api_base,
|
|
285
|
-
api_version=self.benchmark_config.api_version,
|
|
286
|
-
max_retries=3,
|
|
287
|
-
)
|
|
288
|
-
|
|
289
|
-
# Set up the `response_format` generation argument if we are dealing with a task
|
|
290
|
-
# using structured generation
|
|
277
|
+
# Sanity check that "JSON" is included in the prompt, as some models require
|
|
278
|
+
# this
|
|
291
279
|
if self.dataset_config.task in TASKS_USING_JSON:
|
|
292
|
-
# Sanity check that "JSON" is included in the prompt, as some models require
|
|
293
|
-
# this
|
|
294
280
|
for conversation in conversations:
|
|
295
281
|
if not conversation:
|
|
296
282
|
raise InvalidBenchmark(
|
|
@@ -310,87 +296,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
310
296
|
"Prompt must contain 'json' for JSON tasks."
|
|
311
297
|
)
|
|
312
298
|
|
|
313
|
-
if self.generative_type == GenerativeType.REASONING:
|
|
314
|
-
log_once(
|
|
315
|
-
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
316
|
-
"and thus does not support structured generation, so we do not "
|
|
317
|
-
"enable it.",
|
|
318
|
-
level=logging.DEBUG,
|
|
319
|
-
)
|
|
320
|
-
elif supports_response_schema(model=self.model_config.model_id):
|
|
321
|
-
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
322
|
-
keys_and_their_types: dict[str, t.Any] = {
|
|
323
|
-
tag_name: (conlist(str, max_length=5), ...)
|
|
324
|
-
for tag_name in ner_tag_names
|
|
325
|
-
}
|
|
326
|
-
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
327
|
-
generation_kwargs["response_format"] = pydantic_class
|
|
328
|
-
log_once(
|
|
329
|
-
"Enabling structured generation for model "
|
|
330
|
-
f"{self.model_config.model_id!r} with the JSON schema "
|
|
331
|
-
f"{pydantic_class.model_json_schema()}",
|
|
332
|
-
level=logging.DEBUG,
|
|
333
|
-
)
|
|
334
|
-
else:
|
|
335
|
-
generation_kwargs["response_format"] = dict(type="json_object")
|
|
336
|
-
log_once(
|
|
337
|
-
"Enabling structured JSON generation for model "
|
|
338
|
-
f"{self.model_config.model_id!r} with no custom JSON schema, as "
|
|
339
|
-
"the model does not support schemas.",
|
|
340
|
-
level=logging.DEBUG,
|
|
341
|
-
)
|
|
342
|
-
|
|
343
|
-
# If the model is an Ollama reasoning model, we ensure that thinking is enabled
|
|
344
|
-
if self.is_ollama and self.generative_type == GenerativeType.REASONING:
|
|
345
|
-
generation_kwargs["think"] = True
|
|
346
|
-
log_once(
|
|
347
|
-
"Enabling thinking mode for Ollama model "
|
|
348
|
-
f"{self.model_config.model_id!r}",
|
|
349
|
-
level=logging.DEBUG,
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
# Handle manually set parameters
|
|
353
|
-
if self.buffer["first_label_token_mapping"]:
|
|
354
|
-
generation_kwargs["logprobs"] = True
|
|
355
|
-
generation_kwargs["top_logprobs"] = MAX_LOGPROBS
|
|
356
|
-
if self.model_config.revision == "thinking":
|
|
357
|
-
generation_kwargs["thinking"] = dict(
|
|
358
|
-
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
359
|
-
)
|
|
360
|
-
log_once(
|
|
361
|
-
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
362
|
-
level=logging.DEBUG,
|
|
363
|
-
)
|
|
364
|
-
elif self.model_config.revision == "no-thinking":
|
|
365
|
-
generation_kwargs["thinking"] = dict(budget_tokens=0)
|
|
366
|
-
log_once(
|
|
367
|
-
f"Disabling thinking mode for model {self.model_config.model_id!r}",
|
|
368
|
-
level=logging.DEBUG,
|
|
369
|
-
)
|
|
370
|
-
elif self.model_config.revision in {"low", "medium", "high"}:
|
|
371
|
-
generation_kwargs["reasoning_effort"] = self.model_config.revision
|
|
372
|
-
log_once(
|
|
373
|
-
f"Enabling reasoning effort {self.model_config.revision!r} for model "
|
|
374
|
-
f"{self.model_config.model_id!r}",
|
|
375
|
-
level=logging.DEBUG,
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
# Drop generation kwargs that are not supported by the model
|
|
379
|
-
litellm.drop_params = True
|
|
380
|
-
|
|
381
|
-
# First attempt is a test run with a single conversation to handle errors
|
|
382
|
-
# quickly
|
|
383
|
-
test_conversation = conversations[0]
|
|
384
|
-
_, failures = safe_run(
|
|
385
|
-
self._generate_async(
|
|
386
|
-
model_id=self.model_config.model_id,
|
|
387
|
-
conversations=[test_conversation],
|
|
388
|
-
**generation_kwargs,
|
|
389
|
-
)
|
|
390
|
-
)
|
|
391
|
-
for _, error in failures:
|
|
392
|
-
self._handle_exception(error=error, generation_kwargs=generation_kwargs)
|
|
393
|
-
|
|
394
299
|
all_responses: dict[int, "ModelResponse"] = {}
|
|
395
300
|
conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
|
|
396
301
|
enumerate(conversations)
|
|
@@ -404,7 +309,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
404
309
|
self._generate_async(
|
|
405
310
|
model_id=self.model_config.model_id,
|
|
406
311
|
conversations=list(batch_conversations),
|
|
407
|
-
**
|
|
312
|
+
**self.get_generation_kwargs(dataset_config=self.dataset_config),
|
|
408
313
|
)
|
|
409
314
|
)
|
|
410
315
|
|
|
@@ -431,7 +336,12 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
431
336
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
432
337
|
# successful generations next time around
|
|
433
338
|
for _, error in failures:
|
|
434
|
-
self._handle_exception(
|
|
339
|
+
self._handle_exception(
|
|
340
|
+
error=error,
|
|
341
|
+
generation_kwargs=self.get_generation_kwargs(
|
|
342
|
+
dataset_config=self.dataset_config
|
|
343
|
+
),
|
|
344
|
+
)
|
|
435
345
|
|
|
436
346
|
# Sleep for a second to avoid pinging the API server too quickly
|
|
437
347
|
sleep(1)
|
|
@@ -484,6 +394,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
484
394
|
"`temperature` may only be set to 1",
|
|
485
395
|
"'temperature' does not support 0.0 with this model. Only the default "
|
|
486
396
|
"(1) value is supported",
|
|
397
|
+
"Only temperature=1 is supported",
|
|
487
398
|
]
|
|
488
399
|
max_items_messages = ["'maxItems' is not permitted."]
|
|
489
400
|
no_json_schema_messages = ["Property keys should match pattern"]
|
|
@@ -593,6 +504,20 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
593
504
|
)
|
|
594
505
|
sleep(5)
|
|
595
506
|
return
|
|
507
|
+
elif isinstance(error, UnsupportedParamsError):
|
|
508
|
+
unsupported_param_match = re.search(
|
|
509
|
+
pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
|
|
510
|
+
string=error.message,
|
|
511
|
+
)
|
|
512
|
+
if unsupported_param_match is None:
|
|
513
|
+
raise InvalidModel(error.message)
|
|
514
|
+
else:
|
|
515
|
+
unsupported_param = unsupported_param_match.group(0)
|
|
516
|
+
raise InvalidModel(
|
|
517
|
+
f"The model {model_id!r} does not support the parameter "
|
|
518
|
+
f"{unsupported_param!r}. Try again without this parameter. "
|
|
519
|
+
"Skipping this model."
|
|
520
|
+
)
|
|
596
521
|
elif isinstance(error, (APIConnectionError, OSError)):
|
|
597
522
|
# If there are too many I/O connections, we increase the number of allowed
|
|
598
523
|
# file descriptors
|
|
@@ -1233,6 +1158,126 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1233
1158
|
|
|
1234
1159
|
return dataset
|
|
1235
1160
|
|
|
1161
|
+
@cache
|
|
1162
|
+
def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
|
|
1163
|
+
"""Get the generation arguments for the model.
|
|
1164
|
+
|
|
1165
|
+
Args:
|
|
1166
|
+
dataset_config:
|
|
1167
|
+
The dataset configuration, which is used to determine the generative
|
|
1168
|
+
type of the model. We use this as an argument here rather than using
|
|
1169
|
+
`self.dataset_config` to ensure that that the cache is updated when the
|
|
1170
|
+
dataset configuration changes.
|
|
1171
|
+
|
|
1172
|
+
Returns:
|
|
1173
|
+
The generation arguments for the model.
|
|
1174
|
+
"""
|
|
1175
|
+
# Set the core generation arguments
|
|
1176
|
+
generation_kwargs: dict[str, t.Any] = dict(
|
|
1177
|
+
model=self.model_config.model_id,
|
|
1178
|
+
max_completion_tokens=(
|
|
1179
|
+
REASONING_MAX_TOKENS
|
|
1180
|
+
if self.generative_type == GenerativeType.REASONING
|
|
1181
|
+
else dataset_config.max_generated_tokens
|
|
1182
|
+
),
|
|
1183
|
+
stop=[],
|
|
1184
|
+
temperature=0.0,
|
|
1185
|
+
seed=4242,
|
|
1186
|
+
api_key=self.benchmark_config.api_key,
|
|
1187
|
+
api_base=self.benchmark_config.api_base,
|
|
1188
|
+
api_version=self.benchmark_config.api_version,
|
|
1189
|
+
max_retries=3,
|
|
1190
|
+
)
|
|
1191
|
+
|
|
1192
|
+
# Set up the `response_format` generation argument if we are dealing with a task
|
|
1193
|
+
# using structured generation
|
|
1194
|
+
if dataset_config.task in TASKS_USING_JSON:
|
|
1195
|
+
if self.generative_type == GenerativeType.REASONING:
|
|
1196
|
+
log_once(
|
|
1197
|
+
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
1198
|
+
"and thus does not support structured generation, so we do not "
|
|
1199
|
+
"enable it.",
|
|
1200
|
+
level=logging.DEBUG,
|
|
1201
|
+
)
|
|
1202
|
+
elif supports_response_schema(model=self.model_config.model_id):
|
|
1203
|
+
ner_tag_names = list(dataset_config.prompt_label_mapping.values())
|
|
1204
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
1205
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
1206
|
+
for tag_name in ner_tag_names
|
|
1207
|
+
}
|
|
1208
|
+
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
1209
|
+
generation_kwargs["response_format"] = pydantic_class
|
|
1210
|
+
log_once(
|
|
1211
|
+
"Enabling structured generation for model "
|
|
1212
|
+
f"{self.model_config.model_id!r} with the JSON schema "
|
|
1213
|
+
f"{pydantic_class.model_json_schema()}",
|
|
1214
|
+
level=logging.DEBUG,
|
|
1215
|
+
)
|
|
1216
|
+
else:
|
|
1217
|
+
generation_kwargs["response_format"] = dict(type="json_object")
|
|
1218
|
+
log_once(
|
|
1219
|
+
"Enabling structured JSON generation for model "
|
|
1220
|
+
f"{self.model_config.model_id!r} with no custom JSON schema, as "
|
|
1221
|
+
"the model does not support schemas.",
|
|
1222
|
+
level=logging.DEBUG,
|
|
1223
|
+
)
|
|
1224
|
+
|
|
1225
|
+
# If the model is an Ollama reasoning model, we ensure that thinking is enabled
|
|
1226
|
+
if self.is_ollama and self.generative_type == GenerativeType.REASONING:
|
|
1227
|
+
generation_kwargs["think"] = True
|
|
1228
|
+
log_once(
|
|
1229
|
+
"Enabling thinking mode for Ollama model "
|
|
1230
|
+
f"{self.model_config.model_id!r}",
|
|
1231
|
+
level=logging.DEBUG,
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
# Handle manually set parameters
|
|
1235
|
+
if self.buffer["first_label_token_mapping"]:
|
|
1236
|
+
generation_kwargs["logprobs"] = True
|
|
1237
|
+
generation_kwargs["top_logprobs"] = MAX_LOGPROBS
|
|
1238
|
+
if self.model_config.revision == "thinking":
|
|
1239
|
+
generation_kwargs["thinking"] = dict(
|
|
1240
|
+
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
1241
|
+
)
|
|
1242
|
+
log_once(
|
|
1243
|
+
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
1244
|
+
level=logging.DEBUG,
|
|
1245
|
+
)
|
|
1246
|
+
elif self.model_config.revision == "no-thinking":
|
|
1247
|
+
generation_kwargs["thinking"] = dict(budget_tokens=0)
|
|
1248
|
+
log_once(
|
|
1249
|
+
f"Disabling thinking mode for model {self.model_config.model_id!r}",
|
|
1250
|
+
level=logging.DEBUG,
|
|
1251
|
+
)
|
|
1252
|
+
elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
|
|
1253
|
+
generation_kwargs["reasoning_effort"] = self.model_config.revision
|
|
1254
|
+
log_once(
|
|
1255
|
+
f"Enabling reasoning effort {self.model_config.revision!r} for model "
|
|
1256
|
+
f"{self.model_config.model_id!r}",
|
|
1257
|
+
level=logging.DEBUG,
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
# First attempt is a test run with a single conversation to handle errors
|
|
1261
|
+
# quickly. We repeat this multiple times to deal with different types of
|
|
1262
|
+
# errors, and stop if we get a successful response.
|
|
1263
|
+
test_conversation = [
|
|
1264
|
+
litellm.ChatCompletionUserMessage(role="user", content="Test message")
|
|
1265
|
+
]
|
|
1266
|
+
for _ in range(5):
|
|
1267
|
+
_, failures = safe_run(
|
|
1268
|
+
self._generate_async(
|
|
1269
|
+
model_id=self.model_config.model_id,
|
|
1270
|
+
conversations=[test_conversation],
|
|
1271
|
+
**generation_kwargs,
|
|
1272
|
+
)
|
|
1273
|
+
)
|
|
1274
|
+
if not failures:
|
|
1275
|
+
break
|
|
1276
|
+
for _, error in failures:
|
|
1277
|
+
self._handle_exception(error=error, generation_kwargs=generation_kwargs)
|
|
1278
|
+
|
|
1279
|
+
return generation_kwargs
|
|
1280
|
+
|
|
1236
1281
|
|
|
1237
1282
|
def raise_if_wrong_params(
|
|
1238
1283
|
model_config: ModelConfig, allowed_params: dict[str, list[str]]
|
|
@@ -1264,6 +1309,11 @@ def raise_if_wrong_params(
|
|
|
1264
1309
|
msg += " No parameters are allowed."
|
|
1265
1310
|
raise InvalidModel(msg)
|
|
1266
1311
|
return
|
|
1312
|
+
else:
|
|
1313
|
+
raise InvalidModel(
|
|
1314
|
+
f"The parameter {param!r} is not supported for the model "
|
|
1315
|
+
f"{model_config.model_id!r}."
|
|
1316
|
+
)
|
|
1267
1317
|
|
|
1268
1318
|
|
|
1269
1319
|
def try_download_ollama_model(model_id: str) -> bool:
|
|
@@ -168,7 +168,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
168
168
|
|
|
169
169
|
def __del__(self) -> None:
|
|
170
170
|
"""Clean up the model and tokenizer."""
|
|
171
|
-
|
|
171
|
+
if importlib.util.find_spec("vllm") is not None:
|
|
172
|
+
clear_vllm()
|
|
172
173
|
if hasattr(self, "_model"):
|
|
173
174
|
del self._model
|
|
174
175
|
if hasattr(self, "_tokenizer"):
|
|
@@ -690,8 +691,14 @@ def load_model_and_tokenizer(
|
|
|
690
691
|
)
|
|
691
692
|
dtype = torch.float16
|
|
692
693
|
|
|
693
|
-
# If the model is a quantized model, we need to
|
|
694
|
-
if quantization
|
|
694
|
+
# If the model is a quantized model, we might need to change the dtype
|
|
695
|
+
if quantization == "mxfp4" and hf_model_config.torch_dtype is None:
|
|
696
|
+
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
|
697
|
+
logger.debug(
|
|
698
|
+
"You are loading a quantized model where `torch_dtype` has not been set. "
|
|
699
|
+
f"Setting dtype to {dtype!r}."
|
|
700
|
+
)
|
|
701
|
+
elif quantization is not None and hf_model_config.torch_dtype != torch.float16:
|
|
695
702
|
logger.info(
|
|
696
703
|
"You are loading a quantized model with dtype "
|
|
697
704
|
f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
|
|
@@ -379,7 +379,16 @@ class Benchmarker:
|
|
|
379
379
|
|
|
380
380
|
current_benchmark_results: list[BenchmarkResult] = list()
|
|
381
381
|
for model_id in model_ids:
|
|
382
|
-
|
|
382
|
+
# Load the model configuration, or skip the model if it is invalid
|
|
383
|
+
try:
|
|
384
|
+
model_config = get_model_config(
|
|
385
|
+
model_id=model_id, benchmark_config=benchmark_config
|
|
386
|
+
)
|
|
387
|
+
except InvalidModel as e:
|
|
388
|
+
logger.info(e.message)
|
|
389
|
+
num_finished_benchmarks += len(dataset_configs)
|
|
390
|
+
continue
|
|
391
|
+
|
|
383
392
|
loaded_model: BenchmarkModule | None = None
|
|
384
393
|
for dataset_config in dataset_configs:
|
|
385
394
|
# Skip if we have already benchmarked this model on this dataset and
|
|
@@ -399,16 +408,6 @@ class Benchmarker:
|
|
|
399
408
|
num_finished_benchmarks += 1
|
|
400
409
|
continue
|
|
401
410
|
|
|
402
|
-
if model_config is None:
|
|
403
|
-
try:
|
|
404
|
-
model_config = get_model_config(
|
|
405
|
-
model_id=model_id, benchmark_config=benchmark_config
|
|
406
|
-
)
|
|
407
|
-
except InvalidModel as e:
|
|
408
|
-
logger.info(e.message)
|
|
409
|
-
num_finished_benchmarks += len(dataset_configs)
|
|
410
|
-
continue
|
|
411
|
-
|
|
412
411
|
# Skip if the model is an encoder model and the task is generative
|
|
413
412
|
task_is_generative = (
|
|
414
413
|
dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
5
|
import typing as t
|
|
6
|
+
from functools import partial
|
|
6
7
|
|
|
7
8
|
import torch
|
|
8
9
|
from tqdm.auto import tqdm
|
|
@@ -198,7 +199,7 @@ def finetune_single_iteration(
|
|
|
198
199
|
args=training_args,
|
|
199
200
|
train_dataset=dataset["train"],
|
|
200
201
|
eval_dataset=dataset["val"],
|
|
201
|
-
compute_metrics=model.compute_metrics,
|
|
202
|
+
compute_metrics=partial(model.compute_metrics, dataset=None),
|
|
202
203
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
|
|
203
204
|
data_collator=model.data_collator,
|
|
204
205
|
preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
|
|
@@ -51,7 +51,7 @@ class Metric(abc.ABC):
|
|
|
51
51
|
|
|
52
52
|
@abc.abstractmethod
|
|
53
53
|
def __call__(
|
|
54
|
-
self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
|
|
54
|
+
self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
|
|
55
55
|
) -> float | None:
|
|
56
56
|
"""Calculate the metric score.
|
|
57
57
|
|
|
@@ -132,7 +132,7 @@ class HuggingFaceMetric(Metric):
|
|
|
132
132
|
self.metric: "EvaluationModule | None" = None
|
|
133
133
|
|
|
134
134
|
def __call__(
|
|
135
|
-
self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
|
|
135
|
+
self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
|
|
136
136
|
) -> float | None:
|
|
137
137
|
"""Calculate the metric score.
|
|
138
138
|
|
|
@@ -225,7 +225,7 @@ class LLMAsAJudgeMetric(Metric):
|
|
|
225
225
|
self.system_prompt = system_prompt
|
|
226
226
|
|
|
227
227
|
def __call__(
|
|
228
|
-
self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
|
|
228
|
+
self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
|
|
229
229
|
) -> float | None:
|
|
230
230
|
"""Calculate the metric score using the judge model.
|
|
231
231
|
|
|
@@ -359,7 +359,9 @@ class SpeedMetric(Metric):
|
|
|
359
359
|
postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
|
|
360
360
|
)
|
|
361
361
|
|
|
362
|
-
def __call__(
|
|
362
|
+
def __call__(
|
|
363
|
+
self, _: t.Sequence, __: t.Sequence, ___: "Dataset | None"
|
|
364
|
+
) -> float | None:
|
|
363
365
|
"""Not used with the speed metric, but required for consistency."""
|
|
364
366
|
raise NotImplementedError
|
|
365
367
|
|
|
@@ -1123,7 +1123,7 @@ wheels = [
|
|
|
1123
1123
|
|
|
1124
1124
|
[[package]]
|
|
1125
1125
|
name = "euroeval"
|
|
1126
|
-
version = "15.
|
|
1126
|
+
version = "15.16.0"
|
|
1127
1127
|
source = { editable = "." }
|
|
1128
1128
|
dependencies = [
|
|
1129
1129
|
{ name = "accelerate" },
|
|
@@ -1246,7 +1246,7 @@ requires-dist = [
|
|
|
1246
1246
|
{ name = "tenacity", specifier = ">=9.0.0" },
|
|
1247
1247
|
{ name = "termcolor", specifier = ">=2.0.0" },
|
|
1248
1248
|
{ name = "torch", specifier = ">=2.6.0" },
|
|
1249
|
-
{ name = "transformers", specifier = ">=4.
|
|
1249
|
+
{ name = "transformers", specifier = ">=4.55.0" },
|
|
1250
1250
|
{ name = "vllm", marker = "sys_platform == 'linux' and extra == 'all'", specifier = ">=0.10.0" },
|
|
1251
1251
|
{ name = "vllm", marker = "sys_platform == 'linux' and extra == 'generative'", specifier = ">=0.10.0" },
|
|
1252
1252
|
]
|
|
@@ -5376,7 +5376,7 @@ wheels = [
|
|
|
5376
5376
|
|
|
5377
5377
|
[[package]]
|
|
5378
5378
|
name = "transformers"
|
|
5379
|
-
version = "4.
|
|
5379
|
+
version = "4.55.0"
|
|
5380
5380
|
source = { registry = "https://pypi.org/simple" }
|
|
5381
5381
|
dependencies = [
|
|
5382
5382
|
{ name = "filelock" },
|
|
@@ -5390,9 +5390,9 @@ dependencies = [
|
|
|
5390
5390
|
{ name = "tokenizers" },
|
|
5391
5391
|
{ name = "tqdm" },
|
|
5392
5392
|
]
|
|
5393
|
-
sdist = { url = "https://files.pythonhosted.org/packages/
|
|
5393
|
+
sdist = { url = "https://files.pythonhosted.org/packages/27/5d/f7dc746eef83336a6b34197311fe0c1da0d1192f637c726c6a5cf0d83502/transformers-4.55.0.tar.gz", hash = "sha256:15aa138a05d07a15b30d191ea2c45e23061ebf9fcc928a1318e03fe2234f3ae1", size = 9569089, upload-time = "2025-08-05T16:13:48.997Z" }
|
|
5394
5394
|
wheels = [
|
|
5395
|
-
{ url = "https://files.pythonhosted.org/packages/
|
|
5395
|
+
{ url = "https://files.pythonhosted.org/packages/1c/93/bcb22fb52ed65084c0199270832aa4cdd4b41296d896f3e7ade188bccb68/transformers-4.55.0-py3-none-any.whl", hash = "sha256:29d9b8800e32a4a831bb16efb5f762f6a9742fef9fce5d693ed018d19b106490", size = 11267905, upload-time = "2025-08-05T16:13:34.814Z" },
|
|
5396
5396
|
]
|
|
5397
5397
|
|
|
5398
5398
|
[[package]]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|