EuroEval 15.4.0__tar.gz → 15.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-15.4.0 → euroeval-15.4.2}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +2 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/.github/ISSUE_TEMPLATE/bug.yaml +17 -2
- {euroeval-15.4.0 → euroeval-15.4.2}/.github/ISSUE_TEMPLATE/feature_request.yaml +1 -11
- {euroeval-15.4.0 → euroeval-15.4.2}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +21 -10
- {euroeval-15.4.0 → euroeval-15.4.2}/.github/workflows/ci.yaml +0 -2
- {euroeval-15.4.0 → euroeval-15.4.2}/CHANGELOG.md +54 -1
- {euroeval-15.4.0 → euroeval-15.4.2}/PKG-INFO +4 -3
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/danish.md +4 -5
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/french.md +2 -2
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/spanish.md +1 -1
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/swedish.md +4 -5
- {euroeval-15.4.0 → euroeval-15.4.2}/pyproject.toml +4 -4
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/benchmark_modules/hf.py +68 -37
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/benchmark_modules/vllm.py +47 -8
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/constants.py +3 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/data_models.py +7 -2
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/dataset_configs.py +5 -5
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/generation.py +17 -3
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/task_utils/sequence_classification.py +35 -10
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/types.py +3 -3
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/utils.py +32 -29
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_mlsum_de.py +1 -1
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_mlsum_es.py +1 -1
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_constants.py +1 -1
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_utils.py +0 -11
- {euroeval-15.4.0 → euroeval-15.4.2}/uv.lock +478 -440
- {euroeval-15.4.0 → euroeval-15.4.2}/.gitignore +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/.pre-commit-config.yaml +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/CITATION.cff +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/CONTRIBUTING.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/Dockerfile.cuda +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/LICENSE +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/README.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/CNAME +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/README.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/README.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/dutch.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/english.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/faroese.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/german.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/icelandic.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/italian.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/datasets/norwegian.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/extras/radial_plotter.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/faq.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/gfx/favicon.png +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/leaderboards/README.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/methodology.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/python-package.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/tasks/README.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/tasks/knowledge.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/tasks/speed.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/docs/tasks/summarization.md +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/gfx/euroeval.png +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/gfx/euroeval.xcf +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/gfx/scandeval.png +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/makefile +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/mkdocs.yaml +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/__init__.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/benchmark_config_factory.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/benchmark_modules/base.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/benchmark_modules/fresh.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/benchmark_modules/litellm.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/benchmarker.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/callbacks.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/cli.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/data_loading.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/enums.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/exceptions.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/finetuning.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/human_evaluation.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/languages.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/model_cache.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/model_config.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/model_loading.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/scores.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/speed_benchmark.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/task_utils/__init__.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/task_utils/multiple_choice_classification.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/task_utils/question_answering.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/task_utils/text_to_text.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/task_utils/token_classification.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/euroeval/tasks.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/constants.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_allocine.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_arc.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_arc_is.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_belebele.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_conll_en.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_conll_es.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_dane.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_danish_citizen_tests.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_dansk.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_dbrd.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_dutch_social.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_eltec.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_fone.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_foqa.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_fosent.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_fquad.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_germanquad.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_germeval.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_icesum.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_jentoft.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_mmlu.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_no_cola.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_norec.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_norne.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_norquad.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_nqii.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_rrn.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_sb10k.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_scala.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_schibsted.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_squad.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_squad_it.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_sst5.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_suc3.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_swedn.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_swerec.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/create_xquad_es.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/load_ud_pos.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/src/scripts/versioning.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/__init__.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/conftest.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_benchmark_modules/test_hf.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_benchmarker.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_callbacks.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_cli.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_data_loading.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_data_models.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_dataset_configs.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_enums.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_exceptions.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_finetuning.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_generation.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_human_evaluation.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_languages.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_model_cache.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_model_config.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_model_loading.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_scores.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_speed_benchmark.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_tasks.py +0 -0
- {euroeval-15.4.0 → euroeval-15.4.2}/tests/test_types.py +0 -0
|
@@ -2,6 +2,7 @@ name: 📚 Benchmark Dataset Request
|
|
|
2
2
|
description: Do you think a particular benchmark dataset is missing in EuroEval?
|
|
3
3
|
title: "[BENCHMARK DATASET REQUEST] <dataset-name>"
|
|
4
4
|
labels: "benchmark dataset request"
|
|
5
|
+
type: task
|
|
5
6
|
|
|
6
7
|
body:
|
|
7
8
|
- type: input
|
|
@@ -30,6 +31,7 @@ body:
|
|
|
30
31
|
- label: Icelandic
|
|
31
32
|
- label: Italian
|
|
32
33
|
- label: Norwegian (Bokmål or Nynorsk)
|
|
34
|
+
- label: Spanish
|
|
33
35
|
- label: Swedish
|
|
34
36
|
validations:
|
|
35
37
|
required: true
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
name: 🐛 Bug Report
|
|
2
2
|
description: Have you experienced a bug using the `euroeval` package?
|
|
3
3
|
title: "[BUG] <name-of-bug>"
|
|
4
|
-
|
|
4
|
+
type: bug
|
|
5
5
|
|
|
6
6
|
body:
|
|
7
7
|
- type: markdown
|
|
@@ -46,8 +46,9 @@ body:
|
|
|
46
46
|
- 3.10.x
|
|
47
47
|
- 3.11.x
|
|
48
48
|
- 3.12.x
|
|
49
|
+
- 3.13.x
|
|
49
50
|
- Older than 3.10.x
|
|
50
|
-
- Newer than 3.
|
|
51
|
+
- Newer than 3.13.x
|
|
51
52
|
validations:
|
|
52
53
|
required: true
|
|
53
54
|
- type: input
|
|
@@ -57,6 +58,20 @@ body:
|
|
|
57
58
|
placeholder: Output of `pip list | grep EuroEval`
|
|
58
59
|
validations:
|
|
59
60
|
required: true
|
|
61
|
+
- type: input
|
|
62
|
+
attributes:
|
|
63
|
+
label: Transformers version
|
|
64
|
+
description: What version of 🤗 transformers are you using?
|
|
65
|
+
placeholder: Output of `pip list | grep transformers`
|
|
66
|
+
validations:
|
|
67
|
+
required: true
|
|
68
|
+
- type: input
|
|
69
|
+
attributes:
|
|
70
|
+
label: vLLM version
|
|
71
|
+
description: What version of vLLM are you using?
|
|
72
|
+
placeholder: Output of `pip list | grep vllm`
|
|
73
|
+
validations:
|
|
74
|
+
required: true
|
|
60
75
|
- type: markdown
|
|
61
76
|
attributes:
|
|
62
77
|
value: >
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
name: 🚀 Feature Request
|
|
2
2
|
description: Is the EuroEval benchmark missing a feature?
|
|
3
3
|
title: "[FEATURE REQUEST] <name-of-feature>"
|
|
4
|
-
|
|
4
|
+
type: feature
|
|
5
5
|
|
|
6
6
|
body:
|
|
7
7
|
- type: textarea
|
|
@@ -11,16 +11,6 @@ body:
|
|
|
11
11
|
A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*.
|
|
12
12
|
validations:
|
|
13
13
|
required: true
|
|
14
|
-
- type: textarea
|
|
15
|
-
attributes:
|
|
16
|
-
label: Alternatives
|
|
17
|
-
description: >
|
|
18
|
-
A description of any alternative solutions or features you've considered, if any.
|
|
19
|
-
- type: textarea
|
|
20
|
-
attributes:
|
|
21
|
-
label: Additional context
|
|
22
|
-
description: >
|
|
23
|
-
Add any other context or screenshots about the feature request.
|
|
24
14
|
- type: markdown
|
|
25
15
|
attributes:
|
|
26
16
|
value: >
|
|
@@ -2,6 +2,7 @@ name: 📊 Model Evaluation Request
|
|
|
2
2
|
description: Would you like to have a particular model included in the leaderboards?
|
|
3
3
|
title: "[MODEL EVALUATION REQUEST] <model-name>"
|
|
4
4
|
labels: "model evaluation request"
|
|
5
|
+
type: task
|
|
5
6
|
|
|
6
7
|
body:
|
|
7
8
|
- type: input
|
|
@@ -10,16 +11,6 @@ body:
|
|
|
10
11
|
description: What is the Hugging Face model ID?
|
|
11
12
|
validations:
|
|
12
13
|
required: true
|
|
13
|
-
- type: dropdown
|
|
14
|
-
attributes:
|
|
15
|
-
label: Model type
|
|
16
|
-
description: What is the architecture of the model?
|
|
17
|
-
options:
|
|
18
|
-
- Decoder model (e.g., GPT)
|
|
19
|
-
- Encoder model (e.g., BERT)
|
|
20
|
-
- Sequence-to-sequence model (e.g., T5)
|
|
21
|
-
validations:
|
|
22
|
-
required: true
|
|
23
14
|
- type: checkboxes
|
|
24
15
|
attributes:
|
|
25
16
|
label: Evaluation languages
|
|
@@ -36,9 +27,29 @@ body:
|
|
|
36
27
|
- label: Icelandic
|
|
37
28
|
- label: Italian
|
|
38
29
|
- label: Norwegian (Bokmål or Nynorsk)
|
|
30
|
+
- label: Spanish
|
|
39
31
|
- label: Swedish
|
|
40
32
|
validations:
|
|
41
33
|
required: true
|
|
34
|
+
- type: dropdown
|
|
35
|
+
attributes:
|
|
36
|
+
label: Model type
|
|
37
|
+
description: What is the architecture of the model?
|
|
38
|
+
options:
|
|
39
|
+
- Decoder model (e.g., GPT)
|
|
40
|
+
- Encoder model (e.g., BERT)
|
|
41
|
+
- Sequence-to-sequence model (e.g., T5)
|
|
42
|
+
validations:
|
|
43
|
+
required: true
|
|
44
|
+
- type: dropdown
|
|
45
|
+
attributes:
|
|
46
|
+
label: Model size
|
|
47
|
+
description: What is the size of the model?
|
|
48
|
+
options:
|
|
49
|
+
- Small (<=8B parameters)
|
|
50
|
+
- Large (>8B parameters)
|
|
51
|
+
validations:
|
|
52
|
+
required: true
|
|
42
53
|
- type: dropdown
|
|
43
54
|
attributes:
|
|
44
55
|
label: Merged model
|
|
@@ -43,7 +43,6 @@ jobs:
|
|
|
43
43
|
- name: Install uv and set up Python
|
|
44
44
|
uses: astral-sh/setup-uv@v4
|
|
45
45
|
with:
|
|
46
|
-
enable-cache: true
|
|
47
46
|
python-version: ${{ matrix.python-version }}
|
|
48
47
|
|
|
49
48
|
- name: Install Dependencies
|
|
@@ -75,7 +74,6 @@ jobs:
|
|
|
75
74
|
- name: Install uv and set up Python
|
|
76
75
|
uses: astral-sh/setup-uv@v4
|
|
77
76
|
with:
|
|
78
|
-
enable-cache: true
|
|
79
77
|
python-version: ${{ matrix.python-version }}
|
|
80
78
|
|
|
81
79
|
- name: Install Dependencies
|
|
@@ -10,6 +10,59 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v15.4.2] - 2025-03-31
|
|
14
|
+
### Added
|
|
15
|
+
- Now added version metadata to results, to easier track which versions of the various
|
|
16
|
+
dependencies were used when evaluating a model. This currently includes
|
|
17
|
+
`transformers`, `torch`, `vllm` and `outlines`.
|
|
18
|
+
|
|
19
|
+
### Changed
|
|
20
|
+
- Changed the name of the German 'mlsum' summarisation dataset to 'mlsum-de', to reflect
|
|
21
|
+
that it is the German version of the dataset, and to avoid confusion with the Spanish
|
|
22
|
+
'mlsum-es' dataset.
|
|
23
|
+
|
|
24
|
+
### Fixed
|
|
25
|
+
- Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
|
|
26
|
+
compatibility < 8.0. This was contributed by [@marksverdhei](https://github.com/marksverdhei) ✨
|
|
27
|
+
- Corrected the name of the French sentiment dataset AlloCiné. This was contributed by
|
|
28
|
+
[@Alkarex](https://github.com/Alkarex) ✨
|
|
29
|
+
- Evaluating a specific model revision did not work for adapter models, as there was a
|
|
30
|
+
confusion between the revision of the adapter and the revision of the base model. We
|
|
31
|
+
now use the revision for the adapter and use the latest revision for the base model.
|
|
32
|
+
- In the (very unlikely) scenario that the model's tokeniser has the same first token
|
|
33
|
+
for two different labels in a text classification task, we now also use the second
|
|
34
|
+
token to ensure that we determine the correct label. If this is not possible, then we
|
|
35
|
+
warn the user.
|
|
36
|
+
- Now catches `TypeError` when trying to generate with vLLM, and retries 3 times before
|
|
37
|
+
giving up on evaluating the dataset.
|
|
38
|
+
- A bug in `transformers` caused models with the `image-text-to-text` pipeline tag to
|
|
39
|
+
not be detected as generative models. This has been patched now, and will be fixed
|
|
40
|
+
properly when [this transformers
|
|
41
|
+
PR](https://github.com/huggingface/transformers/pull/37107) has been merged.
|
|
42
|
+
- Force `vllm` v0.8.0 for now, as the severe degradation in generation output of some
|
|
43
|
+
models has not been resolved in versions v0.8.2 and v0.8.3.
|
|
44
|
+
- Only accepts the local labels for text classification tasks when evaluating decoder
|
|
45
|
+
models now, where we before accepted both the local and English labels. The reason is
|
|
46
|
+
that this caused a confusion mat times when there was a unique local label starting
|
|
47
|
+
with a particular letter, but a different English label starting with the same letter,
|
|
48
|
+
causing some models to be evaluated on the wrong label.
|
|
49
|
+
- When fetching the model information from the Hugging Face API we now attempt 3 times,
|
|
50
|
+
as the API sometimes fails. If it still fails after 3 attempts, we raise the
|
|
51
|
+
`HuggingFaceHubDown` exception.
|
|
52
|
+
- Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
|
|
53
|
+
compatibility < 8.0. This was contributed by [@marksverdhei](https://github.com/marksverdhei) ✨
|
|
54
|
+
- Fixed docs for ScandiQA-da and ScandiQA-sv, where it was incorrectly stated that
|
|
55
|
+
the splits were made by considering the original train/validation/test splits.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## [v15.4.1] - 2025-03-25
|
|
59
|
+
### Fixed
|
|
60
|
+
- Disallow `vllm` v0.8.1, as it causes severe degradation in generation output of
|
|
61
|
+
some models, resulting in artificially low scores.
|
|
62
|
+
- Fixed an issue with text classification tasks if the first token of multiple labels
|
|
63
|
+
are identical, when tokenising with the model's tokeniser.
|
|
64
|
+
|
|
65
|
+
|
|
13
66
|
## [v15.4.0] - 2025-03-24
|
|
14
67
|
### Added
|
|
15
68
|
- Added support for Spanish! 🇪🇸This includes two reading comprehension datasets:
|
|
@@ -203,7 +256,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
203
256
|
|
|
204
257
|
### Added
|
|
205
258
|
- Added support for French! 🇫🇷This includes the sentiment classification dataset
|
|
206
|
-
[
|
|
259
|
+
[AlloCiné](https://hf.co/datasets/tblard/allocine), the linguistic acceptability
|
|
207
260
|
dataset ScaLA with the [French Universal
|
|
208
261
|
Dependencies](https://github.com/UniversalDependencies/UD_French-GSD), the reading
|
|
209
262
|
comprehension dataset [FQuAD](https://hf.co/datasets/illuin/fquad) (and unofficially
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.4.
|
|
3
|
+
Version: 15.4.2
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -42,6 +42,7 @@ Requires-Dist: more-itertools>=10.5.0
|
|
|
42
42
|
Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
43
43
|
Requires-Dist: ollama>=0.4.7
|
|
44
44
|
Requires-Dist: pandas>=2.2.0
|
|
45
|
+
Requires-Dist: peft>=0.15.0
|
|
45
46
|
Requires-Dist: protobuf~=3.20.0
|
|
46
47
|
Requires-Dist: pydantic>=2.6.0
|
|
47
48
|
Requires-Dist: pyinfer>=0.0.3
|
|
@@ -61,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
|
|
|
61
62
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
62
63
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
63
64
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
64
|
-
Requires-Dist: vllm
|
|
65
|
+
Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'all'
|
|
65
66
|
Provides-Extra: generative
|
|
66
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
67
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
68
69
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
69
|
-
Requires-Dist: vllm
|
|
70
|
+
Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'generative'
|
|
70
71
|
Provides-Extra: human-evaluation
|
|
71
72
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
72
73
|
Provides-Extra: test
|
|
@@ -285,11 +285,10 @@ the translated contexts still contained the answer to the question, potentially
|
|
|
285
285
|
changing the answers slightly.
|
|
286
286
|
|
|
287
287
|
The original full dataset consists of 6,810 / 500 / 500 samples for training,
|
|
288
|
-
validation and testing, respectively
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
was sampled from the original training set.
|
|
288
|
+
validation and testing, respectively (so 3,328 samples used in total).
|
|
289
|
+
We use a 1,024 / 256 / 2,048 split for training, validation and testing, respectively,
|
|
290
|
+
where the splits are made by randomly sampling from the full dataset without considering
|
|
291
|
+
the original train/validation/test splits.
|
|
293
292
|
|
|
294
293
|
Here are a few examples from the training split:
|
|
295
294
|
|
|
@@ -7,11 +7,11 @@ information about what these constitute.
|
|
|
7
7
|
|
|
8
8
|
## Sentiment Classification
|
|
9
9
|
|
|
10
|
-
###
|
|
10
|
+
### AlloCiné
|
|
11
11
|
|
|
12
12
|
This dataset was published in [this Github
|
|
13
13
|
repository](https://github.com/TheophileBlard/french-sentiment-analysis-with-bert) and
|
|
14
|
-
features reviews from the French movie review website
|
|
14
|
+
features reviews from the French movie review website [AlloCiné](https://www.allocine.fr/). The reviews range from
|
|
15
15
|
0.5 to 5 (inclusive), with steps of 0.5. The negative samples are reviews with a rating
|
|
16
16
|
of at most 2, and the positive ones are reviews with a rating of at least 4. The reviews
|
|
17
17
|
in between were discarded.
|
|
@@ -475,7 +475,7 @@ $ euroeval --model <model-id> --dataset hellaswag-es
|
|
|
475
475
|
|
|
476
476
|
## Summarization
|
|
477
477
|
|
|
478
|
-
### MLSum-es
|
|
478
|
+
### MLSum-es
|
|
479
479
|
|
|
480
480
|
The dataset was published in [this paper](https://aclanthology.org/2020.emnlp-main.647/) and is obtained from online newspapers.
|
|
481
481
|
|
|
@@ -231,11 +231,10 @@ the translated contexts still contained the answer to the question, potentially
|
|
|
231
231
|
changing the answers slightly.
|
|
232
232
|
|
|
233
233
|
The original full dataset consists of 6,810 / 500 / 500 samples for training,
|
|
234
|
-
validation and testing, respectively
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
was sampled from the original training set.
|
|
234
|
+
validation and testing, respectively (so 3,328 samples used in total).
|
|
235
|
+
We use a 1,024 / 256 / 2,048 split for training, validation and testing, respectively,
|
|
236
|
+
where the splits are made by randomly sampling from the full dataset without considering
|
|
237
|
+
the original train/validation/test splits.
|
|
239
238
|
|
|
240
239
|
Here are a few examples from the training split:
|
|
241
240
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "EuroEval"
|
|
3
|
-
version = "15.4.
|
|
3
|
+
version = "15.4.2"
|
|
4
4
|
description = "The robust European language model benchmark."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -39,13 +39,14 @@ dependencies = [
|
|
|
39
39
|
"setuptools>=75.8.2",
|
|
40
40
|
"demjson3>=3.0.6",
|
|
41
41
|
"ollama>=0.4.7",
|
|
42
|
+
"peft>=0.15.0",
|
|
42
43
|
]
|
|
43
44
|
|
|
44
45
|
[project.optional-dependencies]
|
|
45
46
|
generative = [
|
|
46
47
|
"outlines>=0.1.11",
|
|
47
48
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
48
|
-
"vllm
|
|
49
|
+
"vllm==0.8.0; platform_system == 'Linux'",
|
|
49
50
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
50
51
|
]
|
|
51
52
|
human_evaluation = [
|
|
@@ -54,7 +55,7 @@ human_evaluation = [
|
|
|
54
55
|
all = [
|
|
55
56
|
"outlines>=0.1.11",
|
|
56
57
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
57
|
-
"vllm
|
|
58
|
+
"vllm==0.8.0; platform_system == 'Linux'",
|
|
58
59
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
59
60
|
"gradio>=4.26.0",
|
|
60
61
|
]
|
|
@@ -86,7 +87,6 @@ dev-dependencies = [
|
|
|
86
87
|
"nbstripout>=0.7.1",
|
|
87
88
|
"coverage>=5.5",
|
|
88
89
|
"lxml>=5.1.0",
|
|
89
|
-
"peft>=0.13.2",
|
|
90
90
|
"mkdocs-material>=9.5.45",
|
|
91
91
|
"mkdocs-include-markdown-plugin>=7.0.1",
|
|
92
92
|
"mkdocs-include-dir-to-nav>=1.2.0",
|
|
@@ -20,6 +20,7 @@ from huggingface_hub.utils import (
|
|
|
20
20
|
HFValidationError,
|
|
21
21
|
LocalTokenNotFoundError,
|
|
22
22
|
)
|
|
23
|
+
from peft import PeftConfig
|
|
23
24
|
from requests.exceptions import RequestException
|
|
24
25
|
from torch import nn
|
|
25
26
|
from transformers import (
|
|
@@ -34,6 +35,9 @@ from transformers import (
|
|
|
34
35
|
Trainer,
|
|
35
36
|
)
|
|
36
37
|
from transformers.modelcard import TASK_MAPPING
|
|
38
|
+
from transformers.models.auto.modeling_auto import (
|
|
39
|
+
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
|
|
40
|
+
)
|
|
37
41
|
from urllib3.exceptions import RequestError
|
|
38
42
|
|
|
39
43
|
from ..constants import (
|
|
@@ -73,6 +77,7 @@ from ..utils import (
|
|
|
73
77
|
get_class_by_name,
|
|
74
78
|
get_eos_token,
|
|
75
79
|
internet_connection_available,
|
|
80
|
+
log_once,
|
|
76
81
|
)
|
|
77
82
|
from .base import BenchmarkModule
|
|
78
83
|
|
|
@@ -727,53 +732,54 @@ def get_model_repo_info(
|
|
|
727
732
|
# If the model does not exist locally, then we get the model info from the Hugging
|
|
728
733
|
# Face Hub
|
|
729
734
|
if model_info is None:
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
repo_id=model_id, revision=revision, token=token
|
|
733
|
-
)
|
|
734
|
-
except (GatedRepoError, LocalTokenNotFoundError) as e:
|
|
735
|
+
num_attempts = 3
|
|
736
|
+
for _ in range(num_attempts):
|
|
735
737
|
try:
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
f"Could not access the model {model_id} with the revision "
|
|
739
|
-
f"{revision}. The error was {str(e)!r}."
|
|
738
|
+
model_info = hf_api.model_info(
|
|
739
|
+
repo_id=model_id, revision=revision, token=token
|
|
740
740
|
)
|
|
741
|
+
break
|
|
742
|
+
except (GatedRepoError, LocalTokenNotFoundError) as e:
|
|
743
|
+
try:
|
|
744
|
+
hf_whoami(token=token)
|
|
745
|
+
logger.warning(
|
|
746
|
+
f"Could not access the model {model_id} with the revision "
|
|
747
|
+
f"{revision}. The error was {str(e)!r}."
|
|
748
|
+
)
|
|
749
|
+
return None
|
|
750
|
+
except LocalTokenNotFoundError:
|
|
751
|
+
raise NeedsAdditionalArgument(
|
|
752
|
+
cli_argument="--api-key",
|
|
753
|
+
script_argument="api_key=<your-api-key>",
|
|
754
|
+
run_with_cli=benchmark_config.run_with_cli,
|
|
755
|
+
)
|
|
756
|
+
except (RepositoryNotFoundError, HFValidationError):
|
|
741
757
|
return None
|
|
742
|
-
except
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
script_argument="api_key=<your-api-key>",
|
|
746
|
-
run_with_cli=benchmark_config.run_with_cli,
|
|
747
|
-
)
|
|
748
|
-
except (RepositoryNotFoundError, HFValidationError):
|
|
749
|
-
return None
|
|
750
|
-
except (OSError, RequestException):
|
|
751
|
-
if internet_connection_available():
|
|
752
|
-
raise HuggingFaceHubDown()
|
|
753
|
-
else:
|
|
758
|
+
except (OSError, RequestException):
|
|
759
|
+
if internet_connection_available():
|
|
760
|
+
continue
|
|
754
761
|
raise NoInternetConnection()
|
|
762
|
+
else:
|
|
763
|
+
raise HuggingFaceHubDown()
|
|
755
764
|
|
|
756
765
|
# Get all the Hugging Face repository tags for the model. If the model is an adapter
|
|
757
766
|
# model, then we also get the tags for the base model
|
|
758
767
|
tags = model_info.tags or list()
|
|
759
|
-
has_base_model_tag = any(
|
|
760
|
-
tag.startswith("base_model:") and tag.count(":") == 1 for tag in tags
|
|
761
|
-
)
|
|
762
768
|
base_model_id: str | None = None
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
769
|
+
has_adapter_config = model_info.siblings is not None and any(
|
|
770
|
+
sibling.rfilename == "adapter_config.json" for sibling in model_info.siblings
|
|
771
|
+
)
|
|
772
|
+
if has_adapter_config:
|
|
773
|
+
adapter_config = PeftConfig.from_pretrained(model_id, revision=revision)
|
|
774
|
+
base_model_id = adapter_config.base_model_name_or_path
|
|
775
|
+
log_once(
|
|
776
|
+
f"Model {model_id!r} identified as an adapter model, with base model "
|
|
777
|
+
f"{base_model_id!r}.",
|
|
778
|
+
level=logging.DEBUG,
|
|
767
779
|
)
|
|
768
|
-
if
|
|
769
|
-
base_model_id = [
|
|
770
|
-
tag.split(":")[1]
|
|
771
|
-
for tag in tags
|
|
772
|
-
if tag.startswith("base_model:") and tag.count(":") == 1
|
|
773
|
-
][0]
|
|
780
|
+
if base_model_id is not None:
|
|
774
781
|
base_model_info = hf_api.model_info(
|
|
775
782
|
repo_id=base_model_id,
|
|
776
|
-
revision=revision,
|
|
777
783
|
token=benchmark_config.api_key
|
|
778
784
|
or os.getenv("HUGGINGFACE_API_KEY")
|
|
779
785
|
or True,
|
|
@@ -781,12 +787,18 @@ def get_model_repo_info(
|
|
|
781
787
|
tags += base_model_info.tags or list()
|
|
782
788
|
tags = list(set(tags))
|
|
783
789
|
|
|
790
|
+
# TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
|
|
791
|
+
# 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
|
|
792
|
+
# when this PR has been merged in and published:
|
|
793
|
+
# https://github.com/huggingface/transformers/pull/37107
|
|
794
|
+
TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
|
|
795
|
+
|
|
784
796
|
# Get the pipeline tag for the model. If it is not specified, then we determine it
|
|
785
797
|
# by checking the model's architecture as written in the model's Hugging Face config
|
|
786
798
|
pipeline_tag = model_info.pipeline_tag
|
|
787
799
|
if pipeline_tag is None:
|
|
788
800
|
hf_config = load_hf_model_config(
|
|
789
|
-
model_id=model_id,
|
|
801
|
+
model_id=base_model_id or model_id,
|
|
790
802
|
num_labels=0,
|
|
791
803
|
id2label=dict(),
|
|
792
804
|
label2id=dict(),
|
|
@@ -812,7 +824,6 @@ def get_model_repo_info(
|
|
|
812
824
|
pipeline_tag = "fill-mask"
|
|
813
825
|
|
|
814
826
|
if benchmark_config.only_allow_safetensors:
|
|
815
|
-
# Check if any file ends with .safetensors
|
|
816
827
|
repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
|
|
817
828
|
has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
|
|
818
829
|
if not has_safetensors:
|
|
@@ -826,6 +837,26 @@ def get_model_repo_info(
|
|
|
826
837
|
)
|
|
827
838
|
raise InvalidModel(msg)
|
|
828
839
|
|
|
840
|
+
# Also check base model if we are evaluating an adapter
|
|
841
|
+
if base_model_id is not None:
|
|
842
|
+
base_repo_files = hf_api.list_repo_files(repo_id=base_model_id)
|
|
843
|
+
base_has_safetensors = any(
|
|
844
|
+
f.endswith(".safetensors") for f in base_repo_files
|
|
845
|
+
)
|
|
846
|
+
if not base_has_safetensors:
|
|
847
|
+
msg = (
|
|
848
|
+
f"Base model {base_model_id} does not have safetensors weights "
|
|
849
|
+
"available."
|
|
850
|
+
)
|
|
851
|
+
if benchmark_config.run_with_cli:
|
|
852
|
+
msg += " Skipping since the `--only-allow-safetensors` flag is set."
|
|
853
|
+
else:
|
|
854
|
+
msg += (
|
|
855
|
+
" Skipping since the `only_allow_safetensors` argument is set "
|
|
856
|
+
"to `True`."
|
|
857
|
+
)
|
|
858
|
+
raise InvalidModel(msg)
|
|
859
|
+
|
|
829
860
|
return HFModelInfo(
|
|
830
861
|
pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id
|
|
831
862
|
)
|
|
@@ -30,6 +30,7 @@ from ..constants import (
|
|
|
30
30
|
REASONING_MAX_TOKENS,
|
|
31
31
|
TASK_GROUPS_USING_LOGPROBS,
|
|
32
32
|
TASKS_USING_JSON,
|
|
33
|
+
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
33
34
|
)
|
|
34
35
|
from ..data_models import (
|
|
35
36
|
BenchmarkConfig,
|
|
@@ -65,6 +66,7 @@ from ..utils import (
|
|
|
65
66
|
get_bos_token,
|
|
66
67
|
get_end_of_chat_token_ids,
|
|
67
68
|
get_eos_token,
|
|
69
|
+
get_min_cuda_compute_capability,
|
|
68
70
|
log_once,
|
|
69
71
|
should_prompts_be_stripped,
|
|
70
72
|
)
|
|
@@ -145,6 +147,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
145
147
|
if self.model_config.adapter_base_model_id is not None:
|
|
146
148
|
adapter_path = snapshot_download(
|
|
147
149
|
repo_id=self.model_config.model_id,
|
|
150
|
+
revision=self.model_config.revision,
|
|
148
151
|
cache_dir=Path(self.model_config.model_cache_dir),
|
|
149
152
|
)
|
|
150
153
|
self.buffer["lora_request"] = LoRARequest(
|
|
@@ -373,12 +376,27 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
373
376
|
|
|
374
377
|
# Generate sequences using vLLM
|
|
375
378
|
input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
379
|
+
num_attempts = 3
|
|
380
|
+
for _ in range(num_attempts):
|
|
381
|
+
try:
|
|
382
|
+
raw_outputs = self._model.generate(
|
|
383
|
+
prompts=prompts,
|
|
384
|
+
sampling_params=sampling_params,
|
|
385
|
+
use_tqdm=(not input_is_a_test),
|
|
386
|
+
lora_request=self.buffer.get("lora_request"),
|
|
387
|
+
)
|
|
388
|
+
break
|
|
389
|
+
except TypeError as e:
|
|
390
|
+
logger.debug(
|
|
391
|
+
f"Encountered error during vLLM generation: {str(e)}. Retrying..."
|
|
392
|
+
)
|
|
393
|
+
sleep(1)
|
|
394
|
+
else:
|
|
395
|
+
raise InvalidBenchmark(
|
|
396
|
+
f"Could not generate sequences after {num_attempts} attempts."
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Parse the raw model outputs
|
|
382
400
|
completion_ids: list[list[int]] = [
|
|
383
401
|
output.outputs[0].token_ids for output in raw_outputs
|
|
384
402
|
]
|
|
@@ -846,13 +864,16 @@ def load_model_and_tokenizer(
|
|
|
846
864
|
# Prefer base model ID if the model is an adapter - the adapter will be added on
|
|
847
865
|
# during inference in this case
|
|
848
866
|
model_id = model_config.adapter_base_model_id or model_config.model_id
|
|
867
|
+
revision = (
|
|
868
|
+
model_config.revision if model_config.adapter_base_model_id is None else "main"
|
|
869
|
+
)
|
|
849
870
|
|
|
850
871
|
hf_model_config = load_hf_model_config(
|
|
851
872
|
model_id=model_id,
|
|
852
873
|
num_labels=0,
|
|
853
874
|
id2label=dict(),
|
|
854
875
|
label2id=dict(),
|
|
855
|
-
revision=
|
|
876
|
+
revision=revision,
|
|
856
877
|
model_cache_dir=model_config.model_cache_dir,
|
|
857
878
|
api_key=benchmark_config.api_key,
|
|
858
879
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
@@ -881,6 +902,23 @@ def load_model_and_tokenizer(
|
|
|
881
902
|
)
|
|
882
903
|
dtype = torch.float16
|
|
883
904
|
|
|
905
|
+
if hf_model_config.torch_dtype == torch.bfloat16:
|
|
906
|
+
min_cuda_compute_capability = get_min_cuda_compute_capability()
|
|
907
|
+
required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
|
|
908
|
+
|
|
909
|
+
if min_cuda_compute_capability is not None:
|
|
910
|
+
if min_cuda_compute_capability < required_capability:
|
|
911
|
+
logger.info(
|
|
912
|
+
"You are loading a model with "
|
|
913
|
+
f"dtype {hf_model_config.torch_dtype}, "
|
|
914
|
+
"which vLLM only supports for CUDA devices with"
|
|
915
|
+
f"CUDA compute capability >={required_capability}. "
|
|
916
|
+
"You are using one or more devices with "
|
|
917
|
+
f"compute capability {min_cuda_compute_capability}. "
|
|
918
|
+
"Setting dtype to float16 instead."
|
|
919
|
+
)
|
|
920
|
+
dtype = torch.float16
|
|
921
|
+
|
|
884
922
|
if model_config.adapter_base_model_id is not None:
|
|
885
923
|
download_dir = str(Path(model_config.model_cache_dir) / "base_model")
|
|
886
924
|
else:
|
|
@@ -916,7 +954,7 @@ def load_model_and_tokenizer(
|
|
|
916
954
|
max_model_len=min(true_max_model_len, 5_000),
|
|
917
955
|
download_dir=download_dir,
|
|
918
956
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
919
|
-
revision=
|
|
957
|
+
revision=revision,
|
|
920
958
|
seed=4242,
|
|
921
959
|
distributed_executor_backend=executor_backend,
|
|
922
960
|
tensor_parallel_size=torch.cuda.device_count(),
|
|
@@ -994,6 +1032,7 @@ def load_tokenizer(
|
|
|
994
1032
|
Returns:
|
|
995
1033
|
The loaded tokenizer.
|
|
996
1034
|
"""
|
|
1035
|
+
revision = revision if adapter_base_model_id is None else "main"
|
|
997
1036
|
config = AutoConfig.from_pretrained(
|
|
998
1037
|
adapter_base_model_id or model_id,
|
|
999
1038
|
revision=revision,
|
|
@@ -54,3 +54,6 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
|
|
|
54
54
|
|
|
55
55
|
# Hugging Face Hub tags used to classify models as merge models
|
|
56
56
|
MERGE_TAGS = ["merge", "mergekit"]
|
|
57
|
+
|
|
58
|
+
# The minimum required CUDA compute capability for using bfloat16 in vLLM
|
|
59
|
+
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
|