EuroEval 15.5.0__tar.gz → 15.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-15.5.0 → euroeval-15.6.0}/.pre-commit-config.yaml +1 -1
- {euroeval-15.5.0 → euroeval-15.6.0}/CHANGELOG.md +35 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/PKG-INFO +30 -9
- {euroeval-15.5.0 → euroeval-15.6.0}/README.md +24 -3
- {euroeval-15.5.0 → euroeval-15.6.0}/makefile +3 -4
- {euroeval-15.5.0 → euroeval-15.6.0}/pyproject.toml +16 -5
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/base.py +3 -2
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/fresh.py +8 -6
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/hf.py +33 -31
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/litellm.py +120 -56
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/vllm.py +41 -26
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmarker.py +23 -21
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/callbacks.py +2 -2
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/constants.py +1 -1
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/data_models.py +257 -42
- euroeval-15.6.0/src/euroeval/dataset_configs/__init__.py +61 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/danish.py +120 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/dutch.py +123 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/english.py +88 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/faroese.py +53 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/french.py +83 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/german.py +91 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/icelandic.py +148 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/italian.py +81 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/norwegian.py +178 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/spanish.py +78 -0
- euroeval-15.6.0/src/euroeval/dataset_configs/swedish.py +100 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/exceptions.py +10 -10
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/finetuning.py +6 -10
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/generation.py +1 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/human_evaluation.py +2 -2
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/languages.py +20 -13
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/model_cache.py +1 -1
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/model_loading.py +1 -12
- euroeval-15.6.0/src/euroeval/prompt_templates/__init__.py +8 -0
- euroeval-15.6.0/src/euroeval/prompt_templates/linguistic_acceptability.py +112 -0
- euroeval-15.6.0/src/euroeval/prompt_templates/multiple_choice.py +97 -0
- euroeval-15.6.0/src/euroeval/prompt_templates/named_entity_recognition.py +257 -0
- euroeval-15.6.0/src/euroeval/prompt_templates/reading_comprehension.py +118 -0
- euroeval-15.6.0/src/euroeval/prompt_templates/sentiment_classification.py +137 -0
- euroeval-15.6.0/src/euroeval/prompt_templates/summarization.py +97 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/speed_benchmark.py +1 -1
- {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/multiple_choice_classification.py +19 -11
- {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/question_answering.py +31 -30
- {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/sequence_classification.py +1 -1
- {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/text_to_text.py +1 -1
- {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/token_classification.py +3 -2
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/tasks.py +54 -0
- euroeval-15.5.0/src/euroeval/utils.py → euroeval-15.6.0/src/euroeval/tokenization_utils.py +8 -339
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/types.py +3 -1
- euroeval-15.6.0/src/euroeval/utils.py +329 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/conftest.py +4 -4
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmarker.py +13 -33
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_callbacks.py +2 -1
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_data_loading.py +2 -2
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_finetuning.py +2 -1
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_model_loading.py +4 -4
- euroeval-15.5.0/tests/test_utils.py → euroeval-15.6.0/tests/test_tokenization_utils.py +3 -68
- euroeval-15.6.0/tests/test_utils.py +67 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/uv.lock +302 -275
- euroeval-15.5.0/src/euroeval/dataset_configs.py +0 -2408
- {euroeval-15.5.0 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/.github/workflows/ci.yaml +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/.gitignore +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/CITATION.cff +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/CONTRIBUTING.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/Dockerfile.cuda +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/LICENSE +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/CNAME +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/README.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/README.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/danish.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/dutch.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/english.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/faroese.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/french.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/german.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/icelandic.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/italian.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/norwegian.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/spanish.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/swedish.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/extras/radial_plotter.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/faq.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/gfx/favicon.png +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/README.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/methodology.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/python-package.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/README.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/knowledge.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/speed.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/summarization.md +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/gfx/euroeval.png +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/gfx/euroeval.xcf +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/gfx/scandeval.png +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/mkdocs.yaml +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/__init__.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_config_factory.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/cli.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/data_loading.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/enums.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/model_config.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/scores.py +0 -0
- {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/__init__.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/constants.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_allocine.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_arc.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_arc_is.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_belebele.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_conll_en.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_conll_es.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dane.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_danish_citizen_tests.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dansk.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dbrd.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dutch_social.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_eltec.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_fone.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_foqa.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_fosent.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_fquad.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_germanquad.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_germeval.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_icesum.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_jentoft.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mmlu.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_no_cola.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norec.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norne.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norquad.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_nqii.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_rrn.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_sb10k.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_scala.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_schibsted.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_squad.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_squad_it.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_sst5.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_suc3.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_swedn.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_swerec.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_xquad_es.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/load_ud_pos.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/versioning.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/__init__.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_hf.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_cli.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_constants.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_data_models.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_dataset_configs.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_enums.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_exceptions.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_generation.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_human_evaluation.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_languages.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_model_cache.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_model_config.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_scores.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_speed_benchmark.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_tasks.py +0 -0
- {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_types.py +0 -0
|
@@ -10,6 +10,41 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v15.6.0] - 2025-04-13
|
|
14
|
+
### Added
|
|
15
|
+
- We now support specifying custom inference providers when benchmarking via the Hugging
|
|
16
|
+
Face inference APIs. This can be done by specifying the model as
|
|
17
|
+
`huggingface/<inference-provider>/<organisation>/<model>`, as described in [these
|
|
18
|
+
LiteLLM docs](https://docs.litellm.ai/docs/providers/huggingface).
|
|
19
|
+
|
|
20
|
+
### Changed
|
|
21
|
+
- Updated `transformers` to `>=4.51.0`, which includes support for Llama-4, Phi-4,
|
|
22
|
+
Deepseek-v3 and Qwen3. This also includes the `image-text-to-text` pipeline tag
|
|
23
|
+
properly, so that we do not have to use a custom fix for it anymore.
|
|
24
|
+
- Updated `vllm` to `>=0.8.3`, which includes support for Llama-4.
|
|
25
|
+
- Set the maximum amount of logprobs for generative models to 8, as that is the upper
|
|
26
|
+
bound for xAI models.
|
|
27
|
+
- When benchmarking Ollama models, if the model is not found, we now also check if the
|
|
28
|
+
model exists if prefixed with 'hf.co/'.
|
|
29
|
+
- Uniformised the prompt templates used for each task, so that they are more
|
|
30
|
+
consistent across tasks. Evaluation tests across different model types and sizes show
|
|
31
|
+
no significant performance difference between the new and old templates. This was
|
|
32
|
+
contributed by [@viggo-gascou](https://github.com/viggo-gascou) ✨
|
|
33
|
+
|
|
34
|
+
### Fixed
|
|
35
|
+
- Avoid duplicate error messages when a rate limit occurs.
|
|
36
|
+
- ModernBERT models cannot be used on a CPU, which caused an error in our check for
|
|
37
|
+
maximal context length. In this case we simply skip this check and use the reported
|
|
38
|
+
maximal context length as-is.
|
|
39
|
+
- Fixed issue with benchmarking multiple generative models in the same evaluation
|
|
40
|
+
command. This was caused by vLLM and Ray not being able to release GPU memory
|
|
41
|
+
properly, but this seems to be released properly now.
|
|
42
|
+
- Now only logs when encoder models are being benchmarked on generative tasks if the
|
|
43
|
+
`--verbose` flag is set (or `verbose=True` in the `Benchmarker` API).
|
|
44
|
+
- All Spanish NER datasets were mistakenly marked as unofficial. The `conll-es` is now
|
|
45
|
+
marked as official.
|
|
46
|
+
|
|
47
|
+
|
|
13
48
|
## [v15.5.0] - 2025-04-07
|
|
14
49
|
### Added
|
|
15
50
|
- Now allows supplying a parameter to API models, which is done by using
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.6.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -35,7 +35,7 @@ Requires-Dist: click>=8.1.3
|
|
|
35
35
|
Requires-Dist: datasets>=2.15.0
|
|
36
36
|
Requires-Dist: demjson3>=3.0.6
|
|
37
37
|
Requires-Dist: evaluate>=0.4.1
|
|
38
|
-
Requires-Dist: huggingface-hub>=0.
|
|
38
|
+
Requires-Dist: huggingface-hub>=0.30.1
|
|
39
39
|
Requires-Dist: levenshtein>=0.24.0
|
|
40
40
|
Requires-Dist: litellm>=1.63.0
|
|
41
41
|
Requires-Dist: more-itertools>=10.5.0
|
|
@@ -56,18 +56,18 @@ Requires-Dist: setuptools>=75.8.2
|
|
|
56
56
|
Requires-Dist: tenacity>=9.0.0
|
|
57
57
|
Requires-Dist: termcolor>=2.0.0
|
|
58
58
|
Requires-Dist: torch>=2.6.0
|
|
59
|
-
Requires-Dist: transformers>=4.
|
|
59
|
+
Requires-Dist: transformers>=4.51.0
|
|
60
60
|
Provides-Extra: all
|
|
61
61
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
62
62
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
64
64
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
65
|
-
Requires-Dist: vllm>=0.8.
|
|
65
|
+
Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'all'
|
|
66
66
|
Provides-Extra: generative
|
|
67
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
69
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
70
|
-
Requires-Dist: vllm>=0.8.
|
|
70
|
+
Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
|
|
71
71
|
Provides-Extra: human-evaluation
|
|
72
72
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
73
73
|
Provides-Extra: test
|
|
@@ -89,7 +89,7 @@ ______________________________________________________________________
|
|
|
89
89
|
[](https://arxiv.org/abs/2406.13469)
|
|
90
90
|
[](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
|
|
91
91
|
[](https://github.com/EuroEval/EuroEval/commits/main)
|
|
92
|
-
[](https://github.com/EuroEval/EuroEval/tree/main/tests)
|
|
93
93
|
[](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
|
|
94
94
|
|
|
95
95
|
|
|
@@ -206,7 +206,9 @@ sentiment-classification`.
|
|
|
206
206
|
|
|
207
207
|
|
|
208
208
|
### Reproducing the datasets
|
|
209
|
-
All datasets used in this project are generated using the scripts located in the
|
|
209
|
+
All datasets used in this project are generated using the scripts located in the
|
|
210
|
+
[src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
|
|
211
|
+
with the following command
|
|
210
212
|
|
|
211
213
|
```shell
|
|
212
214
|
$ uv run src/scripts/<name-of-script>.py
|
|
@@ -218,8 +220,27 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
|
|
|
218
220
|
$ uv run src/scripts/create_allocine.py
|
|
219
221
|
```
|
|
220
222
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
+
## Contributors :pray:
|
|
224
|
+
|
|
225
|
+
A huge thank you to all the contributors who have helped make this project a success!
|
|
226
|
+
|
|
227
|
+
<a href="https://github.com/peter-sk"><img src="https://avatars.githubusercontent.com/u/6168908" width=50 alt="Contributor avatar for peter-sk"/></a>
|
|
228
|
+
<a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
|
|
229
|
+
<a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
|
|
230
|
+
<a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
|
|
231
|
+
<a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
|
|
232
|
+
<a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
|
|
233
|
+
<a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
|
|
234
|
+
<a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
|
|
235
|
+
<a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
|
|
236
|
+
<a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
|
|
237
|
+
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
238
|
+
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
239
|
+
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
240
|
+
|
|
241
|
+
### Special Thanks
|
|
242
|
+
- Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
|
|
243
|
+
[Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
|
|
223
244
|
- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
|
|
224
245
|
models on the leaderboards.
|
|
225
246
|
- Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
|
|
@@ -13,7 +13,7 @@ ______________________________________________________________________
|
|
|
13
13
|
[](https://arxiv.org/abs/2406.13469)
|
|
14
14
|
[](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
|
|
15
15
|
[](https://github.com/EuroEval/EuroEval/commits/main)
|
|
16
|
-
[](https://github.com/EuroEval/EuroEval/tree/main/tests)
|
|
17
17
|
[](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
|
|
18
18
|
|
|
19
19
|
|
|
@@ -130,7 +130,9 @@ sentiment-classification`.
|
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
### Reproducing the datasets
|
|
133
|
-
All datasets used in this project are generated using the scripts located in the
|
|
133
|
+
All datasets used in this project are generated using the scripts located in the
|
|
134
|
+
[src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
|
|
135
|
+
with the following command
|
|
134
136
|
|
|
135
137
|
```shell
|
|
136
138
|
$ uv run src/scripts/<name-of-script>.py
|
|
@@ -142,8 +144,27 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
|
|
|
142
144
|
$ uv run src/scripts/create_allocine.py
|
|
143
145
|
```
|
|
144
146
|
|
|
147
|
+
## Contributors :pray:
|
|
145
148
|
|
|
146
|
-
|
|
149
|
+
A huge thank you to all the contributors who have helped make this project a success!
|
|
150
|
+
|
|
151
|
+
<a href="https://github.com/peter-sk"><img src="https://avatars.githubusercontent.com/u/6168908" width=50 alt="Contributor avatar for peter-sk"/></a>
|
|
152
|
+
<a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
|
|
153
|
+
<a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
|
|
154
|
+
<a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
|
|
155
|
+
<a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
|
|
156
|
+
<a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
|
|
157
|
+
<a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
|
|
158
|
+
<a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
|
|
159
|
+
<a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
|
|
160
|
+
<a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
|
|
161
|
+
<a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
|
|
162
|
+
<a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
|
|
163
|
+
<a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
|
|
164
|
+
|
|
165
|
+
### Special Thanks
|
|
166
|
+
- Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
|
|
167
|
+
[Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
|
|
147
168
|
- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
|
|
148
169
|
models on the leaderboards.
|
|
149
170
|
- Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
|
|
@@ -56,7 +56,6 @@ install-dependencies:
|
|
|
56
56
|
@if [ "${NO_FLASH_ATTN}" != "1" ] && [ $$(uname) != "Darwin" ]; then \
|
|
57
57
|
uv pip install --no-build-isolation flash-attn>=2.7.0.post2; \
|
|
58
58
|
fi
|
|
59
|
-
@uv sync -U --only-dev
|
|
60
59
|
|
|
61
60
|
setup-environment-variables:
|
|
62
61
|
@uv run python src/scripts/fix_dot_env_file.py
|
|
@@ -156,8 +155,8 @@ publish-scandeval:
|
|
|
156
155
|
fi
|
|
157
156
|
@mv src/scandeval src/euroeval
|
|
158
157
|
|
|
159
|
-
publish-major: bump-major publish ## Publish a major version
|
|
158
|
+
publish-major: install check bump-major publish ## Publish a major version
|
|
160
159
|
|
|
161
|
-
publish-minor: bump-minor publish ## Publish a minor version
|
|
160
|
+
publish-minor: install check bump-minor publish ## Publish a minor version
|
|
162
161
|
|
|
163
|
-
publish-patch: bump-patch publish ## Publish a patch version
|
|
162
|
+
publish-patch: install check bump-patch publish ## Publish a patch version
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "EuroEval"
|
|
3
|
-
version = "15.
|
|
3
|
+
version = "15.6.0"
|
|
4
4
|
description = "The robust European language model benchmark."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -15,7 +15,7 @@ dependencies = [
|
|
|
15
15
|
"torch>=2.6.0",
|
|
16
16
|
"pandas>=2.2.0",
|
|
17
17
|
"numpy>=1.23.0,<2.0.0",
|
|
18
|
-
"transformers>=4.
|
|
18
|
+
"transformers>=4.51.0",
|
|
19
19
|
"accelerate>=0.34.2",
|
|
20
20
|
"evaluate>=0.4.1",
|
|
21
21
|
"datasets>=2.15.0",
|
|
@@ -24,7 +24,7 @@ dependencies = [
|
|
|
24
24
|
"termcolor>=2.0.0",
|
|
25
25
|
"seqeval>=1.2.2",
|
|
26
26
|
"python-dotenv>=1.0.1",
|
|
27
|
-
"huggingface-hub>=0.
|
|
27
|
+
"huggingface-hub>=0.30.1",
|
|
28
28
|
"pyinfer>=0.0.3",
|
|
29
29
|
"sentencepiece>=0.1.96",
|
|
30
30
|
"protobuf~=3.20.0",
|
|
@@ -46,7 +46,7 @@ dependencies = [
|
|
|
46
46
|
generative = [
|
|
47
47
|
"outlines>=0.1.11",
|
|
48
48
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
49
|
-
"vllm>=0.8.
|
|
49
|
+
"vllm>=0.8.3; platform_system == 'Linux'",
|
|
50
50
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
51
51
|
]
|
|
52
52
|
human_evaluation = [
|
|
@@ -55,7 +55,7 @@ human_evaluation = [
|
|
|
55
55
|
all = [
|
|
56
56
|
"outlines>=0.1.11",
|
|
57
57
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
58
|
-
"vllm>=0.8.
|
|
58
|
+
"vllm>=0.8.3; platform_system == 'Linux'",
|
|
59
59
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
60
60
|
"gradio>=4.26.0",
|
|
61
61
|
]
|
|
@@ -107,6 +107,7 @@ dev-dependencies = [
|
|
|
107
107
|
"types-setuptools>=75.8.0.20250110",
|
|
108
108
|
"types-ujson>=5.10.0.20240515",
|
|
109
109
|
"types-simplejson>=3.2.0.2025032",
|
|
110
|
+
"debugpy>=1.8.13",
|
|
110
111
|
]
|
|
111
112
|
|
|
112
113
|
[tool.ruff]
|
|
@@ -144,6 +145,16 @@ select = [
|
|
|
144
145
|
# Pyflakes
|
|
145
146
|
"F",
|
|
146
147
|
]
|
|
148
|
+
ignore = [
|
|
149
|
+
# Type annotations for "self" arguments
|
|
150
|
+
"ANN101",
|
|
151
|
+
# Type annotations for "cls" arguments
|
|
152
|
+
"ANN102",
|
|
153
|
+
# Type annotations for **kwargs
|
|
154
|
+
"ANN003",
|
|
155
|
+
# Docstrings for **kwargs
|
|
156
|
+
"D417",
|
|
157
|
+
]
|
|
147
158
|
|
|
148
159
|
[tool.ruff.lint.extend-per-file-ignores]
|
|
149
160
|
"__init__.py" = [
|
|
@@ -10,7 +10,8 @@ from functools import cached_property, partial
|
|
|
10
10
|
from datasets import DatasetDict
|
|
11
11
|
from torch import nn
|
|
12
12
|
from tqdm.auto import tqdm
|
|
13
|
-
from transformers import PreTrainedTokenizer
|
|
13
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
14
|
+
from transformers.trainer import Trainer
|
|
14
15
|
|
|
15
16
|
from ..data_models import (
|
|
16
17
|
BenchmarkConfig,
|
|
@@ -21,7 +22,7 @@ from ..data_models import (
|
|
|
21
22
|
)
|
|
22
23
|
from ..enums import BatchingPreference, GenerativeType, TaskGroup
|
|
23
24
|
from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
|
|
24
|
-
from ..
|
|
25
|
+
from ..task_group_utils import (
|
|
25
26
|
question_answering,
|
|
26
27
|
sequence_classification,
|
|
27
28
|
text_to_text,
|
|
@@ -4,19 +4,21 @@ import os
|
|
|
4
4
|
from functools import cached_property
|
|
5
5
|
from json import JSONDecodeError
|
|
6
6
|
|
|
7
|
-
from transformers import
|
|
8
|
-
|
|
9
|
-
|
|
7
|
+
from transformers.configuration_utils import PretrainedConfig
|
|
8
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
9
|
+
from transformers.models.auto.configuration_auto import AutoConfig
|
|
10
|
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
11
|
+
from transformers.models.electra import (
|
|
10
12
|
ElectraForQuestionAnswering,
|
|
11
13
|
ElectraForSequenceClassification,
|
|
12
14
|
ElectraForTokenClassification,
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
PreTrainedTokenizer,
|
|
15
|
+
)
|
|
16
|
+
from transformers.models.xlm_roberta import (
|
|
16
17
|
XLMRobertaForQuestionAnswering,
|
|
17
18
|
XLMRobertaForSequenceClassification,
|
|
18
19
|
XLMRobertaForTokenClassification,
|
|
19
20
|
)
|
|
21
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
20
22
|
|
|
21
23
|
from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
22
24
|
from ..enums import InferenceBackend, ModelType, TaskGroup
|
|
@@ -13,31 +13,29 @@ import torch
|
|
|
13
13
|
from datasets import DatasetDict
|
|
14
14
|
from huggingface_hub import HfApi
|
|
15
15
|
from huggingface_hub import whoami as hf_whoami
|
|
16
|
-
from huggingface_hub.
|
|
17
|
-
from huggingface_hub.hf_api import RepositoryNotFoundError, RevisionNotFoundError
|
|
18
|
-
from huggingface_hub.utils import (
|
|
16
|
+
from huggingface_hub.errors import (
|
|
19
17
|
GatedRepoError,
|
|
20
18
|
HFValidationError,
|
|
21
19
|
LocalTokenNotFoundError,
|
|
20
|
+
RepositoryNotFoundError,
|
|
21
|
+
RevisionNotFoundError,
|
|
22
22
|
)
|
|
23
|
+
from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
|
|
23
24
|
from peft import PeftConfig
|
|
24
25
|
from requests.exceptions import RequestException
|
|
25
26
|
from torch import nn
|
|
26
|
-
from transformers import
|
|
27
|
-
|
|
28
|
-
AutoTokenizer,
|
|
29
|
-
BatchEncoding,
|
|
27
|
+
from transformers.configuration_utils import PretrainedConfig
|
|
28
|
+
from transformers.data.data_collator import (
|
|
30
29
|
DataCollatorForTokenClassification,
|
|
31
30
|
DataCollatorWithPadding,
|
|
32
|
-
PretrainedConfig,
|
|
33
|
-
PreTrainedModel,
|
|
34
|
-
PreTrainedTokenizer,
|
|
35
|
-
Trainer,
|
|
36
31
|
)
|
|
37
32
|
from transformers.modelcard import TASK_MAPPING
|
|
38
|
-
from transformers.
|
|
39
|
-
|
|
40
|
-
|
|
33
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
34
|
+
from transformers.models.auto.configuration_auto import AutoConfig
|
|
35
|
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
36
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
37
|
+
from transformers.tokenization_utils_base import BatchEncoding
|
|
38
|
+
from transformers.trainer import Trainer
|
|
41
39
|
from urllib3.exceptions import RequestError
|
|
42
40
|
|
|
43
41
|
from ..constants import (
|
|
@@ -65,18 +63,17 @@ from ..exceptions import (
|
|
|
65
63
|
NoInternetConnection,
|
|
66
64
|
)
|
|
67
65
|
from ..languages import get_all_languages
|
|
68
|
-
from ..
|
|
66
|
+
from ..task_group_utils import (
|
|
69
67
|
multiple_choice_classification,
|
|
70
68
|
question_answering,
|
|
71
69
|
token_classification,
|
|
72
70
|
)
|
|
71
|
+
from ..tokenization_utils import get_bos_token, get_eos_token
|
|
73
72
|
from ..types import ExtractLabelsFunction
|
|
74
73
|
from ..utils import (
|
|
75
74
|
block_terminal_output,
|
|
76
75
|
create_model_cache_dir,
|
|
77
|
-
get_bos_token,
|
|
78
76
|
get_class_by_name,
|
|
79
|
-
get_eos_token,
|
|
80
77
|
internet_connection_available,
|
|
81
78
|
log_once,
|
|
82
79
|
)
|
|
@@ -690,7 +687,7 @@ def load_model_and_tokenizer(
|
|
|
690
687
|
assert model is not None, "The model should not be None."
|
|
691
688
|
|
|
692
689
|
model.eval()
|
|
693
|
-
model.to(benchmark_config.device)
|
|
690
|
+
model.to(benchmark_config.device) # type: ignore[arg-type]
|
|
694
691
|
|
|
695
692
|
if (
|
|
696
693
|
isinstance(model, PreTrainedModel)
|
|
@@ -797,12 +794,6 @@ def get_model_repo_info(
|
|
|
797
794
|
tags += base_model_info.tags or list()
|
|
798
795
|
tags = list(set(tags))
|
|
799
796
|
|
|
800
|
-
# TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
|
|
801
|
-
# 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
|
|
802
|
-
# when this PR has been merged in and published:
|
|
803
|
-
# https://github.com/huggingface/transformers/pull/37107
|
|
804
|
-
TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
|
|
805
|
-
|
|
806
797
|
# Get the pipeline tag for the model. If it is not specified, then we determine it
|
|
807
798
|
# by checking the model's architecture as written in the model's Hugging Face config
|
|
808
799
|
pipeline_tag = model_info.pipeline_tag
|
|
@@ -824,7 +815,7 @@ def get_model_repo_info(
|
|
|
824
815
|
generative_class_names = [
|
|
825
816
|
class_name
|
|
826
817
|
for tag in GENERATIVE_PIPELINE_TAGS
|
|
827
|
-
for class_name in TASK_MAPPING.get(tag, dict()).values()
|
|
818
|
+
for class_name in TASK_MAPPING.get(tag, dict()).values() # type: ignore[attr-defined]
|
|
828
819
|
]
|
|
829
820
|
if class_names is not None and any(
|
|
830
821
|
class_name in generative_class_names for class_name in class_names
|
|
@@ -1083,17 +1074,20 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
|
|
|
1083
1074
|
for attribute in attribute_list:
|
|
1084
1075
|
token_type_embeddings = getattr(token_type_embeddings, attribute)
|
|
1085
1076
|
|
|
1077
|
+
token_type_embedding_tensor = token_type_embeddings.weight.data
|
|
1078
|
+
assert isinstance(token_type_embedding_tensor, torch.Tensor)
|
|
1079
|
+
|
|
1086
1080
|
# If the token type embeddings has shape (1, ...) then set the shape to
|
|
1087
1081
|
# (2, ...) by randomly initializing the second token type embedding
|
|
1088
|
-
if
|
|
1082
|
+
if token_type_embedding_tensor.shape[0] == 1:
|
|
1089
1083
|
token_type_embeddings.weight.data = torch.cat(
|
|
1090
1084
|
(
|
|
1091
|
-
|
|
1092
|
-
torch.rand_like(
|
|
1085
|
+
token_type_embedding_tensor,
|
|
1086
|
+
torch.rand_like(token_type_embedding_tensor),
|
|
1093
1087
|
),
|
|
1094
1088
|
dim=0,
|
|
1095
1089
|
)
|
|
1096
|
-
token_type_embeddings.num_embeddings = 2
|
|
1090
|
+
token_type_embeddings.num_embeddings = 2 # type: ignore[assignment]
|
|
1097
1091
|
|
|
1098
1092
|
# Set the model config to use the new type vocab size
|
|
1099
1093
|
model.config.type_vocab_size = 2
|
|
@@ -1160,7 +1154,7 @@ def align_model_and_tokenizer(
|
|
|
1160
1154
|
# Move the model to the CPU, since otherwise we can't catch the IndexErrors when
|
|
1161
1155
|
# finding the maximum sequence length of the model
|
|
1162
1156
|
model_device = model.device
|
|
1163
|
-
model.to(torch.device("cpu"))
|
|
1157
|
+
model.to(torch.device("cpu")) # type: ignore[arg-type]
|
|
1164
1158
|
|
|
1165
1159
|
# Manually check that this model max length is valid for the model, and adjust
|
|
1166
1160
|
# otherwise
|
|
@@ -1182,8 +1176,16 @@ def align_model_and_tokenizer(
|
|
|
1182
1176
|
except IndexError:
|
|
1183
1177
|
continue
|
|
1184
1178
|
|
|
1179
|
+
except ValueError as e:
|
|
1180
|
+
# This happens when the model is using Triton, such as with ModernBERT,
|
|
1181
|
+
# which doesn't work with CPU tensors at all
|
|
1182
|
+
if "cpu tensor" in str(e):
|
|
1183
|
+
break
|
|
1184
|
+
else:
|
|
1185
|
+
raise e
|
|
1186
|
+
|
|
1185
1187
|
# Move the model back to the original device
|
|
1186
|
-
model.to(model_device)
|
|
1188
|
+
model.to(model_device) # type: ignore[arg-type]
|
|
1187
1189
|
|
|
1188
1190
|
# If there is a mismatch between the vocab size according to the tokenizer and
|
|
1189
1191
|
# the vocab size according to the model, we raise an error
|