EuroEval 16.1.0__tar.gz → 16.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-16.1.0 → euroeval-16.2.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +1 -0
- euroeval-16.2.0/.github/ISSUE_TEMPLATE/language_request.yaml +49 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +1 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/.gitignore +3 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/.pre-commit-config.yaml +1 -1
- {euroeval-16.1.0 → euroeval-16.2.0}/CHANGELOG.md +36 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/PKG-INFO +31 -7
- {euroeval-16.1.0 → euroeval-16.2.0}/README.md +26 -2
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/icelandic.md +10 -10
- {euroeval-16.1.0 → euroeval-16.2.0}/pyproject.toml +10 -7
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/__init__.py +7 -6
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_config_factory.py +4 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/hf.py +31 -16
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/litellm.py +2 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/vllm.py +24 -9
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmarker.py +127 -14
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/cli.py +8 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/data_models.py +4 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/generation.py +3 -1
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/generation_utils.py +10 -4
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/base.py +12 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/huggingface.py +23 -2
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/linguistic_acceptability.py +6 -5
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/named_entity_recognition.py +3 -3
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/sentiment_classification.py +5 -5
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/sequence_classification.py +1 -1
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/tasks.py +3 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/tokenisation_utils.py +12 -13
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/types.py +2 -2
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/utils.py +77 -5
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/conftest.py +1 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_benchmarker.py +56 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_cli.py +2 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_data_loading.py +10 -1
- {euroeval-16.1.0 → euroeval-16.2.0}/uv.lock +668 -522
- euroeval-16.1.0/generated_contracts/employment_contract_001.md +0 -137
- euroeval-16.1.0/generated_contracts/employment_contract_002.md +0 -152
- euroeval-16.1.0/generated_contracts/employment_contract_003.md +0 -144
- euroeval-16.1.0/generated_contracts/employment_contract_004.md +0 -139
- euroeval-16.1.0/generated_contracts/employment_contract_005.md +0 -146
- euroeval-16.1.0/generated_contracts/employment_contract_006.md +0 -127
- euroeval-16.1.0/generated_contracts/employment_contract_007.md +0 -147
- euroeval-16.1.0/generated_contracts/employment_contract_008.md +0 -136
- euroeval-16.1.0/generated_contracts/employment_contract_009.md +0 -143
- euroeval-16.1.0/generated_contracts/employment_contract_010.md +0 -148
- {euroeval-16.1.0 → euroeval-16.2.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/.github/workflows/ci.yaml +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/CITATION.cff +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/CONTRIBUTING.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/Dockerfile.cuda +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/LICENSE +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/NEW_DATASET_GUIDE.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/CNAME +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/README.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/README.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/danish.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/dutch.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/english.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/estonian.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/faroese.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/finnish.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/french.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/german.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/italian.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/latvian.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/norwegian.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/polish.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/portuguese.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/spanish.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/swedish.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/extras/radial_plotter.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/faq.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/gfx/favicon.png +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/estonian.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/portuguese.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/finnic.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/README.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/methodology.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/python-package.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/README.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/knowledge.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/speed.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/summarization.md +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/gfx/euroeval.png +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/gfx/euroeval.xcf +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/gfx/scandeval.png +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/makefile +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/mkdocs.yaml +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/base.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/fresh.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/callbacks.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/constants.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/data_loading.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/__init__.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/danish.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/dutch.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/english.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/estonian.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/faroese.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/finnish.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/french.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/german.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/icelandic.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/italian.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/latvian.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/norwegian.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/polish.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/portuguese.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/spanish.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/swedish.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/enums.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/exceptions.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/finetuning.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/languages.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/__init__.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/llm_as_a_judge.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/pipeline.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/speed.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/model_cache.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/model_config.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/model_loading.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/__init__.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/summarization.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/scores.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/speed_benchmark.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/__init__.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/question_answering.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/text_to_text.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/token_classification.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/constants.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_allocine.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_arc.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_arc_is.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_belebele.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_boolq_pt.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_conll_en.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_conll_es.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_copa_lv.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_dane.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_danish_citizen_tests.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_dansk.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_dbrd.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_eltec.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_err_news.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_estner.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_estonian_valence.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_european_values.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_exam_et.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_fone.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_foqa.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_fosent.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_fquad.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_fullstack_ner.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_germanquad.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_germeval.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_goldenswag.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_grammar_et.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_harem.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_hellaswag_fi.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_icesum.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_idioms_no.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_jentoft.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_kpwr_ner.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_latvian_lsm_summary.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_life_in_the_uk.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_llmzszl.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mmlu.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mmlu_lv.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_multi_wiki_qa.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_no_cola.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norec.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norne.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norquad.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_nqii.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_polemo2.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_poquad.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_psc.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_publico.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_rrn.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sb10k.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_scala.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_scandisent_fi.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_schibsted.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_squad.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_squad_it.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sst2_pt.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sst5.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_suc3.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_swedish_skolprov.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_swedn.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_swerec.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_trivia_et.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_turku_ner_fi.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_tydiqa_fi.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_wikiann_lv.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_winogrande.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_winogrande_et.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_xlsum_fi.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_xquad.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/load_ud_pos.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/versioning.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/__init__.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_benchmark_modules/test_hf.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_callbacks.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_constants.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_data_models.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_dataset_configs.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_enums.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_exceptions.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_finetuning.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_languages.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_model_config.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_model_loading.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_scores.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_speed_benchmark.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_tasks.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_tokenisation_utils.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_types.py +0 -0
- {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_utils.py +0 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
name: 🌍 Language Request
|
|
2
|
+
description: Is there a European language missing in EuroEval?
|
|
3
|
+
title: "[LANGUAGE REQUEST] <language-name>"
|
|
4
|
+
labels: "new language"
|
|
5
|
+
type: task
|
|
6
|
+
|
|
7
|
+
body:
|
|
8
|
+
- type: input
|
|
9
|
+
attributes:
|
|
10
|
+
label: Language name and code
|
|
11
|
+
description: What is the name and ISO 639 code of the language?
|
|
12
|
+
validations:
|
|
13
|
+
required: true
|
|
14
|
+
- type: markdown
|
|
15
|
+
attributes:
|
|
16
|
+
value: >
|
|
17
|
+
Here are some existing evaluation datasets in the language, that could be used:
|
|
18
|
+
- type: textarea
|
|
19
|
+
attributes:
|
|
20
|
+
label: Sentiment classification dataset
|
|
21
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
22
|
+
- type: textarea
|
|
23
|
+
attributes:
|
|
24
|
+
label: Linguistic acceptability dataset
|
|
25
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
26
|
+
- type: textarea
|
|
27
|
+
attributes:
|
|
28
|
+
label: Named entity recognition dataset
|
|
29
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
30
|
+
- type: textarea
|
|
31
|
+
attributes:
|
|
32
|
+
label: Reading comprehension dataset
|
|
33
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
34
|
+
- type: textarea
|
|
35
|
+
attributes:
|
|
36
|
+
label: Summarisation dataset
|
|
37
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
38
|
+
- type: textarea
|
|
39
|
+
attributes:
|
|
40
|
+
label: Knowledge dataset
|
|
41
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
42
|
+
- type: textarea
|
|
43
|
+
attributes:
|
|
44
|
+
label: Common-sense reasoning dataset
|
|
45
|
+
description: Link to one or more datasets in the language (leave blank if unknown)
|
|
46
|
+
- type: markdown
|
|
47
|
+
attributes:
|
|
48
|
+
value: >
|
|
49
|
+
Thanks for contributing 🎉!
|
|
@@ -10,6 +10,42 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v16.2.0] - 2025-09-15
|
|
14
|
+
### Added
|
|
15
|
+
- Now supports evaluating models in an offline environment. This is done by first
|
|
16
|
+
downloading all necessary models, datasets, metrics and other artifacts while online,
|
|
17
|
+
using the new `--download-only` flag (or `download_only=True` in the `Benchmarker`
|
|
18
|
+
API). Then you can safely disable internet access and run the evaluation as normal,
|
|
19
|
+
and it will use the cached models, datasets and metrics. This was contributed by
|
|
20
|
+
@viggo-gascou ✨
|
|
21
|
+
- Added the `timm` package to the set of `generative` extra dependencies, as it is
|
|
22
|
+
required to load some multimodal models, such as Gemma-3n.
|
|
23
|
+
|
|
24
|
+
### Changed
|
|
25
|
+
- Now does not benchmark encoder models on multiple-choice classification tasks, as they
|
|
26
|
+
get near-random performance and these scores are not used in the leaderboards. We can
|
|
27
|
+
change this in the future if we find a way to make encoder models work better on these
|
|
28
|
+
tasks.
|
|
29
|
+
- For generative vLLM models that can swap between reasoning and non-reasoning modes,
|
|
30
|
+
we previously defaulted to reasoning. We now default to what the model uses by
|
|
31
|
+
default, which is non-reasoning for most models.
|
|
32
|
+
|
|
33
|
+
### Fixed
|
|
34
|
+
- Fixed an issue where old evaluation records could not be loaded, as the format had
|
|
35
|
+
changed. We are now able to load old records again.
|
|
36
|
+
- Fixed some grammatical errors in the Icelandic prompts.
|
|
37
|
+
- Now stores model IDs with parameters (e.g., `o3#low`) correctly in the benchmark
|
|
38
|
+
results, rather than just the base model ID (e.g., `o3`).
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
## [v16.1.1] - 2025-09-12
|
|
42
|
+
### Fixed
|
|
43
|
+
- Fixed an issue from v16.1.0, where reasoning models were not using the tokeniser's
|
|
44
|
+
chat template.
|
|
45
|
+
- Fixed an issue with some of the prompts for base decoders, that the list of possible
|
|
46
|
+
labels for sequence classification tasks was not included in the prompt.
|
|
47
|
+
|
|
48
|
+
|
|
13
49
|
## [v16.1.0] - 2025-09-11
|
|
14
50
|
### Added
|
|
15
51
|
- Added support for Polish 🇵🇱! This includes the reading comprehension dataset PoQuAD,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 16.
|
|
3
|
+
Version: 16.2.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -61,13 +61,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
|
61
61
|
Provides-Extra: all
|
|
62
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
-
Requires-Dist:
|
|
65
|
-
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
+
Requires-Dist: timm>=1.0.19; extra == 'all'
|
|
65
|
+
Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
66
66
|
Provides-Extra: generative
|
|
67
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
-
Requires-Dist:
|
|
70
|
-
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
+
Requires-Dist: timm>=1.0.19; extra == 'generative'
|
|
70
|
+
Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
71
71
|
Description-Content-Type: text/markdown
|
|
72
72
|
|
|
73
73
|
<div align='center'>
|
|
@@ -152,13 +152,13 @@ model:
|
|
|
152
152
|
```
|
|
153
153
|
>>> from euroeval import Benchmarker
|
|
154
154
|
>>> benchmark = Benchmarker()
|
|
155
|
-
>>> benchmark(model="<model>")
|
|
155
|
+
>>> benchmark(model="<model-id>")
|
|
156
156
|
```
|
|
157
157
|
|
|
158
158
|
To benchmark on a specific task and/or language, you simply specify the `task` or
|
|
159
159
|
`language` arguments, shown here with same example as above:
|
|
160
160
|
```
|
|
161
|
-
>>> benchmark(model="<model>", task="sentiment-classification", language="da")
|
|
161
|
+
>>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
|
|
162
162
|
```
|
|
163
163
|
|
|
164
164
|
If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
|
|
@@ -168,6 +168,30 @@ models on the Danish sentiment classification task:
|
|
|
168
168
|
>>> benchmark(task="sentiment-classification", language="da")
|
|
169
169
|
```
|
|
170
170
|
|
|
171
|
+
### Benchmarking in an Offline Environment
|
|
172
|
+
If you need to benchmark in an offline environment, you need to download the models,
|
|
173
|
+
datasets and metrics beforehand. This can be done by adding the `--download-only`
|
|
174
|
+
argument, from the command line, or the `download_only` argument, if benchmarking from a
|
|
175
|
+
script. For example to download the model you want and all of the Danish sentiment
|
|
176
|
+
classification datasets:
|
|
177
|
+
```
|
|
178
|
+
$ euroeval --model <model-id> --task sentiment-classification --language da --download-only
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Or from a script:
|
|
182
|
+
```
|
|
183
|
+
>>> benchmark(
|
|
184
|
+
... model="<model-id>",
|
|
185
|
+
... task="sentiment-classification",
|
|
186
|
+
... language="da",
|
|
187
|
+
... download_only=True,
|
|
188
|
+
... )
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Please note: Offline benchmarking of adapter models is not currently supported. An
|
|
192
|
+
internet connection will be required during evaluation. If offline support is important
|
|
193
|
+
to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
|
|
194
|
+
|
|
171
195
|
### Benchmarking from Docker
|
|
172
196
|
A Dockerfile is provided in the repo, which can be downloaded and run, without needing
|
|
173
197
|
to clone the repo and installing from source. This can be fetched programmatically by
|
|
@@ -80,13 +80,13 @@ model:
|
|
|
80
80
|
```
|
|
81
81
|
>>> from euroeval import Benchmarker
|
|
82
82
|
>>> benchmark = Benchmarker()
|
|
83
|
-
>>> benchmark(model="<model>")
|
|
83
|
+
>>> benchmark(model="<model-id>")
|
|
84
84
|
```
|
|
85
85
|
|
|
86
86
|
To benchmark on a specific task and/or language, you simply specify the `task` or
|
|
87
87
|
`language` arguments, shown here with same example as above:
|
|
88
88
|
```
|
|
89
|
-
>>> benchmark(model="<model>", task="sentiment-classification", language="da")
|
|
89
|
+
>>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
|
|
90
90
|
```
|
|
91
91
|
|
|
92
92
|
If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
|
|
@@ -96,6 +96,30 @@ models on the Danish sentiment classification task:
|
|
|
96
96
|
>>> benchmark(task="sentiment-classification", language="da")
|
|
97
97
|
```
|
|
98
98
|
|
|
99
|
+
### Benchmarking in an Offline Environment
|
|
100
|
+
If you need to benchmark in an offline environment, you need to download the models,
|
|
101
|
+
datasets and metrics beforehand. This can be done by adding the `--download-only`
|
|
102
|
+
argument, from the command line, or the `download_only` argument, if benchmarking from a
|
|
103
|
+
script. For example to download the model you want and all of the Danish sentiment
|
|
104
|
+
classification datasets:
|
|
105
|
+
```
|
|
106
|
+
$ euroeval --model <model-id> --task sentiment-classification --language da --download-only
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Or from a script:
|
|
110
|
+
```
|
|
111
|
+
>>> benchmark(
|
|
112
|
+
... model="<model-id>",
|
|
113
|
+
... task="sentiment-classification",
|
|
114
|
+
... language="da",
|
|
115
|
+
... download_only=True,
|
|
116
|
+
... )
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Please note: Offline benchmarking of adapter models is not currently supported. An
|
|
120
|
+
internet connection will be required during evaluation. If offline support is important
|
|
121
|
+
to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
|
|
122
|
+
|
|
99
123
|
### Benchmarking from Docker
|
|
100
124
|
A Dockerfile is provided in the repo, which can be downloaded and run, without needing
|
|
101
125
|
to clone the repo and installing from source. This can be fetched programmatically by
|
|
@@ -44,11 +44,11 @@ When evaluating generative models, we use the following setup (see the
|
|
|
44
44
|
- Number of few-shot examples: 12
|
|
45
45
|
- Prefix prompt:
|
|
46
46
|
```
|
|
47
|
-
|
|
47
|
+
Hér fyrir neðan eru textabrot ásamt lyndisgildi þeirra sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.
|
|
48
48
|
```
|
|
49
49
|
- Base prompt template:
|
|
50
50
|
```
|
|
51
|
-
|
|
51
|
+
Textabrot: {text}
|
|
52
52
|
Lyndi: {label}
|
|
53
53
|
```
|
|
54
54
|
- Instruction-tuned prompt template:
|
|
@@ -117,13 +117,13 @@ When evaluating generative models, we use the following setup (see the
|
|
|
117
117
|
- Base prompt template:
|
|
118
118
|
```
|
|
119
119
|
Setning: {text}
|
|
120
|
-
|
|
120
|
+
Nafneiningar: {label}
|
|
121
121
|
```
|
|
122
122
|
- Instruction-tuned prompt template:
|
|
123
123
|
```
|
|
124
124
|
Setning: {text}
|
|
125
125
|
|
|
126
|
-
|
|
126
|
+
Greindu nefndu einingarnar í setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum 'einstaklingur', 'staðsetning', 'stofnun' og 'ýmislegt'. Gildin ættu að vera listi yfir nefndu einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.
|
|
127
127
|
```
|
|
128
128
|
- Label mapping:
|
|
129
129
|
- `B-PER` ➡️ `einstaklingur`
|
|
@@ -186,7 +186,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
186
186
|
- Number of few-shot examples: 12
|
|
187
187
|
- Prefix prompt:
|
|
188
188
|
```
|
|
189
|
-
|
|
189
|
+
Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
|
|
190
190
|
```
|
|
191
191
|
- Base prompt template:
|
|
192
192
|
```
|
|
@@ -197,7 +197,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
197
197
|
```
|
|
198
198
|
Setning: {text}
|
|
199
199
|
|
|
200
|
-
|
|
200
|
+
Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
|
|
201
201
|
```
|
|
202
202
|
- Label mapping:
|
|
203
203
|
- `correct` ➡️ `já`
|
|
@@ -249,7 +249,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
249
249
|
- Number of few-shot examples: 12
|
|
250
250
|
- Prefix prompt:
|
|
251
251
|
```
|
|
252
|
-
|
|
252
|
+
Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
|
|
253
253
|
```
|
|
254
254
|
- Base prompt template:
|
|
255
255
|
```
|
|
@@ -260,7 +260,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
260
260
|
```
|
|
261
261
|
Setning: {text}
|
|
262
262
|
|
|
263
|
-
|
|
263
|
+
Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
|
|
264
264
|
```
|
|
265
265
|
- Label mapping:
|
|
266
266
|
- `correct` ➡️ `já`
|
|
@@ -310,7 +310,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
310
310
|
- Number of few-shot examples: 12
|
|
311
311
|
- Prefix prompt:
|
|
312
312
|
```
|
|
313
|
-
|
|
313
|
+
Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
|
|
314
314
|
```
|
|
315
315
|
- Base prompt template:
|
|
316
316
|
```
|
|
@@ -321,7 +321,7 @@ When evaluating generative models, we use the following setup (see the
|
|
|
321
321
|
```
|
|
322
322
|
Setning: {text}
|
|
323
323
|
|
|
324
|
-
|
|
324
|
+
Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
|
|
325
325
|
```
|
|
326
326
|
- Label mapping:
|
|
327
327
|
- `correct` ➡️ `já`
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "EuroEval"
|
|
3
|
-
version = "16.
|
|
3
|
+
version = "16.2.0"
|
|
4
4
|
description = "The robust European language model benchmark."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
authors = [
|
|
@@ -33,7 +33,7 @@ dependencies = [
|
|
|
33
33
|
"rouge-score>=0.1.2",
|
|
34
34
|
"bert-score>=0.3.13",
|
|
35
35
|
"levenshtein>=0.24.0",
|
|
36
|
-
"scikit-learn==1.6.1",
|
|
36
|
+
"scikit-learn==1.6.1", # Required for loading European values pipeline
|
|
37
37
|
"setuptools>=75.8.2",
|
|
38
38
|
"demjson3>=3.0.6",
|
|
39
39
|
"ollama>=0.5.1",
|
|
@@ -45,15 +45,15 @@ dependencies = [
|
|
|
45
45
|
[project.optional-dependencies]
|
|
46
46
|
generative = [
|
|
47
47
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
48
|
-
"vllm>=0.10.1; platform_system == 'Linux'",
|
|
49
|
-
"flashinfer-python>=0.3.1; platform_system == 'Linux'",
|
|
48
|
+
"vllm[flashinfer]>=0.10.1; platform_system == 'Linux'",
|
|
50
49
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
50
|
+
"timm>=1.0.19",
|
|
51
51
|
]
|
|
52
52
|
all = [
|
|
53
53
|
"bitsandbytes>=0.43.1; platform_system == 'Linux'",
|
|
54
|
-
"vllm>=0.10.1; platform_system == 'Linux'",
|
|
55
|
-
"flashinfer-python>=0.3.1; platform_system == 'Linux'",
|
|
54
|
+
"vllm[flashinfer]>=0.10.1; platform_system == 'Linux'",
|
|
56
55
|
"fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
|
|
56
|
+
"timm>=1.0.19",
|
|
57
57
|
]
|
|
58
58
|
|
|
59
59
|
[project.urls]
|
|
@@ -100,6 +100,8 @@ dev-dependencies = [
|
|
|
100
100
|
"types-ujson>=5.10.0.20240515",
|
|
101
101
|
"types-simplejson>=3.2.0.2025032",
|
|
102
102
|
"debugpy>=1.8.13",
|
|
103
|
+
"pytest-socket>=0.7.0",
|
|
104
|
+
"pytest-dependency>=0.6.0",
|
|
103
105
|
]
|
|
104
106
|
|
|
105
107
|
[tool.ruff]
|
|
@@ -170,6 +172,7 @@ addopts = [
|
|
|
170
172
|
"--cov=src/euroeval",
|
|
171
173
|
"--color=yes",
|
|
172
174
|
"-vvv",
|
|
175
|
+
"--allow-unix-socket"
|
|
173
176
|
]
|
|
174
177
|
xfail_strict = true
|
|
175
178
|
filterwarnings = [
|
|
@@ -181,7 +184,7 @@ filterwarnings = [
|
|
|
181
184
|
"ignore::ResourceWarning",
|
|
182
185
|
"ignore::FutureWarning",
|
|
183
186
|
]
|
|
184
|
-
log_cli_level = "
|
|
187
|
+
log_cli_level = "INFO"
|
|
185
188
|
testpaths = [
|
|
186
189
|
"tests",
|
|
187
190
|
"src/euroeval",
|
|
@@ -12,12 +12,13 @@ import warnings
|
|
|
12
12
|
from termcolor import colored
|
|
13
13
|
|
|
14
14
|
# Block specific warnings before importing anything else, as they can be noisy
|
|
15
|
-
|
|
16
|
-
warnings.filterwarnings("ignore", category=
|
|
17
|
-
|
|
18
|
-
logging.getLogger("
|
|
19
|
-
logging.getLogger("
|
|
20
|
-
|
|
15
|
+
if os.getenv("FULL_LOG") != "1":
|
|
16
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
17
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
18
|
+
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
19
|
+
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
20
|
+
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
21
|
+
os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
|
|
21
22
|
|
|
22
23
|
# Set up logging
|
|
23
24
|
fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
@@ -47,6 +47,7 @@ def build_benchmark_config(
|
|
|
47
47
|
debug: bool,
|
|
48
48
|
run_with_cli: bool,
|
|
49
49
|
requires_safetensors: bool,
|
|
50
|
+
download_only: bool,
|
|
50
51
|
) -> BenchmarkConfig:
|
|
51
52
|
"""Create a benchmark configuration.
|
|
52
53
|
|
|
@@ -117,6 +118,8 @@ def build_benchmark_config(
|
|
|
117
118
|
Whether the benchmark is being run with the CLI.
|
|
118
119
|
requires_safetensors:
|
|
119
120
|
Whether to only allow evaluations of models stored as safetensors.
|
|
121
|
+
download_only:
|
|
122
|
+
Whether to only download the requested model weights and datasets.
|
|
120
123
|
|
|
121
124
|
Returns:
|
|
122
125
|
The benchmark configuration.
|
|
@@ -165,6 +168,7 @@ def build_benchmark_config(
|
|
|
165
168
|
debug=debug,
|
|
166
169
|
run_with_cli=run_with_cli,
|
|
167
170
|
requires_safetensors=requires_safetensors,
|
|
171
|
+
download_only=download_only,
|
|
168
172
|
)
|
|
169
173
|
|
|
170
174
|
|
|
@@ -146,21 +146,25 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
146
146
|
Returns:
|
|
147
147
|
The number of parameters in the model.
|
|
148
148
|
"""
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
try:
|
|
152
|
-
repo_info = hf_api.model_info(
|
|
153
|
-
repo_id=self.model_config.adapter_base_model_id
|
|
154
|
-
or self.model_config.model_id,
|
|
155
|
-
revision=self.model_config.revision,
|
|
156
|
-
)
|
|
157
|
-
except (
|
|
158
|
-
RepositoryNotFoundError,
|
|
159
|
-
RevisionNotFoundError,
|
|
160
|
-
RequestException,
|
|
161
|
-
HFValidationError,
|
|
162
|
-
):
|
|
149
|
+
# No need to try to use the API if we have no internet.
|
|
150
|
+
if not internet_connection_available():
|
|
163
151
|
repo_info = None
|
|
152
|
+
else:
|
|
153
|
+
token = get_hf_token(api_key=self.benchmark_config.api_key)
|
|
154
|
+
hf_api = HfApi(token=token)
|
|
155
|
+
try:
|
|
156
|
+
repo_info = hf_api.model_info(
|
|
157
|
+
repo_id=self.model_config.adapter_base_model_id
|
|
158
|
+
or self.model_config.model_id,
|
|
159
|
+
revision=self.model_config.revision,
|
|
160
|
+
)
|
|
161
|
+
except (
|
|
162
|
+
RepositoryNotFoundError,
|
|
163
|
+
RevisionNotFoundError,
|
|
164
|
+
RequestException,
|
|
165
|
+
HFValidationError,
|
|
166
|
+
):
|
|
167
|
+
repo_info = None
|
|
164
168
|
|
|
165
169
|
if (
|
|
166
170
|
repo_info is not None
|
|
@@ -558,7 +562,7 @@ def load_model_and_tokeniser(
|
|
|
558
562
|
The benchmark configuration
|
|
559
563
|
|
|
560
564
|
Returns:
|
|
561
|
-
|
|
565
|
+
A pair (model, tokeniser), with the loaded model and tokeniser
|
|
562
566
|
"""
|
|
563
567
|
config: "PretrainedConfig"
|
|
564
568
|
block_terminal_output()
|
|
@@ -686,6 +690,7 @@ def load_model_and_tokeniser(
|
|
|
686
690
|
model=model,
|
|
687
691
|
model_id=model_id,
|
|
688
692
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
693
|
+
model_cache_dir=model_config.model_cache_dir,
|
|
689
694
|
)
|
|
690
695
|
|
|
691
696
|
return model, tokeniser
|
|
@@ -722,6 +727,11 @@ def get_model_repo_info(
|
|
|
722
727
|
):
|
|
723
728
|
model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
|
|
724
729
|
|
|
730
|
+
# If we have not internet, and the model_id is not a directory for a local model
|
|
731
|
+
# we also just create a dummy model info object.
|
|
732
|
+
elif not internet_connection_available():
|
|
733
|
+
model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
|
|
734
|
+
|
|
725
735
|
# If the model does not exist locally, then we get the model info from the Hugging
|
|
726
736
|
# Face Hub, if possible
|
|
727
737
|
if model_info is None:
|
|
@@ -867,7 +877,10 @@ def get_model_repo_info(
|
|
|
867
877
|
|
|
868
878
|
|
|
869
879
|
def load_tokeniser(
|
|
870
|
-
model: "PreTrainedModel | None",
|
|
880
|
+
model: "PreTrainedModel | None",
|
|
881
|
+
model_id: str,
|
|
882
|
+
trust_remote_code: bool,
|
|
883
|
+
model_cache_dir: str,
|
|
871
884
|
) -> "PreTrainedTokenizer":
|
|
872
885
|
"""Load the tokeniser.
|
|
873
886
|
|
|
@@ -889,6 +902,7 @@ def load_tokeniser(
|
|
|
889
902
|
trust_remote_code=trust_remote_code,
|
|
890
903
|
padding_side="right",
|
|
891
904
|
truncation_side="right",
|
|
905
|
+
cache_dir=model_cache_dir,
|
|
892
906
|
)
|
|
893
907
|
|
|
894
908
|
# If the model is a subclass of a certain model types then we have to add a prefix
|
|
@@ -999,6 +1013,7 @@ def load_hf_model_config(
|
|
|
999
1013
|
token=get_hf_token(api_key=api_key),
|
|
1000
1014
|
trust_remote_code=trust_remote_code,
|
|
1001
1015
|
cache_dir=model_cache_dir,
|
|
1016
|
+
local_files_only=not internet_connection_available(),
|
|
1002
1017
|
)
|
|
1003
1018
|
if config.eos_token_id is not None and config.pad_token_id is None:
|
|
1004
1019
|
if isinstance(config.eos_token_id, list):
|
|
@@ -984,6 +984,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
984
984
|
model=None,
|
|
985
985
|
model_id=model_id,
|
|
986
986
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
987
|
+
model_cache_dir=self.model_config.model_cache_dir,
|
|
987
988
|
)
|
|
988
989
|
|
|
989
990
|
if (
|
|
@@ -1066,6 +1067,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1066
1067
|
model=None,
|
|
1067
1068
|
model_id=model_id,
|
|
1068
1069
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
1070
|
+
model_cache_dir=self.model_config.model_cache_dir,
|
|
1069
1071
|
)
|
|
1070
1072
|
|
|
1071
1073
|
all_max_lengths: list[int] = list()
|
|
@@ -72,7 +72,9 @@ from ..utils import (
|
|
|
72
72
|
create_model_cache_dir,
|
|
73
73
|
get_hf_token,
|
|
74
74
|
get_min_cuda_compute_capability,
|
|
75
|
+
internet_connection_available,
|
|
75
76
|
log_once,
|
|
77
|
+
resolve_model_path,
|
|
76
78
|
split_model_id,
|
|
77
79
|
)
|
|
78
80
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
@@ -146,7 +148,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
146
148
|
)
|
|
147
149
|
|
|
148
150
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
149
|
-
model=self._model, tokeniser=self._tokeniser,
|
|
151
|
+
model=self._model, tokeniser=self._tokeniser, model_config=model_config
|
|
150
152
|
)
|
|
151
153
|
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
152
154
|
tokeniser=self._tokeniser, generative_type=self.generative_type
|
|
@@ -834,10 +836,15 @@ def load_model_and_tokeniser(
|
|
|
834
836
|
|
|
835
837
|
clear_vllm()
|
|
836
838
|
|
|
839
|
+
# if we do not have an internet connection we need to give the path to the folder
|
|
840
|
+
# that contains the model weights and config files, otherwise vLLM will try to
|
|
841
|
+
# download them regardless if they are already present in the download_dir
|
|
842
|
+
model_path = resolve_model_path(download_dir)
|
|
843
|
+
|
|
837
844
|
try:
|
|
838
845
|
model = LLM(
|
|
839
|
-
model=model_id,
|
|
840
|
-
tokenizer=model_id,
|
|
846
|
+
model=model_id if internet_connection_available() else model_path,
|
|
847
|
+
tokenizer=model_id if internet_connection_available() else model_path,
|
|
841
848
|
gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
|
|
842
849
|
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
843
850
|
download_dir=download_dir,
|
|
@@ -925,6 +932,7 @@ def load_tokeniser(
|
|
|
925
932
|
cache_dir=model_cache_dir,
|
|
926
933
|
token=token,
|
|
927
934
|
trust_remote_code=trust_remote_code,
|
|
935
|
+
local_files_only=not internet_connection_available(),
|
|
928
936
|
)
|
|
929
937
|
num_retries = 5
|
|
930
938
|
for _ in range(num_retries):
|
|
@@ -937,8 +945,10 @@ def load_tokeniser(
|
|
|
937
945
|
padding_side="left",
|
|
938
946
|
truncation_side="left",
|
|
939
947
|
model_max_length=model_max_length,
|
|
948
|
+
cache_dir=model_cache_dir,
|
|
940
949
|
config=config,
|
|
941
950
|
token=token,
|
|
951
|
+
local_files_only=not internet_connection_available(),
|
|
942
952
|
)
|
|
943
953
|
break
|
|
944
954
|
except (json.JSONDecodeError, OSError, TypeError) as e:
|
|
@@ -996,7 +1006,7 @@ def clear_vllm() -> None:
|
|
|
996
1006
|
|
|
997
1007
|
|
|
998
1008
|
def get_end_of_reasoning_token(
|
|
999
|
-
model: "LLM", tokeniser: "PreTrainedTokenizer",
|
|
1009
|
+
model: "LLM", tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
|
|
1000
1010
|
) -> str | None:
|
|
1001
1011
|
"""Get the end-of-reasoning token for a generative model.
|
|
1002
1012
|
|
|
@@ -1005,21 +1015,26 @@ def get_end_of_reasoning_token(
|
|
|
1005
1015
|
The vLLM model.
|
|
1006
1016
|
tokeniser:
|
|
1007
1017
|
The tokeniser.
|
|
1008
|
-
|
|
1009
|
-
The model
|
|
1018
|
+
model_config:
|
|
1019
|
+
The model configuration.
|
|
1010
1020
|
|
|
1011
1021
|
Returns:
|
|
1012
1022
|
The end of reasoning token, or None if it could not be found.
|
|
1013
1023
|
"""
|
|
1024
|
+
model_id = model_config.model_id
|
|
1025
|
+
|
|
1014
1026
|
# Create a prompt to check if the model uses the reasoning tokens
|
|
1015
1027
|
prompt = "What is your name?"
|
|
1016
1028
|
if has_chat_template(tokeniser=tokeniser):
|
|
1029
|
+
extra_kwargs = dict()
|
|
1030
|
+
if model_config.param in {"thinking", "no-thinking"}:
|
|
1031
|
+
extra_kwargs["enable_thinking"] = model_config.param == "thinking"
|
|
1017
1032
|
templated_prompt = apply_chat_template(
|
|
1018
1033
|
conversation=[dict(role="user", content=prompt)],
|
|
1019
1034
|
tokeniser=tokeniser,
|
|
1020
1035
|
tokenise=False,
|
|
1021
1036
|
add_generation_prompt=True,
|
|
1022
|
-
|
|
1037
|
+
**extra_kwargs,
|
|
1023
1038
|
)
|
|
1024
1039
|
assert isinstance(templated_prompt, str)
|
|
1025
1040
|
prompt = templated_prompt
|
|
@@ -1042,8 +1057,8 @@ def get_end_of_reasoning_token(
|
|
|
1042
1057
|
if not bor_reasoning_matches:
|
|
1043
1058
|
log_once(
|
|
1044
1059
|
f"The model {model_id!r} did not generate any beginning-of-reasoning "
|
|
1045
|
-
"tokens in the prompt or the completion. Assuming the model is not "
|
|
1046
|
-
"
|
|
1060
|
+
"tokens in the prompt or the completion. Assuming the model is not a "
|
|
1061
|
+
"reasoning model.",
|
|
1047
1062
|
level=logging.DEBUG,
|
|
1048
1063
|
)
|
|
1049
1064
|
return None
|