EuroEval 15.3.1__tar.gz → 15.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-15.3.1 → euroeval-15.4.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +2 -2
- {euroeval-15.3.1 → euroeval-15.4.1}/.github/ISSUE_TEMPLATE/bug.yaml +5 -5
- {euroeval-15.3.1 → euroeval-15.4.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +1 -1
- {euroeval-15.3.1 → euroeval-15.4.1}/.github/workflows/ci.yaml +10 -4
- {euroeval-15.3.1 → euroeval-15.4.1}/.pre-commit-config.yaml +2 -1
- {euroeval-15.3.1 → euroeval-15.4.1}/CHANGELOG.md +61 -1
- {euroeval-15.3.1 → euroeval-15.4.1}/PKG-INFO +22 -7
- {euroeval-15.3.1 → euroeval-15.4.1}/README.md +13 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/dutch.md +1 -1
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/faroese.md +2 -2
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/german.md +2 -2
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/icelandic.md +1 -1
- euroeval-15.4.1/docs/datasets/spanish.md +529 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/pyproject.toml +9 -7
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/__init__.py +11 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/benchmark_config_factory.py +2 -2
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/benchmark_modules/hf.py +2 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/benchmark_modules/litellm.py +124 -2
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/benchmark_modules/vllm.py +33 -13
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/benchmarker.py +2 -2
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/constants.py +7 -1
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/data_loading.py +2 -1
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/dataset_configs.py +172 -1
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/generation.py +17 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/task_utils/sequence_classification.py +27 -7
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/task_utils/token_classification.py +3 -9
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/utils.py +1 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_allocine.py +33 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_arc.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_arc_is.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_belebele.py +11 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_cnn_dailymail.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_conll_en.py +10 -0
- euroeval-15.4.1/src/scripts/create_conll_es.py +115 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_conll_nl.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_dane.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_danish_citizen_tests.py +11 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_dansk.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_danske_talemaader.py +11 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_danske_talemaader_old.py +11 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_dbrd.py +33 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_dutch_cola.py +33 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_dutch_social.py +33 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_eltec.py +14 -1
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_fone.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_foqa.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_fosent.py +33 -4
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_fquad.py +11 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_germanquad.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_germeval.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_hellaswag.py +12 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_hotter_and_colder_sentiment.py +36 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_ice_linguistic.py +37 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_icelandic_error_corpus.py +40 -6
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_icelandic_knowledge.py +14 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_icelandic_qa.py +12 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_icesum.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_ilpost_sum.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_jentoft.py +38 -4
- euroeval-15.4.1/src/scripts/create_mlqa_es.py +74 -0
- euroeval-15.3.1/src/scripts/create_mlsum.py → euroeval-15.4.1/src/scripts/create_mlsum_de.py +13 -3
- euroeval-15.4.1/src/scripts/create_mlsum_es.py +84 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_mmlu.py +12 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_multinerd-it.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_no_cola.py +38 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_no_sammendrag.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_nor_common_sense_qa.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_nordjylland_news.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_norglm_multiqa.py +12 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_norglm_multisum.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_norne.py +11 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_norquad.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_nqii.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_nrk_quiz_qa.py +11 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_orange_sum.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_personal_sum.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_rrn.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_sb10k.py +33 -3
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_scala.py +43 -4
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_scandiqa.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_schibsted.py +10 -0
- euroeval-15.3.1/src/scripts/create_sentipolc16.py → euroeval-15.4.1/src/scripts/create_sentiment_headlines_es.py +24 -9
- euroeval-15.4.1/src/scripts/create_sentipolc16.py +106 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_squad.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_squad_it.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_squad_nl.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_squad_nl_old.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_sst5.py +30 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_suc3.py +11 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_swedn.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_swerec.py +12 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_wiki_lingua_nl.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_wikineural-it.py +10 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_winogrande_is.py +11 -0
- euroeval-15.4.1/src/scripts/create_xquad_es.py +80 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/fix_dot_env_file.py +5 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/load_ud_pos.py +26 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/versioning.py +5 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/conftest.py +6 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_benchmark_config_factory.py +8 -4
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_benchmarker.py +16 -2
- {euroeval-15.3.1 → euroeval-15.4.1}/uv.lock +858 -493
- {euroeval-15.3.1 → euroeval-15.4.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/.gitignore +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/CITATION.cff +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/CONTRIBUTING.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/Dockerfile.cuda +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/LICENSE +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/CNAME +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/README.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/README.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/danish.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/english.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/french.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/italian.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/norwegian.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/datasets/swedish.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/extras/radial_plotter.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/faq.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/gfx/favicon.png +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/leaderboards/README.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/methodology.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/python-package.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/tasks/README.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/tasks/knowledge.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/tasks/speed.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/docs/tasks/summarization.md +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/gfx/euroeval.png +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/gfx/euroeval.xcf +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/gfx/scandeval.png +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/makefile +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/mkdocs.yaml +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/benchmark_modules/base.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/callbacks.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/cli.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/data_models.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/enums.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/exceptions.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/finetuning.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/human_evaluation.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/languages.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/model_cache.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/model_config.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/model_loading.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/scores.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/speed_benchmark.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/task_utils/__init__.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/task_utils/multiple_choice_classification.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/task_utils/question_answering.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/task_utils/text_to_text.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/tasks.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/euroeval/types.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/constants.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_norec.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/__init__.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_benchmark_modules/test_hf.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_callbacks.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_cli.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_constants.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_data_loading.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_data_models.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_dataset_configs.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_enums.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_exceptions.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_finetuning.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_generation.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_human_evaluation.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_languages.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_model_cache.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_model_config.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_model_loading.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_scores.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_speed_benchmark.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_tasks.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_types.py +0 -0
- {euroeval-15.3.1 → euroeval-15.4.1}/tests/test_utils.py +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
name: 📚 Benchmark Dataset Request
|
|
2
|
-
description: Do you think a particular benchmark dataset is missing in
|
|
2
|
+
description: Do you think a particular benchmark dataset is missing in EuroEval?
|
|
3
3
|
title: "[BENCHMARK DATASET REQUEST] <dataset-name>"
|
|
4
4
|
labels: "benchmark dataset request"
|
|
5
5
|
|
|
@@ -36,7 +36,7 @@ body:
|
|
|
36
36
|
- type: textarea
|
|
37
37
|
attributes:
|
|
38
38
|
label: Describe the dataset
|
|
39
|
-
description: Describe what the dataset is measuring, and why you think it is important to include it as a benchmark dataset in
|
|
39
|
+
description: Describe what the dataset is measuring, and why you think it is important to include it as a benchmark dataset in EuroEval.
|
|
40
40
|
validations:
|
|
41
41
|
required: true
|
|
42
42
|
- type: markdown
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
name: 🐛 Bug Report
|
|
2
|
-
description: Have you experienced a bug using the `
|
|
2
|
+
description: Have you experienced a bug using the `euroeval` package?
|
|
3
3
|
title: "[BUG] <name-of-bug>"
|
|
4
4
|
labels: bug
|
|
5
5
|
|
|
@@ -7,7 +7,7 @@ body:
|
|
|
7
7
|
- type: markdown
|
|
8
8
|
attributes:
|
|
9
9
|
value: >
|
|
10
|
-
#### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/
|
|
10
|
+
#### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/EuroEval/EuroEval/issues?q=is%3Aissue).
|
|
11
11
|
- type: textarea
|
|
12
12
|
attributes:
|
|
13
13
|
label: 🐛 Describe the bug
|
|
@@ -52,9 +52,9 @@ body:
|
|
|
52
52
|
required: true
|
|
53
53
|
- type: input
|
|
54
54
|
attributes:
|
|
55
|
-
label:
|
|
56
|
-
description: What version of
|
|
57
|
-
placeholder: Output of `pip list | grep
|
|
55
|
+
label: EuroEval version
|
|
56
|
+
description: What version of EuroEval are you using?
|
|
57
|
+
placeholder: Output of `pip list | grep EuroEval`
|
|
58
58
|
validations:
|
|
59
59
|
required: true
|
|
60
60
|
- type: markdown
|
|
@@ -49,6 +49,9 @@ jobs:
|
|
|
49
49
|
- name: Install Dependencies
|
|
50
50
|
run: uv sync --no-dev --extra test
|
|
51
51
|
|
|
52
|
+
- name: Start Ollama server
|
|
53
|
+
run: curl -fsSL https://ollama.com/install.sh | sh
|
|
54
|
+
|
|
52
55
|
- name: Test with pytest
|
|
53
56
|
run: uv run pytest
|
|
54
57
|
env:
|
|
@@ -57,8 +60,8 @@ jobs:
|
|
|
57
60
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
58
61
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
59
62
|
|
|
60
|
-
- name: Delete
|
|
61
|
-
run: rm -rf .
|
|
63
|
+
- name: Delete EuroEval cache
|
|
64
|
+
run: rm -rf .euroeval_cache
|
|
62
65
|
|
|
63
66
|
pytest-macos:
|
|
64
67
|
if: github.event.pull_request.draft == false && contains(github.event.pull_request.labels.*.name, 'macos')
|
|
@@ -78,6 +81,9 @@ jobs:
|
|
|
78
81
|
- name: Install Dependencies
|
|
79
82
|
run: uv sync --no-dev --extra test
|
|
80
83
|
|
|
84
|
+
- name: Start Ollama server
|
|
85
|
+
run: curl -fsSL https://ollama.com/install.sh | sh
|
|
86
|
+
|
|
81
87
|
- name: Test with pytest
|
|
82
88
|
run: uv run pytest
|
|
83
89
|
env:
|
|
@@ -86,5 +92,5 @@ jobs:
|
|
|
86
92
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
87
93
|
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
88
94
|
|
|
89
|
-
- name: Delete
|
|
90
|
-
run: rm -rf .
|
|
95
|
+
- name: Delete EuroEval cache
|
|
96
|
+
run: rm -rf .euroeval_cache
|
|
@@ -10,11 +10,12 @@ repos:
|
|
|
10
10
|
- id: trailing-whitespace
|
|
11
11
|
- id: debug-statements
|
|
12
12
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
-
rev: v0.
|
|
13
|
+
rev: v0.11.2
|
|
14
14
|
hooks:
|
|
15
15
|
- id: ruff
|
|
16
16
|
args:
|
|
17
17
|
- --fix
|
|
18
|
+
- --unsafe-fixes
|
|
18
19
|
- --exit-non-zero-on-fix
|
|
19
20
|
types_or:
|
|
20
21
|
- python
|
|
@@ -10,9 +10,64 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v15.4.1] - 2025-03-25
|
|
14
|
+
### Fixed
|
|
15
|
+
- Disallow `vllm` v0.8.1, as it causes severe degradation in generation output of
|
|
16
|
+
some models, resulting in artificially low scores.
|
|
17
|
+
- Fixed an issue with text classification tasks if the first token of multiple labels
|
|
18
|
+
are identical, when tokenising with the model's tokeniser.
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
## [v15.4.0] - 2025-03-24
|
|
22
|
+
### Added
|
|
23
|
+
- Added support for Spanish! 🇪🇸This includes two reading comprehension datasets:
|
|
24
|
+
[XQuAD-es](https://huggingface.co/datasets/google/xquad/viewer/xquad.es) and
|
|
25
|
+
[MLQA-es](https://huggingface.co/datasets/facebook/mlqa/viewer/mlqa.es.es),
|
|
26
|
+
[SentimentHeadlines-es](https://huggingface.co/datasets/pysentimiento/spanish-targeted-sentiment-headlines),
|
|
27
|
+
the linguistic acceptability dataset ScaLA with the [Spanish Universal
|
|
28
|
+
Dependencies](https://github.com/UniversalDependencies/UD_Spanish-AnCora),
|
|
29
|
+
[MLSum-es](https://huggingface.co/datasets/reciTAL/mlsum), the knowledge dataset
|
|
30
|
+
[MMLU-es](https://hf.co/datasets/alexandrainst/m_mmlu), the common-sense reasoning
|
|
31
|
+
dataset [HellaSwag-es](https://hf.co/datasets/alexandrainst/m_hellaswag), and the
|
|
32
|
+
named entity recognition dataset [CoNLL-es](https://aclanthology.org/W02-2024/). This
|
|
33
|
+
was contributed by [@oliverkinch](https://github.com/oliverkinch) ✨
|
|
34
|
+
- Now extracts number of parameters and context length for Ollama models, using the
|
|
35
|
+
`ollama` package. Vocabulary size is currently not available available in the `ollama`
|
|
36
|
+
package, so this is not extracted for Ollama models. For this reason, the `ollama`
|
|
37
|
+
package has been added to the core dependencies, as it is very small (~10 KB)
|
|
38
|
+
- Now downloads Ollama models when evaluating them.
|
|
39
|
+
|
|
40
|
+
### Fixed
|
|
41
|
+
- When models output nested JSON dictionaries and structured generation isn't available,
|
|
42
|
+
we use the inner-most dictionary. This caused issues with Anthropic models, since they
|
|
43
|
+
do not support structured generation, and their output are always {"input": actual
|
|
44
|
+
dictionary}. This has been fixed now.
|
|
45
|
+
- Now handles `ReadTimeout`s when loading datasets, rather than aborting evaluations.
|
|
46
|
+
- Benchmark configurations specified when calling `Benchmarker.benchmark` did not
|
|
47
|
+
properly override the default configurations set during initialisation when
|
|
48
|
+
benchmarking generative models. This has been fixed now.
|
|
49
|
+
- Now sets the `VLLM_WORKER_MULTIPROC_METHOD` environment variable to `spawn`, to avoid
|
|
50
|
+
a `RuntimeError` when using newer versions of vLLM with multiple GPUs.
|
|
51
|
+
- Now also detects reasoning tokens specified in the prompt rather than in the
|
|
52
|
+
completion, which is for instance the case for the QwQ reasoning model.
|
|
53
|
+
- Now recognises models with the pipeline tags `image-text-to-text`,
|
|
54
|
+
`audio-text-to-text` and `video-text-to-text` as generative models, which mistakenly
|
|
55
|
+
were detected as encoder models before.
|
|
56
|
+
|
|
57
|
+
### Changed
|
|
58
|
+
- Update `vllm` to `>=0.8.0`, `transformers` to `>=4.50.0` and `torch` to `>=2.6.0`.
|
|
59
|
+
- Moved the `demjson3` dependency from the `generative` extra to the main dependencies,
|
|
60
|
+
to allow benchmarking API-based models without any extras.
|
|
61
|
+
- Now does not include the speed benchmark by default, as it is not used in the official
|
|
62
|
+
leaderboards. It can still be used by including `--task speed` when benchmarking a
|
|
63
|
+
model, or by using the `task` argument if using the `Benchmarker` API.
|
|
64
|
+
- Do not use sliding window sizes as candidates for maximum context length anymore, as
|
|
65
|
+
this is no longer needed.
|
|
66
|
+
|
|
67
|
+
|
|
13
68
|
## [v15.3.1] - 2025-03-13
|
|
14
69
|
### Fixed
|
|
15
|
-
- Now handles`ConnectionError`s when loading datasets, rather than aborting evaluations.
|
|
70
|
+
- Now handles `ConnectionError`s when loading datasets, rather than aborting evaluations.
|
|
16
71
|
|
|
17
72
|
|
|
18
73
|
## [v15.3.0] - 2025-03-12
|
|
@@ -85,12 +140,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
85
140
|
|
|
86
141
|
|
|
87
142
|
## [v15.1.0] - 2025-02-12
|
|
143
|
+
|
|
88
144
|
### Added
|
|
89
145
|
- Added new `--only-allow-safetensors` flag, which disallows evaluating models from the
|
|
90
146
|
Hugging Face Hub if they are not stored as safetensors. This ensures a high level of
|
|
91
147
|
security on the system running the evaluations, if this is necessary. This was
|
|
92
148
|
contributed by [@Mikeriess](https://github.com/Mikeriess) ✨
|
|
93
149
|
|
|
150
|
+
|
|
94
151
|
### Fixed
|
|
95
152
|
- Regex mismatch caused the wrong sequence length for GPT-4o models. This has been fixed
|
|
96
153
|
now.
|
|
@@ -104,6 +161,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
104
161
|
|
|
105
162
|
|
|
106
163
|
## [v15.0.0] - 2025-02-02
|
|
164
|
+
|
|
107
165
|
### Added
|
|
108
166
|
- Added support for evaluating generative reasoning models, such as OpenAI o1 and
|
|
109
167
|
Deepseek R1. This is done by upping the maximal sequence length to 8,192 tokens, and
|
|
@@ -150,6 +208,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
150
208
|
|
|
151
209
|
|
|
152
210
|
## [v14.4.0] - 2025-01-22
|
|
211
|
+
|
|
212
|
+
### Added
|
|
153
213
|
- Added support for French! 🇫🇷This includes the sentiment classification dataset
|
|
154
214
|
[Allocine](https://hf.co/datasets/tblard/allocine), the linguistic acceptability
|
|
155
215
|
dataset ScaLA with the [French Universal
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.4.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -33,12 +33,14 @@ Requires-Dist: accelerate>=0.34.2
|
|
|
33
33
|
Requires-Dist: bert-score>=0.3.13
|
|
34
34
|
Requires-Dist: click>=8.1.3
|
|
35
35
|
Requires-Dist: datasets>=2.15.0
|
|
36
|
+
Requires-Dist: demjson3>=3.0.6
|
|
36
37
|
Requires-Dist: evaluate>=0.4.1
|
|
37
38
|
Requires-Dist: huggingface-hub>=0.24.0
|
|
38
39
|
Requires-Dist: levenshtein>=0.24.0
|
|
39
40
|
Requires-Dist: litellm>=1.61.13
|
|
40
41
|
Requires-Dist: more-itertools>=10.5.0
|
|
41
42
|
Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
43
|
+
Requires-Dist: ollama>=0.4.7
|
|
42
44
|
Requires-Dist: pandas>=2.2.0
|
|
43
45
|
Requires-Dist: protobuf~=3.20.0
|
|
44
46
|
Requires-Dist: pydantic>=2.6.0
|
|
@@ -52,19 +54,19 @@ Requires-Dist: seqeval>=1.2.2
|
|
|
52
54
|
Requires-Dist: setuptools>=75.8.2
|
|
53
55
|
Requires-Dist: tenacity>=9.0.0
|
|
54
56
|
Requires-Dist: termcolor>=2.0.0
|
|
55
|
-
Requires-Dist: torch>=2.
|
|
56
|
-
Requires-Dist: transformers>=4.
|
|
57
|
+
Requires-Dist: torch>=2.6.0
|
|
58
|
+
Requires-Dist: transformers>=4.50.0
|
|
57
59
|
Provides-Extra: all
|
|
58
60
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
59
|
-
Requires-Dist: demjson3>=3.0.6; extra == 'all'
|
|
60
61
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
61
62
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
62
|
-
Requires-Dist:
|
|
63
|
+
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
64
|
+
Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
65
|
Provides-Extra: generative
|
|
64
66
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
65
|
-
Requires-Dist: demjson3>=3.0.6; extra == 'generative'
|
|
66
67
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
67
|
-
Requires-Dist:
|
|
68
|
+
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
69
|
+
Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
|
|
68
70
|
Provides-Extra: human-evaluation
|
|
69
71
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
70
72
|
Provides-Extra: test
|
|
@@ -202,6 +204,19 @@ argument. This could for instance be `--model <model-id> --task
|
|
|
202
204
|
sentiment-classification`.
|
|
203
205
|
|
|
204
206
|
|
|
207
|
+
### Reproducing the datasets
|
|
208
|
+
All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
|
|
209
|
+
|
|
210
|
+
```shell
|
|
211
|
+
$ uv run src/scripts/<name-of-script>.py
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
Replace <name-of-script> with the specific script you wish to execute, e.g.,
|
|
215
|
+
|
|
216
|
+
```shell
|
|
217
|
+
$ uv run src/scripts/create_allocine.py
|
|
218
|
+
```
|
|
219
|
+
|
|
205
220
|
## Special Thanks :pray:
|
|
206
221
|
- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
|
|
207
222
|
models on the leaderboards.
|
|
@@ -129,6 +129,19 @@ argument. This could for instance be `--model <model-id> --task
|
|
|
129
129
|
sentiment-classification`.
|
|
130
130
|
|
|
131
131
|
|
|
132
|
+
### Reproducing the datasets
|
|
133
|
+
All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
|
|
134
|
+
|
|
135
|
+
```shell
|
|
136
|
+
$ uv run src/scripts/<name-of-script>.py
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Replace <name-of-script> with the specific script you wish to execute, e.g.,
|
|
140
|
+
|
|
141
|
+
```shell
|
|
142
|
+
$ uv run src/scripts/create_allocine.py
|
|
143
|
+
```
|
|
144
|
+
|
|
132
145
|
## Special Thanks :pray:
|
|
133
146
|
- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
|
|
134
147
|
models on the leaderboards.
|
|
@@ -75,7 +75,7 @@ and features Dutch book reviews from [Hebban.nl](https://www.hebban.nl), annotat
|
|
|
75
75
|
sentiment labels, written by the users of the website.
|
|
76
76
|
|
|
77
77
|
The original full dataset consists of 20,000 / 2,200 samples for training and testing,
|
|
78
|
-
respectively. We use a 1,
|
|
78
|
+
respectively. We use a 1,014 / 253 / 2,014 split for training, validation and testing,
|
|
79
79
|
respectively (so 3,328 samples used in total). The training and testing splits are
|
|
80
80
|
subsets of the original splits, and the validation split is a disjoint subset of the
|
|
81
81
|
original training split.
|
|
@@ -17,8 +17,8 @@ labels were manually annotated by two native speakers.
|
|
|
17
17
|
The original full dataset consists of 245 samples, which consisted of both a news
|
|
18
18
|
article, a chosen sentence from the article, and the sentiment label. We use both the
|
|
19
19
|
news article and the chosen sentence as two separate samples, to increase the size of
|
|
20
|
-
the dataset (keeping them within the same dataset split). In total, we use a
|
|
21
|
-
|
|
20
|
+
the dataset (keeping them within the same dataset split). In total, we use a 72 / 40 /
|
|
21
|
+
279 split for training, validation and testing, respectively.
|
|
22
22
|
|
|
23
23
|
Here are a few examples from the training split:
|
|
24
24
|
|
|
@@ -485,7 +485,7 @@ $ euroeval --model <model-id> --dataset hellaswag-de
|
|
|
485
485
|
|
|
486
486
|
## Summarization
|
|
487
487
|
|
|
488
|
-
### MLSum
|
|
488
|
+
### MLSum-de
|
|
489
489
|
|
|
490
490
|
This dataset was published in [this
|
|
491
491
|
paper](https://aclanthology.org/2020.emnlp-main.647/) and features news articles and
|
|
@@ -541,5 +541,5 @@ When evaluating generative models, we use the following setup (see the
|
|
|
541
541
|
You can evaluate this dataset directly as follows:
|
|
542
542
|
|
|
543
543
|
```bash
|
|
544
|
-
$ euroeval --model <model-id> --dataset mlsum
|
|
544
|
+
$ euroeval --model <model-id> --dataset mlsum-de
|
|
545
545
|
```
|
|
@@ -13,7 +13,7 @@ This dataset is being published in an upcoming paper, and consists of texts from
|
|
|
13
13
|
Icelandic blog post, annotated with sentiment labels (and many others) via a
|
|
14
14
|
crowdsourcing platform.
|
|
15
15
|
|
|
16
|
-
The original full dataset consists of 2,901 samples, and we use a 1,
|
|
16
|
+
The original full dataset consists of 2,901 samples, and we use a 1,021 / 255 / 1,607
|
|
17
17
|
split for training, validation and testing, respectively (so all samples are used in
|
|
18
18
|
total).
|
|
19
19
|
|