EuroEval 15.7.0__tar.gz → 15.7.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- {euroeval-15.7.0 → euroeval-15.7.1}/.pre-commit-config.yaml +1 -1
- {euroeval-15.7.0 → euroeval-15.7.1}/CHANGELOG.md +12 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/PKG-INFO +1 -1
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/dutch.md +1 -62
- {euroeval-15.7.0 → euroeval-15.7.1}/pyproject.toml +1 -1
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/benchmark_modules/litellm.py +12 -253
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/benchmark_modules/vllm.py +13 -303
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/benchmarker.py +1 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/data_models.py +3 -1
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/dutch.py +5 -16
- euroeval-15.7.1/src/euroeval/generation_utils.py +346 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/scores.py +7 -1
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_dbrd.py +22 -22
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_data_loading.py +33 -20
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_scores.py +1 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/uv.lock +1 -1
- euroeval-15.7.0/src/scripts/create_dutch_social.py +0 -114
- {euroeval-15.7.0 → euroeval-15.7.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/.github/workflows/ci.yaml +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/.gitignore +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/CITATION.cff +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/CODE_OF_CONDUCT.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/CONTRIBUTING.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/Dockerfile.cuda +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/LICENSE +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/NEW_DATASET_GUIDE.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/README.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/CNAME +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/README.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/README.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/danish.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/english.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/faroese.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/finnish.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/french.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/german.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/icelandic.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/italian.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/norwegian.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/spanish.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/datasets/swedish.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/extras/radial_plotter.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/faq.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/gfx/favicon.png +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/danish.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/english.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/french.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/german.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/italian.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Multilingual/european.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/Multilingual/romance.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/leaderboards/README.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/methodology.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/python-package.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/tasks/README.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/tasks/common-sense-reasoning.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/tasks/knowledge.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/tasks/linguistic-acceptability.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/tasks/named-entity-recognition.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/tasks/reading-comprehension.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/tasks/sentiment-classification.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/tasks/speed.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/docs/tasks/summarization.md +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/gfx/euroeval.png +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/gfx/euroeval.xcf +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/gfx/scandeval.png +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/makefile +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/mkdocs.yaml +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/__init__.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/benchmark_config_factory.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/benchmark_modules/base.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/benchmark_modules/hf.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/callbacks.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/cli.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/constants.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/data_loading.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/__init__.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/danish.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/english.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/faroese.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/finnish.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/french.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/german.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/icelandic.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/italian.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/norwegian.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/spanish.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/dataset_configs/swedish.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/enums.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/exceptions.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/finetuning.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/generation.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/human_evaluation.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/languages.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/model_cache.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/model_config.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/model_loading.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/prompt_templates/__init__.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/prompt_templates/summarization.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/speed_benchmark.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/task_group_utils/__init__.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/task_group_utils/question_answering.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/task_group_utils/text_to_text.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/task_group_utils/token_classification.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/tasks.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/tokenization_utils.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/types.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/euroeval/utils.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/constants.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_allocine.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_angry_tweets.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_arc.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_arc_is.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_belebele.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_cnn_dailymail.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_conll_en.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_conll_es.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_conll_nl.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_dane.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_danish_citizen_tests.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_dansk.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_danske_talemaader.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_danske_talemaader_old.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_dutch_cola.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_eltec.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_fone.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_foqa.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_fosent.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_fquad.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_germanquad.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_germeval.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_hellaswag.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_hellaswag_fi.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_ice_linguistic.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_icelandic_knowledge.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_icelandic_qa.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_icesum.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_ilpost_sum.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_jentoft.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_mim_gold_ner.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_mlqa_es.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_mlsum_de.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_mlsum_es.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_mmlu.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_multinerd-it.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_no_cola.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_no_sammendrag.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_nordjylland_news.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_norec.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_norglm_multiqa.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_norglm_multisum.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_norne.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_norquad.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_nqii.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_orange_sum.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_personal_sum.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_rrn.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_sb10k.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_scala.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_scandiqa.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_scandisent_fi.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_schibsted.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_sentipolc16.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_squad.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_squad_it.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_squad_nl.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_squad_nl_old.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_sst5.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_suc3.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_swedn.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_swerec.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_turku_ner_fi.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_tydiqa_fi.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_wikiann_fo.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_wikineural-it.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_winogrande_is.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_xlsum_fi.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/create_xquad_es.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/fix_dot_env_file.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/load_ud_pos.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/src/scripts/versioning.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/__init__.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/conftest.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_benchmark_config_factory.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_benchmark_modules/__init__.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_benchmark_modules/test_base.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_benchmark_modules/test_fresh.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_benchmark_modules/test_hf.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_benchmark_modules/test_litellm.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_benchmark_modules/test_vllm.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_benchmarker.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_callbacks.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_cli.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_constants.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_data_models.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_dataset_configs.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_enums.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_exceptions.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_finetuning.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_generation.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_human_evaluation.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_languages.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_model_cache.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_model_config.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_model_loading.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_speed_benchmark.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_task_utils/__init__.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_task_utils/test_question_answering.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_task_utils/test_sequence_classification.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_task_utils/test_text_to_text.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_task_utils/test_token_classification.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_tasks.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_tokenization_utils.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_types.py +0 -0
- {euroeval-15.7.0 → euroeval-15.7.1}/tests/test_utils.py +0 -0
|
@@ -10,6 +10,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
## [v15.7.1] - 2025-04-29
|
|
14
|
+
### Changed
|
|
15
|
+
- Marked the DBRD Dutch sentiment classification as official, as the quality is
|
|
16
|
+
substantially better than the previous Dutch Social.
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
- Fixed an issue with NER evaluation of instruction-tuned models, which was caused by
|
|
20
|
+
the "O" label mistakenly being included in the prompt template, causing an error
|
|
21
|
+
during evaluation. No evaluations were affected by this, only that some evaluations
|
|
22
|
+
could not be run.
|
|
23
|
+
|
|
24
|
+
|
|
13
25
|
## [v15.7.0] - 2025-04-28
|
|
14
26
|
### Added
|
|
15
27
|
- Added support for Finnish 🇫🇮! This includes the Finnish part of the reading
|
|
@@ -7,68 +7,7 @@ information about what these constitute.
|
|
|
7
7
|
|
|
8
8
|
## Sentiment Classification
|
|
9
9
|
|
|
10
|
-
###
|
|
11
|
-
|
|
12
|
-
This dataset consists of Dutch tweets annotated with sentiment labels. It is not sure
|
|
13
|
-
how the sentiment labels were assigned, this information is pending from the authors.
|
|
14
|
-
|
|
15
|
-
The original full dataset consists of 162,805 / 54,269 / 54,268 samples for training,
|
|
16
|
-
validation and testing, respectively (so 271,342 samples used in total). We use a 1,024
|
|
17
|
-
/ 256 / 1,024 split for training, validation and testing, respectively. All the new
|
|
18
|
-
splits are subsets of the original splits.
|
|
19
|
-
|
|
20
|
-
Here are a few examples from the training split:
|
|
21
|
-
|
|
22
|
-
```json
|
|
23
|
-
{
|
|
24
|
-
"text": 'Novak Djokovic positief getest op coronavirus na eigen tennistoernooi\n\nhttps://t.co/U7VOcjANh9',
|
|
25
|
-
"label": 'positive'
|
|
26
|
-
}
|
|
27
|
-
```
|
|
28
|
-
```json
|
|
29
|
-
{
|
|
30
|
-
"text": "via @NYTimes https://t.co/IjbCWIwYvR",
|
|
31
|
-
"label": "neutral"
|
|
32
|
-
}
|
|
33
|
-
```
|
|
34
|
-
```json
|
|
35
|
-
{
|
|
36
|
-
"text": "@backinflow 30 min Corona tijd....",
|
|
37
|
-
"label": "negative"
|
|
38
|
-
}
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
When evaluating generative models, we use the following setup (see the
|
|
42
|
-
[methodology](/methodology) for more information on how these are used):
|
|
43
|
-
|
|
44
|
-
- Number of few-shot examples: 12
|
|
45
|
-
- Prefix prompt:
|
|
46
|
-
```
|
|
47
|
-
Hieronder staan tweets en hun sentiment, dat 'positief', 'neutraal' of 'negatief' kan zijn.
|
|
48
|
-
```
|
|
49
|
-
- Base prompt template:
|
|
50
|
-
```
|
|
51
|
-
Tweet: {text}
|
|
52
|
-
Sentiment: {label}
|
|
53
|
-
```
|
|
54
|
-
- Instruction-tuned prompt template:
|
|
55
|
-
```
|
|
56
|
-
Tweet: {text}
|
|
57
|
-
|
|
58
|
-
Classificeer het sentiment in de tweet. Antwoord met 'positief', 'neutraal' of 'negatief'.
|
|
59
|
-
```
|
|
60
|
-
- Label mapping:
|
|
61
|
-
- `positive` ➡️ `positief`
|
|
62
|
-
- `neutral` ➡️ `neutraal`
|
|
63
|
-
- `negative` ➡️ `negatief`
|
|
64
|
-
|
|
65
|
-
You can evaluate this dataset directly as follows:
|
|
66
|
-
|
|
67
|
-
```bash
|
|
68
|
-
$ euroeval --model <model-id> --dataset dutch-social
|
|
69
|
-
```
|
|
70
|
-
|
|
71
|
-
### Unofficial: DBRD
|
|
10
|
+
### DBRD
|
|
72
11
|
|
|
73
12
|
This dataset was published in [this paper](https://doi.org/10.48550/arXiv.1910.00896)
|
|
74
13
|
and features Dutch book reviews from [Hebban.nl](https://www.hebban.nl), annotated with
|
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
"""Generative models from an inference API, using the LiteLLM framework."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
-
import itertools as it
|
|
5
|
-
import json
|
|
6
4
|
import logging
|
|
7
5
|
import os
|
|
8
|
-
import random
|
|
9
6
|
import re
|
|
10
7
|
import typing as t
|
|
11
8
|
from functools import cached_property, partial
|
|
@@ -60,6 +57,7 @@ from ..exceptions import (
|
|
|
60
57
|
NeedsEnvironmentVariable,
|
|
61
58
|
NeedsExtraInstalled,
|
|
62
59
|
)
|
|
60
|
+
from ..generation_utils import apply_prompt, extract_few_shot_examples
|
|
63
61
|
from ..task_group_utils import (
|
|
64
62
|
question_answering,
|
|
65
63
|
sequence_classification,
|
|
@@ -943,14 +941,22 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
943
941
|
)
|
|
944
942
|
|
|
945
943
|
if self.benchmark_config.few_shot:
|
|
946
|
-
few_shot_examples =
|
|
947
|
-
dataset=dataset,
|
|
944
|
+
few_shot_examples = extract_few_shot_examples(
|
|
945
|
+
dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
|
|
948
946
|
)
|
|
949
947
|
else:
|
|
950
948
|
few_shot_examples = list()
|
|
951
949
|
|
|
952
950
|
dataset["test"] = dataset["test"].map(
|
|
953
|
-
partial(
|
|
951
|
+
partial(
|
|
952
|
+
apply_prompt,
|
|
953
|
+
few_shot_examples=few_shot_examples,
|
|
954
|
+
model_config=self.model_config,
|
|
955
|
+
dataset_config=self.dataset_config,
|
|
956
|
+
instruction_model=True,
|
|
957
|
+
always_populate_text_field=False,
|
|
958
|
+
tokenizer=None,
|
|
959
|
+
),
|
|
954
960
|
batched=True,
|
|
955
961
|
load_from_cache_file=False,
|
|
956
962
|
keep_in_memory=True,
|
|
@@ -958,253 +964,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
958
964
|
|
|
959
965
|
return dataset
|
|
960
966
|
|
|
961
|
-
def _extract_few_shot_examples(
|
|
962
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
963
|
-
) -> list[dict[str, t.Any]]:
|
|
964
|
-
"""Extract few-shot examples from a dataset.
|
|
965
|
-
|
|
966
|
-
This will always extract the examples from the training split.
|
|
967
|
-
|
|
968
|
-
We ensure that the few-shot examples are unique by picking them one at a time.
|
|
969
|
-
|
|
970
|
-
Args:
|
|
971
|
-
dataset:
|
|
972
|
-
The dataset to extract the few-shot examples from.
|
|
973
|
-
task:
|
|
974
|
-
The task that is being benchmarked.
|
|
975
|
-
itr_idx:
|
|
976
|
-
The index of the dataset in the iterator.
|
|
977
|
-
|
|
978
|
-
Returns:
|
|
979
|
-
The few-shot examples.
|
|
980
|
-
"""
|
|
981
|
-
random_seed = 4242 + itr_idx
|
|
982
|
-
num_few_shots = self.dataset_config.num_few_shot_examples
|
|
983
|
-
few_shot_examples: list[dict[str, t.Any]] = list()
|
|
984
|
-
shuffled_train = dataset["train"].shuffle(seed=random_seed)
|
|
985
|
-
|
|
986
|
-
match task.task_group:
|
|
987
|
-
case (
|
|
988
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
989
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
990
|
-
):
|
|
991
|
-
labels = it.cycle(self.dataset_config.labels)
|
|
992
|
-
while (
|
|
993
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
994
|
-
):
|
|
995
|
-
label = next(labels)
|
|
996
|
-
possible_examples = shuffled_train.filter(
|
|
997
|
-
lambda x: x["label"].lower() == label.lower()
|
|
998
|
-
)
|
|
999
|
-
if len(possible_examples) == 0:
|
|
1000
|
-
continue
|
|
1001
|
-
example = possible_examples.select(range(1))[0]
|
|
1002
|
-
few_shot_examples.append(example)
|
|
1003
|
-
shuffled_train = shuffled_train.filter(
|
|
1004
|
-
lambda x: x["text"] != example["text"]
|
|
1005
|
-
)
|
|
1006
|
-
|
|
1007
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
1008
|
-
while (
|
|
1009
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
1010
|
-
):
|
|
1011
|
-
example = shuffled_train.select(range(1))[0]
|
|
1012
|
-
few_shot_examples.append(example)
|
|
1013
|
-
shuffled_train = shuffled_train.filter(
|
|
1014
|
-
lambda x: x["text"] != example["text"]
|
|
1015
|
-
)
|
|
1016
|
-
|
|
1017
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
1018
|
-
labels = it.cycle(
|
|
1019
|
-
[
|
|
1020
|
-
label.lower()
|
|
1021
|
-
for label in self.dataset_config.labels
|
|
1022
|
-
if label.lower().startswith("b-")
|
|
1023
|
-
]
|
|
1024
|
-
)
|
|
1025
|
-
while (
|
|
1026
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
1027
|
-
):
|
|
1028
|
-
label = next(labels)
|
|
1029
|
-
possible_examples = shuffled_train.filter(
|
|
1030
|
-
lambda x: label in [tag.lower() for tag in x["labels"]]
|
|
1031
|
-
)
|
|
1032
|
-
if len(possible_examples) == 0:
|
|
1033
|
-
continue
|
|
1034
|
-
example = possible_examples.select(range(1))[0]
|
|
1035
|
-
few_shot_examples.append(example)
|
|
1036
|
-
shuffled_train = shuffled_train.filter(
|
|
1037
|
-
lambda x: x["tokens"] != example["tokens"]
|
|
1038
|
-
)
|
|
1039
|
-
|
|
1040
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
1041
|
-
# Locate the maximum number of tokens that constitutes a short example
|
|
1042
|
-
for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
|
|
1043
|
-
train_with_short_examples = dataset["train"].filter(
|
|
1044
|
-
lambda example: len(example["context"]) < max_num_tokens
|
|
1045
|
-
)
|
|
1046
|
-
num_short_examples = len(train_with_short_examples)
|
|
1047
|
-
if num_short_examples >= self.dataset_config.num_few_shot_examples:
|
|
1048
|
-
break
|
|
1049
|
-
else:
|
|
1050
|
-
raise InvalidBenchmark(
|
|
1051
|
-
"Could not find enough short examples for few-shot learning."
|
|
1052
|
-
)
|
|
1053
|
-
|
|
1054
|
-
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
1055
|
-
while (
|
|
1056
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
1057
|
-
):
|
|
1058
|
-
example = shuffled_train.select(range(1))[0]
|
|
1059
|
-
few_shot_examples.append(example)
|
|
1060
|
-
shuffled_train = shuffled_train.filter(
|
|
1061
|
-
lambda x: x["context"] != example["context"]
|
|
1062
|
-
)
|
|
1063
|
-
|
|
1064
|
-
case _:
|
|
1065
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
1066
|
-
|
|
1067
|
-
random.seed(random_seed)
|
|
1068
|
-
random.shuffle(few_shot_examples)
|
|
1069
|
-
return few_shot_examples
|
|
1070
|
-
|
|
1071
|
-
def _apply_prompt(
|
|
1072
|
-
self,
|
|
1073
|
-
examples: dict[str, t.Any],
|
|
1074
|
-
few_shot_examples: list[dict[str, t.Any]],
|
|
1075
|
-
task: Task,
|
|
1076
|
-
) -> dict[str, t.Any]:
|
|
1077
|
-
"""Apply prompt template to an example, potentially with few-shot examples.
|
|
1078
|
-
|
|
1079
|
-
Args:
|
|
1080
|
-
examples:
|
|
1081
|
-
The examples to apply the few-shot examples to.
|
|
1082
|
-
few_shot_examples:
|
|
1083
|
-
The few-shot examples to apply.
|
|
1084
|
-
task:
|
|
1085
|
-
The task that is being benchmarked.
|
|
1086
|
-
|
|
1087
|
-
Returns:
|
|
1088
|
-
The example with the few-shot examples applied.
|
|
1089
|
-
"""
|
|
1090
|
-
|
|
1091
|
-
def create_prompt(**kwargs: str) -> tuple[str, str]:
|
|
1092
|
-
"""Create a prompt from the given keyword arguments.
|
|
1093
|
-
|
|
1094
|
-
Args:
|
|
1095
|
-
kwargs:
|
|
1096
|
-
The keyword arguments to use in the prompt.
|
|
1097
|
-
|
|
1098
|
-
Returns:
|
|
1099
|
-
A pair (prompt, label), where "label" is an empty string if the model is
|
|
1100
|
-
not instruction tuned (as in this case it is included in the prompt).
|
|
1101
|
-
"""
|
|
1102
|
-
label_key = "label" if "label" in kwargs else "target_text"
|
|
1103
|
-
label = kwargs.pop(label_key)
|
|
1104
|
-
label_mapping = self.dataset_config.prompt_label_mapping
|
|
1105
|
-
label = label_mapping.get(label, label)
|
|
1106
|
-
prompt = self.dataset_config.instruction_prompt.format(**kwargs)
|
|
1107
|
-
return prompt, label
|
|
1108
|
-
|
|
1109
|
-
match task.task_group:
|
|
1110
|
-
case (
|
|
1111
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
1112
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
1113
|
-
):
|
|
1114
|
-
few_shot_sections = [
|
|
1115
|
-
create_prompt(
|
|
1116
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
1117
|
-
label=example["label"].replace("\n", " ").strip(),
|
|
1118
|
-
)
|
|
1119
|
-
for example in few_shot_examples
|
|
1120
|
-
]
|
|
1121
|
-
new_sections = [
|
|
1122
|
-
create_prompt(text=text.replace("\n", " ").strip(), label="")
|
|
1123
|
-
for text in examples["text"]
|
|
1124
|
-
]
|
|
1125
|
-
|
|
1126
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
1127
|
-
few_shot_sections = [
|
|
1128
|
-
create_prompt(
|
|
1129
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
1130
|
-
target_text=example["target_text"].replace("\n", " ").strip(),
|
|
1131
|
-
)
|
|
1132
|
-
for example in few_shot_examples
|
|
1133
|
-
]
|
|
1134
|
-
new_sections = [
|
|
1135
|
-
create_prompt(text=text.replace("\n", " ").strip(), target_text="")
|
|
1136
|
-
for text in examples["text"]
|
|
1137
|
-
]
|
|
1138
|
-
|
|
1139
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
1140
|
-
|
|
1141
|
-
def create_label(example: dict) -> str:
|
|
1142
|
-
prompt_labels = self.dataset_config.prompt_label_mapping.values()
|
|
1143
|
-
labels: dict[str, list[str]] = {
|
|
1144
|
-
prompt_label: list() for prompt_label in prompt_labels
|
|
1145
|
-
}
|
|
1146
|
-
for token, label in zip(example["tokens"], example["labels"]):
|
|
1147
|
-
label = label.lower()
|
|
1148
|
-
if label == "o":
|
|
1149
|
-
continue
|
|
1150
|
-
prompt_label = self.dataset_config.prompt_label_mapping[label]
|
|
1151
|
-
if label.startswith("b-"):
|
|
1152
|
-
labels[prompt_label].append(token)
|
|
1153
|
-
elif label.startswith("i-"):
|
|
1154
|
-
labels[prompt_label][-1] += " " + token
|
|
1155
|
-
return json.dumps(labels, ensure_ascii=False)
|
|
1156
|
-
|
|
1157
|
-
few_shot_sections = [
|
|
1158
|
-
create_prompt(
|
|
1159
|
-
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
1160
|
-
label=create_label(example=example),
|
|
1161
|
-
)
|
|
1162
|
-
for example in few_shot_examples
|
|
1163
|
-
]
|
|
1164
|
-
new_sections = [
|
|
1165
|
-
create_prompt(
|
|
1166
|
-
text=" ".join(tokens).replace("\n", " ").strip(), label=""
|
|
1167
|
-
)
|
|
1168
|
-
for tokens in examples["tokens"]
|
|
1169
|
-
]
|
|
1170
|
-
|
|
1171
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
1172
|
-
few_shot_sections = [
|
|
1173
|
-
create_prompt(
|
|
1174
|
-
text=example["context"].replace("\n", " ").strip(),
|
|
1175
|
-
question=example["question"].replace("\n", " ").strip(),
|
|
1176
|
-
label=example["answers"]["text"][0].replace("\n", " "),
|
|
1177
|
-
)
|
|
1178
|
-
for example in few_shot_examples
|
|
1179
|
-
]
|
|
1180
|
-
new_sections = [
|
|
1181
|
-
create_prompt(
|
|
1182
|
-
text=context.replace("\n", " ").strip(),
|
|
1183
|
-
question=question.replace("\n", " ").strip(),
|
|
1184
|
-
label="",
|
|
1185
|
-
)
|
|
1186
|
-
for context, question in zip(
|
|
1187
|
-
examples["context"], examples["question"]
|
|
1188
|
-
)
|
|
1189
|
-
]
|
|
1190
|
-
|
|
1191
|
-
case _:
|
|
1192
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
1193
|
-
|
|
1194
|
-
few_shot_messages = [
|
|
1195
|
-
dict(role=role, content=content)
|
|
1196
|
-
for prompt, label in few_shot_sections
|
|
1197
|
-
for role, content in [("user", prompt), ("assistant", label)]
|
|
1198
|
-
]
|
|
1199
|
-
|
|
1200
|
-
messages_list = [
|
|
1201
|
-
few_shot_messages + [dict(role="user", content=prompt)]
|
|
1202
|
-
for prompt, _ in new_sections
|
|
1203
|
-
]
|
|
1204
|
-
|
|
1205
|
-
examples["messages"] = messages_list
|
|
1206
|
-
return examples
|
|
1207
|
-
|
|
1208
967
|
|
|
1209
968
|
def raise_if_wrong_params(
|
|
1210
969
|
model_config: ModelConfig, allowed_params: dict[str, list[str]]
|