ScandEval 16.7.1__tar.gz → 16.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scandeval-16.7.1 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +3 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +3 -2
- {scandeval-16.7.1 → scandeval-16.9.0}/.pre-commit-config.yaml +7 -11
- {scandeval-16.7.1 → scandeval-16.9.0}/CHANGELOG.md +76 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/PKG-INFO +5 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/README.md +1 -1
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/bosnian.md +2 -2
- scandeval-16.9.0/docs/datasets/catalan.md +536 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/croatian.md +2 -2
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/czech.md +4 -4
- scandeval-16.9.0/docs/datasets/hungarian.md +522 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/lithuanian.md +91 -1
- scandeval-16.9.0/docs/datasets/romanian.md +524 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/serbian.md +2 -2
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/spanish.md +72 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/swedish.md +77 -0
- scandeval-16.9.0/docs/leaderboards/Monolingual/bosnian.md +26 -0
- scandeval-16.9.0/docs/leaderboards/Monolingual/catalan.md +26 -0
- scandeval-16.9.0/docs/leaderboards/Monolingual/hungarian.md +26 -0
- scandeval-16.9.0/docs/leaderboards/Monolingual/slovene.md +26 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Multilingual/romance.md +1 -1
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Multilingual/slavic.md +1 -1
- scandeval-16.9.0/docs/python-package.md +394 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/pyproject.toml +9 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/__init__.py +3 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/benchmark_config_factory.py +8 -1
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/benchmark_modules/base.py +3 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/benchmark_modules/fresh.py +4 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/benchmark_modules/hf.py +38 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/benchmark_modules/litellm.py +59 -42
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/benchmark_modules/vllm.py +42 -31
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/benchmarker.py +26 -17
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/cli.py +11 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/data_loading.py +4 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/data_models.py +3 -2
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/__init__.py +15 -4
- scandeval-16.9.0/src/scandeval/dataset_configs/catalan.py +64 -0
- scandeval-16.9.0/src/scandeval/dataset_configs/hungarian.py +64 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/lithuanian.py +15 -4
- scandeval-16.9.0/src/scandeval/dataset_configs/romanian.py +65 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/spanish.py +9 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/swedish.py +9 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/finetuning.py +5 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/generation.py +1 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/generation_utils.py +30 -1
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/logging_utils.py +5 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/metrics/llm_as_a_judge.py +1 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/metrics/pipeline.py +2 -2
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/model_cache.py +19 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/model_loading.py +1 -1
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/prompt_templates/linguistic_acceptability.py +38 -25
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/prompt_templates/multiple_choice.py +49 -40
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/prompt_templates/named_entity_recognition.py +81 -34
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/prompt_templates/reading_comprehension.py +66 -74
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/prompt_templates/sentiment_classification.py +55 -40
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/prompt_templates/summarization.py +43 -24
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/task_group_utils/multiple_choice_classification.py +8 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/task_group_utils/question_answering.py +30 -19
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/task_group_utils/sequence_classification.py +4 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/task_group_utils/text_to_text.py +3 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/task_group_utils/token_classification.py +6 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/tokenisation_utils.py +21 -21
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/types.py +9 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/utils.py +23 -16
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/constants.py +3 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_allocine.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_arc.py +13 -10
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_arc_is.py +16 -11
- scandeval-16.9.0/src/scripts/create_atsiliepimai.py +83 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_belebele.py +11 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_bg_ner_bsnlp.py +6 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_boolq_pt.py +12 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_cinexio.py +9 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_cnn_dailymail.py +10 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_conll_en.py +5 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_conll_es.py +5 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_conll_nl.py +5 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_copa_lv.py +9 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_cross_domain_uk_reviews.py +16 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_cs_gec.py +16 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_csfd_sentiment.py +8 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_csfd_sentiment_sk.py +6 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_czech_news.py +15 -7
- scandeval-16.9.0/src/scripts/create_dacsa.py +117 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_dane.py +5 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_danish_citizen_tests.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_dansk.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_danske_talemaader.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_danske_talemaader_old.py +10 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_dbrd.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_dutch_cola.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_elner.py +5 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_eltec.py +9 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_err_news.py +13 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_estner.py +6 -2
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_estonian_valence.py +7 -10
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_european_values.py +5 -2
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_exam_et.py +10 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_exams_bg.py +11 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_fone.py +7 -5
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_foqa.py +5 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_fosent.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_fquad.py +11 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_fullstack_ner.py +23 -14
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_germanquad.py +13 -10
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_germeval.py +5 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_global_mmlu.py +14 -11
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_goldenswag.py +14 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_grammar_et.py +9 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_greek_sa.py +12 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_greek_wikipedia.py +10 -5
- scandeval-16.9.0/src/scripts/create_guia_cat.py +126 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_harem.py +11 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_hellaswag.py +12 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_hellaswag_cs.py +12 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_hellaswag_fi.py +16 -11
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_hotter_and_colder_sentiment.py +9 -6
- scandeval-16.9.0/src/scripts/create_hun_sum.py +237 -0
- scandeval-16.9.0/src/scripts/create_husst.py +128 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_ice_linguistic.py +17 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_icelandic_error_corpus.py +30 -20
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_icelandic_knowledge.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_icelandic_qa.py +21 -11
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_icesum.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_idioms_no.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_ilpost_sum.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_jentoft.py +14 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_kpwr_ner.py +10 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_latvian_lsm_summary.py +15 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_latvian_twitter_sentiment.py +16 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_life_in_the_uk.py +12 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_lithuanian_lrytas_summarization.py +15 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_llmzszl.py +14 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_lr_sum.py +12 -5
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_lt_emotions.py +12 -5
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_lt_history.py +10 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_mlqa_es.py +9 -5
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_mlsum_de.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_mlsum_es.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_mmlu.py +19 -11
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_mmlu_et.py +11 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_mmlu_hr.py +12 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_mmlu_lv.py +19 -11
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_mms.py +10 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_multi_wiki_qa.py +16 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_multinerd-it.py +9 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_ner_uk.py +14 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_no_cola.py +13 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_no_sammendrag.py +12 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_nor_common_sense_qa.py +14 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_nordjylland_news.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_norglm_multiqa.py +18 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_norglm_multisum.py +12 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_norne.py +14 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_norquad.py +12 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_nqii.py +17 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_nrk_quiz_qa.py +15 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_orange_sum.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_personal_sum.py +8 -5
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_polemo2.py +10 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_poner.py +10 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_poquad.py +19 -10
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_psc.py +15 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_publico.py +2 -1
- scandeval-16.9.0/src/scripts/create_ronec.py +166 -0
- scandeval-16.9.0/src/scripts/create_rosent.py +227 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_rrn.py +12 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_sb10k.py +11 -5
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_scala.py +33 -13
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_scandiqa.py +13 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_scandisent_fi.py +11 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_schibsted.py +12 -5
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_sentiment_headlines_es.py +13 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_sentinews.py +14 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_sentipolc16.py +11 -5
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_skolprov.py +10 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_sqad.py +21 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_squad.py +19 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_squad_it.py +19 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_squad_nl.py +16 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_squad_nl_old.py +15 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_ssj500k_ner.py +12 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_sst2_pt.py +25 -11
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_sst5.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_suc3.py +13 -7
- scandeval-16.9.0/src/scripts/create_sumo_ro.py +138 -0
- scandeval-16.9.0/src/scripts/create_swedish_facts.py +246 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_swedn.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_swerec.py +14 -5
- scandeval-16.9.0/src/scripts/create_szeged_ner.py +176 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_trivia_et.py +13 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_turku_ner_fi.py +9 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_tydiqa_fi.py +17 -10
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_umimeto_qa.py +7 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_uner_sk.py +10 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_uner_sr.py +14 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_wiki_lingua_nl.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_wikiann.py +6 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_wikineural-it.py +5 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_winogrande.py +16 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_winogrande_et.py +17 -12
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_winogrande_is.py +11 -7
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_xlsum_fi.py +11 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_xquad.py +15 -8
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/load_ud_pos.py +54 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/conftest.py +15 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_benchmark_config_factory.py +25 -5
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_benchmarker.py +26 -9
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_callbacks.py +1 -1
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_cli.py +3 -1
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_data_loading.py +8 -4
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_data_models.py +3 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_dataset_configs.py +9 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_finetuning.py +2 -2
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_model_config.py +0 -1
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_model_loading.py +12 -6
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_speed_benchmark.py +0 -1
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_tokenisation_utils.py +0 -3
- {scandeval-16.7.1 → scandeval-16.9.0}/uv.lock +30 -67
- scandeval-16.7.1/docs/python-package.md +0 -130
- {scandeval-16.7.1 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/language_request.yaml +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/.github/workflows/ci.yaml +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/.gitignore +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/.markdownlint.jsonc +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/CITATION.cff +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/CODE_OF_CONDUCT.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/CONTRIBUTING.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/Dockerfile.cuda +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/LICENSE +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/NEW_DATASET_GUIDE.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/CNAME +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/README.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/README.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/bulgarian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/danish.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/dutch.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/english.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/estonian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/faroese.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/finnish.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/french.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/german.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/greek.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/icelandic.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/italian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/latvian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/norwegian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/polish.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/portuguese.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/slovak.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/slovene.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/datasets/ukrainian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/extras/radial_plotter.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/faq.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/gfx/favicon.png +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/bulgarian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/croatian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/czech.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/danish.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/english.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/estonian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/french.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/german.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/greek.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/italian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/latvian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/lithuanian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/polish.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/portuguese.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/serbian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/slovak.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Monolingual/ukrainian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Multilingual/baltic.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Multilingual/european.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Multilingual/finnic.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/leaderboards/README.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/methodology.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/tasks/README.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/tasks/common-sense-reasoning.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/tasks/knowledge.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/tasks/linguistic-acceptability.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/tasks/named-entity-recognition.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/tasks/reading-comprehension.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/tasks/sentiment-classification.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/tasks/speed.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/docs/tasks/summarization.md +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/gfx/euroeval.png +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/gfx/euroeval.xcf +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/gfx/scandeval.png +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/makefile +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/mkdocs.yaml +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/benchmark_modules/__init__.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/caching_utils.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/callbacks.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/constants.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/bosnian.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/bulgarian.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/croatian.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/czech.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/danish.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/dutch.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/english.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/estonian.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/faroese.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/finnish.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/french.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/german.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/greek.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/icelandic.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/italian.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/latvian.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/norwegian.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/polish.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/portuguese.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/serbian.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/slovak.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/slovene.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/dataset_configs/ukrainian.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/enums.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/exceptions.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/languages.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/metrics/__init__.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/metrics/base.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/metrics/huggingface.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/metrics/speed.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/model_config.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/prompt_templates/__init__.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/prompt_templates/classification.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/prompt_templates/token_classification.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/scores.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/speed_benchmark.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/task_group_utils/__init__.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scandeval/tasks.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/__init__.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_angry_tweets.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_mim_gold_ner.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/create_norec.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/fix_dot_env_file.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/src/scripts/versioning.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/__init__.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_benchmark_modules/__init__.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_benchmark_modules/test_hf.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_constants.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_enums.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_exceptions.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_languages.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scores.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scripts/__init__.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/__init__.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_create_scala.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/de_gsd-ud-train.conllu.adp_det +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/empty.file +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/en_gum-ud-train.conllu.case +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_01 +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_02 +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_03 +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_types.py +0 -0
- {scandeval-16.7.1 → scandeval-16.9.0}/tests/test_utils.py +0 -0
|
@@ -26,6 +26,7 @@ body:
|
|
|
26
26
|
options:
|
|
27
27
|
- label: Bulgarian
|
|
28
28
|
- label: Bosnian
|
|
29
|
+
- label: Catalan
|
|
29
30
|
- label: Croatian
|
|
30
31
|
- label: Czech
|
|
31
32
|
- label: Danish
|
|
@@ -37,6 +38,7 @@ body:
|
|
|
37
38
|
- label: French
|
|
38
39
|
- label: German
|
|
39
40
|
- label: Greek
|
|
41
|
+
- label: Hungarian
|
|
40
42
|
- label: Icelandic
|
|
41
43
|
- label: Italian
|
|
42
44
|
- label: Latvian
|
|
@@ -44,6 +46,7 @@ body:
|
|
|
44
46
|
- label: Norwegian (Bokmål or Nynorsk)
|
|
45
47
|
- label: Polish
|
|
46
48
|
- label: Portuguese
|
|
49
|
+
- label: Romanian
|
|
47
50
|
- label: Serbian
|
|
48
51
|
- label: Slovak
|
|
49
52
|
- label: Slovenian
|
|
@@ -20,11 +20,12 @@ body:
|
|
|
20
20
|
options:
|
|
21
21
|
- label: Baltic languages (Latvian, Lithuanian)
|
|
22
22
|
- label: Finnic languages (Estonian, Finnish)
|
|
23
|
-
- label:
|
|
24
|
-
- label: Romance languages (French, Italian, Portuguese, Spanish)
|
|
23
|
+
- label: Romance languages (Catalan, French, Italian, Portuguese, Romanian, Spanish)
|
|
25
24
|
- label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
|
|
26
25
|
- label: Slavic languages (Bulgarian, Bosnian, Croatian, Czech, Polish, Serbian, Slovak, Slovenian, Ukrainian)
|
|
27
26
|
- label: West Germanic languages (Dutch, English, German)
|
|
27
|
+
- label: Greek
|
|
28
|
+
- label: Hungarian
|
|
28
29
|
validations:
|
|
29
30
|
required: true
|
|
30
31
|
- type: dropdown
|
|
@@ -10,7 +10,7 @@ repos:
|
|
|
10
10
|
- id: trailing-whitespace
|
|
11
11
|
- id: debug-statements
|
|
12
12
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
-
rev: v0.14.
|
|
13
|
+
rev: v0.14.9
|
|
14
14
|
hooks:
|
|
15
15
|
- id: ruff
|
|
16
16
|
args:
|
|
@@ -33,18 +33,14 @@ repos:
|
|
|
33
33
|
rev: 0.8.2
|
|
34
34
|
hooks:
|
|
35
35
|
- id: nbstripout
|
|
36
|
-
- repo: https://github.com/pre-commit
|
|
37
|
-
rev:
|
|
36
|
+
- repo: https://github.com/facebook/pyrefly-pre-commit
|
|
37
|
+
rev: 0.46.0
|
|
38
38
|
hooks:
|
|
39
|
-
- id:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
- --non-interactive
|
|
43
|
-
- --ignore-missing-imports
|
|
44
|
-
- --show-error-codes
|
|
45
|
-
- --check-untyped-defs
|
|
39
|
+
- id: pyrefly-check
|
|
40
|
+
name: Pyrefly (type checking)
|
|
41
|
+
pass_filenames: true
|
|
46
42
|
- repo: https://github.com/DavidAnson/markdownlint-cli2
|
|
47
|
-
rev: v0.
|
|
43
|
+
rev: v0.20.0
|
|
48
44
|
hooks:
|
|
49
45
|
- id: markdownlint-cli2
|
|
50
46
|
args:
|
|
@@ -7,6 +7,71 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [v16.9.0] - 2025-12-16
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Added the Swedish factual knowledge dataset SwedishFacts, which is based on the
|
|
15
|
+
[liu-nlp/swedish-facts-v1](https://huggingface.co/datasets/liu-nlp/swedish-facts-v1)
|
|
16
|
+
dataset. This was contributed by @oliverkinch ✨
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
- When a model has registered the number of parameters wrongly within their safetensors
|
|
21
|
+
files, we collect all the potential parameter counts from the safetensors file and
|
|
22
|
+
pick the largest one.
|
|
23
|
+
- We now pinned vLLM to v0.11.0, as all future versions (up to and including v0.12.0)
|
|
24
|
+
have breaking changes regarding loading of Mistral models. We aim to unpin this when a
|
|
25
|
+
new vLLM version fixes this.
|
|
26
|
+
- Removed mentions of `hf_transfer` and the associated environment variable
|
|
27
|
+
`HF_HUB_ENABLE_HF_TRANSFER`, since this has been removed from the `transformers`
|
|
28
|
+
library now.
|
|
29
|
+
- Marked the `PleIAs/Pleias-3b-Preview` as requiring the `TRITON_ATTN` backend over the
|
|
30
|
+
default `FLASHINFER` backend, as the model architecture is currently not supported by
|
|
31
|
+
the default backend.
|
|
32
|
+
|
|
33
|
+
## [v16.8.0] - 2025-11-25
|
|
34
|
+
|
|
35
|
+
### Added
|
|
36
|
+
|
|
37
|
+
- Added support for Romanian 🇷🇴! This includes the sentiment classification dataset
|
|
38
|
+
RoSent, the linguistic acceptability dataset ScaLA-ro, the named entity recognition
|
|
39
|
+
dataset RoNEC, the reading comprehension dataset MultiWikiQA-ro, the summarisation
|
|
40
|
+
dataset SumO-Ro, the knowledge dataset Global-MMLU-ro, and the common-sense
|
|
41
|
+
reasoning dataset Winogrande-ro. This was contributed by @oliverkinch ✨
|
|
42
|
+
- Added support for Hungarian 🇭🇺! This includes the sentiment classification dataset
|
|
43
|
+
HuSST, the linguistic acceptability dataset ScaLA-hu, the named entity recognition
|
|
44
|
+
dataset SzegedNER, the reading comprehension dataset MultiWikiQA-hu, the
|
|
45
|
+
summarisation dataset HunSum, the knowledge dataset MMLU-hu, and the common-sense
|
|
46
|
+
reasoning dataset Winogrande-hu. This was contributed by @oliverkinch ✨
|
|
47
|
+
- Added support for Catalan! This includes the sentiment classification dataset
|
|
48
|
+
GuiaCat, the linguistic acceptability dataset ScaLA-ca, the named entity recognition
|
|
49
|
+
dataset WikiANN-ca, the reading comprehension dataset MultiWikiQA-ca, the summarisation
|
|
50
|
+
dataset DACSA-ca, the knowledge dataset MMLU-ca, and the common-sense reasoning dataset
|
|
51
|
+
Winogrande-ca. This was contributed by @oliverkinch ✨
|
|
52
|
+
- Added Spanish summarisation dataset DACSA-es as an unofficial dataset.
|
|
53
|
+
- Added Lithuanian sentiment classification dataset Atsiliepimai to replace the now
|
|
54
|
+
unofficial Lithuanian Emotions dataset. This was contributed by @oliverkinch ✨
|
|
55
|
+
- Added new `--custom-datasets-file` (`custom_datasets_file` in the `Benchmarker` API)
|
|
56
|
+
argument, which can be used to specify a custom Python file containing custom dataset
|
|
57
|
+
definitions. It defaults to `custom_datasets.py` in the current working directory.
|
|
58
|
+
|
|
59
|
+
### Changed
|
|
60
|
+
|
|
61
|
+
- When evaluating models with the `--debug` flag (`debug=True` in the `Benchmarker`
|
|
62
|
+
API), we now include the full model inputs and outputs in the JSON file stored to the
|
|
63
|
+
current working directory, where we previously only included the model outputs.
|
|
64
|
+
|
|
65
|
+
### Fixed
|
|
66
|
+
|
|
67
|
+
- When encountering rate limits for API inference models, we ended up waiting 10 seconds
|
|
68
|
+
for each request, which was unnecessarily long. We now only wait 10 seconds for each
|
|
69
|
+
batch of requests.
|
|
70
|
+
- Uses the `FLASH_ATTN` backend with vLLM for Gemma-3-1b and Gemma-3n models now and the
|
|
71
|
+
`TRITON_ATTN` with the other Gemma-3 models, as their architecture is currently not
|
|
72
|
+
supported by the default `FLASHINFER` backend. Note that this can always be changed
|
|
73
|
+
manually with the `VLLM_ATTENTION_BACKEND` environment variable.
|
|
74
|
+
|
|
10
75
|
## [v16.7.1] - 2025-11-18
|
|
11
76
|
|
|
12
77
|
### Fixed
|
|
@@ -23,7 +88,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
23
88
|
|
|
24
89
|
- Added support for Bosnian 🇧🇦! This includes the sentiment classification dataset
|
|
25
90
|
MMS-bs, the named entity recognition dataset WikiANN-bs, the reading comprehension
|
|
26
|
-
dataset MultiWikiQA-bs, and the summarisation dataset LR-Sum-bs.
|
|
91
|
+
dataset MultiWikiQA-bs, and the summarisation dataset LR-Sum-bs. This was contributed
|
|
92
|
+
by @oliverkinch ✨
|
|
27
93
|
- Now allows the 'low', 'medium' and 'high' reasoning effort parameters for the GPT-OSS
|
|
28
94
|
models, which can be set by appending `#low`, `#medium` or `#high` to the model ID.
|
|
29
95
|
|
|
@@ -65,7 +131,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
65
131
|
- Added support for Croatian 🇭🇷! This includes the sentiment classification dataset
|
|
66
132
|
MMS-hr, the linguistic acceptability dataset ScaLA-hr, the named entity recognition
|
|
67
133
|
dataset WikiANN-hr, the reading comprehension dataset MultiWikiQA-hr, the knowledge
|
|
68
|
-
dataset MMLU-hr, and the common-sense reasoning dataset Winogrande-hr.
|
|
134
|
+
dataset MMLU-hr, and the common-sense reasoning dataset Winogrande-hr. This was
|
|
135
|
+
contributed by @oliverkinch ✨
|
|
69
136
|
- Added a system dependency check for `nvcc` in the `VLLMModel.__init__` method to
|
|
70
137
|
ensure the CUDA Toolkit is installed. Raises an error with installation instructions
|
|
71
138
|
if NVCC is not available in the system PATH.
|
|
@@ -86,11 +153,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
86
153
|
|
|
87
154
|
### Added
|
|
88
155
|
|
|
89
|
-
- Added support for Slovene 🇸🇮! This includes the sentiment classification dataset
|
|
90
|
-
Sentinews, the linguistic acceptability dataset ScaLA-sl, the named entity recognition
|
|
91
|
-
dataset ssj500k-NER, the reading comprehension
|
|
92
|
-
dataset MultiWikiQA-sl, the knowledge dataset MMLU-sl, and the common-sense reasoning
|
|
93
|
-
dataset Winogrande-sl.
|
|
94
156
|
- Added better support for evaluating on custom datasets, by allowing `DatasetConfig`
|
|
95
157
|
objects directly in the `Benchmarker.benchmark` method. We also support custom
|
|
96
158
|
datasets with the CLI, by simply defining the desired `DatasetConfig`s in a
|
|
@@ -99,6 +161,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
99
161
|
with the new `source` argument. This argument can both be the Hugging Face Hub ID of
|
|
100
162
|
the dataset or a dictionary with 'train', 'val' and 'test', and values the paths to
|
|
101
163
|
the CSV files.
|
|
164
|
+
- Added support for Slovene 🇸🇮! This includes the sentiment classification dataset
|
|
165
|
+
Sentinews, the linguistic acceptability dataset ScaLA-sl, the named entity recognition
|
|
166
|
+
dataset ssj500k-NER, the reading comprehension
|
|
167
|
+
dataset MultiWikiQA-sl, the knowledge dataset MMLU-sl, and the common-sense reasoning
|
|
168
|
+
dataset Winogrande-sl. This was contributed by @oliverkinch ✨
|
|
102
169
|
- Added support for Serbian 🇷🇸! This includes the sentiment classification dataset
|
|
103
170
|
MMS-sr, the linguistic acceptability dataset ScaLA-sr, the named entity recognition
|
|
104
171
|
dataset UNER-sr, the reading comprehension dataset MultiWikiQA-sr, the summarisation
|
|
@@ -2691,8 +2758,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
|
|
|
2691
2758
|
|
|
2692
2759
|
### Deprecated
|
|
2693
2760
|
|
|
2694
|
-
- Deprecated support for evaluating finetuned models, as the package was primarily used
|
|
2695
|
-
benchmark pretrained models anyway, and the change in datasets means that many
|
|
2761
|
+
- Deprecated support for evaluating finetuned models, as the package was primarily used
|
|
2762
|
+
to benchmark pretrained models anyway, and the change in datasets means that many
|
|
2696
2763
|
finetuned models would have been trained on (part of) the test sets, resulting in
|
|
2697
2764
|
artificially large scores. For evaluation of finetuned models, please check out the
|
|
2698
2765
|
`aiai_eval` Python package instead (under development).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ScandEval
|
|
3
|
-
Version: 16.
|
|
3
|
+
Version: 16.9.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -39,6 +39,7 @@ Requires-Dist: evaluate>=0.4.1
|
|
|
39
39
|
Requires-Dist: huggingface-hub>=0.30.1
|
|
40
40
|
Requires-Dist: levenshtein>=0.24.0
|
|
41
41
|
Requires-Dist: litellm>=1.75.6
|
|
42
|
+
Requires-Dist: mistral-common[soundfile]
|
|
42
43
|
Requires-Dist: more-itertools>=10.5.0
|
|
43
44
|
Requires-Dist: numpy>=2.0.0
|
|
44
45
|
Requires-Dist: ollama>=0.5.1
|
|
@@ -62,12 +63,12 @@ Provides-Extra: all
|
|
|
62
63
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
63
64
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
65
|
Requires-Dist: timm>=1.0.19; extra == 'all'
|
|
65
|
-
Requires-Dist: vllm[flashinfer]
|
|
66
|
+
Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'all'
|
|
66
67
|
Provides-Extra: generative
|
|
67
68
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
69
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
70
|
Requires-Dist: timm>=1.0.19; extra == 'generative'
|
|
70
|
-
Requires-Dist: vllm[flashinfer]
|
|
71
|
+
Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'generative'
|
|
71
72
|
Description-Content-Type: text/markdown
|
|
72
73
|
|
|
73
74
|
<!-- This disables the requirement that the first line is a top-level heading -->
|
|
@@ -92,7 +93,7 @@ ______________________________________________________________________
|
|
|
92
93
|
[](https://arxiv.org/abs/2406.13469)
|
|
93
94
|
[](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
|
|
94
95
|
[](https://github.com/EuroEval/EuroEval/commits/main)
|
|
95
|
-
[](https://github.com/EuroEval/EuroEval/tree/main/tests)
|
|
96
97
|
[](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
|
|
97
98
|
|
|
98
99
|
## Maintainer
|
|
@@ -20,7 +20,7 @@ ______________________________________________________________________
|
|
|
20
20
|
[](https://arxiv.org/abs/2406.13469)
|
|
21
21
|
[](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
|
|
22
22
|
[](https://github.com/EuroEval/EuroEval/commits/main)
|
|
23
|
-
[](https://github.com/EuroEval/EuroEval/tree/main/tests)
|
|
24
24
|
[](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
|
|
25
25
|
|
|
26
26
|
## Maintainer
|
|
@@ -9,8 +9,8 @@ information about what these constitute.
|
|
|
9
9
|
### MMS-bs
|
|
10
10
|
|
|
11
11
|
This dataset was published in [this paper](https://doi.org/10.48550/arXiv.2306.07902).
|
|
12
|
-
The corpus consists of 79 manually selected datasets from over 350 datasets reported in
|
|
13
|
-
scientific literature based on strict quality criteria.
|
|
12
|
+
The corpus consists of 79 manually selected datasets from over 350 datasets reported in
|
|
13
|
+
the scientific literature based on strict quality criteria.
|
|
14
14
|
|
|
15
15
|
The original dataset contains a single split with 36,183 Bosnian samples.
|
|
16
16
|
We use 1,024 / 256 / 2,048 samples for our training, validation, and test splits,
|