ScandEval 16.8.0__tar.gz → 16.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (370) hide show
  1. {scandeval-16.8.0 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +3 -2
  2. {scandeval-16.8.0 → scandeval-16.9.0}/.pre-commit-config.yaml +4 -4
  3. {scandeval-16.8.0 → scandeval-16.9.0}/CHANGELOG.md +25 -2
  4. {scandeval-16.8.0 → scandeval-16.9.0}/PKG-INFO +4 -3
  5. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/bosnian.md +2 -2
  6. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/croatian.md +2 -2
  7. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/czech.md +4 -4
  8. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/lithuanian.md +3 -3
  9. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/serbian.md +2 -2
  10. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/swedish.md +77 -0
  11. scandeval-16.9.0/docs/leaderboards/Monolingual/bosnian.md +26 -0
  12. scandeval-16.9.0/docs/leaderboards/Monolingual/catalan.md +26 -0
  13. scandeval-16.9.0/docs/leaderboards/Monolingual/hungarian.md +26 -0
  14. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Multilingual/romance.md +1 -1
  15. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Multilingual/slavic.md +1 -1
  16. scandeval-16.9.0/docs/python-package.md +394 -0
  17. {scandeval-16.8.0 → scandeval-16.9.0}/makefile +1 -1
  18. {scandeval-16.8.0 → scandeval-16.9.0}/pyproject.toml +9 -4
  19. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/benchmark_modules/hf.py +18 -3
  20. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/benchmark_modules/vllm.py +13 -6
  21. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/benchmarker.py +0 -11
  22. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/swedish.py +9 -0
  23. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/metrics/llm_as_a_judge.py +1 -3
  24. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/model_config.py +2 -2
  25. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/task_group_utils/question_answering.py +30 -19
  26. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/task_group_utils/sequence_classification.py +4 -4
  27. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/task_group_utils/text_to_text.py +3 -4
  28. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/task_group_utils/token_classification.py +6 -8
  29. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/tokenisation_utils.py +7 -1
  30. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/types.py +7 -1
  31. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_allocine.py +7 -4
  32. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_arc.py +13 -10
  33. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_arc_is.py +16 -11
  34. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_atsiliepimai.py +9 -4
  35. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_belebele.py +11 -8
  36. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_bg_ner_bsnlp.py +6 -4
  37. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_boolq_pt.py +12 -6
  38. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_cinexio.py +9 -6
  39. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_cnn_dailymail.py +10 -7
  40. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_conll_en.py +5 -3
  41. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_conll_es.py +5 -3
  42. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_conll_nl.py +5 -3
  43. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_copa_lv.py +9 -6
  44. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_cross_domain_uk_reviews.py +16 -8
  45. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_cs_gec.py +16 -4
  46. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_csfd_sentiment.py +8 -4
  47. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_csfd_sentiment_sk.py +6 -4
  48. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_czech_news.py +15 -7
  49. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_dacsa.py +10 -6
  50. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_dane.py +5 -6
  51. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_danish_citizen_tests.py +7 -4
  52. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_dansk.py +7 -4
  53. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_danske_talemaader.py +7 -4
  54. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_danske_talemaader_old.py +10 -7
  55. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_dbrd.py +7 -4
  56. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_dutch_cola.py +7 -4
  57. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_elner.py +5 -3
  58. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_eltec.py +9 -7
  59. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_err_news.py +13 -8
  60. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_estner.py +6 -2
  61. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_estonian_valence.py +7 -10
  62. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_european_values.py +5 -2
  63. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_exam_et.py +10 -9
  64. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_exams_bg.py +11 -8
  65. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_fone.py +7 -5
  66. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_foqa.py +5 -3
  67. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_fosent.py +7 -4
  68. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_fquad.py +11 -8
  69. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_fullstack_ner.py +23 -14
  70. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_germanquad.py +13 -10
  71. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_germeval.py +5 -3
  72. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_global_mmlu.py +13 -10
  73. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_goldenswag.py +14 -9
  74. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_grammar_et.py +9 -7
  75. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_greek_sa.py +12 -7
  76. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_greek_wikipedia.py +10 -5
  77. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_guia_cat.py +15 -5
  78. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_harem.py +11 -9
  79. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_hellaswag.py +12 -9
  80. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_hellaswag_cs.py +12 -9
  81. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_hellaswag_fi.py +16 -11
  82. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_hotter_and_colder_sentiment.py +9 -6
  83. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_hun_sum.py +21 -7
  84. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_husst.py +13 -4
  85. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_ice_linguistic.py +17 -8
  86. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_icelandic_error_corpus.py +30 -20
  87. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_icelandic_knowledge.py +11 -4
  88. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_icelandic_qa.py +21 -11
  89. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_icesum.py +7 -4
  90. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_idioms_no.py +11 -4
  91. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_ilpost_sum.py +11 -4
  92. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_jentoft.py +14 -9
  93. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_kpwr_ner.py +10 -4
  94. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_latvian_lsm_summary.py +15 -6
  95. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_latvian_twitter_sentiment.py +16 -8
  96. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_life_in_the_uk.py +12 -9
  97. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_lithuanian_lrytas_summarization.py +15 -6
  98. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_llmzszl.py +14 -9
  99. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_lr_sum.py +12 -5
  100. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_lt_emotions.py +12 -5
  101. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_lt_history.py +10 -6
  102. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_mlqa_es.py +9 -5
  103. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_mlsum_de.py +11 -4
  104. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_mlsum_es.py +11 -4
  105. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_mmlu.py +17 -11
  106. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_mmlu_et.py +11 -8
  107. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_mmlu_hr.py +12 -6
  108. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_mmlu_lv.py +19 -11
  109. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_mms.py +10 -4
  110. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_multi_wiki_qa.py +13 -9
  111. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_multinerd-it.py +9 -3
  112. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_ner_uk.py +14 -4
  113. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_no_cola.py +13 -8
  114. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_no_sammendrag.py +12 -4
  115. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_nor_common_sense_qa.py +14 -7
  116. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_nordjylland_news.py +11 -4
  117. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_norglm_multiqa.py +18 -8
  118. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_norglm_multisum.py +12 -4
  119. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_norne.py +14 -4
  120. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_norquad.py +12 -8
  121. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_nqii.py +17 -9
  122. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_nrk_quiz_qa.py +15 -8
  123. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_orange_sum.py +11 -4
  124. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_personal_sum.py +8 -5
  125. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_polemo2.py +10 -7
  126. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_poner.py +10 -3
  127. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_poquad.py +19 -10
  128. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_psc.py +15 -6
  129. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_publico.py +2 -1
  130. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_ronec.py +11 -5
  131. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_rosent.py +17 -5
  132. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_rrn.py +12 -4
  133. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_sb10k.py +11 -5
  134. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_scala.py +27 -13
  135. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_scandiqa.py +13 -9
  136. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_scandisent_fi.py +11 -7
  137. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_schibsted.py +12 -5
  138. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_sentiment_headlines_es.py +13 -4
  139. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_sentinews.py +14 -4
  140. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_sentipolc16.py +11 -5
  141. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_skolprov.py +10 -7
  142. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_sqad.py +21 -7
  143. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_squad.py +19 -9
  144. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_squad_it.py +19 -9
  145. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_squad_nl.py +16 -9
  146. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_squad_nl_old.py +15 -9
  147. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_ssj500k_ner.py +12 -6
  148. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_sst2_pt.py +25 -11
  149. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_sst5.py +7 -4
  150. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_suc3.py +13 -7
  151. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_sumo_ro.py +14 -7
  152. scandeval-16.9.0/src/scripts/create_swedish_facts.py +246 -0
  153. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_swedn.py +11 -4
  154. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_swerec.py +14 -5
  155. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_szeged_ner.py +11 -4
  156. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_trivia_et.py +13 -4
  157. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_turku_ner_fi.py +9 -4
  158. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_tydiqa_fi.py +17 -10
  159. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_umimeto_qa.py +7 -4
  160. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_uner_sk.py +10 -4
  161. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_uner_sr.py +14 -4
  162. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_wiki_lingua_nl.py +11 -4
  163. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_wikiann.py +5 -3
  164. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_wikineural-it.py +5 -3
  165. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_winogrande.py +13 -9
  166. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_winogrande_et.py +17 -12
  167. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_winogrande_is.py +11 -7
  168. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_xlsum_fi.py +11 -4
  169. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_xquad.py +15 -8
  170. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_benchmarker.py +1 -6
  171. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_data_loading.py +0 -2
  172. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_data_models.py +0 -1
  173. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_model_config.py +0 -1
  174. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_model_loading.py +0 -3
  175. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_speed_benchmark.py +0 -1
  176. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_tokenisation_utils.py +0 -3
  177. {scandeval-16.8.0 → scandeval-16.9.0}/uv.lock +30 -18
  178. scandeval-16.8.0/AGENTS.md +0 -121
  179. scandeval-16.8.0/docs/python-package.md +0 -130
  180. {scandeval-16.8.0 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  181. {scandeval-16.8.0 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  182. {scandeval-16.8.0 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  183. {scandeval-16.8.0 → scandeval-16.9.0}/.github/ISSUE_TEMPLATE/language_request.yaml +0 -0
  184. {scandeval-16.8.0 → scandeval-16.9.0}/.github/workflows/ci.yaml +0 -0
  185. {scandeval-16.8.0 → scandeval-16.9.0}/.gitignore +0 -0
  186. {scandeval-16.8.0 → scandeval-16.9.0}/.markdownlint.jsonc +0 -0
  187. {scandeval-16.8.0 → scandeval-16.9.0}/CITATION.cff +0 -0
  188. {scandeval-16.8.0 → scandeval-16.9.0}/CODE_OF_CONDUCT.md +0 -0
  189. {scandeval-16.8.0 → scandeval-16.9.0}/CONTRIBUTING.md +0 -0
  190. {scandeval-16.8.0 → scandeval-16.9.0}/Dockerfile.cuda +0 -0
  191. {scandeval-16.8.0 → scandeval-16.9.0}/LICENSE +0 -0
  192. {scandeval-16.8.0 → scandeval-16.9.0}/NEW_DATASET_GUIDE.md +0 -0
  193. {scandeval-16.8.0 → scandeval-16.9.0}/README.md +0 -0
  194. {scandeval-16.8.0 → scandeval-16.9.0}/docs/CNAME +0 -0
  195. {scandeval-16.8.0 → scandeval-16.9.0}/docs/README.md +0 -0
  196. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/README.md +0 -0
  197. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/bulgarian.md +0 -0
  198. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/catalan.md +0 -0
  199. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/danish.md +0 -0
  200. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/dutch.md +0 -0
  201. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/english.md +0 -0
  202. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/estonian.md +0 -0
  203. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/faroese.md +0 -0
  204. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/finnish.md +0 -0
  205. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/french.md +0 -0
  206. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/german.md +0 -0
  207. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/greek.md +0 -0
  208. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/hungarian.md +0 -0
  209. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/icelandic.md +0 -0
  210. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/italian.md +0 -0
  211. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/latvian.md +0 -0
  212. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/norwegian.md +0 -0
  213. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/polish.md +0 -0
  214. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/portuguese.md +0 -0
  215. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/romanian.md +0 -0
  216. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/slovak.md +0 -0
  217. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/slovene.md +0 -0
  218. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/spanish.md +0 -0
  219. {scandeval-16.8.0 → scandeval-16.9.0}/docs/datasets/ukrainian.md +0 -0
  220. {scandeval-16.8.0 → scandeval-16.9.0}/docs/extras/radial_plotter.md +0 -0
  221. {scandeval-16.8.0 → scandeval-16.9.0}/docs/faq.md +0 -0
  222. {scandeval-16.8.0 → scandeval-16.9.0}/docs/gfx/favicon.png +0 -0
  223. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/bulgarian.md +0 -0
  224. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/croatian.md +0 -0
  225. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/czech.md +0 -0
  226. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/danish.md +0 -0
  227. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
  228. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/english.md +0 -0
  229. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/estonian.md +0 -0
  230. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
  231. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
  232. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/french.md +0 -0
  233. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/german.md +0 -0
  234. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/greek.md +0 -0
  235. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  236. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/italian.md +0 -0
  237. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/latvian.md +0 -0
  238. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/lithuanian.md +0 -0
  239. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  240. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/polish.md +0 -0
  241. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/portuguese.md +0 -0
  242. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/serbian.md +0 -0
  243. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/slovak.md +0 -0
  244. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/slovene.md +0 -0
  245. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
  246. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
  247. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Monolingual/ukrainian.md +0 -0
  248. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Multilingual/baltic.md +0 -0
  249. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Multilingual/european.md +0 -0
  250. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Multilingual/finnic.md +0 -0
  251. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
  252. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  253. {scandeval-16.8.0 → scandeval-16.9.0}/docs/leaderboards/README.md +0 -0
  254. {scandeval-16.8.0 → scandeval-16.9.0}/docs/methodology.md +0 -0
  255. {scandeval-16.8.0 → scandeval-16.9.0}/docs/tasks/README.md +0 -0
  256. {scandeval-16.8.0 → scandeval-16.9.0}/docs/tasks/common-sense-reasoning.md +0 -0
  257. {scandeval-16.8.0 → scandeval-16.9.0}/docs/tasks/knowledge.md +0 -0
  258. {scandeval-16.8.0 → scandeval-16.9.0}/docs/tasks/linguistic-acceptability.md +0 -0
  259. {scandeval-16.8.0 → scandeval-16.9.0}/docs/tasks/named-entity-recognition.md +0 -0
  260. {scandeval-16.8.0 → scandeval-16.9.0}/docs/tasks/reading-comprehension.md +0 -0
  261. {scandeval-16.8.0 → scandeval-16.9.0}/docs/tasks/sentiment-classification.md +0 -0
  262. {scandeval-16.8.0 → scandeval-16.9.0}/docs/tasks/speed.md +0 -0
  263. {scandeval-16.8.0 → scandeval-16.9.0}/docs/tasks/summarization.md +0 -0
  264. {scandeval-16.8.0 → scandeval-16.9.0}/gfx/euroeval.png +0 -0
  265. {scandeval-16.8.0 → scandeval-16.9.0}/gfx/euroeval.xcf +0 -0
  266. {scandeval-16.8.0 → scandeval-16.9.0}/gfx/scandeval.png +0 -0
  267. {scandeval-16.8.0 → scandeval-16.9.0}/mkdocs.yaml +0 -0
  268. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/__init__.py +0 -0
  269. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/benchmark_config_factory.py +0 -0
  270. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/benchmark_modules/__init__.py +0 -0
  271. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/benchmark_modules/base.py +0 -0
  272. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/benchmark_modules/fresh.py +0 -0
  273. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/benchmark_modules/litellm.py +0 -0
  274. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/caching_utils.py +0 -0
  275. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/callbacks.py +0 -0
  276. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/cli.py +0 -0
  277. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/constants.py +0 -0
  278. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/data_loading.py +0 -0
  279. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/data_models.py +0 -0
  280. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/__init__.py +0 -0
  281. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/bosnian.py +0 -0
  282. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/bulgarian.py +0 -0
  283. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/catalan.py +0 -0
  284. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/croatian.py +0 -0
  285. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/czech.py +0 -0
  286. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/danish.py +0 -0
  287. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/dutch.py +0 -0
  288. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/english.py +0 -0
  289. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/estonian.py +0 -0
  290. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/faroese.py +0 -0
  291. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/finnish.py +0 -0
  292. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/french.py +0 -0
  293. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/german.py +0 -0
  294. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/greek.py +0 -0
  295. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/hungarian.py +0 -0
  296. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/icelandic.py +0 -0
  297. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/italian.py +0 -0
  298. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/latvian.py +0 -0
  299. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/lithuanian.py +0 -0
  300. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/norwegian.py +0 -0
  301. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/polish.py +0 -0
  302. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/portuguese.py +0 -0
  303. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/romanian.py +0 -0
  304. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/serbian.py +0 -0
  305. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/slovak.py +0 -0
  306. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/slovene.py +0 -0
  307. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/spanish.py +0 -0
  308. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/dataset_configs/ukrainian.py +0 -0
  309. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/enums.py +0 -0
  310. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/exceptions.py +0 -0
  311. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/finetuning.py +0 -0
  312. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/generation.py +0 -0
  313. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/generation_utils.py +0 -0
  314. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/languages.py +0 -0
  315. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/logging_utils.py +0 -0
  316. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/metrics/__init__.py +0 -0
  317. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/metrics/base.py +0 -0
  318. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/metrics/huggingface.py +0 -0
  319. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/metrics/pipeline.py +0 -0
  320. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/metrics/speed.py +0 -0
  321. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/model_cache.py +0 -0
  322. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/model_loading.py +0 -0
  323. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/prompt_templates/__init__.py +0 -0
  324. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/prompt_templates/classification.py +0 -0
  325. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/prompt_templates/linguistic_acceptability.py +0 -0
  326. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/prompt_templates/multiple_choice.py +0 -0
  327. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/prompt_templates/named_entity_recognition.py +0 -0
  328. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/prompt_templates/reading_comprehension.py +0 -0
  329. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/prompt_templates/sentiment_classification.py +0 -0
  330. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/prompt_templates/summarization.py +0 -0
  331. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/prompt_templates/token_classification.py +0 -0
  332. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/scores.py +0 -0
  333. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/speed_benchmark.py +0 -0
  334. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/task_group_utils/__init__.py +0 -0
  335. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/task_group_utils/multiple_choice_classification.py +0 -0
  336. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/tasks.py +0 -0
  337. {scandeval-16.8.0 → scandeval-16.9.0}/src/scandeval/utils.py +0 -0
  338. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/__init__.py +0 -0
  339. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/constants.py +0 -0
  340. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_angry_tweets.py +0 -0
  341. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_mim_gold_ner.py +0 -0
  342. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/create_norec.py +0 -0
  343. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/fix_dot_env_file.py +0 -0
  344. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/load_ud_pos.py +0 -0
  345. {scandeval-16.8.0 → scandeval-16.9.0}/src/scripts/versioning.py +0 -0
  346. {scandeval-16.8.0 → scandeval-16.9.0}/tests/__init__.py +0 -0
  347. {scandeval-16.8.0 → scandeval-16.9.0}/tests/conftest.py +0 -0
  348. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_benchmark_config_factory.py +0 -0
  349. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_benchmark_modules/__init__.py +0 -0
  350. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_benchmark_modules/test_hf.py +0 -0
  351. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_callbacks.py +0 -0
  352. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_cli.py +0 -0
  353. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_constants.py +0 -0
  354. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_dataset_configs.py +0 -0
  355. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_enums.py +0 -0
  356. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_exceptions.py +0 -0
  357. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_finetuning.py +0 -0
  358. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_languages.py +0 -0
  359. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scores.py +0 -0
  360. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scripts/__init__.py +0 -0
  361. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/__init__.py +0 -0
  362. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_create_scala.py +0 -0
  363. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/de_gsd-ud-train.conllu.adp_det +0 -0
  364. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/empty.file +0 -0
  365. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/en_gum-ud-train.conllu.case +0 -0
  366. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_01 +0 -0
  367. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_02 +0 -0
  368. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_03 +0 -0
  369. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_types.py +0 -0
  370. {scandeval-16.8.0 → scandeval-16.9.0}/tests/test_utils.py +0 -0
@@ -20,11 +20,12 @@ body:
20
20
  options:
21
21
  - label: Baltic languages (Latvian, Lithuanian)
22
22
  - label: Finnic languages (Estonian, Finnish)
23
- - label: Greek
24
23
  - label: Romance languages (Catalan, French, Italian, Portuguese, Romanian, Spanish)
25
24
  - label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
26
- - label: Slavic languages (Bulgarian, Bosnian, Croatian, Czech, Hungarian, Polish, Serbian, Slovak, Slovenian, Ukrainian)
25
+ - label: Slavic languages (Bulgarian, Bosnian, Croatian, Czech, Polish, Serbian, Slovak, Slovenian, Ukrainian)
27
26
  - label: West Germanic languages (Dutch, English, German)
27
+ - label: Greek
28
+ - label: Hungarian
28
29
  validations:
29
30
  required: true
30
31
  - type: dropdown
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.14.6
13
+ rev: v0.14.9
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -34,13 +34,13 @@ repos:
34
34
  hooks:
35
35
  - id: nbstripout
36
36
  - repo: https://github.com/facebook/pyrefly-pre-commit
37
- rev: 0.0.1
37
+ rev: 0.46.0
38
38
  hooks:
39
- - id: pyrefly-typecheck-system
39
+ - id: pyrefly-check
40
40
  name: Pyrefly (type checking)
41
41
  pass_filenames: true
42
42
  - repo: https://github.com/DavidAnson/markdownlint-cli2
43
- rev: v0.19.1
43
+ rev: v0.20.0
44
44
  hooks:
45
45
  - id: markdownlint-cli2
46
46
  args:
@@ -7,6 +7,29 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [v16.9.0] - 2025-12-16
11
+
12
+ ### Added
13
+
14
+ - Added the Swedish factual knowledge dataset SwedishFacts, which is based on the
15
+ [liu-nlp/swedish-facts-v1](https://huggingface.co/datasets/liu-nlp/swedish-facts-v1)
16
+ dataset. This was contributed by @oliverkinch ✨
17
+
18
+ ### Fixed
19
+
20
+ - When a model has registered the number of parameters wrongly within their safetensors
21
+ files, we collect all the potential parameter counts from the safetensors file and
22
+ pick the largest one.
23
+ - We now pinned vLLM to v0.11.0, as all future versions (up to and including v0.12.0)
24
+ have breaking changes regarding loading of Mistral models. We aim to unpin this when a
25
+ new vLLM version fixes this.
26
+ - Removed mentions of `hf_transfer` and the associated environment variable
27
+ `HF_HUB_ENABLE_HF_TRANSFER`, since this has been removed from the `transformers`
28
+ library now.
29
+ - Marked the `PleIAs/Pleias-3b-Preview` as requiring the `TRITON_ATTN` backend over the
30
+ default `FLASHINFER` backend, as the model architecture is currently not supported by
31
+ the default backend.
32
+
10
33
  ## [v16.8.0] - 2025-11-25
11
34
 
12
35
  ### Added
@@ -2735,8 +2758,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2735
2758
 
2736
2759
  ### Deprecated
2737
2760
 
2738
- - Deprecated support for evaluating finetuned models, as the package was primarily used to
2739
- benchmark pretrained models anyway, and the change in datasets means that many
2761
+ - Deprecated support for evaluating finetuned models, as the package was primarily used
2762
+ to benchmark pretrained models anyway, and the change in datasets means that many
2740
2763
  finetuned models would have been trained on (part of) the test sets, resulting in
2741
2764
  artificially large scores. For evaluation of finetuned models, please check out the
2742
2765
  `aiai_eval` Python package instead (under development).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ScandEval
3
- Version: 16.8.0
3
+ Version: 16.9.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -39,6 +39,7 @@ Requires-Dist: evaluate>=0.4.1
39
39
  Requires-Dist: huggingface-hub>=0.30.1
40
40
  Requires-Dist: levenshtein>=0.24.0
41
41
  Requires-Dist: litellm>=1.75.6
42
+ Requires-Dist: mistral-common[soundfile]
42
43
  Requires-Dist: more-itertools>=10.5.0
43
44
  Requires-Dist: numpy>=2.0.0
44
45
  Requires-Dist: ollama>=0.5.1
@@ -62,12 +63,12 @@ Provides-Extra: all
62
63
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
63
64
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
64
65
  Requires-Dist: timm>=1.0.19; extra == 'all'
65
- Requires-Dist: vllm[flashinfer]>=0.11.0; (platform_system == 'Linux') and extra == 'all'
66
+ Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'all'
66
67
  Provides-Extra: generative
67
68
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
69
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
70
  Requires-Dist: timm>=1.0.19; extra == 'generative'
70
- Requires-Dist: vllm[flashinfer]>=0.11.0; (platform_system == 'Linux') and extra == 'generative'
71
+ Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'generative'
71
72
  Description-Content-Type: text/markdown
72
73
 
73
74
  <!-- This disables the requirement that the first line is a top-level heading -->
@@ -9,8 +9,8 @@ information about what these constitute.
9
9
  ### MMS-bs
10
10
 
11
11
  This dataset was published in [this paper](https://doi.org/10.48550/arXiv.2306.07902).
12
- The corpus consists of 79 manually selected datasets from over 350 datasets reported in the
13
- scientific literature based on strict quality criteria.
12
+ The corpus consists of 79 manually selected datasets from over 350 datasets reported in
13
+ the scientific literature based on strict quality criteria.
14
14
 
15
15
  The original dataset contains a single split with 36,183 Bosnian samples.
16
16
  We use 1,024 / 256 / 2,048 samples for our training, validation, and test splits,
@@ -9,8 +9,8 @@ information about what these constitute.
9
9
  ### MMS-hr
10
10
 
11
11
  This dataset was published in [this paper](https://doi.org/10.48550/arXiv.2306.07902).
12
- The corpus consists of 79 manually selected datasets from over 350 datasets reported in the
13
- scientific literature based on strict quality criteria.
12
+ The corpus consists of 79 manually selected datasets from over 350 datasets reported in
13
+ the scientific literature based on strict quality criteria.
14
14
 
15
15
  The original dataset contains a single split with 77,594 Croatian samples.
16
16
  We use 1,024 / 256 / 2,048 samples for our training, validation, and test splits,
@@ -12,10 +12,10 @@ This dataset was published in [this paper](https://aclanthology.org/R13-1016/) a
12
12
  consists of reviews from the the Czech Movie
13
13
  Database (CSFD).
14
14
 
15
- The original dataset contains 85,948 / 894 / 1503 samples for the training, validation, and
16
- and test splits, respectively. We use 1,024 / 256 / 2,048 samples for our training,
17
- validation and test splits, respectively. The train and validation splits are subsets
18
- of the original splits. For the test split, we use all available test samples and
15
+ The original dataset contains 85,948 / 894 / 1503 samples for the training, validation,
16
+ and and test splits, respectively. We use 1,024 / 256 / 2,048 samples for our training,
17
+ validation and test splits, respectively. The train and validation splits are subsets of
18
+ the original splits. For the test split, we use all available test samples and
19
19
  supplement with additional samples from the training set to reach 2,048 samples in
20
20
  total.
21
21
 
@@ -9,9 +9,9 @@ information about what these constitute.
9
9
  ### Atsiliepimai
10
10
 
11
11
  This dataset was published
12
- [here](https://huggingface.co/datasets/alexandrainst/lithuanian-sentiment-analysis). It was
13
- scraped from [atsiliepimai.lt](https://atsiliepimai.lt/) and
14
- contains reviews similar to trustpilot reviews.
12
+ [here](https://huggingface.co/datasets/alexandrainst/lithuanian-sentiment-analysis). It
13
+ was scraped from [atsiliepimai.lt](https://atsiliepimai.lt/) and contains reviews
14
+ similar to trustpilot reviews.
15
15
 
16
16
  The original dataset consists of 1,796 samples. We use 512 / 256 / 1,028
17
17
  samples for our training, validation and test splits, respectively.
@@ -9,8 +9,8 @@ information about what these constitute.
9
9
  ### MMS-sr
10
10
 
11
11
  This dataset was published in [this paper](https://doi.org/10.48550/arXiv.2306.07902).
12
- The corpus consists of 79 manually selected datasets from over 350 datasets reported in the
13
- scientific literature based on strict quality criteria.
12
+ The corpus consists of 79 manually selected datasets from over 350 datasets reported in
13
+ the scientific literature based on strict quality criteria.
14
14
 
15
15
  The original dataset contains a single split with 76,368 Serbian samples. We use
16
16
  1,024 / 256 / 2,048 samples for our training, validation and test splits, respectively.
@@ -699,6 +699,83 @@ You can evaluate this dataset directly as follows:
699
699
  euroeval --model <model-id> --dataset skolprov
700
700
  ```
701
701
 
702
+ ### Unofficial: SwedishFacts
703
+
704
+ This is a benchmark for factual knowledge about Sweden.
705
+ The questions are based on topics related to the hosts of the Swedish radio program
706
+ [Sommar i P1](https://www.sverigesradio.se/sommar-i-p1) as well as Swedish sporting
707
+ events, such as those featured in [En Svensk Klassiker](https://ensvenskklassiker.se).
708
+ In the [dataset card](https://huggingface.co/datasets/liu-nlp/swedish-facts-v1)
709
+ it is mentioned that a paper with more information is coming soon.
710
+
711
+ Since the dataset does not include candidate answers, we generate them using GPT-4o.
712
+ The original dataset consists of 1,289 samples. We
713
+ use a 128 / 64 / 1,097 split for training, validation and testing, respectively.
714
+
715
+ Here are a few examples from the training split:
716
+
717
+ ```json
718
+ {
719
+ "text": "Hur många gånger befodrades Micael Bydén till en högre militär grad under 1990-talet?\nSvarsalternativ:\na. Tre, 3\nb. Fyra\nc. Fem\nd. Två",
720
+ "label": "a"
721
+ }
722
+ ```
723
+
724
+ ```json
725
+ {
726
+ "text": "Vad heter skivbolaget Titiyo Jah kontrakt med år 1988?\nSvarsalternativ:\na. Virgin Records\nb. Telegram\nc. Sony Music\nd. Warner Music",
727
+ "label": "b"
728
+ }
729
+ ```
730
+
731
+ ```json
732
+ {
733
+ "text": "I vilken ort föddes PM Nilsson?\nSvarsalternativ:\na. Göteborg\nb. Lund\nc. Helsingborg\nd. Malmö",
734
+ "label": "b"
735
+ }
736
+ ```
737
+
738
+ When evaluating generative models, we use the following setup (see the
739
+ [methodology](/methodology) for more information on how these are used):
740
+
741
+ - Number of few-shot examples: 5
742
+ - Prefix prompt:
743
+
744
+ ```text
745
+ Följande är flervalsfrågor (med svar).
746
+ ```
747
+
748
+ - Base prompt template:
749
+
750
+ ```text
751
+ Fråga: {text}
752
+ Svarsalternativ:
753
+ a. {option_a}
754
+ b. {option_b}
755
+ c. {option_c}
756
+ d. {option_d}
757
+ Svar: {label}
758
+ ```
759
+
760
+ - Instruction-tuned prompt template:
761
+
762
+ ```text
763
+ Fråga: {text}
764
+ Svarsalternativ:
765
+ a. {option_a}
766
+ b. {option_b}
767
+ c. {option_c}
768
+ d. {option_d}
769
+
770
+ Besvara följande fråga med 'a', 'b', 'c' eller 'd', och inget annat.
771
+ ```
772
+
773
+ You can evaluate this dataset directly as follows:
774
+
775
+ ```bash
776
+ euroeval --model <model-id> --dataset swedish-facts
777
+ ```
778
+
702
779
  ## Common-sense Reasoning
703
780
 
704
781
  ### HellaSwag-sv
@@ -0,0 +1,26 @@
1
+ ---
2
+ hide:
3
+ - toc
4
+ ---
5
+ # 🇧🇦 Bosnian
6
+
7
+ See the [leaderboard page](/leaderboards) for more information about all the columns.
8
+
9
+ /// tab | Generative Leaderboard
10
+ <iframe title="" aria-label="Table" id="datawrapper-chart-Qs9Zq" src="https://datawrapper.dwcdn.net/Qs9Zq" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="1016" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
11
+ ///
12
+
13
+ /// tab | NLU Leaderboard
14
+ <iframe title="" aria-label="Table" id="datawrapper-chart-PrIYR" src="https://datawrapper.dwcdn.net/PrIYR" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="950" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
15
+ ///
16
+
17
+ /// tab | Generative Scatter Plot
18
+ <iframe title="Few-shot Performance of Generative Language Models on Bosnian Tasks by Model Size" aria-label="Scatter Plot" id="datawrapper-chart-nLYsL" src="https://datawrapper.dwcdn.net/nLYsL" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="687" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
19
+ ///
20
+
21
+ /// tab | NLU Scatter Plot
22
+ <iframe title="Few-shot Performance of Language Models on Bosnian NLU Tasks by Model Size" aria-label="Scatter Plot" id="datawrapper-chart-ibhn6" src="https://datawrapper.dwcdn.net/ibhn6" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="687" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
23
+ ///
24
+
25
+ <!-- This disables the requirement that all lines must be shorter than 88 characters -->
26
+ <!-- markdownlint-configure-file { "MD013": false } -->
@@ -0,0 +1,26 @@
1
+ ---
2
+ hide:
3
+ - toc
4
+ ---
5
+ # <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/c/ce/Flag_of_Catalonia.svg/960px-Flag_of_Catalonia.svg.png" width="35" alt="Flag of Catalonia"/> Catalan
6
+
7
+ See the [leaderboard page](/leaderboards) for more information about all the columns.
8
+
9
+ /// tab | Generative Leaderboard
10
+ <iframe title="" aria-label="Table" id="datawrapper-chart-pC6Eu" src="https://datawrapper.dwcdn.net/pC6Eu/2/" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="886" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
11
+ ///
12
+
13
+ /// tab | NLU Leaderboard
14
+ <iframe title="" aria-label="Table" id="datawrapper-chart-rYG8W" src="https://datawrapper.dwcdn.net/rYG8W/2/" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="902" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
15
+ ///
16
+
17
+ /// tab | Generative Scatter Plot
18
+ <iframe title="Performance of Generative Language Models on Catalan Tasks by Model Size" aria-label="Scatter Plot" id="datawrapper-chart-noEdf" src="https://datawrapper.dwcdn.net/noEdf" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="687" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
19
+ ///
20
+
21
+ /// tab | NLU Scatter Plot
22
+ <iframe title="Performance of Language Models on Catalan NLU Tasks by Model Size" aria-label="Scatter Plot" id="datawrapper-chart-DH0vi" src="https://datawrapper.dwcdn.net/DH0vi" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="687" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
23
+ ///
24
+
25
+ <!-- This disables the requirement that all lines must be shorter than 88 characters -->
26
+ <!-- markdownlint-configure-file { "MD013": false } -->
@@ -0,0 +1,26 @@
1
+ ---
2
+ hide:
3
+ - toc
4
+ ---
5
+ # 🇭🇺 Hungarian
6
+
7
+ See the [leaderboard page](/leaderboards) for more information about all the columns.
8
+
9
+ /// tab | Generative Leaderboard
10
+ <iframe title="" aria-label="Table" id="datawrapper-chart-H91LQ" src="https://datawrapper.dwcdn.net/H91LQ" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="855" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
11
+ ///
12
+
13
+ /// tab | NLU Leaderboard
14
+ <iframe title="" aria-label="Table" id="datawrapper-chart-bLogV" src="https://datawrapper.dwcdn.net/bLogV" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="826" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
15
+ ///
16
+
17
+ /// tab | Generative Scatter Plot
18
+ <iframe title="Performance of Generative Language Models on Hungarian Tasks by Model Size" aria-label="Scatter Plot" id="datawrapper-chart-7Qn2I" src="https://datawrapper.dwcdn.net/7Qn2I" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="687" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
19
+ ///
20
+
21
+ /// tab | NLU Scatter Plot
22
+ <iframe title="Performance of Language Models on Hungarian NLU Tasks by Model Size" aria-label="Scatter Plot" id="datawrapper-chart-F5I8e" src="https://datawrapper.dwcdn.net/F5I8e" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="687" data-external="1"></iframe><script type="text/javascript">window.addEventListener("message",function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}});</script>
23
+ ///
24
+
25
+ <!-- This disables the requirement that all lines must be shorter than 88 characters -->
26
+ <!-- markdownlint-configure-file { "MD013": false } -->
@@ -2,7 +2,7 @@
2
2
  hide:
3
3
  - toc
4
4
  ---
5
- # 🇫🇷🇮🇹🇵🇹🇪🇸 Romance
5
+ # <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/c/ce/Flag_of_Catalonia.svg/960px-Flag_of_Catalonia.svg.png" width="35" alt="Flag of Catalonia"/>🇫🇷🇮🇹🇵🇹🇪🇸 Romance
6
6
 
7
7
  See the [leaderboard page](/leaderboards) for more information about all the columns.
8
8
 
@@ -2,7 +2,7 @@
2
2
  hide:
3
3
  - toc
4
4
  ---
5
- # 🇧🇬🇭🇷🇨🇿🇵🇱🇷🇸🇸🇰🇸🇮🇺🇦 Slavic
5
+ # 🇧🇦🇧🇬🇭🇷🇨🇿🇵🇱🇷🇸🇸🇰🇸🇮🇺🇦 Slavic
6
6
 
7
7
  See the [leaderboard page](/leaderboards) for more information about all the columns.
8
8