EuroEval 15.4.2__tar.gz → 15.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (232) hide show
  1. {euroeval-15.4.2 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +6 -12
  2. {euroeval-15.4.2 → euroeval-15.6.0}/.github/workflows/ci.yaml +2 -0
  3. {euroeval-15.4.2 → euroeval-15.6.0}/.gitignore +4 -0
  4. {euroeval-15.4.2 → euroeval-15.6.0}/.pre-commit-config.yaml +1 -1
  5. {euroeval-15.4.2 → euroeval-15.6.0}/CHANGELOG.md +86 -12
  6. {euroeval-15.4.2 → euroeval-15.6.0}/PKG-INFO +31 -9
  7. {euroeval-15.4.2 → euroeval-15.6.0}/README.md +25 -3
  8. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/danish.md +4 -2
  9. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/dutch.md +1 -1
  10. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/english.md +1 -1
  11. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/faroese.md +4 -4
  12. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/icelandic.md +17 -13
  13. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/italian.md +5 -6
  14. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/norwegian.md +18 -9
  15. {euroeval-15.4.2 → euroeval-15.6.0}/makefile +4 -6
  16. {euroeval-15.4.2 → euroeval-15.6.0}/pyproject.toml +19 -7
  17. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/__init__.py +2 -2
  18. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/benchmark_modules/base.py +3 -2
  19. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/benchmark_modules/fresh.py +8 -6
  20. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/benchmark_modules/hf.py +44 -33
  21. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/benchmark_modules/litellm.py +314 -120
  22. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/benchmark_modules/vllm.py +99 -59
  23. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/benchmarker.py +52 -21
  24. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/callbacks.py +2 -2
  25. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/constants.py +9 -2
  26. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/data_models.py +258 -44
  27. euroeval-15.6.0/src/euroeval/dataset_configs/__init__.py +61 -0
  28. euroeval-15.6.0/src/euroeval/dataset_configs/danish.py +120 -0
  29. euroeval-15.6.0/src/euroeval/dataset_configs/dutch.py +123 -0
  30. euroeval-15.6.0/src/euroeval/dataset_configs/english.py +88 -0
  31. euroeval-15.6.0/src/euroeval/dataset_configs/faroese.py +53 -0
  32. euroeval-15.6.0/src/euroeval/dataset_configs/french.py +83 -0
  33. euroeval-15.6.0/src/euroeval/dataset_configs/german.py +91 -0
  34. euroeval-15.6.0/src/euroeval/dataset_configs/icelandic.py +148 -0
  35. euroeval-15.6.0/src/euroeval/dataset_configs/italian.py +81 -0
  36. euroeval-15.6.0/src/euroeval/dataset_configs/norwegian.py +178 -0
  37. euroeval-15.6.0/src/euroeval/dataset_configs/spanish.py +78 -0
  38. euroeval-15.6.0/src/euroeval/dataset_configs/swedish.py +100 -0
  39. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/exceptions.py +10 -10
  40. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/finetuning.py +6 -10
  41. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/generation.py +1 -0
  42. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/human_evaluation.py +2 -2
  43. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/languages.py +20 -13
  44. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/model_cache.py +1 -1
  45. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/model_loading.py +1 -12
  46. euroeval-15.6.0/src/euroeval/prompt_templates/__init__.py +8 -0
  47. euroeval-15.6.0/src/euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  48. euroeval-15.6.0/src/euroeval/prompt_templates/multiple_choice.py +97 -0
  49. euroeval-15.6.0/src/euroeval/prompt_templates/named_entity_recognition.py +257 -0
  50. euroeval-15.6.0/src/euroeval/prompt_templates/reading_comprehension.py +118 -0
  51. euroeval-15.6.0/src/euroeval/prompt_templates/sentiment_classification.py +137 -0
  52. euroeval-15.6.0/src/euroeval/prompt_templates/summarization.py +97 -0
  53. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/speed_benchmark.py +1 -1
  54. {euroeval-15.4.2/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/multiple_choice_classification.py +19 -11
  55. {euroeval-15.4.2/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/question_answering.py +31 -30
  56. {euroeval-15.4.2/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/sequence_classification.py +45 -10
  57. {euroeval-15.4.2/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/text_to_text.py +1 -1
  58. {euroeval-15.4.2/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/token_classification.py +3 -2
  59. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/tasks.py +54 -0
  60. euroeval-15.6.0/src/euroeval/tokenization_utils.py +343 -0
  61. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/types.py +3 -1
  62. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/utils.py +5 -254
  63. {euroeval-15.4.2 → euroeval-15.6.0}/tests/conftest.py +16 -4
  64. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_benchmarker.py +42 -33
  65. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_callbacks.py +2 -1
  66. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_data_loading.py +2 -2
  67. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_data_models.py +4 -0
  68. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_finetuning.py +2 -1
  69. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_model_loading.py +4 -4
  70. euroeval-15.4.2/tests/test_utils.py → euroeval-15.6.0/tests/test_tokenization_utils.py +3 -68
  71. euroeval-15.6.0/tests/test_utils.py +67 -0
  72. {euroeval-15.4.2 → euroeval-15.6.0}/uv.lock +822 -739
  73. euroeval-15.4.2/src/euroeval/dataset_configs.py +0 -2408
  74. {euroeval-15.4.2 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  75. {euroeval-15.4.2 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  76. {euroeval-15.4.2 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  77. {euroeval-15.4.2 → euroeval-15.6.0}/CITATION.cff +0 -0
  78. {euroeval-15.4.2 → euroeval-15.6.0}/CODE_OF_CONDUCT.md +0 -0
  79. {euroeval-15.4.2 → euroeval-15.6.0}/CONTRIBUTING.md +0 -0
  80. {euroeval-15.4.2 → euroeval-15.6.0}/Dockerfile.cuda +0 -0
  81. {euroeval-15.4.2 → euroeval-15.6.0}/LICENSE +0 -0
  82. {euroeval-15.4.2 → euroeval-15.6.0}/docs/CNAME +0 -0
  83. {euroeval-15.4.2 → euroeval-15.6.0}/docs/README.md +0 -0
  84. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/README.md +0 -0
  85. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/french.md +0 -0
  86. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/german.md +0 -0
  87. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/spanish.md +0 -0
  88. {euroeval-15.4.2 → euroeval-15.6.0}/docs/datasets/swedish.md +0 -0
  89. {euroeval-15.4.2 → euroeval-15.6.0}/docs/extras/radial_plotter.md +0 -0
  90. {euroeval-15.4.2 → euroeval-15.6.0}/docs/faq.md +0 -0
  91. {euroeval-15.4.2 → euroeval-15.6.0}/docs/gfx/favicon.png +0 -0
  92. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/danish.md +0 -0
  93. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
  94. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/english.md +0 -0
  95. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
  96. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/french.md +0 -0
  97. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/german.md +0 -0
  98. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  99. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/italian.md +0 -0
  100. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  101. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
  102. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Multilingual/european.md +0 -0
  103. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
  104. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  105. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/Multilingual/romance.md +0 -0
  106. {euroeval-15.4.2 → euroeval-15.6.0}/docs/leaderboards/README.md +0 -0
  107. {euroeval-15.4.2 → euroeval-15.6.0}/docs/methodology.md +0 -0
  108. {euroeval-15.4.2 → euroeval-15.6.0}/docs/python-package.md +0 -0
  109. {euroeval-15.4.2 → euroeval-15.6.0}/docs/tasks/README.md +0 -0
  110. {euroeval-15.4.2 → euroeval-15.6.0}/docs/tasks/common-sense-reasoning.md +0 -0
  111. {euroeval-15.4.2 → euroeval-15.6.0}/docs/tasks/knowledge.md +0 -0
  112. {euroeval-15.4.2 → euroeval-15.6.0}/docs/tasks/linguistic-acceptability.md +0 -0
  113. {euroeval-15.4.2 → euroeval-15.6.0}/docs/tasks/named-entity-recognition.md +0 -0
  114. {euroeval-15.4.2 → euroeval-15.6.0}/docs/tasks/reading-comprehension.md +0 -0
  115. {euroeval-15.4.2 → euroeval-15.6.0}/docs/tasks/sentiment-classification.md +0 -0
  116. {euroeval-15.4.2 → euroeval-15.6.0}/docs/tasks/speed.md +0 -0
  117. {euroeval-15.4.2 → euroeval-15.6.0}/docs/tasks/summarization.md +0 -0
  118. {euroeval-15.4.2 → euroeval-15.6.0}/gfx/euroeval.png +0 -0
  119. {euroeval-15.4.2 → euroeval-15.6.0}/gfx/euroeval.xcf +0 -0
  120. {euroeval-15.4.2 → euroeval-15.6.0}/gfx/scandeval.png +0 -0
  121. {euroeval-15.4.2 → euroeval-15.6.0}/mkdocs.yaml +0 -0
  122. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/benchmark_config_factory.py +0 -0
  123. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
  124. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/cli.py +0 -0
  125. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/data_loading.py +0 -0
  126. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/enums.py +0 -0
  127. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/model_config.py +0 -0
  128. {euroeval-15.4.2 → euroeval-15.6.0}/src/euroeval/scores.py +0 -0
  129. {euroeval-15.4.2/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/__init__.py +0 -0
  130. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/constants.py +0 -0
  131. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_allocine.py +0 -0
  132. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_angry_tweets.py +0 -0
  133. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_arc.py +0 -0
  134. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_arc_is.py +0 -0
  135. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_belebele.py +0 -0
  136. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_cnn_dailymail.py +0 -0
  137. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_conll_en.py +0 -0
  138. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_conll_es.py +0 -0
  139. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_conll_nl.py +0 -0
  140. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_dane.py +0 -0
  141. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_danish_citizen_tests.py +0 -0
  142. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_dansk.py +0 -0
  143. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_danske_talemaader.py +0 -0
  144. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_danske_talemaader_old.py +0 -0
  145. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_dbrd.py +0 -0
  146. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_dutch_cola.py +0 -0
  147. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_dutch_social.py +0 -0
  148. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_eltec.py +0 -0
  149. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_fone.py +0 -0
  150. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_foqa.py +0 -0
  151. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_fosent.py +0 -0
  152. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_fquad.py +0 -0
  153. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_germanquad.py +0 -0
  154. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_germeval.py +0 -0
  155. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_hellaswag.py +0 -0
  156. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  157. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_ice_linguistic.py +0 -0
  158. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
  159. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_icelandic_knowledge.py +0 -0
  160. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_icelandic_qa.py +0 -0
  161. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_icesum.py +0 -0
  162. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_ilpost_sum.py +0 -0
  163. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_jentoft.py +0 -0
  164. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_mim_gold_ner.py +0 -0
  165. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_mlqa_es.py +0 -0
  166. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_mlsum_de.py +0 -0
  167. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_mlsum_es.py +0 -0
  168. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_mmlu.py +0 -0
  169. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_multinerd-it.py +0 -0
  170. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_no_cola.py +0 -0
  171. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_no_sammendrag.py +0 -0
  172. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
  173. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_nordjylland_news.py +0 -0
  174. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_norec.py +0 -0
  175. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_norglm_multiqa.py +0 -0
  176. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_norglm_multisum.py +0 -0
  177. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_norne.py +0 -0
  178. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_norquad.py +0 -0
  179. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_nqii.py +0 -0
  180. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
  181. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_orange_sum.py +0 -0
  182. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_personal_sum.py +0 -0
  183. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_rrn.py +0 -0
  184. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_sb10k.py +0 -0
  185. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_scala.py +0 -0
  186. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_scandiqa.py +0 -0
  187. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_schibsted.py +0 -0
  188. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
  189. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_sentipolc16.py +0 -0
  190. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_squad.py +0 -0
  191. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_squad_it.py +0 -0
  192. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_squad_nl.py +0 -0
  193. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_squad_nl_old.py +0 -0
  194. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_sst5.py +0 -0
  195. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_suc3.py +0 -0
  196. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_swedn.py +0 -0
  197. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_swerec.py +0 -0
  198. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
  199. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_wikiann_fo.py +0 -0
  200. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_wikineural-it.py +0 -0
  201. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_winogrande_is.py +0 -0
  202. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/create_xquad_es.py +0 -0
  203. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/fix_dot_env_file.py +0 -0
  204. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/load_ud_pos.py +0 -0
  205. {euroeval-15.4.2 → euroeval-15.6.0}/src/scripts/versioning.py +0 -0
  206. {euroeval-15.4.2 → euroeval-15.6.0}/tests/__init__.py +0 -0
  207. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_benchmark_config_factory.py +0 -0
  208. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_benchmark_modules/__init__.py +0 -0
  209. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_benchmark_modules/test_base.py +0 -0
  210. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_benchmark_modules/test_fresh.py +0 -0
  211. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_benchmark_modules/test_hf.py +0 -0
  212. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_benchmark_modules/test_litellm.py +0 -0
  213. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_benchmark_modules/test_vllm.py +0 -0
  214. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_cli.py +0 -0
  215. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_constants.py +0 -0
  216. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_dataset_configs.py +0 -0
  217. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_enums.py +0 -0
  218. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_exceptions.py +0 -0
  219. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_generation.py +0 -0
  220. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_human_evaluation.py +0 -0
  221. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_languages.py +0 -0
  222. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_model_cache.py +0 -0
  223. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_model_config.py +0 -0
  224. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_scores.py +0 -0
  225. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_speed_benchmark.py +0 -0
  226. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_task_utils/__init__.py +0 -0
  227. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_task_utils/test_question_answering.py +0 -0
  228. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_task_utils/test_sequence_classification.py +0 -0
  229. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_task_utils/test_text_to_text.py +0 -0
  230. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_task_utils/test_token_classification.py +0 -0
  231. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_tasks.py +0 -0
  232. {euroeval-15.4.2 → euroeval-15.6.0}/tests/test_types.py +0 -0
@@ -8,7 +8,7 @@ body:
8
8
  - type: input
9
9
  attributes:
10
10
  label: Model ID
11
- description: What is the Hugging Face model ID?
11
+ description: What is the model ID, either on the Hugging Face Hub or on LiteLLM?
12
12
  validations:
13
13
  required: true
14
14
  - type: checkboxes
@@ -18,17 +18,9 @@ body:
18
18
  What languages should this model be evaluated on? Tick all that apply. If the
19
19
  model is multilingual (e.g., Mistral, Llama), then tick all the languages.
20
20
  options:
21
- - label: Danish
22
- - label: Dutch
23
- - label: English
24
- - label: Faroese
25
- - label: French
26
- - label: German
27
- - label: Icelandic
28
- - label: Italian
29
- - label: Norwegian (Bokmål or Nynorsk)
30
- - label: Spanish
31
- - label: Swedish
21
+ - label: Romance languages (French, Italian, Spanish)
22
+ - label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
23
+ - label: West Germanic languages (Dutch, English, German)
32
24
  validations:
33
25
  required: true
34
26
  - type: dropdown
@@ -48,6 +40,7 @@ body:
48
40
  options:
49
41
  - Small (<=8B parameters)
50
42
  - Large (>8B parameters)
43
+ - N/A
51
44
  validations:
52
45
  required: true
53
46
  - type: dropdown
@@ -57,6 +50,7 @@ body:
57
50
  options:
58
51
  - Not a merged model
59
52
  - Merged model
53
+ - N/A
60
54
  validations:
61
55
  required: true
62
56
  - type: markdown
@@ -89,6 +89,8 @@ jobs:
89
89
  HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
90
90
  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
91
91
  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
92
+ GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
93
+ XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
92
94
 
93
95
  - name: Delete EuroEval cache
94
96
  run: rm -rf .euroeval_cache
@@ -115,3 +115,7 @@ site/
115
115
 
116
116
  # Helper files for docs
117
117
  docs/datasets/dataset_example_commands.txt
118
+
119
+ # Various graphics
120
+ gfx/euroeval-italian.png
121
+ gfx/euroeval-italian.xcf
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.11.2
13
+ rev: v0.11.5
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -10,6 +10,79 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.6.0] - 2025-04-13
14
+ ### Added
15
+ - We now support specifying custom inference providers when benchmarking via the Hugging
16
+ Face inference APIs. This can be done by specifying the model as
17
+ `huggingface/<inference-provider>/<organisation>/<model>`, as described in [these
18
+ LiteLLM docs](https://docs.litellm.ai/docs/providers/huggingface).
19
+
20
+ ### Changed
21
+ - Updated `transformers` to `>=4.51.0`, which includes support for Llama-4, Phi-4,
22
+ Deepseek-v3 and Qwen3. This also includes the `image-text-to-text` pipeline tag
23
+ properly, so that we do not have to use a custom fix for it anymore.
24
+ - Updated `vllm` to `>=0.8.3`, which includes support for Llama-4.
25
+ - Set the maximum amount of logprobs for generative models to 8, as that is the upper
26
+ bound for xAI models.
27
+ - When benchmarking Ollama models, if the model is not found, we now also check if the
28
+ model exists if prefixed with 'hf.co/'.
29
+ - Uniformised the prompt templates used for each task, so that they are more
30
+ consistent across tasks. Evaluation tests across different model types and sizes show
31
+ no significant performance difference between the new and old templates. This was
32
+ contributed by [@viggo-gascou](https://github.com/viggo-gascou) ✨
33
+
34
+ ### Fixed
35
+ - Avoid duplicate error messages when a rate limit occurs.
36
+ - ModernBERT models cannot be used on a CPU, which caused an error in our check for
37
+ maximal context length. In this case we simply skip this check and use the reported
38
+ maximal context length as-is.
39
+ - Fixed issue with benchmarking multiple generative models in the same evaluation
40
+ command. This was caused by vLLM and Ray not being able to release GPU memory
41
+ properly, but this seems to be released properly now.
42
+ - Now only logs when encoder models are being benchmarked on generative tasks if the
43
+ `--verbose` flag is set (or `verbose=True` in the `Benchmarker` API).
44
+ - All Spanish NER datasets were mistakenly marked as unofficial. The `conll-es` is now
45
+ marked as official.
46
+
47
+
48
+ ## [v15.5.0] - 2025-04-07
49
+ ### Added
50
+ - Now allows supplying a parameter to API models, which is done by using
51
+ `<model-id>@<parameter>` as the model ID (only a single parameter is supported). The
52
+ parameters allowed are "low" and "high" for OpenAI models (which is the reasoning
53
+ effort of the model, supported by the o1- and o3-series, default is "medium"), and
54
+ "thinking" for Anthropic models, to enable thinking mode (supported for
55
+ Claude-Sonnet-3.7+). These will appear in the leaderboards as
56
+ `<model-id>@<parameter>`.
57
+ - Added metadata for Google Gemini and xAI Grok models.
58
+ - Allows all vLLM versions from v0.8.0 again, as the issue with the generation output
59
+ has been resolved.
60
+ - Added overall progress indicator during evaluation. This was contributed by
61
+ [@mathiasesn](https://github.com/mathiasesn) ✨
62
+
63
+ ### Changed
64
+ - Now does not use logprobs in text classification tasks with Google VertexAI models, as
65
+ they heavily rate limit logprobs usage. This shouldn't affect the scores significantly
66
+ in any case, as the models are very confident in their predictions.
67
+ - Updated `litellm` to `>=1.63.0`, allowing better support for reasoning models.
68
+
69
+ ### Fixed
70
+ - The Gemini-2.5-pro model uses different error messages than the other Gemini models,
71
+ which caused an error when evaluating it. This has been fixed now.
72
+ - Now registers the Gemini-2.5-pro model series as reasoning models, as otherwise they
73
+ did not generate any text as they were just generating reasoning tokens.
74
+ - Previously, if there were multiple labels whose first tokens were identical and that
75
+ the (generative) model did not output the label as the first output token, we would
76
+ randomly choose one of the labels, resulting in an evaluation error. This is very
77
+ rare, but *does* happen for very particular (model, dataset) pairs. If we are in this
78
+ case, we now resort to choosing the label with closest word edit distance instead of
79
+ relying on logprobs of the first token.
80
+ - Now defaults to BF16 if the model is registered as using FP32, assuming that BF16 is
81
+ supported by the GPU.
82
+ - Improved model existence pipeline for Ollama model IDs with multiple forward slashes
83
+ in the name, which caused some models to not be detected as existing.
84
+
85
+
13
86
  ## [v15.4.2] - 2025-03-31
14
87
  ### Added
15
88
  - Now added version metadata to results, to easier track which versions of the various
@@ -23,7 +96,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
23
96
 
24
97
  ### Fixed
25
98
  - Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
26
- compatibility < 8.0. This was contributed by [@marksverdhei](https://github.com/marksverdhei) ✨
99
+ compatibility < 8.0. This was contributed by
100
+ [@marksverdhei](https://github.com/marksverdhei) ✨
27
101
  - Corrected the name of the French sentiment dataset AlloCiné. This was contributed by
28
102
  [@Alkarex](https://github.com/Alkarex) ✨
29
103
  - Evaluating a specific model revision did not work for adapter models, as there was a
@@ -50,7 +124,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
50
124
  as the API sometimes fails. If it still fails after 3 attempts, we raise the
51
125
  `HuggingFaceHubDown` exception.
52
126
  - Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
53
- compatibility < 8.0. This was contributed by [@marksverdhei](https://github.com/marksverdhei) ✨
127
+ compatibility < 8.0. This was contributed by
128
+ [@marksverdhei](https://github.com/marksverdhei) ✨
54
129
  - Fixed docs for ScandiQA-da and ScandiQA-sv, where it was incorrectly stated that
55
130
  the splits were made by considering the original train/validation/test splits.
56
131
 
@@ -118,18 +193,17 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
118
193
  ## [v15.3.0] - 2025-03-12
119
194
  ### Added
120
195
  - Added support for evaluating Italian 🇮🇹! This includes the reading comprehension
121
- dataset [SQuAD-it](https://hf.co/datasets/crux82/squad_it), the summarization
122
- dataset [IlPost](https://hf.co/datasets/ARTeLab/ilpost), the sentiment
123
- classification
124
- [Sentipolc-16](https://hf.co/datasets/cardiffnlp/tweet_sentiment_multilingual),
125
- the common-sense reasoning dataset
126
- [HellaSwag-it](https://hf.co/datasets/alexandrainst/m_hellaswag), the linguistic acceptability
127
- dataset ScaLA with the [Italian Universal Dependencies
196
+ dataset [SQuAD-it](https://hf.co/datasets/crux82/squad_it), the summarization dataset
197
+ [IlPost](https://hf.co/datasets/ARTeLab/ilpost), the sentiment classification
198
+ [Sentipolc-16](https://hf.co/datasets/cardiffnlp/tweet_sentiment_multilingual), the
199
+ common-sense reasoning dataset
200
+ [HellaSwag-it](https://hf.co/datasets/alexandrainst/m_hellaswag), the linguistic
201
+ acceptability dataset ScaLA with the [Italian Universal Dependencies
128
202
  treebank](https://github.com/UniversalDependencies/UD_Italian-ISDT), the knowledge
129
203
  dataset [MMLU-it](https://hf.co/datasets/alexandrainst/m_mmlu), and the named entity
130
- recognition dataset [MultiNERD
131
- IT](https://hf.co/datasets/Babelscape/multinerd) (and unofficially
132
- [WikiNEuRal IT](https://hf.co/datasets/Babelscape/wikineural)). This was contributed by [@viggo-gascou](https://github.com/viggo-gascou) ✨
204
+ recognition dataset [MultiNERD IT](https://hf.co/datasets/Babelscape/multinerd) (and
205
+ unofficially [WikiNEuRal IT](https://hf.co/datasets/Babelscape/wikineural)). This was
206
+ contributed by [@viggo-gascou](https://github.com/viggo-gascou) ✨
133
207
  - Added the new Norwegian knowledge dataset NRK-Quiz-QA, consisting of quizzes on the
134
208
  Norwegian language and culture, in both Bokmål and Nynorsk. The dataset has been split
135
209
  into 635 / 256 / 2,048 samples for train, val, and test, respectively. This replaces
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.4.2
3
+ Version: 15.6.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -35,9 +35,9 @@ Requires-Dist: click>=8.1.3
35
35
  Requires-Dist: datasets>=2.15.0
36
36
  Requires-Dist: demjson3>=3.0.6
37
37
  Requires-Dist: evaluate>=0.4.1
38
- Requires-Dist: huggingface-hub>=0.24.0
38
+ Requires-Dist: huggingface-hub>=0.30.1
39
39
  Requires-Dist: levenshtein>=0.24.0
40
- Requires-Dist: litellm>=1.61.13
40
+ Requires-Dist: litellm>=1.63.0
41
41
  Requires-Dist: more-itertools>=10.5.0
42
42
  Requires-Dist: numpy<2.0.0,>=1.23.0
43
43
  Requires-Dist: ollama>=0.4.7
@@ -56,18 +56,18 @@ Requires-Dist: setuptools>=75.8.2
56
56
  Requires-Dist: tenacity>=9.0.0
57
57
  Requires-Dist: termcolor>=2.0.0
58
58
  Requires-Dist: torch>=2.6.0
59
- Requires-Dist: transformers>=4.50.0
59
+ Requires-Dist: transformers>=4.51.0
60
60
  Provides-Extra: all
61
61
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
71
71
  Provides-Extra: human-evaluation
72
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
73
  Provides-Extra: test
@@ -89,7 +89,7 @@ ______________________________________________________________________
89
89
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
90
90
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
91
91
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
92
- [![Code Coverage](https://img.shields.io/badge/Coverage-65%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
92
+ [![Code Coverage](https://img.shields.io/badge/Coverage-67%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
93
93
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
94
94
 
95
95
 
@@ -206,7 +206,9 @@ sentiment-classification`.
206
206
 
207
207
 
208
208
  ### Reproducing the datasets
209
- All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
209
+ All datasets used in this project are generated using the scripts located in the
210
+ [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
211
+ with the following command
210
212
 
211
213
  ```shell
212
214
  $ uv run src/scripts/<name-of-script>.py
@@ -218,7 +220,27 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
218
220
  $ uv run src/scripts/create_allocine.py
219
221
  ```
220
222
 
221
- ## Special Thanks :pray:
223
+ ## Contributors :pray:
224
+
225
+ A huge thank you to all the contributors who have helped make this project a success!
226
+
227
+ <a href="https://github.com/peter-sk"><img src="https://avatars.githubusercontent.com/u/6168908" width=50 alt="Contributor avatar for peter-sk"/></a>
228
+ <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
229
+ <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
230
+ <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
231
+ <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
232
+ <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
233
+ <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
234
+ <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
235
+ <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
236
+ <a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
237
+ <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
238
+ <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
239
+ <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
240
+
241
+ ### Special Thanks
242
+ - Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
243
+ [Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
222
244
  - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
223
245
  models on the leaderboards.
224
246
  - Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
@@ -13,7 +13,7 @@ ______________________________________________________________________
13
13
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
14
14
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
15
15
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
16
- [![Code Coverage](https://img.shields.io/badge/Coverage-65%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
16
+ [![Code Coverage](https://img.shields.io/badge/Coverage-67%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
17
17
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
18
18
 
19
19
 
@@ -130,7 +130,9 @@ sentiment-classification`.
130
130
 
131
131
 
132
132
  ### Reproducing the datasets
133
- All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
133
+ All datasets used in this project are generated using the scripts located in the
134
+ [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
135
+ with the following command
134
136
 
135
137
  ```shell
136
138
  $ uv run src/scripts/<name-of-script>.py
@@ -142,7 +144,27 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
142
144
  $ uv run src/scripts/create_allocine.py
143
145
  ```
144
146
 
145
- ## Special Thanks :pray:
147
+ ## Contributors :pray:
148
+
149
+ A huge thank you to all the contributors who have helped make this project a success!
150
+
151
+ <a href="https://github.com/peter-sk"><img src="https://avatars.githubusercontent.com/u/6168908" width=50 alt="Contributor avatar for peter-sk"/></a>
152
+ <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
153
+ <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
154
+ <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
155
+ <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
156
+ <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
157
+ <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
158
+ <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
159
+ <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
160
+ <a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
161
+ <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
162
+ <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
163
+ <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
164
+
165
+ ### Special Thanks
166
+ - Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
167
+ [Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
146
168
  - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
147
169
  models on the leaderboards.
148
170
  - Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
@@ -450,12 +450,14 @@ Here are a few examples from the training split:
450
450
  {
451
451
  "text": "Hvilket af følgende områder har kommunerne ansvaret for driften af?\nSvarmuligheder:\na. Domstole\nb. Vuggestuer\nc. Sygehuse",
452
452
  "label": "b"
453
- }```
453
+ }
454
+ ```
454
455
  ```json
455
456
  {
456
457
  "text": "Hvilken organisation blev Danmark medlem af i 1945?\nSvarmuligheder:\na. Verdenshandelsorganisationen (WTO)\nb. Den Europæiske Union (EU)\nc. De Forenede Nationer (FN)",
457
458
  "label": "c"
458
- }```
459
+ }
460
+ ```
459
461
 
460
462
  When evaluating generative models, we use the following setup (see the
461
463
  [methodology](/methodology) for more information on how these are used):
@@ -133,7 +133,7 @@ $ euroeval --model <model-id> --dataset dbrd
133
133
 
134
134
  ## Named Entity Recognition
135
135
 
136
- ### CoNLL-2002-nl
136
+ ### CoNLL-nl
137
137
 
138
138
  This dataset was published in [this paper](https://aclanthology.org/W02-2024/) and
139
139
  consists of named entity recognition annotations of the Belgian newspaper "De Morgen" of
@@ -81,7 +81,7 @@ $ euroeval --model <model-id> --dataset sst5
81
81
 
82
82
  ## Named Entity Recognition
83
83
 
84
- ### CoNLL-2003-En
84
+ ### CoNLL-en
85
85
 
86
86
  This dataset was published in [this paper](https://aclanthology.org/W03-0419/) and was
87
87
  part of the CoNNL-2003 shared task. The data comes from the [Reuters
@@ -282,10 +282,10 @@ $ euroeval --model <model-id> --dataset scala-fo
282
282
 
283
283
  ### FoQA
284
284
 
285
- This dataset will be published in an upcoming paper and is based on the Faroese
286
- Wikipedia. The questions and answers were automatically generated using GPT-4-turbo,
287
- which were verified by a native speaker, and some of them were also corrected by the
288
- same native speaker.
285
+ This dataset was published in [this paper](https://doi.org/10.48550/arXiv.2502.07642)
286
+ and is based on the Faroese Wikipedia. The questions and answers were automatically
287
+ generated using GPT-4-turbo, which were verified by a native speaker, and some of them
288
+ were also corrected by the same native speaker.
289
289
 
290
290
  The original full dataset consists of 2,000 samples, and we split these into 848 / 128 /
291
291
  1,024 samples for training, validation and testing, respectively.
@@ -9,9 +9,9 @@ information about what these constitute.
9
9
 
10
10
  ### Hotter and Colder Sentiment
11
11
 
12
- This dataset is being published in an upcoming paper, and consists of texts from
13
- Icelandic blog post, annotated with sentiment labels (and many others) via a
14
- crowdsourcing platform.
12
+ This dataset was published in [this paper](https://doi.org/10.48550/arXiv.2502.16987),
13
+ and consists of texts from Icelandic blog post, annotated with sentiment labels (and
14
+ many others) via a crowdsourcing platform.
15
15
 
16
16
  The original full dataset consists of 2,901 samples, and we use a 1,021 / 255 / 1,607
17
17
  split for training, validation and testing, respectively (so all samples are used in
@@ -73,13 +73,14 @@ $ euroeval --model <model-id> --dataset hotter-and-colder-sentiment
73
73
 
74
74
  ### MIM-GOLD-NER
75
75
 
76
- This dataset was published in [this paper]() and is based on the [Tagged Icelandic
77
- Corpus (MIM)](https://clarin.is/en/resources/mim/), which consists of Icelandic books,
78
- news articles, periodicals, parliament speeches, legal texts, adjudications and
79
- government websites. It has been annotated with named entities in a semi-automated
80
- fashion, where each labels has been manually verified. The entity types in the dataset
81
- is a superset of the CoNLL-2003 tags, with the following additional labels: `DATE`,
82
- `TIME`, `MONEY`, `PERCENT`. These labels have been removed.
76
+ This dataset was published in [this
77
+ paper](https://repository.clarin.is/repository/xmlui/handle/20.500.12537/230) and is
78
+ based on the [Tagged Icelandic Corpus (MIM)](https://clarin.is/en/resources/mim/), which
79
+ consists of Icelandic books, news articles, periodicals, parliament speeches, legal
80
+ texts, adjudications and government websites. It has been annotated with named entities
81
+ in a semi-automated fashion, where each labels has been manually verified. The entity
82
+ types in the dataset is a superset of the CoNLL-2003 tags, with the following additional
83
+ labels: `DATE`, `TIME`, `MONEY`, `PERCENT`. These labels have been removed.
83
84
 
84
85
  The original full dataset consists of 1,000,000 tokens. We use a 1,024 / 256 / 2,048
85
86
  split for training, validation and testing, respectively.
@@ -526,17 +527,20 @@ Here are a few examples from the training split:
526
527
  {
527
528
  "text": "Hver var talinn heilagur maður eftir dauða sinn, er tákngervingur alþýðuhreyfingar vestanlands og talinn góður til áheita?\nSvarmöguleikar:\na. Þórður Jónsson helgi\nb. Guðmundur Arason\nc. Snorri Þorgrímsson\nd. Jón Hreggviðsson",
528
529
  "label": "a"
529
- }```
530
+ }
531
+ ```
530
532
  ```json
531
533
  {
532
534
  "text": "Í kringum hvaða ár hófst verslun á Arngerðareyri?\nSvarmöguleikar:\na. 1895\nb. 1884\nc. 1870\nd. 1902",
533
535
  "label": "b"
534
- }```
536
+ }
537
+ ```
535
538
  ```json
536
539
  {
537
540
  "text": "Hvenær var ákveðið að uppstigningardagur skyldi vera kirkjudagur aldraðra á Íslandi?\nSvarmöguleikar:\na. Árið 1975\nb. Árið 1985\nc. Árið 1982\nd. Árið 1990",
538
541
  "label": "c"
539
- }```
542
+ }
543
+ ```
540
544
 
541
545
  When evaluating generative models, we use the following setup (see the
542
546
  [methodology](/methodology) for more information on how these are used):
@@ -71,11 +71,10 @@ $ euroeval --model <model-id> --dataset sentipolc16
71
71
  ### MultiNERD IT
72
72
 
73
73
  This dataset was published in [this
74
- paper](https://aclanthology.org/2022.findings-naacl.60/) and
75
- consists of sentences from Wikipedia and Wikinews in 10 different languages. It is an
76
- extension of the combination of
77
- (WikiNEuRal)[https://www.github.com/Babelscape/wikineural] and
78
- (NER4EL)[https://www.github.com/Babelscape/ner4el]. The original test set was created
74
+ paper](https://aclanthology.org/2022.findings-naacl.60/) and consists of sentences from
75
+ Wikipedia and Wikinews in 10 different languages. It is an extension of the combination
76
+ of [WikiNEuRal](https://www.github.com/Babelscape/wikineural) and
77
+ [NER4EL](https://www.github.com/Babelscape/ner4el). The original test set was created
79
78
  from manual annotations, while the training set is based on an automatic annotation
80
79
  pipeline.
81
80
 
@@ -519,7 +518,7 @@ $ euroeval --model <model-id> --dataset hellaswag-it
519
518
 
520
519
  ## Summarization
521
520
 
522
- ### IlPost-sum
521
+ ### IlPost-Sum
523
522
 
524
523
  This dataset was published in [this paper](https://www.mdpi.com/2078-2489/13/5/228) and
525
524
  consists of news articles from [Il Post](https://www.ilpost.it/). The summaries were
@@ -388,17 +388,20 @@ Here are a few examples from the training split:
388
388
  {
389
389
  "text": "Vi har hatt krig i nesten ti år. Jeg føler meg noen ganger trist fordi jeg har mistet flere venner og min far på grunn av krigen.",
390
390
  "label": "correct"
391
- }```
391
+ }
392
+ ```
392
393
  ```json
393
394
  {
394
395
  "text": "Hvis jeg ikke sier in n genting, kan han spille hele dagen.",
395
396
  "label": "incorrect"
396
- }```
397
+ }
398
+ ```
397
399
  ```json
398
400
  {
399
401
  "text": "De føler at samfunnet trenger ikke dem.",
400
402
  "label": "incorrect"
401
- }```
403
+ }
404
+ ```
402
405
 
403
406
  When evaluating generative models, we use the following setup (see the
404
407
  [methodology](/methodology) for more information on how these are used):
@@ -660,17 +663,20 @@ Here are a few examples from the training split:
660
663
  {
661
664
  "text": "Gunnar har hatt plutselige og sterke smerteanfall siden han var liten gutt. Det var vondt å tisse og det gjorde vondt i ryggen og magen. Det hjalp litt å drikke vann. Reseptbelagte medisiner kan være nødvendig under anfall.\nSvaralternativer:\na. Nyrestein, kronisk\nb. Irritabel tarmsyndrom\nc. Angst\nd. Urinveisinfeksjon",
662
665
  "label": "a"
663
- }```
666
+ }
667
+ ```
664
668
  ```json
665
669
  {
666
670
  "text": "80 år gamle Harrison Ford er nok ein gong aktuell i rolla som Indiana Jones. Kva heiter filmen?\nSvaralternativer:\na. Indiana Jones and the Nasty Nazis\nb. Indiana Jones and the Dial of Destiny\nc. Indiana Jones and the Hunt for Power\nd. Indiana Jones Forever",
667
671
  "label": "b"
668
- }```
672
+ }
673
+ ```
669
674
  ```json
670
675
  {
671
676
  "text": "I 1980 måtte denne bassisten overnatte ni netter i fengsel i Japan fordi han prøvde å få med seg ca. 200 gram marihuana inn i landet. Hvem var det?\nSvaralternativer:\na. Sting\nb. Lemmy Kilmister\nc. Paul McCartney\nd. Bootsy Collins",
672
677
  "label": "c"
673
- }```
678
+ }
679
+ ```
674
680
 
675
681
  When evaluating generative models, we use the following setup (see the
676
682
  [methodology](/methodology) for more information on how these are used):
@@ -868,17 +874,20 @@ Here are a few examples from the training split:
868
874
  {
869
875
  "text": "Hvor er det sannsynlig at en fugl lager hjemmet sitt?\nSvaralternativer:\na. I skogen\nb. I et rede\nc. På taket\nd. På blader\ne. I himmelen",
870
876
  "label": "a"
871
- }```
877
+ }
878
+ ```
872
879
  ```json
873
880
  {
874
881
  "text": "Hvis et hjem har et abonnoment, hva får de sannsyneligvis hver dag i posten?\nSvaralternativer:\na. Delestykker\nb. En avis\nc. En gate\nd. En vaskemaskin\ne. Jordas overflate",
875
882
  "label": "b"
876
- }```
883
+ }
884
+ ```
877
885
  ```json
878
886
  {
879
887
  "text": "Når du ikke klarer å gjøre noe ferdig, hva feilet du i da?\nSvaralternativer:\na. Å vinne\nb. Å bestå\nc. Å fullfør\nd. Å gjøre det bra\ne. Å lykkes",
880
888
  "label": "c"
881
- }```
889
+ }
890
+ ```
882
891
 
883
892
  When evaluating generative models, we use the following setup (see the
884
893
  [methodology](/methodology) for more information on how these are used):
@@ -56,7 +56,6 @@ install-dependencies:
56
56
  @if [ "${NO_FLASH_ATTN}" != "1" ] && [ $$(uname) != "Darwin" ]; then \
57
57
  uv pip install --no-build-isolation flash-attn>=2.7.0.post2; \
58
58
  fi
59
- @uv sync -U --only-dev
60
59
 
61
60
  setup-environment-variables:
62
61
  @uv run python src/scripts/fix_dot_env_file.py
@@ -127,8 +126,7 @@ publish:
127
126
  echo "No PyPI API token specified in the '.env' file, so cannot publish."; \
128
127
  else \
129
128
  echo "Publishing to PyPI..."; \
130
- $(MAKE) --quiet check \
131
- && $(MAKE) --quiet publish-euroeval \
129
+ $(MAKE) --quiet publish-euroeval \
132
130
  && $(MAKE) --quiet publish-scandeval \
133
131
  && $(MAKE) --quiet publish-docs \
134
132
  && $(MAKE) --quiet add-dev-version \
@@ -157,8 +155,8 @@ publish-scandeval:
157
155
  fi
158
156
  @mv src/scandeval src/euroeval
159
157
 
160
- publish-major: bump-major publish ## Publish a major version
158
+ publish-major: install check bump-major publish ## Publish a major version
161
159
 
162
- publish-minor: bump-minor publish ## Publish a minor version
160
+ publish-minor: install check bump-minor publish ## Publish a minor version
163
161
 
164
- publish-patch: bump-patch publish ## Publish a patch version
162
+ publish-patch: install check bump-patch publish ## Publish a patch version