EuroEval 15.15.0__tar.gz → 16.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (278) hide show
  1. {euroeval-15.15.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +2 -0
  2. {euroeval-15.15.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/bug.yaml +1 -1
  3. {euroeval-15.15.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +2 -1
  4. {euroeval-15.15.0 → euroeval-16.0.0}/.github/workflows/ci.yaml +26 -16
  5. {euroeval-15.15.0 → euroeval-16.0.0}/.pre-commit-config.yaml +5 -2
  6. {euroeval-15.15.0 → euroeval-16.0.0}/CHANGELOG.md +108 -36
  7. {euroeval-15.15.0 → euroeval-16.0.0}/PKG-INFO +12 -14
  8. {euroeval-15.15.0 → euroeval-16.0.0}/README.md +3 -1
  9. euroeval-16.0.0/docs/datasets/estonian.md +544 -0
  10. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/icelandic.md +11 -11
  11. euroeval-16.0.0/docs/datasets/latvian.md +536 -0
  12. euroeval-16.0.0/docs/leaderboards/Monolingual/portuguese.md +23 -0
  13. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/README.md +1 -1
  14. {euroeval-15.15.0 → euroeval-16.0.0}/makefile +2 -2
  15. {euroeval-15.15.0 → euroeval-16.0.0}/pyproject.toml +11 -17
  16. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/__init__.py +3 -7
  17. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_config_factory.py +3 -7
  18. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/base.py +35 -19
  19. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/fresh.py +24 -19
  20. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/hf.py +136 -154
  21. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/litellm.py +323 -193
  22. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/vllm.py +166 -112
  23. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmarker.py +59 -33
  24. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/cli.py +3 -3
  25. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/constants.py +13 -15
  26. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/data_loading.py +33 -28
  27. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/data_models.py +53 -7
  28. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/__init__.py +2 -0
  29. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/danish.py +38 -1
  30. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/dutch.py +38 -1
  31. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/english.py +38 -1
  32. euroeval-16.0.0/src/euroeval/dataset_configs/estonian.py +95 -0
  33. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/faroese.py +38 -0
  34. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/finnish.py +39 -1
  35. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/french.py +38 -1
  36. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/german.py +38 -1
  37. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/icelandic.py +39 -1
  38. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/italian.py +38 -1
  39. euroeval-16.0.0/src/euroeval/dataset_configs/latvian.py +81 -0
  40. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/norwegian.py +38 -1
  41. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/portuguese.py +38 -1
  42. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/spanish.py +38 -1
  43. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/dataset_configs/swedish.py +38 -1
  44. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/enums.py +0 -6
  45. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/finetuning.py +8 -7
  46. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/generation.py +25 -14
  47. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/generation_utils.py +46 -14
  48. euroeval-16.0.0/src/euroeval/languages.py +966 -0
  49. euroeval-16.0.0/src/euroeval/metrics/__init__.py +6 -0
  50. euroeval-16.0.0/src/euroeval/metrics/base.py +76 -0
  51. euroeval-16.0.0/src/euroeval/metrics/huggingface.py +192 -0
  52. euroeval-16.0.0/src/euroeval/metrics/llm_as_a_judge.py +257 -0
  53. euroeval-16.0.0/src/euroeval/metrics/pipeline.py +234 -0
  54. euroeval-16.0.0/src/euroeval/metrics/speed.py +51 -0
  55. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  56. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/multiple_choice.py +23 -2
  57. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/named_entity_recognition.py +65 -2
  58. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/reading_comprehension.py +42 -2
  59. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/sentiment_classification.py +46 -2
  60. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/summarization.py +24 -4
  61. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/scores.py +7 -2
  62. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/speed_benchmark.py +6 -6
  63. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  64. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/question_answering.py +35 -28
  65. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/sequence_classification.py +96 -23
  66. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/text_to_text.py +7 -3
  67. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/token_classification.py +47 -75
  68. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/tasks.py +31 -6
  69. euroeval-16.0.0/src/euroeval/tokenization_utils.py +586 -0
  70. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/utils.py +118 -34
  71. euroeval-16.0.0/src/scripts/create_copa_lv.py +143 -0
  72. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_danish_citizen_tests.py +3 -2
  73. euroeval-16.0.0/src/scripts/create_err_news.py +83 -0
  74. euroeval-16.0.0/src/scripts/create_estner.py +115 -0
  75. euroeval-16.0.0/src/scripts/create_estonian_valence.py +86 -0
  76. euroeval-16.0.0/src/scripts/create_european_values.py +283 -0
  77. euroeval-16.0.0/src/scripts/create_exam_et.py +136 -0
  78. euroeval-16.0.0/src/scripts/create_fullstack_ner.py +248 -0
  79. euroeval-16.0.0/src/scripts/create_grammar_et.py +74 -0
  80. euroeval-16.0.0/src/scripts/create_latvian_lsm_summary.py +92 -0
  81. euroeval-16.0.0/src/scripts/create_latvian_twitter_sentiment.py +109 -0
  82. euroeval-16.0.0/src/scripts/create_mmlu_lv.py +263 -0
  83. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_multi_wiki_qa.py +1 -0
  84. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_scala.py +4 -0
  85. euroeval-16.0.0/src/scripts/create_wikiann_lv.py +116 -0
  86. euroeval-16.0.0/src/scripts/create_winogrande_et.py +90 -0
  87. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/load_ud_pos.py +36 -0
  88. {euroeval-15.15.0 → euroeval-16.0.0}/tests/conftest.py +2 -19
  89. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_hf.py +10 -13
  90. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmarker.py +0 -44
  91. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_cli.py +2 -2
  92. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_data_loading.py +15 -8
  93. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_data_models.py +2 -2
  94. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_scores.py +1 -1
  95. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_tokenization_utils.py +7 -7
  96. {euroeval-15.15.0 → euroeval-16.0.0}/uv.lock +1335 -2204
  97. euroeval-15.15.0/src/euroeval/human_evaluation.py +0 -738
  98. euroeval-15.15.0/src/euroeval/languages.py +0 -206
  99. euroeval-15.15.0/src/euroeval/metrics.py +0 -468
  100. euroeval-15.15.0/src/euroeval/tokenization_utils.py +0 -498
  101. euroeval-15.15.0/tests/test_human_evaluation.py +0 -8
  102. {euroeval-15.15.0 → euroeval-16.0.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  103. {euroeval-15.15.0 → euroeval-16.0.0}/.gitignore +0 -0
  104. {euroeval-15.15.0 → euroeval-16.0.0}/CITATION.cff +0 -0
  105. {euroeval-15.15.0 → euroeval-16.0.0}/CODE_OF_CONDUCT.md +0 -0
  106. {euroeval-15.15.0 → euroeval-16.0.0}/CONTRIBUTING.md +0 -0
  107. {euroeval-15.15.0 → euroeval-16.0.0}/Dockerfile.cuda +0 -0
  108. {euroeval-15.15.0 → euroeval-16.0.0}/LICENSE +0 -0
  109. {euroeval-15.15.0 → euroeval-16.0.0}/NEW_DATASET_GUIDE.md +0 -0
  110. {euroeval-15.15.0 → euroeval-16.0.0}/docs/CNAME +0 -0
  111. {euroeval-15.15.0 → euroeval-16.0.0}/docs/README.md +0 -0
  112. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/README.md +0 -0
  113. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/danish.md +0 -0
  114. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/dutch.md +0 -0
  115. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/english.md +0 -0
  116. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/faroese.md +0 -0
  117. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/finnish.md +0 -0
  118. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/french.md +0 -0
  119. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/german.md +0 -0
  120. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/italian.md +0 -0
  121. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/norwegian.md +0 -0
  122. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/portuguese.md +0 -0
  123. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/spanish.md +0 -0
  124. {euroeval-15.15.0 → euroeval-16.0.0}/docs/datasets/swedish.md +0 -0
  125. {euroeval-15.15.0 → euroeval-16.0.0}/docs/extras/radial_plotter.md +0 -0
  126. {euroeval-15.15.0 → euroeval-16.0.0}/docs/faq.md +0 -0
  127. {euroeval-15.15.0 → euroeval-16.0.0}/docs/gfx/favicon.png +0 -0
  128. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/danish.md +0 -0
  129. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
  130. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/english.md +0 -0
  131. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
  132. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
  133. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/french.md +0 -0
  134. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/german.md +0 -0
  135. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  136. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/italian.md +0 -0
  137. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  138. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
  139. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
  140. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/european.md +0 -0
  141. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
  142. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  143. {euroeval-15.15.0 → euroeval-16.0.0}/docs/leaderboards/Multilingual/romance.md +0 -0
  144. {euroeval-15.15.0 → euroeval-16.0.0}/docs/methodology.md +0 -0
  145. {euroeval-15.15.0 → euroeval-16.0.0}/docs/python-package.md +0 -0
  146. {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/README.md +0 -0
  147. {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/common-sense-reasoning.md +0 -0
  148. {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/knowledge.md +0 -0
  149. {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/linguistic-acceptability.md +0 -0
  150. {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/named-entity-recognition.md +0 -0
  151. {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/reading-comprehension.md +0 -0
  152. {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/sentiment-classification.md +0 -0
  153. {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/speed.md +0 -0
  154. {euroeval-15.15.0 → euroeval-16.0.0}/docs/tasks/summarization.md +0 -0
  155. {euroeval-15.15.0 → euroeval-16.0.0}/gfx/euroeval.png +0 -0
  156. {euroeval-15.15.0 → euroeval-16.0.0}/gfx/euroeval.xcf +0 -0
  157. {euroeval-15.15.0 → euroeval-16.0.0}/gfx/scandeval.png +0 -0
  158. {euroeval-15.15.0 → euroeval-16.0.0}/mkdocs.yaml +0 -0
  159. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
  160. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/callbacks.py +0 -0
  161. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/exceptions.py +0 -0
  162. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/model_cache.py +0 -0
  163. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/model_config.py +0 -0
  164. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/model_loading.py +0 -0
  165. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/prompt_templates/__init__.py +0 -0
  166. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/task_group_utils/__init__.py +0 -0
  167. {euroeval-15.15.0 → euroeval-16.0.0}/src/euroeval/types.py +0 -0
  168. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/constants.py +0 -0
  169. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_allocine.py +0 -0
  170. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_angry_tweets.py +0 -0
  171. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_arc.py +0 -0
  172. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_arc_is.py +0 -0
  173. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_belebele.py +0 -0
  174. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_boolq_pt.py +0 -0
  175. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_cnn_dailymail.py +0 -0
  176. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_conll_en.py +0 -0
  177. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_conll_es.py +0 -0
  178. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_conll_nl.py +0 -0
  179. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_dane.py +0 -0
  180. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_dansk.py +0 -0
  181. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_danske_talemaader.py +0 -0
  182. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_danske_talemaader_old.py +0 -0
  183. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_dbrd.py +0 -0
  184. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_dutch_cola.py +0 -0
  185. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_eltec.py +0 -0
  186. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_fone.py +0 -0
  187. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_foqa.py +0 -0
  188. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_fosent.py +0 -0
  189. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_fquad.py +0 -0
  190. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_germanquad.py +0 -0
  191. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_germeval.py +0 -0
  192. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_goldenswag.py +0 -0
  193. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_harem.py +0 -0
  194. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_hellaswag.py +0 -0
  195. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_hellaswag_fi.py +0 -0
  196. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  197. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_ice_linguistic.py +0 -0
  198. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
  199. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_icelandic_knowledge.py +0 -0
  200. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_icelandic_qa.py +0 -0
  201. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_icesum.py +0 -0
  202. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_idioms_no.py +0 -0
  203. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_ilpost_sum.py +0 -0
  204. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_jentoft.py +0 -0
  205. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_life_in_the_uk.py +0 -0
  206. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mim_gold_ner.py +0 -0
  207. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mlqa_es.py +0 -0
  208. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mlsum_de.py +0 -0
  209. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mlsum_es.py +0 -0
  210. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_mmlu.py +0 -0
  211. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_multinerd-it.py +0 -0
  212. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_no_cola.py +0 -0
  213. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_no_sammendrag.py +0 -0
  214. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
  215. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_nordjylland_news.py +0 -0
  216. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norec.py +0 -0
  217. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norglm_multiqa.py +0 -0
  218. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norglm_multisum.py +0 -0
  219. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norne.py +0 -0
  220. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_norquad.py +0 -0
  221. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_nqii.py +0 -0
  222. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
  223. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_orange_sum.py +0 -0
  224. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_personal_sum.py +0 -0
  225. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_publico.py +0 -0
  226. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_rrn.py +0 -0
  227. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sb10k.py +0 -0
  228. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_scandiqa.py +0 -0
  229. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_scandisent_fi.py +0 -0
  230. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_schibsted.py +0 -0
  231. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
  232. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sentipolc16.py +0 -0
  233. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_squad.py +0 -0
  234. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_squad_it.py +0 -0
  235. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_squad_nl.py +0 -0
  236. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_squad_nl_old.py +0 -0
  237. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sst2_pt.py +0 -0
  238. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_sst5.py +0 -0
  239. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_suc3.py +0 -0
  240. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_swedn.py +0 -0
  241. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_swerec.py +0 -0
  242. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_turku_ner_fi.py +0 -0
  243. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_tydiqa_fi.py +0 -0
  244. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
  245. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_wikiann_fo.py +0 -0
  246. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_wikineural-it.py +0 -0
  247. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_winogrande_is.py +0 -0
  248. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_xlsum_fi.py +0 -0
  249. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/create_xquad_es.py +0 -0
  250. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/fix_dot_env_file.py +0 -0
  251. {euroeval-15.15.0 → euroeval-16.0.0}/src/scripts/versioning.py +0 -0
  252. {euroeval-15.15.0 → euroeval-16.0.0}/tests/__init__.py +0 -0
  253. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_config_factory.py +0 -0
  254. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/__init__.py +0 -0
  255. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_base.py +0 -0
  256. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_fresh.py +0 -0
  257. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_litellm.py +0 -0
  258. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_benchmark_modules/test_vllm.py +0 -0
  259. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_callbacks.py +0 -0
  260. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_constants.py +0 -0
  261. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_dataset_configs.py +0 -0
  262. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_enums.py +0 -0
  263. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_exceptions.py +0 -0
  264. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_finetuning.py +0 -0
  265. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_generation.py +0 -0
  266. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_languages.py +0 -0
  267. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_model_cache.py +0 -0
  268. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_model_config.py +0 -0
  269. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_model_loading.py +0 -0
  270. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_speed_benchmark.py +0 -0
  271. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/__init__.py +0 -0
  272. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/test_question_answering.py +0 -0
  273. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/test_sequence_classification.py +0 -0
  274. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/test_text_to_text.py +0 -0
  275. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_task_utils/test_token_classification.py +0 -0
  276. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_tasks.py +0 -0
  277. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_types.py +0 -0
  278. {euroeval-15.15.0 → euroeval-16.0.0}/tests/test_utils.py +0 -0
@@ -25,12 +25,14 @@ body:
25
25
  - label: Danish
26
26
  - label: Dutch
27
27
  - label: English
28
+ - label: Estonian
28
29
  - label: Faroese
29
30
  - label: Finnish
30
31
  - label: French
31
32
  - label: German
32
33
  - label: Icelandic
33
34
  - label: Italian
35
+ - label: Latvian
34
36
  - label: Norwegian (Bokmål or Nynorsk)
35
37
  - label: Portuguese
36
38
  - label: Spanish
@@ -55,7 +55,7 @@ body:
55
55
  attributes:
56
56
  label: EuroEval version
57
57
  description: What version of EuroEval are you using?
58
- placeholder: Output of `pip list | grep EuroEval`
58
+ placeholder: Output of `pip list | grep euroeval`
59
59
  validations:
60
60
  required: true
61
61
  - type: input
@@ -21,7 +21,8 @@ body:
21
21
  - label: Romance languages (French, Italian, Portuguese, Spanish)
22
22
  - label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
23
23
  - label: West Germanic languages (Dutch, English, German)
24
- - label: Finnish
24
+ - label: Finnic languages (Estonian, Finnish)
25
+ - label: Latvian
25
26
  validations:
26
27
  required: true
27
28
  - type: dropdown
@@ -22,16 +22,19 @@ jobs:
22
22
  pull-requests: write
23
23
  runs-on: ubuntu-latest
24
24
  steps:
25
- - uses: actions/checkout@v4
25
+ - uses: actions/checkout@v5
26
26
  with:
27
27
  persist-credentials: false
28
- - uses: actions/setup-python@v5
28
+ ref: main
29
+
30
+ - name: Install uv and set up Python
31
+ uses: astral-sh/setup-uv@v6
29
32
  with:
33
+ enable-cache: false
30
34
  python-version: "3.11"
31
- - run: python -m pip install pre-commit
32
- shell: bash
33
- - run: pre-commit run --show-diff-on-failure --color=always --all-files
34
- shell: bash
35
+
36
+ - name: Run pre-commit hooks
37
+ uses: pre-commit/action@v3.0.1
35
38
 
36
39
  pytest-linux:
37
40
  if: github.event.pull_request.draft == false
@@ -40,24 +43,25 @@ jobs:
40
43
  pull-requests: write
41
44
  strategy:
42
45
  matrix:
43
- python-version: ["3.10", "3.11", "3.12"]
46
+ python-version: ["3.11", "3.12", "3.13"]
44
47
  runs-on: ubuntu-latest
45
48
  steps:
46
- - uses: actions/checkout@v4
49
+ - uses: actions/checkout@v5
47
50
  with:
48
51
  persist-credentials: false
52
+ ref: main
49
53
 
50
54
  - name: Install uv and set up Python
51
- uses: astral-sh/setup-uv@v5
55
+ uses: astral-sh/setup-uv@v6
52
56
  with:
53
57
  enable-cache: false
54
58
  python-version: ${{ matrix.python-version }}
55
59
 
56
60
  - name: Install Dependencies
57
- run: uv sync --no-dev --extra test
61
+ run: uv sync --no-dev
58
62
 
59
63
  - name: Start Ollama server
60
- run: curl -fsSL https://ollama.com/install.sh | sh
64
+ run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
61
65
 
62
66
  - name: Test with pytest
63
67
  run: uv run pytest
@@ -66,6 +70,8 @@ jobs:
66
70
  HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
67
71
  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
68
72
  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
73
+ GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
74
+ XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
69
75
 
70
76
  - name: Delete EuroEval cache
71
77
  run: rm -rf .euroeval_cache
@@ -77,21 +83,25 @@ jobs:
77
83
  pull-requests: write
78
84
  runs-on: macos-latest
79
85
  steps:
80
- - uses: actions/checkout@v4
86
+ - uses: actions/checkout@v5
87
+ with:
88
+ persist-credentials: false
89
+ ref: main
81
90
 
82
91
  - name: Install uv and set up Python
83
- uses: astral-sh/setup-uv@v4
92
+ uses: astral-sh/setup-uv@v6
84
93
  with:
94
+ enable-cache: false
85
95
  python-version: ${{ matrix.python-version }}
86
96
 
87
97
  - name: Install Dependencies
88
- run: uv sync --no-dev --extra test
98
+ run: uv sync --no-dev
89
99
 
90
100
  - name: Start Ollama server
91
- run: curl -fsSL https://ollama.com/install.sh | sh
101
+ run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
92
102
 
93
103
  - name: Test with pytest
94
- run: uv run pytest
104
+ run: uv run pytest -vvv
95
105
  env:
96
106
  HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
97
107
  HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
@@ -4,24 +4,27 @@ repos:
4
4
  hooks:
5
5
  - id: python-use-type-annotations
6
6
  - repo: https://github.com/pre-commit/pre-commit-hooks
7
- rev: v5.0.0
7
+ rev: v6.0.0
8
8
  hooks:
9
9
  - id: end-of-file-fixer
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.12.7
13
+ rev: v0.12.12
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
17
17
  - --fix
18
18
  - --unsafe-fixes
19
19
  - --exit-non-zero-on-fix
20
+ - --no-cache
20
21
  types_or:
21
22
  - python
22
23
  - pyi
23
24
  - jupyter
24
25
  - id: ruff-format
26
+ args:
27
+ - --no-cache
25
28
  types_or:
26
29
  - python
27
30
  - pyi
@@ -10,12 +10,85 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v16.0.0] - 2025-09-05
14
+ ### Added
15
+ - Added support for Latvian 🇱🇻! This includes the sentiment classification dataset
16
+ Latvian Twitter Sentiment, the linguistic acceptability dataset ScaLA-lv, the named
17
+ entity recognition datasets FullStack-NER-lv and WikiANN-lv, the reading comprehension
18
+ dataset MultiWikiQA, the knowledge dataset MMLU-lv, the common-sense reasoning
19
+ dataset COPA-lv, and the summarisation dataset LSM.
20
+ - Added support for Estonian 🇪🇪! It includes the sentiment classification dataset
21
+ Estonian Valence, the linguistic acceptability datasets Grammar-et and ScaLA-et, the
22
+ named entity recognition dataset EstNER, the reading comprehension dataset
23
+ MultiWikiQA-et, the summarisation dataset ERRNews, the knowledge dataset Exam-et,
24
+ and the common-sense reasoning dataset Winogrande-et. This was contributed by
25
+ @slowwavesleep ✨
26
+ - It is now possible to evaluate how much a model adhere to European values! 🇪🇺 This
27
+ probes 53 questions from the European values survey, which have been chosen based on
28
+ an optimisation procedure that maximises agreement across the EU. We then measure how
29
+ well the model's answers align with the distribution of answers across the EU, using a
30
+ tree-based kernel density estimation. This can only be used zero-shot, and only with
31
+ instruction-based decoder models (including reasoning models).
32
+
33
+ ### Changed
34
+ - When evaluating classification tasks, we now force the model to output one of the
35
+ labels. This is done directly with open models, and done via a JSON schema for API
36
+ models. This won't change the results for existing tasks, as logprobs are used, but
37
+ this was required to measure the European values.
38
+ - Updated `vllm` dependency to `>=0.10.1`, which includes GPT-OSS support.
39
+ - Updated `numpy` dependency to `>=2.0.0`, as the previous clash is not applicable
40
+ - Updated `transformers` dependency to `>=4.56.0`, which includes support for more
41
+ models.
42
+ - Now requires Python >=3.11, as Python 3.10 does not support structured generation with
43
+ a dynamic set of choices (Literal[*list_of_choices] is not supported)
44
+
45
+ ### Fixed
46
+ - Enable support to evaluate Mistral models with their custom `mistral-common`
47
+ tokeniser, which includes all recent Mistral models. Note that we currently assume
48
+ that all of these models are instruction-tuned decoder models (which _is_ true
49
+ currently), which can lead to errors in case they publish different types of models in
50
+ the future.
51
+ - Now disables the `seed` parameter if the API inference model does not support it,
52
+ which prevented evaluating some models.
53
+ - Now correctly detects an API inference model as non-existing, even if LiteLLM *does*
54
+ see it as existing. We have an additional check during evaluation to ensure this now.
55
+ - Catch an `ImportError` error that sometimes happens when finishing the evaluation of a
56
+ vLLM model, during shutdown.
57
+ - Now uses `litellm>=1.75.6`, which fixes an issue related to evaluation of GPT-5 models
58
+ using Ollama.
59
+ - Now always uses the `multiprocessing` backend when evaluating vLLM models, rather than
60
+ reverting to `ray` when using multiple GPUs, as `ray` led to evaluations of several
61
+ models freezing.
62
+ - Now does not require the user to be logged in to Hugging Face to benchmark models on
63
+ the Hugging Face Hub, if the models are public.
64
+
65
+ ### Removed
66
+ - Removed support for human evaluation, as it was not actively maintained and not used.
67
+
68
+
69
+ ## [v15.16.0] - 2025-08-12
70
+ ### Added
71
+ - Added metadata for GPT-5 models.
72
+
73
+ ### Changed
74
+ - Updated `transformers` dependency to `>=4.55.0`.
75
+
76
+ ### Fixed
77
+ - If the model uses 'mxfp4' quantisation then we allow the dtype to be bfloat16, rather
78
+ than forcing float16. This caused issues with the new GPT-OSS models.
79
+ - Prevent multiple `Model <model-id> does not exist` logs when evaluating a model
80
+ that does not exist - now only logs this once.
81
+ - Cleaner error message when attempting to benchmark a generative model without having a
82
+ GPU available.
83
+ - Now raises error if an inference API is used with a parameter that is not supported.
84
+
85
+
13
86
  ## [v15.15.0] - 2025-08-06
14
87
  ### Added
15
88
  - Added the common-sense reasoning dataset GoldenSwag for the following
16
89
  languages: Danish, German, Spanish, Finnish, French, Italian, Dutch, Swedish.
17
90
  The datasets are unofficial for now. This was contributed by
18
- [@oliverkinch](https://github.com/oliverkinch)
91
+ @oliverkinch ✨
19
92
 
20
93
  ### Changed
21
94
  - Now allows metadata to be included in metrics, allowing more flexibility when
@@ -71,7 +144,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
71
144
  acceptability dataset ScaLA-pt. The machine translated ones include the sentiment
72
145
  classification dataset SST-2, the multiple choice reading comprehension dataset BoolQ,
73
146
  the knowledge dataset MMLU, and the common-sense reasoning dataset GoldenSwag. This
74
- was contributed by [@duarteocarmo](https://github.com/duarteocarmo)
147
+ was contributed by @duarteocarmo ✨
75
148
  - Added `--gpu-memory-utilization` argument (`gpu_memory_utilization` in the
76
149
  `Benchmarker` API), which can be lowered in case the user is experiencing OOM errors
77
150
  when evaluating models. The default is 0.9 (same as previously), which means that vLLM
@@ -91,11 +164,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
91
164
  - Added the English knowledge dataset Life in the UK, which has been added as an
92
165
  official dataset, replacing the existing English knowledge dataset MMLU, which in turn
93
166
  has been marked as unofficial now. This was contributed by
94
- [@oliverkinch](https://github.com/oliverkinch)
167
+ @oliverkinch ✨
95
168
  - Added the Norwegian knowledge dataset Idioms-no, which is a multiple-choice question
96
169
  dataset where the alternative answers have been generated using GPT-4o. This has been
97
170
  added as an official dataset, and was contributed by
98
- [@oliverkinch](https://github.com/oliverkinch)
171
+ @oliverkinch ✨
99
172
  - Added new `LLMAsAJudgeMetric`, which allows evaluating the performance of a model with
100
173
  another judge model. This is useful for evaluating models in a reference-free manner,
101
174
  or if the metric is sufficiently complex. It is currently not used in any task, but
@@ -199,11 +272,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
199
272
  ### Added
200
273
  - Added the BeleBele datasets for Finnish, Italian and Spanish. They are listed as
201
274
  unofficial for now. This was contributed by
202
- [@oliverkinch](https://github.com/oliverkinch)
275
+ @oliverkinch ✨
203
276
 
204
277
  ### Changed
205
278
  - Now uses asyncronous requests when dealing with API models, speeding up the generation
206
- immensely. This was contributed by [@mathiasesn](https://github.com/mathiasesn)
279
+ immensely. This was contributed by @mathiasesn ✨
207
280
 
208
281
  ### Fixed
209
282
  - Add HellaSwag-fi back in, as the issue with the labels in the test split has been
@@ -255,7 +328,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
255
328
  dataset [XL-Sum-fi](https://huggingface.co/datasets/TurkuNLP/xlsum-fi), and the
256
329
  common-sense reasoning dataset
257
330
  [HellaSwag-fi](https://huggingface.co/datasets/Finnish-NLP/hellaswag-fi-google-translate).
258
- This was contributed by [@oliverkinch](https://github.com/oliverkinch)
331
+ This was contributed by @oliverkinch ✨
259
332
  - Added metadata for GPT-4.1 and Grok-3 models.
260
333
  - Marked Gemini-2.5-flash and Grok-3-mini as reasoning models, giving them more tokens
261
334
  to think.
@@ -298,7 +371,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
298
371
  ## [v15.6.1] - 2025-04-14
299
372
  ### Changed
300
373
  - Added more info about SQuAD-nl in the documentation. This was contributed by
301
- [@Rijgersberg](https://github.com/Rijgersberg)
374
+ @Rijgersberg ✨
302
375
 
303
376
  ### Fixed
304
377
  - The "E" option for the Norwegian NorCommonSenseQA dataset was not included in the
@@ -326,7 +399,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
326
399
  - Uniformised the prompt templates used for each task, so that they are more
327
400
  consistent across tasks. Evaluation tests across different model types and sizes show
328
401
  no significant performance difference between the new and old templates. This was
329
- contributed by [@viggo-gascou](https://github.com/viggo-gascou)
402
+ contributed by @viggo-gascou ✨
330
403
 
331
404
  ### Fixed
332
405
  - Avoid duplicate error messages when a rate limit occurs.
@@ -355,7 +428,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
355
428
  - Allows all vLLM versions from v0.8.0 again, as the issue with the generation output
356
429
  has been resolved.
357
430
  - Added overall progress indicator during evaluation. This was contributed by
358
- [@mathiasesn](https://github.com/mathiasesn)
431
+ @mathiasesn ✨
359
432
 
360
433
  ### Changed
361
434
  - Now does not use logprobs in text classification tasks with Google VertexAI models, as
@@ -394,9 +467,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
394
467
  ### Fixed
395
468
  - Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
396
469
  compatibility < 8.0. This was contributed by
397
- [@marksverdhei](https://github.com/marksverdhei)
470
+ @marksverdhei ✨
398
471
  - Corrected the name of the French sentiment dataset AlloCiné. This was contributed by
399
- [@Alkarex](https://github.com/Alkarex)
472
+ @Alkarex ✨
400
473
  - Evaluating a specific model revision did not work for adapter models, as there was a
401
474
  confusion between the revision of the adapter and the revision of the base model. We
402
475
  now use the revision for the adapter and use the latest revision for the base model.
@@ -422,7 +495,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
422
495
  `HuggingFaceHubDown` exception.
423
496
  - Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
424
497
  compatibility < 8.0. This was contributed by
425
- [@marksverdhei](https://github.com/marksverdhei)
498
+ @marksverdhei ✨
426
499
  - Fixed docs for ScandiQA-da and ScandiQA-sv, where it was incorrectly stated that
427
500
  the splits were made by considering the original train/validation/test splits.
428
501
 
@@ -447,7 +520,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
447
520
  [MMLU-es](https://hf.co/datasets/alexandrainst/m_mmlu), the common-sense reasoning
448
521
  dataset [HellaSwag-es](https://hf.co/datasets/alexandrainst/m_hellaswag), and the
449
522
  named entity recognition dataset [CoNLL-es](https://aclanthology.org/W02-2024/). This
450
- was contributed by [@oliverkinch](https://github.com/oliverkinch)
523
+ was contributed by @oliverkinch ✨
451
524
  - Now extracts number of parameters and context length for Ollama models, using the
452
525
  `ollama` package. Vocabulary size is currently not available available in the `ollama`
453
526
  package, so this is not extracted for Ollama models. For this reason, the `ollama`
@@ -500,7 +573,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
500
573
  dataset [MMLU-it](https://hf.co/datasets/alexandrainst/m_mmlu), and the named entity
501
574
  recognition dataset [MultiNERD IT](https://hf.co/datasets/Babelscape/multinerd) (and
502
575
  unofficially [WikiNEuRal IT](https://hf.co/datasets/Babelscape/wikineural)). This was
503
- contributed by [@viggo-gascou](https://github.com/viggo-gascou)
576
+ contributed by @viggo-gascou ✨
504
577
  - Added the new Norwegian knowledge dataset NRK-Quiz-QA, consisting of quizzes on the
505
578
  Norwegian language and culture, in both Bokmål and Nynorsk. The dataset has been split
506
579
  into 635 / 256 / 2,048 samples for train, val, and test, respectively. This replaces
@@ -561,7 +634,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
561
634
  - Added new `--only-allow-safetensors` flag, which disallows evaluating models from the
562
635
  Hugging Face Hub if they are not stored as safetensors. This ensures a high level of
563
636
  security on the system running the evaluations, if this is necessary. This was
564
- contributed by [@Mikeriess](https://github.com/Mikeriess)
637
+ contributed by @Mikeriess ✨
565
638
 
566
639
 
567
640
  ### Fixed
@@ -590,19 +663,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
590
663
  [personal-sum](https://github.com/SmartmediaAI/PersonalSum). It has been split into
591
664
  121 / 64 / 256 samples for train / validation / test, respectively, and is set to
592
665
  `unofficial` for now. This was contributed by
593
- [@oliverkinch](https://github.com/oliverkinch)
666
+ @oliverkinch ✨
594
667
  - Added the Jentoft dataset - a linguistic acceptability dataset which was published in
595
668
  [this Master's thesis](https://www.duo.uio.no/handle/10852/103885) by Matias Jentoft.
596
669
  The original dataset consists of 85,771 / 10,827 / 10487 samples for training,
597
670
  validation and test, respectively. We use a split of 1,024 / 256 / 2,048 samples for
598
671
  training, validation and test, respectively. In each split, the distribution of
599
672
  `correct` and `incorrect` is 50/50. This dataset has been set to `unofficial` for now.
600
- This was contributed by [@oliverkinch](https://github.com/oliverkinch)
673
+ This was contributed by @oliverkinch ✨
601
674
  - Added the dataset icelandic-knowledge, which is derived from the IcelandicQA dataset,
602
675
  reformatted as a knowledge dataset with GPT-4o generated candidate answers. The split
603
676
  is given by 845 / 128 / 1024 for train, val, and test, respectively. It is marked as
604
677
  `unofficial` for now. This was contributed by
605
- [@oliverkinch](https://github.com/oliverkinch)
678
+ @oliverkinch ✨
606
679
 
607
680
  ### Changed
608
681
  - Changed the instruction prompts to all text classification tasks by specifying
@@ -640,8 +713,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
640
713
  dataset [OrangeSum](https://hf.co/datasets/EdinburghNLP/orange_sum).
641
714
  - Added support for evaluating local models again, which supports models stored in the
642
715
  Hugging Face format with a Hugging Face model configuration file (`config.json`) in
643
- the model directory. This was contributed by [@rlrs](https://github.com/rlrs) and
644
- [@peter-sk](https://github.com/peter-sk)
716
+ the model directory. This was contributed by @rlrs and
717
+ @peter-sk ✨
645
718
 
646
719
  ### Changed
647
720
  - Changed the Belebele splits, as there were too few training splits for evaluation on
@@ -861,7 +934,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
861
934
  dataset NO-Multi-QA-Sum (norglm-multi-qa). This dataset is part of the NLEBench
862
935
  Norwegian benchmarks. The answers from the original dataset have been rephrased with
863
936
  gpt-4o to contain the answer from the context. It has been marked as `unofficial` for
864
- now. This was contributed by [@viggo-gascou](https://github.com/viggo-gascou)
937
+ now. This was contributed by @viggo-gascou ✨
865
938
  - Added the sentiment classification part of the Icelandic dataset Hotter and Colder,
866
939
  being a gold standard dataset. As no Icelandic sentiment classification dataset was
867
940
  included in the benchmark previously, this is now the official Icelandic sentiment
@@ -880,18 +953,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
880
953
  - Added the summarisation part of the Norwegian NorGLM multi-task human annotated
881
954
  dataset NO-Multi-QA-Sum (`norglm-multi-sum`). This dataset is part of the NLEBench
882
955
  Norwegian benchmarks. It has been marked as `unofficial` for now. This was contributed
883
- by [@viggo-gascou](https://github.com/viggo-gascou)
956
+ by @viggo-gascou ✨
884
957
  - Added `ice-linguistic` a linguistic acceptability dataset which is a subset of the
885
958
  Icelandic Linguistic Benchmarks dataset. It is a small dataset with 94 train
886
959
  samples, 32 validation samples, and 256 test samples, and has been marked as
887
960
  `unofficial` for now. This was contributed by
888
- [@oliverkinch](https://github.com/oliverkinch)
961
+ @oliverkinch ✨
889
962
  - Added `icelandic-qa`, an Icelandic question answering dataset about Icelandic culture
890
963
  and history. The original dataset has 2000 samples, but only 375 of the samples have
891
964
  answers that are found in the context (exact match). An LLM has therefore been used to
892
965
  rephrase the answers and we now have 1683 samples where the answers are found in the
893
966
  context (531 train, 128 val, 1024 test). It has been set to `unofficial` for now. This
894
- was contributed by [@oliverkinch](http://github.com/oliverkinch)
967
+ was contributed by @oliverkinch ✨
895
968
 
896
969
  ### Fixed
897
970
  - Small typo in prefix prompt used for few-shot evaluation of the English sentiment
@@ -903,21 +976,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
903
976
  ## [v13.1.0] - 2024-10-31
904
977
  - Added `ice-ec` (a subset of the dataset) and `ice-ec-full` (the full dataset), an
905
978
  Icelandic linguistic acceptability dataset. It has been set to `unofficial` for now.
906
- This was contributed by [@oliverkinch](https://github.com/oliverkinch)
979
+ This was contributed by @oliverkinch ✨
907
980
  - Added the Schibsted summarisation dataset, which contains summaries of published
908
981
  articles from Schibsted Media's Norwegian and Swedish newsrooms. The dataset has been
909
982
  split into two separate small datasets, `schibsted-sv` for Swedish and `schibsted-no`
910
983
  for Norwegian. Note that both of these datasets are really small (89 and 374 test
911
984
  samples in `schibsted-sv` and `schibsted-no`, respectively), and have been set to
912
985
  `unofficial` for now. This was contributed by
913
- [@oliverkinch](https://github.com/oliverkinch)
986
+ @oliverkinch ✨
914
987
  - Added the Icelandic summarisation dataset IceSum. IceSum is a collection of 1,000
915
988
  Icelandic news articles from mbl.is, which have been manually annotated with
916
989
  summaries. The dataset has been marked as unofficial, meaning that it will not be
917
990
  automatically included when benchmarking models, but can be included by specifying the
918
991
  dataset explicitly using the --dataset argument (or dataset argument if using the
919
992
  Benchmarker API). This was contributed by
920
- [@viggo-gascou](https://github.com/viggo-gascou)
993
+ @viggo-gascou ✨
921
994
  - Added the new Faroese reading comprehension dataset FoQA. This is now the default
922
995
  Faroese reading comprehension benchmark, as there was none previously.
923
996
  - Now supports evaluation of models with adapters. This requires that the model
@@ -1219,7 +1292,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1219
1292
 
1220
1293
  ### Fixed
1221
1294
  - Move tensor to the correct device when benchmarking seq-to-seq models (#363). Thanks
1222
- to [@ThomasKluiters](https://github.com/ThomasKluiters) for this contribution! :tada:
1295
+ to @ThomasKluiters for this contribution! :tada:
1223
1296
  - Deals with the case where an instruction tuned model does not use any special token
1224
1297
  at the end of the chat, such as `<|im_end|>`. This holds for, e.g., Qwen models.
1225
1298
  - Better auto-detection of pipeline tag for models on the Hugging Face Hub, in case the
@@ -1233,7 +1306,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1233
1306
  `AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_VERSION` need to have been set, or
1234
1307
  alternatively through the `--azure-openai-api-key`, `--azure-openai-endpoint` and
1235
1308
  `--azure-openai-api-version` arguments. Thanks to
1236
- [@BramVanroy](https://github.com/BramVanroy) for all the help regarding the
1309
+ @BramVanroy for all the help regarding the
1237
1310
  implementation of this :tada:
1238
1311
  - We now use the new JSON mode for newer OpenAI models for the NER task, to ensure
1239
1312
  better JSON generation.
@@ -1744,7 +1817,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1744
1817
  - A `--use-flash-attention` flag has been added, which enables Flash Attention 2.0,
1745
1818
  which is required by some models, such as Mistral-based ones. If `flash-attn` has not
1746
1819
  been installed then an informative error message will be raised. Thanks to
1747
- [@peter-sk](https://github.com/peter-sk) for this contribution! :tada:
1820
+ @peter-sk for this contribution! :tada:
1748
1821
 
1749
1822
  ### Changed
1750
1823
  - Now uses 8-bit AdamW whenever CUDA is available, as opposed to regular AdamW.
@@ -1764,7 +1837,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1764
1837
  OpenAI models. This currently happens automatically when specifying a generative
1765
1838
  model from the Hugging Face Hub, and with all OpenAI models.
1766
1839
  - Now stores model caches in separate directories, enabling parallel evaluations.
1767
- Thanks to [@KennethEnevoldsen](https://github.com/KennethEnevoldsen) for this
1840
+ Thanks to @KennethEnevoldsen for this
1768
1841
  contribution! :tada:
1769
1842
  - Added `--device` argument to the CLI, which can be used to overwrite the automatic
1770
1843
  detection of device (CPU, CUDA GPU, MPS GPU, TPU) to use.
@@ -1833,7 +1906,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1833
1906
  - Now added support for benchmarking local models in the Hugging Face format (i.e.,
1834
1907
  saved with the `save_pretrained` method). This automatically detects the framework
1835
1908
  based on the file extension, but can also be set using the new `--model-framework`
1836
- argument. Thanks to [@peter-sk](https://github.com/peter-sk) for implementing this!
1909
+ argument. Thanks to @peter-sk for implementing this!
1837
1910
  :tada:
1838
1911
 
1839
1912
  ### Fixed
@@ -2132,7 +2205,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2132
2205
  - Specific branches/commits/tags can now be benchmarked, using the `@`
2133
2206
  delimiter. For instance, `scandeval -m model_id@commit_hash` will benchmark
2134
2207
  the model with model ID `model_id`, stored at commit with hash `commit_hash`.
2135
- Thanks to [@versae](https://github.com/versae) for contributing! :tada:
2208
+ Thanks to @versae for contributing! :tada:
2136
2209
 
2137
2210
 
2138
2211
  ## [v2.2.0] - 2022-01-18
@@ -2142,8 +2215,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2142
2215
 
2143
2216
  ## [v2.1.0] - 2022-01-17
2144
2217
  ### Added
2145
- - Added support for `flax` models. Thanks to
2146
- [@versae](https://github.com/versae) for contributing! :tada:
2218
+ - Added support for `flax` models. Thanks to @versae for contributing! :tada:
2147
2219
 
2148
2220
 
2149
2221
  ## [v2.0.0] - 2022-01-07
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.15.0
3
+ Version: 16.0.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -28,18 +28,19 @@ License: MIT License
28
28
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
29
  SOFTWARE.
30
30
  License-File: LICENSE
31
- Requires-Python: <4.0,>=3.10
31
+ Requires-Python: <4.0,>=3.11
32
32
  Requires-Dist: accelerate>=1.9.0
33
33
  Requires-Dist: bert-score>=0.3.13
34
34
  Requires-Dist: click>=8.1.3
35
+ Requires-Dist: cloudpickle>=3.1.1
35
36
  Requires-Dist: datasets>=3.5.0
36
37
  Requires-Dist: demjson3>=3.0.6
37
38
  Requires-Dist: evaluate>=0.4.1
38
39
  Requires-Dist: huggingface-hub>=0.30.1
39
40
  Requires-Dist: levenshtein>=0.24.0
40
- Requires-Dist: litellm>=1.72.2
41
+ Requires-Dist: litellm>=1.75.6
41
42
  Requires-Dist: more-itertools>=10.5.0
42
- Requires-Dist: numpy<2.0.0,>=1.23.0
43
+ Requires-Dist: numpy>=2.0.0
43
44
  Requires-Dist: ollama>=0.5.1
44
45
  Requires-Dist: pandas>=2.2.0
45
46
  Requires-Dist: peft>=0.15.0
@@ -49,27 +50,22 @@ Requires-Dist: pyinfer>=0.0.3
49
50
  Requires-Dist: python-dotenv>=1.0.1
50
51
  Requires-Dist: rouge-score>=0.1.2
51
52
  Requires-Dist: sacremoses>=0.1.1
52
- Requires-Dist: scikit-learn<1.6.0
53
+ Requires-Dist: scikit-learn==1.6.1
53
54
  Requires-Dist: sentencepiece>=0.1.96
54
55
  Requires-Dist: seqeval>=1.2.2
55
56
  Requires-Dist: setuptools>=75.8.2
56
57
  Requires-Dist: tenacity>=9.0.0
57
58
  Requires-Dist: termcolor>=2.0.0
58
59
  Requires-Dist: torch>=2.6.0
59
- Requires-Dist: transformers>=4.51.0
60
+ Requires-Dist: transformers[mistral-common]>=4.56.0
60
61
  Provides-Extra: all
61
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
62
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
- Requires-Dist: gradio>=4.26.0; extra == 'all'
64
- Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
65
65
  Provides-Extra: generative
66
66
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
67
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
68
- Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'generative'
69
- Provides-Extra: human-evaluation
70
- Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
71
- Provides-Extra: test
72
- Requires-Dist: gradio>=4.26.0; extra == 'test'
68
+ Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
73
69
  Description-Content-Type: text/markdown
74
70
 
75
71
  <div align='center'>
@@ -223,16 +219,18 @@ A huge thank you to all the contributors who have helped make this project a suc
223
219
  <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
224
220
  <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
225
221
  <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
222
+ <a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
226
223
  <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
227
224
  <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
228
225
  <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
229
226
  <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
230
227
  <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
231
- <a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
232
228
  <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
233
229
  <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
234
230
  <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
235
231
  <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
232
+ <a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
233
+ <a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
236
234
 
237
235
 
238
236
  ### Contribute to EuroEval
@@ -149,16 +149,18 @@ A huge thank you to all the contributors who have helped make this project a suc
149
149
  <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
150
150
  <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
151
151
  <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
152
+ <a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
152
153
  <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
153
154
  <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
154
155
  <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
155
156
  <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
156
157
  <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
157
- <a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
158
158
  <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
159
159
  <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
160
160
  <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
161
161
  <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
162
+ <a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
163
+ <a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
162
164
 
163
165
 
164
166
  ### Contribute to EuroEval