EuroEval 15.16.0__tar.gz → 16.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (278) hide show
  1. {euroeval-15.16.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +2 -0
  2. {euroeval-15.16.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +2 -1
  3. {euroeval-15.16.0 → euroeval-16.0.1}/.github/workflows/ci.yaml +22 -14
  4. {euroeval-15.16.0 → euroeval-16.0.1}/.pre-commit-config.yaml +4 -1
  5. {euroeval-15.16.0 → euroeval-16.0.1}/CHANGELOG.md +111 -36
  6. {euroeval-15.16.0 → euroeval-16.0.1}/PKG-INFO +13 -14
  7. {euroeval-15.16.0 → euroeval-16.0.1}/README.md +2 -1
  8. euroeval-16.0.1/docs/datasets/estonian.md +544 -0
  9. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/icelandic.md +11 -11
  10. euroeval-16.0.1/docs/datasets/latvian.md +536 -0
  11. euroeval-16.0.1/docs/leaderboards/Monolingual/portuguese.md +23 -0
  12. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/README.md +6 -16
  13. {euroeval-15.16.0 → euroeval-16.0.1}/makefile +5 -2
  14. {euroeval-15.16.0 → euroeval-16.0.1}/pyproject.toml +13 -17
  15. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/__init__.py +8 -7
  16. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_config_factory.py +3 -7
  17. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/base.py +35 -19
  18. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/fresh.py +24 -19
  19. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/hf.py +136 -154
  20. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/litellm.py +190 -110
  21. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/vllm.py +199 -139
  22. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmarker.py +49 -22
  23. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/cli.py +3 -3
  24. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/constants.py +19 -15
  25. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/data_loading.py +33 -28
  26. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/data_models.py +73 -23
  27. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/__init__.py +2 -0
  28. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/danish.py +35 -1
  29. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/dutch.py +38 -1
  30. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/english.py +38 -1
  31. euroeval-16.0.1/src/euroeval/dataset_configs/estonian.py +95 -0
  32. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/faroese.py +38 -0
  33. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/finnish.py +39 -1
  34. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/french.py +38 -1
  35. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/german.py +38 -1
  36. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/icelandic.py +39 -1
  37. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/italian.py +38 -1
  38. euroeval-16.0.1/src/euroeval/dataset_configs/latvian.py +81 -0
  39. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/norwegian.py +38 -1
  40. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/portuguese.py +38 -1
  41. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/spanish.py +38 -1
  42. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/swedish.py +38 -1
  43. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/enums.py +0 -6
  44. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/finetuning.py +6 -6
  45. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/generation.py +25 -14
  46. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/generation_utils.py +90 -20
  47. euroeval-16.0.1/src/euroeval/languages.py +966 -0
  48. euroeval-16.0.1/src/euroeval/metrics/__init__.py +6 -0
  49. euroeval-16.0.1/src/euroeval/metrics/base.py +76 -0
  50. euroeval-16.0.1/src/euroeval/metrics/huggingface.py +192 -0
  51. euroeval-16.0.1/src/euroeval/metrics/llm_as_a_judge.py +257 -0
  52. euroeval-16.0.1/src/euroeval/metrics/pipeline.py +276 -0
  53. euroeval-16.0.1/src/euroeval/metrics/speed.py +51 -0
  54. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/model_cache.py +13 -1
  55. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  56. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/multiple_choice.py +23 -2
  57. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/named_entity_recognition.py +65 -2
  58. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/reading_comprehension.py +42 -2
  59. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/sentiment_classification.py +46 -2
  60. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/summarization.py +24 -4
  61. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/scores.py +7 -2
  62. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/speed_benchmark.py +6 -6
  63. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +19 -8
  64. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/question_answering.py +35 -28
  65. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/sequence_classification.py +128 -42
  66. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/text_to_text.py +7 -3
  67. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/token_classification.py +59 -73
  68. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/tasks.py +33 -6
  69. euroeval-16.0.1/src/euroeval/tokenization_utils.py +585 -0
  70. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/utils.py +150 -35
  71. euroeval-16.0.1/src/scripts/create_copa_lv.py +143 -0
  72. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_danish_citizen_tests.py +3 -2
  73. euroeval-16.0.1/src/scripts/create_err_news.py +83 -0
  74. euroeval-16.0.1/src/scripts/create_estner.py +115 -0
  75. euroeval-16.0.1/src/scripts/create_estonian_valence.py +86 -0
  76. euroeval-16.0.1/src/scripts/create_european_values.py +289 -0
  77. euroeval-16.0.1/src/scripts/create_exam_et.py +136 -0
  78. euroeval-16.0.1/src/scripts/create_fullstack_ner.py +248 -0
  79. euroeval-16.0.1/src/scripts/create_grammar_et.py +74 -0
  80. euroeval-16.0.1/src/scripts/create_latvian_lsm_summary.py +92 -0
  81. euroeval-16.0.1/src/scripts/create_latvian_twitter_sentiment.py +109 -0
  82. euroeval-16.0.1/src/scripts/create_mmlu_lv.py +263 -0
  83. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_multi_wiki_qa.py +1 -0
  84. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_scala.py +4 -0
  85. euroeval-16.0.1/src/scripts/create_wikiann_lv.py +116 -0
  86. euroeval-16.0.1/src/scripts/create_winogrande_et.py +90 -0
  87. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/load_ud_pos.py +36 -0
  88. {euroeval-15.16.0 → euroeval-16.0.1}/tests/conftest.py +2 -19
  89. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_hf.py +10 -13
  90. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmarker.py +0 -44
  91. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_cli.py +2 -2
  92. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_data_loading.py +15 -8
  93. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_data_models.py +2 -2
  94. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_scores.py +1 -1
  95. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_tokenization_utils.py +7 -7
  96. {euroeval-15.16.0 → euroeval-16.0.1}/uv.lock +1389 -2201
  97. euroeval-15.16.0/src/euroeval/human_evaluation.py +0 -738
  98. euroeval-15.16.0/src/euroeval/languages.py +0 -206
  99. euroeval-15.16.0/src/euroeval/metrics.py +0 -470
  100. euroeval-15.16.0/src/euroeval/tokenization_utils.py +0 -498
  101. euroeval-15.16.0/tests/test_human_evaluation.py +0 -8
  102. {euroeval-15.16.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  103. {euroeval-15.16.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  104. {euroeval-15.16.0 → euroeval-16.0.1}/.gitignore +0 -0
  105. {euroeval-15.16.0 → euroeval-16.0.1}/CITATION.cff +0 -0
  106. {euroeval-15.16.0 → euroeval-16.0.1}/CODE_OF_CONDUCT.md +0 -0
  107. {euroeval-15.16.0 → euroeval-16.0.1}/CONTRIBUTING.md +0 -0
  108. {euroeval-15.16.0 → euroeval-16.0.1}/Dockerfile.cuda +0 -0
  109. {euroeval-15.16.0 → euroeval-16.0.1}/LICENSE +0 -0
  110. {euroeval-15.16.0 → euroeval-16.0.1}/NEW_DATASET_GUIDE.md +0 -0
  111. {euroeval-15.16.0 → euroeval-16.0.1}/docs/CNAME +0 -0
  112. {euroeval-15.16.0 → euroeval-16.0.1}/docs/README.md +0 -0
  113. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/README.md +0 -0
  114. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/danish.md +0 -0
  115. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/dutch.md +0 -0
  116. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/english.md +0 -0
  117. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/faroese.md +0 -0
  118. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/finnish.md +0 -0
  119. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/french.md +0 -0
  120. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/german.md +0 -0
  121. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/italian.md +0 -0
  122. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/norwegian.md +0 -0
  123. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/portuguese.md +0 -0
  124. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/spanish.md +0 -0
  125. {euroeval-15.16.0 → euroeval-16.0.1}/docs/datasets/swedish.md +0 -0
  126. {euroeval-15.16.0 → euroeval-16.0.1}/docs/extras/radial_plotter.md +0 -0
  127. {euroeval-15.16.0 → euroeval-16.0.1}/docs/faq.md +0 -0
  128. {euroeval-15.16.0 → euroeval-16.0.1}/docs/gfx/favicon.png +0 -0
  129. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/danish.md +0 -0
  130. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
  131. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/english.md +0 -0
  132. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
  133. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/finnish.md +0 -0
  134. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/french.md +0 -0
  135. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/german.md +0 -0
  136. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  137. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/italian.md +0 -0
  138. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  139. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
  140. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
  141. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/european.md +0 -0
  142. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
  143. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  144. {euroeval-15.16.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/romance.md +0 -0
  145. {euroeval-15.16.0 → euroeval-16.0.1}/docs/methodology.md +0 -0
  146. {euroeval-15.16.0 → euroeval-16.0.1}/docs/python-package.md +0 -0
  147. {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/README.md +0 -0
  148. {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/common-sense-reasoning.md +0 -0
  149. {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/knowledge.md +0 -0
  150. {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/linguistic-acceptability.md +0 -0
  151. {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/named-entity-recognition.md +0 -0
  152. {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/reading-comprehension.md +0 -0
  153. {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/sentiment-classification.md +0 -0
  154. {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/speed.md +0 -0
  155. {euroeval-15.16.0 → euroeval-16.0.1}/docs/tasks/summarization.md +0 -0
  156. {euroeval-15.16.0 → euroeval-16.0.1}/gfx/euroeval.png +0 -0
  157. {euroeval-15.16.0 → euroeval-16.0.1}/gfx/euroeval.xcf +0 -0
  158. {euroeval-15.16.0 → euroeval-16.0.1}/gfx/scandeval.png +0 -0
  159. {euroeval-15.16.0 → euroeval-16.0.1}/mkdocs.yaml +0 -0
  160. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
  161. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/callbacks.py +0 -0
  162. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/exceptions.py +0 -0
  163. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/model_config.py +0 -0
  164. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/model_loading.py +0 -0
  165. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/__init__.py +0 -0
  166. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/__init__.py +0 -0
  167. {euroeval-15.16.0 → euroeval-16.0.1}/src/euroeval/types.py +0 -0
  168. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/constants.py +0 -0
  169. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_allocine.py +0 -0
  170. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_angry_tweets.py +0 -0
  171. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_arc.py +0 -0
  172. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_arc_is.py +0 -0
  173. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_belebele.py +0 -0
  174. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_boolq_pt.py +0 -0
  175. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_cnn_dailymail.py +0 -0
  176. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_conll_en.py +0 -0
  177. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_conll_es.py +0 -0
  178. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_conll_nl.py +0 -0
  179. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_dane.py +0 -0
  180. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_dansk.py +0 -0
  181. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_danske_talemaader.py +0 -0
  182. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_danske_talemaader_old.py +0 -0
  183. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_dbrd.py +0 -0
  184. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_dutch_cola.py +0 -0
  185. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_eltec.py +0 -0
  186. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_fone.py +0 -0
  187. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_foqa.py +0 -0
  188. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_fosent.py +0 -0
  189. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_fquad.py +0 -0
  190. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_germanquad.py +0 -0
  191. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_germeval.py +0 -0
  192. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_goldenswag.py +0 -0
  193. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_harem.py +0 -0
  194. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_hellaswag.py +0 -0
  195. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_hellaswag_fi.py +0 -0
  196. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  197. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_ice_linguistic.py +0 -0
  198. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
  199. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_icelandic_knowledge.py +0 -0
  200. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_icelandic_qa.py +0 -0
  201. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_icesum.py +0 -0
  202. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_idioms_no.py +0 -0
  203. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_ilpost_sum.py +0 -0
  204. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_jentoft.py +0 -0
  205. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_life_in_the_uk.py +0 -0
  206. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mim_gold_ner.py +0 -0
  207. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mlqa_es.py +0 -0
  208. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mlsum_de.py +0 -0
  209. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mlsum_es.py +0 -0
  210. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_mmlu.py +0 -0
  211. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_multinerd-it.py +0 -0
  212. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_no_cola.py +0 -0
  213. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_no_sammendrag.py +0 -0
  214. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
  215. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_nordjylland_news.py +0 -0
  216. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norec.py +0 -0
  217. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norglm_multiqa.py +0 -0
  218. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norglm_multisum.py +0 -0
  219. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norne.py +0 -0
  220. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_norquad.py +0 -0
  221. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_nqii.py +0 -0
  222. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
  223. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_orange_sum.py +0 -0
  224. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_personal_sum.py +0 -0
  225. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_publico.py +0 -0
  226. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_rrn.py +0 -0
  227. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sb10k.py +0 -0
  228. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_scandiqa.py +0 -0
  229. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_scandisent_fi.py +0 -0
  230. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_schibsted.py +0 -0
  231. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
  232. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sentipolc16.py +0 -0
  233. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_squad.py +0 -0
  234. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_squad_it.py +0 -0
  235. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_squad_nl.py +0 -0
  236. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_squad_nl_old.py +0 -0
  237. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sst2_pt.py +0 -0
  238. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_sst5.py +0 -0
  239. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_suc3.py +0 -0
  240. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_swedn.py +0 -0
  241. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_swerec.py +0 -0
  242. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_turku_ner_fi.py +0 -0
  243. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_tydiqa_fi.py +0 -0
  244. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
  245. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_wikiann_fo.py +0 -0
  246. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_wikineural-it.py +0 -0
  247. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_winogrande_is.py +0 -0
  248. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_xlsum_fi.py +0 -0
  249. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/create_xquad_es.py +0 -0
  250. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/fix_dot_env_file.py +0 -0
  251. {euroeval-15.16.0 → euroeval-16.0.1}/src/scripts/versioning.py +0 -0
  252. {euroeval-15.16.0 → euroeval-16.0.1}/tests/__init__.py +0 -0
  253. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_config_factory.py +0 -0
  254. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/__init__.py +0 -0
  255. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_base.py +0 -0
  256. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_fresh.py +0 -0
  257. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_litellm.py +0 -0
  258. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_vllm.py +0 -0
  259. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_callbacks.py +0 -0
  260. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_constants.py +0 -0
  261. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_dataset_configs.py +0 -0
  262. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_enums.py +0 -0
  263. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_exceptions.py +0 -0
  264. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_finetuning.py +0 -0
  265. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_generation.py +0 -0
  266. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_languages.py +0 -0
  267. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_model_cache.py +0 -0
  268. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_model_config.py +0 -0
  269. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_model_loading.py +0 -0
  270. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_speed_benchmark.py +0 -0
  271. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/__init__.py +0 -0
  272. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/test_question_answering.py +0 -0
  273. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/test_sequence_classification.py +0 -0
  274. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/test_text_to_text.py +0 -0
  275. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_task_utils/test_token_classification.py +0 -0
  276. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_tasks.py +0 -0
  277. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_types.py +0 -0
  278. {euroeval-15.16.0 → euroeval-16.0.1}/tests/test_utils.py +0 -0
@@ -25,12 +25,14 @@ body:
25
25
  - label: Danish
26
26
  - label: Dutch
27
27
  - label: English
28
+ - label: Estonian
28
29
  - label: Faroese
29
30
  - label: Finnish
30
31
  - label: French
31
32
  - label: German
32
33
  - label: Icelandic
33
34
  - label: Italian
35
+ - label: Latvian
34
36
  - label: Norwegian (Bokmål or Nynorsk)
35
37
  - label: Portuguese
36
38
  - label: Spanish
@@ -21,7 +21,8 @@ body:
21
21
  - label: Romance languages (French, Italian, Portuguese, Spanish)
22
22
  - label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
23
23
  - label: West Germanic languages (Dutch, English, German)
24
- - label: Finnish
24
+ - label: Finnic languages (Estonian, Finnish)
25
+ - label: Latvian
25
26
  validations:
26
27
  required: true
27
28
  - type: dropdown
@@ -22,16 +22,19 @@ jobs:
22
22
  pull-requests: write
23
23
  runs-on: ubuntu-latest
24
24
  steps:
25
- - uses: actions/checkout@v4
25
+ - uses: actions/checkout@v5
26
26
  with:
27
27
  persist-credentials: false
28
- - uses: actions/setup-python@v5
28
+ ref: main
29
+
30
+ - name: Install uv and set up Python
31
+ uses: astral-sh/setup-uv@v6
29
32
  with:
33
+ enable-cache: false
30
34
  python-version: "3.11"
31
- - run: python -m pip install pre-commit
32
- shell: bash
33
- - run: pre-commit run --show-diff-on-failure --color=always --all-files
34
- shell: bash
35
+
36
+ - name: Run pre-commit hooks
37
+ uses: pre-commit/action@v3.0.1
35
38
 
36
39
  pytest-linux:
37
40
  if: github.event.pull_request.draft == false
@@ -40,21 +43,22 @@ jobs:
40
43
  pull-requests: write
41
44
  strategy:
42
45
  matrix:
43
- python-version: ["3.10", "3.11", "3.12"]
46
+ python-version: ["3.11", "3.12", "3.13"]
44
47
  runs-on: ubuntu-latest
45
48
  steps:
46
- - uses: actions/checkout@v4
49
+ - uses: actions/checkout@v5
47
50
  with:
48
51
  persist-credentials: false
52
+ ref: main
49
53
 
50
54
  - name: Install uv and set up Python
51
- uses: astral-sh/setup-uv@v5
55
+ uses: astral-sh/setup-uv@v6
52
56
  with:
53
57
  enable-cache: false
54
58
  python-version: ${{ matrix.python-version }}
55
59
 
56
60
  - name: Install Dependencies
57
- run: uv sync --no-dev --extra test
61
+ run: uv sync --no-dev
58
62
 
59
63
  - name: Start Ollama server
60
64
  run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
@@ -79,21 +83,25 @@ jobs:
79
83
  pull-requests: write
80
84
  runs-on: macos-latest
81
85
  steps:
82
- - uses: actions/checkout@v4
86
+ - uses: actions/checkout@v5
87
+ with:
88
+ persist-credentials: false
89
+ ref: main
83
90
 
84
91
  - name: Install uv and set up Python
85
- uses: astral-sh/setup-uv@v4
92
+ uses: astral-sh/setup-uv@v6
86
93
  with:
94
+ enable-cache: false
87
95
  python-version: ${{ matrix.python-version }}
88
96
 
89
97
  - name: Install Dependencies
90
- run: uv sync --no-dev --extra test
98
+ run: uv sync --no-dev
91
99
 
92
100
  - name: Start Ollama server
93
101
  run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
94
102
 
95
103
  - name: Test with pytest
96
- run: uv run pytest
104
+ run: uv run pytest -vvv
97
105
  env:
98
106
  HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
99
107
  HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
@@ -10,18 +10,21 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.12.8
13
+ rev: v0.12.12
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
17
17
  - --fix
18
18
  - --unsafe-fixes
19
19
  - --exit-non-zero-on-fix
20
+ - --no-cache
20
21
  types_or:
21
22
  - python
22
23
  - pyi
23
24
  - jupyter
24
25
  - id: ruff-format
26
+ args:
27
+ - --no-cache
25
28
  types_or:
26
29
  - python
27
30
  - pyi
@@ -10,6 +10,82 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v16.0.1] - 2025-09-07
14
+ ### Fixed
15
+ - Fixed a bug causing encoders to fail when evaluating on the Exam-et dataset.
16
+ - Previously we would abort an evaluation completely if the model outputted a single
17
+ invalid output on a classification task. As individual samples rarely have a great
18
+ influence on the overall score, we now just assign the closest label to the sample and
19
+ continue the evaluation. This will be logged to the user, so that they are aware of
20
+ this. Some tasks are more sensitive to individual samples, such as European values,
21
+ where we still abort the evaluation if a single sample is invalid.
22
+ - Fixed a bug where logprobs were not used for classification tasks when evaluating
23
+ generative models, due to the fact that we raised the number of generated tokens to 10
24
+ for such tasks. This did not affect the results, but it meant that some evaluations
25
+ failed.
26
+ - Now includes FlashInfer as a dependency, as it is required by vLLM.
27
+ - Changed the choices in European values to use letters, like the other multiple
28
+ choice tasks, rather than numbers. Aside from ensuring consistency, we also avoid the
29
+ issue where '10' and '1' often both have the same first token ('1'), causing us not to
30
+ be able to use logprobs to determine the answer.
31
+
32
+
33
+ ## [v16.0.0] - 2025-09-05
34
+ ### Added
35
+ - Added support for Latvian 🇱🇻! This includes the sentiment classification dataset
36
+ Latvian Twitter Sentiment, the linguistic acceptability dataset ScaLA-lv, the named
37
+ entity recognition datasets FullStack-NER-lv and WikiANN-lv, the reading comprehension
38
+ dataset MultiWikiQA, the knowledge dataset MMLU-lv, the common-sense reasoning
39
+ dataset COPA-lv, and the summarisation dataset LSM.
40
+ - Added support for Estonian 🇪🇪! It includes the sentiment classification dataset
41
+ Estonian Valence, the linguistic acceptability datasets Grammar-et and ScaLA-et, the
42
+ named entity recognition dataset EstNER, the reading comprehension dataset
43
+ MultiWikiQA-et, the summarisation dataset ERRNews, the knowledge dataset Exam-et,
44
+ and the common-sense reasoning dataset Winogrande-et. This was contributed by
45
+ @slowwavesleep ✨
46
+ - It is now possible to evaluate how much a model adhere to European values! 🇪🇺 This
47
+ probes 53 questions from the European values survey, which have been chosen based on
48
+ an optimisation procedure that maximises agreement across the EU. We then measure how
49
+ well the model's answers align with the distribution of answers across the EU, using a
50
+ tree-based kernel density estimation. This can only be used zero-shot, and only with
51
+ instruction-based decoder models (including reasoning models).
52
+
53
+ ### Changed
54
+ - When evaluating classification tasks, we now force the model to output one of the
55
+ labels. This is done directly with open models, and done via a JSON schema for API
56
+ models. This won't change the results for existing tasks, as logprobs are used, but
57
+ this was required to measure the European values.
58
+ - Updated `vllm` dependency to `>=0.10.1`, which includes GPT-OSS support.
59
+ - Updated `numpy` dependency to `>=2.0.0`, as the previous clash is not applicable
60
+ - Updated `transformers` dependency to `>=4.56.0`, which includes support for more
61
+ models.
62
+ - Now requires Python >=3.11, as Python 3.10 does not support structured generation with
63
+ a dynamic set of choices (Literal[*list_of_choices] is not supported)
64
+
65
+ ### Fixed
66
+ - Enable support to evaluate Mistral models with their custom `mistral-common`
67
+ tokeniser, which includes all recent Mistral models. Note that we currently assume
68
+ that all of these models are instruction-tuned decoder models (which _is_ true
69
+ currently), which can lead to errors in case they publish different types of models in
70
+ the future.
71
+ - Now disables the `seed` parameter if the API inference model does not support it,
72
+ which prevented evaluating some models.
73
+ - Now correctly detects an API inference model as non-existing, even if LiteLLM *does*
74
+ see it as existing. We have an additional check during evaluation to ensure this now.
75
+ - Catch an `ImportError` error that sometimes happens when finishing the evaluation of a
76
+ vLLM model, during shutdown.
77
+ - Now uses `litellm>=1.75.6`, which fixes an issue related to evaluation of GPT-5 models
78
+ using Ollama.
79
+ - Now always uses the `multiprocessing` backend when evaluating vLLM models, rather than
80
+ reverting to `ray` when using multiple GPUs, as `ray` led to evaluations of several
81
+ models freezing.
82
+ - Now does not require the user to be logged in to Hugging Face to benchmark models on
83
+ the Hugging Face Hub, if the models are public.
84
+
85
+ ### Removed
86
+ - Removed support for human evaluation, as it was not actively maintained and not used.
87
+
88
+
13
89
  ## [v15.16.0] - 2025-08-12
14
90
  ### Added
15
91
  - Added metadata for GPT-5 models.
@@ -32,7 +108,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
32
108
  - Added the common-sense reasoning dataset GoldenSwag for the following
33
109
  languages: Danish, German, Spanish, Finnish, French, Italian, Dutch, Swedish.
34
110
  The datasets are unofficial for now. This was contributed by
35
- [@oliverkinch](https://github.com/oliverkinch)
111
+ @oliverkinch ✨
36
112
 
37
113
  ### Changed
38
114
  - Now allows metadata to be included in metrics, allowing more flexibility when
@@ -88,7 +164,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
88
164
  acceptability dataset ScaLA-pt. The machine translated ones include the sentiment
89
165
  classification dataset SST-2, the multiple choice reading comprehension dataset BoolQ,
90
166
  the knowledge dataset MMLU, and the common-sense reasoning dataset GoldenSwag. This
91
- was contributed by [@duarteocarmo](https://github.com/duarteocarmo)
167
+ was contributed by @duarteocarmo ✨
92
168
  - Added `--gpu-memory-utilization` argument (`gpu_memory_utilization` in the
93
169
  `Benchmarker` API), which can be lowered in case the user is experiencing OOM errors
94
170
  when evaluating models. The default is 0.9 (same as previously), which means that vLLM
@@ -108,11 +184,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
108
184
  - Added the English knowledge dataset Life in the UK, which has been added as an
109
185
  official dataset, replacing the existing English knowledge dataset MMLU, which in turn
110
186
  has been marked as unofficial now. This was contributed by
111
- [@oliverkinch](https://github.com/oliverkinch)
187
+ @oliverkinch ✨
112
188
  - Added the Norwegian knowledge dataset Idioms-no, which is a multiple-choice question
113
189
  dataset where the alternative answers have been generated using GPT-4o. This has been
114
190
  added as an official dataset, and was contributed by
115
- [@oliverkinch](https://github.com/oliverkinch)
191
+ @oliverkinch ✨
116
192
  - Added new `LLMAsAJudgeMetric`, which allows evaluating the performance of a model with
117
193
  another judge model. This is useful for evaluating models in a reference-free manner,
118
194
  or if the metric is sufficiently complex. It is currently not used in any task, but
@@ -216,11 +292,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
216
292
  ### Added
217
293
  - Added the BeleBele datasets for Finnish, Italian and Spanish. They are listed as
218
294
  unofficial for now. This was contributed by
219
- [@oliverkinch](https://github.com/oliverkinch)
295
+ @oliverkinch ✨
220
296
 
221
297
  ### Changed
222
298
  - Now uses asyncronous requests when dealing with API models, speeding up the generation
223
- immensely. This was contributed by [@mathiasesn](https://github.com/mathiasesn)
299
+ immensely. This was contributed by @mathiasesn ✨
224
300
 
225
301
  ### Fixed
226
302
  - Add HellaSwag-fi back in, as the issue with the labels in the test split has been
@@ -272,7 +348,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
272
348
  dataset [XL-Sum-fi](https://huggingface.co/datasets/TurkuNLP/xlsum-fi), and the
273
349
  common-sense reasoning dataset
274
350
  [HellaSwag-fi](https://huggingface.co/datasets/Finnish-NLP/hellaswag-fi-google-translate).
275
- This was contributed by [@oliverkinch](https://github.com/oliverkinch)
351
+ This was contributed by @oliverkinch ✨
276
352
  - Added metadata for GPT-4.1 and Grok-3 models.
277
353
  - Marked Gemini-2.5-flash and Grok-3-mini as reasoning models, giving them more tokens
278
354
  to think.
@@ -315,7 +391,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
315
391
  ## [v15.6.1] - 2025-04-14
316
392
  ### Changed
317
393
  - Added more info about SQuAD-nl in the documentation. This was contributed by
318
- [@Rijgersberg](https://github.com/Rijgersberg)
394
+ @Rijgersberg ✨
319
395
 
320
396
  ### Fixed
321
397
  - The "E" option for the Norwegian NorCommonSenseQA dataset was not included in the
@@ -343,7 +419,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
343
419
  - Uniformised the prompt templates used for each task, so that they are more
344
420
  consistent across tasks. Evaluation tests across different model types and sizes show
345
421
  no significant performance difference between the new and old templates. This was
346
- contributed by [@viggo-gascou](https://github.com/viggo-gascou)
422
+ contributed by @viggo-gascou ✨
347
423
 
348
424
  ### Fixed
349
425
  - Avoid duplicate error messages when a rate limit occurs.
@@ -372,7 +448,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
372
448
  - Allows all vLLM versions from v0.8.0 again, as the issue with the generation output
373
449
  has been resolved.
374
450
  - Added overall progress indicator during evaluation. This was contributed by
375
- [@mathiasesn](https://github.com/mathiasesn)
451
+ @mathiasesn ✨
376
452
 
377
453
  ### Changed
378
454
  - Now does not use logprobs in text classification tasks with Google VertexAI models, as
@@ -411,9 +487,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
411
487
  ### Fixed
412
488
  - Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
413
489
  compatibility < 8.0. This was contributed by
414
- [@marksverdhei](https://github.com/marksverdhei)
490
+ @marksverdhei ✨
415
491
  - Corrected the name of the French sentiment dataset AlloCiné. This was contributed by
416
- [@Alkarex](https://github.com/Alkarex)
492
+ @Alkarex ✨
417
493
  - Evaluating a specific model revision did not work for adapter models, as there was a
418
494
  confusion between the revision of the adapter and the revision of the base model. We
419
495
  now use the revision for the adapter and use the latest revision for the base model.
@@ -439,7 +515,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
439
515
  `HuggingFaceHubDown` exception.
440
516
  - Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
441
517
  compatibility < 8.0. This was contributed by
442
- [@marksverdhei](https://github.com/marksverdhei)
518
+ @marksverdhei ✨
443
519
  - Fixed docs for ScandiQA-da and ScandiQA-sv, where it was incorrectly stated that
444
520
  the splits were made by considering the original train/validation/test splits.
445
521
 
@@ -464,7 +540,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
464
540
  [MMLU-es](https://hf.co/datasets/alexandrainst/m_mmlu), the common-sense reasoning
465
541
  dataset [HellaSwag-es](https://hf.co/datasets/alexandrainst/m_hellaswag), and the
466
542
  named entity recognition dataset [CoNLL-es](https://aclanthology.org/W02-2024/). This
467
- was contributed by [@oliverkinch](https://github.com/oliverkinch)
543
+ was contributed by @oliverkinch ✨
468
544
  - Now extracts number of parameters and context length for Ollama models, using the
469
545
  `ollama` package. Vocabulary size is currently not available available in the `ollama`
470
546
  package, so this is not extracted for Ollama models. For this reason, the `ollama`
@@ -517,7 +593,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
517
593
  dataset [MMLU-it](https://hf.co/datasets/alexandrainst/m_mmlu), and the named entity
518
594
  recognition dataset [MultiNERD IT](https://hf.co/datasets/Babelscape/multinerd) (and
519
595
  unofficially [WikiNEuRal IT](https://hf.co/datasets/Babelscape/wikineural)). This was
520
- contributed by [@viggo-gascou](https://github.com/viggo-gascou)
596
+ contributed by @viggo-gascou ✨
521
597
  - Added the new Norwegian knowledge dataset NRK-Quiz-QA, consisting of quizzes on the
522
598
  Norwegian language and culture, in both Bokmål and Nynorsk. The dataset has been split
523
599
  into 635 / 256 / 2,048 samples for train, val, and test, respectively. This replaces
@@ -578,7 +654,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
578
654
  - Added new `--only-allow-safetensors` flag, which disallows evaluating models from the
579
655
  Hugging Face Hub if they are not stored as safetensors. This ensures a high level of
580
656
  security on the system running the evaluations, if this is necessary. This was
581
- contributed by [@Mikeriess](https://github.com/Mikeriess)
657
+ contributed by @Mikeriess ✨
582
658
 
583
659
 
584
660
  ### Fixed
@@ -607,19 +683,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
607
683
  [personal-sum](https://github.com/SmartmediaAI/PersonalSum). It has been split into
608
684
  121 / 64 / 256 samples for train / validation / test, respectively, and is set to
609
685
  `unofficial` for now. This was contributed by
610
- [@oliverkinch](https://github.com/oliverkinch)
686
+ @oliverkinch ✨
611
687
  - Added the Jentoft dataset - a linguistic acceptability dataset which was published in
612
688
  [this Master's thesis](https://www.duo.uio.no/handle/10852/103885) by Matias Jentoft.
613
689
  The original dataset consists of 85,771 / 10,827 / 10487 samples for training,
614
690
  validation and test, respectively. We use a split of 1,024 / 256 / 2,048 samples for
615
691
  training, validation and test, respectively. In each split, the distribution of
616
692
  `correct` and `incorrect` is 50/50. This dataset has been set to `unofficial` for now.
617
- This was contributed by [@oliverkinch](https://github.com/oliverkinch)
693
+ This was contributed by @oliverkinch ✨
618
694
  - Added the dataset icelandic-knowledge, which is derived from the IcelandicQA dataset,
619
695
  reformatted as a knowledge dataset with GPT-4o generated candidate answers. The split
620
696
  is given by 845 / 128 / 1024 for train, val, and test, respectively. It is marked as
621
697
  `unofficial` for now. This was contributed by
622
- [@oliverkinch](https://github.com/oliverkinch)
698
+ @oliverkinch ✨
623
699
 
624
700
  ### Changed
625
701
  - Changed the instruction prompts to all text classification tasks by specifying
@@ -657,8 +733,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
657
733
  dataset [OrangeSum](https://hf.co/datasets/EdinburghNLP/orange_sum).
658
734
  - Added support for evaluating local models again, which supports models stored in the
659
735
  Hugging Face format with a Hugging Face model configuration file (`config.json`) in
660
- the model directory. This was contributed by [@rlrs](https://github.com/rlrs) and
661
- [@peter-sk](https://github.com/peter-sk)
736
+ the model directory. This was contributed by @rlrs and
737
+ @peter-sk ✨
662
738
 
663
739
  ### Changed
664
740
  - Changed the Belebele splits, as there were too few training splits for evaluation on
@@ -878,7 +954,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
878
954
  dataset NO-Multi-QA-Sum (norglm-multi-qa). This dataset is part of the NLEBench
879
955
  Norwegian benchmarks. The answers from the original dataset have been rephrased with
880
956
  gpt-4o to contain the answer from the context. It has been marked as `unofficial` for
881
- now. This was contributed by [@viggo-gascou](https://github.com/viggo-gascou)
957
+ now. This was contributed by @viggo-gascou ✨
882
958
  - Added the sentiment classification part of the Icelandic dataset Hotter and Colder,
883
959
  being a gold standard dataset. As no Icelandic sentiment classification dataset was
884
960
  included in the benchmark previously, this is now the official Icelandic sentiment
@@ -897,18 +973,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
897
973
  - Added the summarisation part of the Norwegian NorGLM multi-task human annotated
898
974
  dataset NO-Multi-QA-Sum (`norglm-multi-sum`). This dataset is part of the NLEBench
899
975
  Norwegian benchmarks. It has been marked as `unofficial` for now. This was contributed
900
- by [@viggo-gascou](https://github.com/viggo-gascou)
976
+ by @viggo-gascou ✨
901
977
  - Added `ice-linguistic` a linguistic acceptability dataset which is a subset of the
902
978
  Icelandic Linguistic Benchmarks dataset. It is a small dataset with 94 train
903
979
  samples, 32 validation samples, and 256 test samples, and has been marked as
904
980
  `unofficial` for now. This was contributed by
905
- [@oliverkinch](https://github.com/oliverkinch)
981
+ @oliverkinch ✨
906
982
  - Added `icelandic-qa`, an Icelandic question answering dataset about Icelandic culture
907
983
  and history. The original dataset has 2000 samples, but only 375 of the samples have
908
984
  answers that are found in the context (exact match). An LLM has therefore been used to
909
985
  rephrase the answers and we now have 1683 samples where the answers are found in the
910
986
  context (531 train, 128 val, 1024 test). It has been set to `unofficial` for now. This
911
- was contributed by [@oliverkinch](http://github.com/oliverkinch)
987
+ was contributed by @oliverkinch ✨
912
988
 
913
989
  ### Fixed
914
990
  - Small typo in prefix prompt used for few-shot evaluation of the English sentiment
@@ -920,21 +996,21 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
920
996
  ## [v13.1.0] - 2024-10-31
921
997
  - Added `ice-ec` (a subset of the dataset) and `ice-ec-full` (the full dataset), an
922
998
  Icelandic linguistic acceptability dataset. It has been set to `unofficial` for now.
923
- This was contributed by [@oliverkinch](https://github.com/oliverkinch)
999
+ This was contributed by @oliverkinch ✨
924
1000
  - Added the Schibsted summarisation dataset, which contains summaries of published
925
1001
  articles from Schibsted Media's Norwegian and Swedish newsrooms. The dataset has been
926
1002
  split into two separate small datasets, `schibsted-sv` for Swedish and `schibsted-no`
927
1003
  for Norwegian. Note that both of these datasets are really small (89 and 374 test
928
1004
  samples in `schibsted-sv` and `schibsted-no`, respectively), and have been set to
929
1005
  `unofficial` for now. This was contributed by
930
- [@oliverkinch](https://github.com/oliverkinch)
1006
+ @oliverkinch ✨
931
1007
  - Added the Icelandic summarisation dataset IceSum. IceSum is a collection of 1,000
932
1008
  Icelandic news articles from mbl.is, which have been manually annotated with
933
1009
  summaries. The dataset has been marked as unofficial, meaning that it will not be
934
1010
  automatically included when benchmarking models, but can be included by specifying the
935
1011
  dataset explicitly using the --dataset argument (or dataset argument if using the
936
1012
  Benchmarker API). This was contributed by
937
- [@viggo-gascou](https://github.com/viggo-gascou)
1013
+ @viggo-gascou ✨
938
1014
  - Added the new Faroese reading comprehension dataset FoQA. This is now the default
939
1015
  Faroese reading comprehension benchmark, as there was none previously.
940
1016
  - Now supports evaluation of models with adapters. This requires that the model
@@ -1236,7 +1312,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1236
1312
 
1237
1313
  ### Fixed
1238
1314
  - Move tensor to the correct device when benchmarking seq-to-seq models (#363). Thanks
1239
- to [@ThomasKluiters](https://github.com/ThomasKluiters) for this contribution! :tada:
1315
+ to @ThomasKluiters for this contribution! :tada:
1240
1316
  - Deals with the case where an instruction tuned model does not use any special token
1241
1317
  at the end of the chat, such as `<|im_end|>`. This holds for, e.g., Qwen models.
1242
1318
  - Better auto-detection of pipeline tag for models on the Hugging Face Hub, in case the
@@ -1250,7 +1326,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1250
1326
  `AZURE_OPENAI_ENDPOINT` and `AZURE_OPENAI_API_VERSION` need to have been set, or
1251
1327
  alternatively through the `--azure-openai-api-key`, `--azure-openai-endpoint` and
1252
1328
  `--azure-openai-api-version` arguments. Thanks to
1253
- [@BramVanroy](https://github.com/BramVanroy) for all the help regarding the
1329
+ @BramVanroy for all the help regarding the
1254
1330
  implementation of this :tada:
1255
1331
  - We now use the new JSON mode for newer OpenAI models for the NER task, to ensure
1256
1332
  better JSON generation.
@@ -1761,7 +1837,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1761
1837
  - A `--use-flash-attention` flag has been added, which enables Flash Attention 2.0,
1762
1838
  which is required by some models, such as Mistral-based ones. If `flash-attn` has not
1763
1839
  been installed then an informative error message will be raised. Thanks to
1764
- [@peter-sk](https://github.com/peter-sk) for this contribution! :tada:
1840
+ @peter-sk for this contribution! :tada:
1765
1841
 
1766
1842
  ### Changed
1767
1843
  - Now uses 8-bit AdamW whenever CUDA is available, as opposed to regular AdamW.
@@ -1781,7 +1857,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1781
1857
  OpenAI models. This currently happens automatically when specifying a generative
1782
1858
  model from the Hugging Face Hub, and with all OpenAI models.
1783
1859
  - Now stores model caches in separate directories, enabling parallel evaluations.
1784
- Thanks to [@KennethEnevoldsen](https://github.com/KennethEnevoldsen) for this
1860
+ Thanks to @KennethEnevoldsen for this
1785
1861
  contribution! :tada:
1786
1862
  - Added `--device` argument to the CLI, which can be used to overwrite the automatic
1787
1863
  detection of device (CPU, CUDA GPU, MPS GPU, TPU) to use.
@@ -1850,7 +1926,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1850
1926
  - Now added support for benchmarking local models in the Hugging Face format (i.e.,
1851
1927
  saved with the `save_pretrained` method). This automatically detects the framework
1852
1928
  based on the file extension, but can also be set using the new `--model-framework`
1853
- argument. Thanks to [@peter-sk](https://github.com/peter-sk) for implementing this!
1929
+ argument. Thanks to @peter-sk for implementing this!
1854
1930
  :tada:
1855
1931
 
1856
1932
  ### Fixed
@@ -2149,7 +2225,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2149
2225
  - Specific branches/commits/tags can now be benchmarked, using the `@`
2150
2226
  delimiter. For instance, `scandeval -m model_id@commit_hash` will benchmark
2151
2227
  the model with model ID `model_id`, stored at commit with hash `commit_hash`.
2152
- Thanks to [@versae](https://github.com/versae) for contributing! :tada:
2228
+ Thanks to @versae for contributing! :tada:
2153
2229
 
2154
2230
 
2155
2231
  ## [v2.2.0] - 2022-01-18
@@ -2159,8 +2235,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2159
2235
 
2160
2236
  ## [v2.1.0] - 2022-01-17
2161
2237
  ### Added
2162
- - Added support for `flax` models. Thanks to
2163
- [@versae](https://github.com/versae) for contributing! :tada:
2238
+ - Added support for `flax` models. Thanks to @versae for contributing! :tada:
2164
2239
 
2165
2240
 
2166
2241
  ## [v2.0.0] - 2022-01-07
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.16.0
3
+ Version: 16.0.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -28,18 +28,19 @@ License: MIT License
28
28
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
29
  SOFTWARE.
30
30
  License-File: LICENSE
31
- Requires-Python: <4.0,>=3.10
31
+ Requires-Python: <4.0,>=3.11
32
32
  Requires-Dist: accelerate>=1.9.0
33
33
  Requires-Dist: bert-score>=0.3.13
34
34
  Requires-Dist: click>=8.1.3
35
+ Requires-Dist: cloudpickle>=3.1.1
35
36
  Requires-Dist: datasets>=3.5.0
36
37
  Requires-Dist: demjson3>=3.0.6
37
38
  Requires-Dist: evaluate>=0.4.1
38
39
  Requires-Dist: huggingface-hub>=0.30.1
39
40
  Requires-Dist: levenshtein>=0.24.0
40
- Requires-Dist: litellm>=1.72.2
41
+ Requires-Dist: litellm>=1.75.6
41
42
  Requires-Dist: more-itertools>=10.5.0
42
- Requires-Dist: numpy<2.0.0,>=1.23.0
43
+ Requires-Dist: numpy>=2.0.0
43
44
  Requires-Dist: ollama>=0.5.1
44
45
  Requires-Dist: pandas>=2.2.0
45
46
  Requires-Dist: peft>=0.15.0
@@ -49,27 +50,24 @@ Requires-Dist: pyinfer>=0.0.3
49
50
  Requires-Dist: python-dotenv>=1.0.1
50
51
  Requires-Dist: rouge-score>=0.1.2
51
52
  Requires-Dist: sacremoses>=0.1.1
52
- Requires-Dist: scikit-learn<1.6.0
53
+ Requires-Dist: scikit-learn==1.6.1
53
54
  Requires-Dist: sentencepiece>=0.1.96
54
55
  Requires-Dist: seqeval>=1.2.2
55
56
  Requires-Dist: setuptools>=75.8.2
56
57
  Requires-Dist: tenacity>=9.0.0
57
58
  Requires-Dist: termcolor>=2.0.0
58
59
  Requires-Dist: torch>=2.6.0
59
- Requires-Dist: transformers>=4.55.0
60
+ Requires-Dist: transformers[mistral-common]>=4.56.0
60
61
  Provides-Extra: all
61
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
62
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
- Requires-Dist: gradio>=4.26.0; extra == 'all'
64
- Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
65
66
  Provides-Extra: generative
66
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
68
- Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'generative'
69
- Provides-Extra: human-evaluation
70
- Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
71
- Provides-Extra: test
72
- Requires-Dist: gradio>=4.26.0; extra == 'test'
69
+ Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
73
71
  Description-Content-Type: text/markdown
74
72
 
75
73
  <div align='center'>
@@ -223,17 +221,18 @@ A huge thank you to all the contributors who have helped make this project a suc
223
221
  <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
224
222
  <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
225
223
  <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
224
+ <a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
226
225
  <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
227
226
  <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
228
227
  <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
229
228
  <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
230
229
  <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
231
- <a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
232
230
  <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
233
231
  <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
234
232
  <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
235
233
  <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
236
234
  <a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
235
+ <a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
237
236
 
238
237
 
239
238
  ### Contribute to EuroEval
@@ -149,17 +149,18 @@ A huge thank you to all the contributors who have helped make this project a suc
149
149
  <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
150
150
  <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
151
151
  <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
152
+ <a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
152
153
  <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
153
154
  <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
154
155
  <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
155
156
  <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
156
157
  <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
157
- <a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
158
158
  <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
159
159
  <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
160
160
  <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
161
161
  <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
162
162
  <a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
163
+ <a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
163
164
 
164
165
 
165
166
  ### Contribute to EuroEval