EuroEval 15.7.0__tar.gz → 15.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (241) hide show
  1. {euroeval-15.7.0 → euroeval-15.7.2}/.pre-commit-config.yaml +2 -2
  2. {euroeval-15.7.0 → euroeval-15.7.2}/CHANGELOG.md +28 -0
  3. {euroeval-15.7.0 → euroeval-15.7.2}/PKG-INFO +1 -1
  4. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/dutch.md +1 -62
  5. {euroeval-15.7.0 → euroeval-15.7.2}/pyproject.toml +1 -1
  6. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/benchmark_config_factory.py +1 -1
  7. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/benchmark_modules/litellm.py +27 -258
  8. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/benchmark_modules/vllm.py +14 -304
  9. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/benchmarker.py +14 -11
  10. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/data_models.py +3 -1
  11. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/__init__.py +1 -0
  12. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/dutch.py +5 -16
  13. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/finnish.py +11 -9
  14. euroeval-15.7.2/src/euroeval/generation_utils.py +346 -0
  15. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/languages.py +1 -1
  16. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/scores.py +7 -1
  17. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/task_group_utils/sequence_classification.py +46 -11
  18. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/tokenization_utils.py +50 -14
  19. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_dbrd.py +22 -22
  20. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_hellaswag_fi.py +18 -4
  21. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_scala.py +0 -6
  22. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_scandisent_fi.py +11 -1
  23. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_turku_ner_fi.py +10 -0
  24. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_tydiqa_fi.py +10 -0
  25. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_xlsum_fi.py +11 -1
  26. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_data_loading.py +33 -20
  27. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_scores.py +1 -0
  28. {euroeval-15.7.0 → euroeval-15.7.2}/uv.lock +2724 -2724
  29. euroeval-15.7.0/src/scripts/create_dutch_social.py +0 -114
  30. {euroeval-15.7.0 → euroeval-15.7.2}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  31. {euroeval-15.7.0 → euroeval-15.7.2}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  32. {euroeval-15.7.0 → euroeval-15.7.2}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  33. {euroeval-15.7.0 → euroeval-15.7.2}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  34. {euroeval-15.7.0 → euroeval-15.7.2}/.github/workflows/ci.yaml +0 -0
  35. {euroeval-15.7.0 → euroeval-15.7.2}/.gitignore +0 -0
  36. {euroeval-15.7.0 → euroeval-15.7.2}/CITATION.cff +0 -0
  37. {euroeval-15.7.0 → euroeval-15.7.2}/CODE_OF_CONDUCT.md +0 -0
  38. {euroeval-15.7.0 → euroeval-15.7.2}/CONTRIBUTING.md +0 -0
  39. {euroeval-15.7.0 → euroeval-15.7.2}/Dockerfile.cuda +0 -0
  40. {euroeval-15.7.0 → euroeval-15.7.2}/LICENSE +0 -0
  41. {euroeval-15.7.0 → euroeval-15.7.2}/NEW_DATASET_GUIDE.md +0 -0
  42. {euroeval-15.7.0 → euroeval-15.7.2}/README.md +0 -0
  43. {euroeval-15.7.0 → euroeval-15.7.2}/docs/CNAME +0 -0
  44. {euroeval-15.7.0 → euroeval-15.7.2}/docs/README.md +0 -0
  45. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/README.md +0 -0
  46. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/danish.md +0 -0
  47. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/english.md +0 -0
  48. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/faroese.md +0 -0
  49. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/finnish.md +0 -0
  50. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/french.md +0 -0
  51. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/german.md +0 -0
  52. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/icelandic.md +0 -0
  53. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/italian.md +0 -0
  54. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/norwegian.md +0 -0
  55. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/spanish.md +0 -0
  56. {euroeval-15.7.0 → euroeval-15.7.2}/docs/datasets/swedish.md +0 -0
  57. {euroeval-15.7.0 → euroeval-15.7.2}/docs/extras/radial_plotter.md +0 -0
  58. {euroeval-15.7.0 → euroeval-15.7.2}/docs/faq.md +0 -0
  59. {euroeval-15.7.0 → euroeval-15.7.2}/docs/gfx/favicon.png +0 -0
  60. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/danish.md +0 -0
  61. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/dutch.md +0 -0
  62. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/english.md +0 -0
  63. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/faroese.md +0 -0
  64. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/french.md +0 -0
  65. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/german.md +0 -0
  66. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  67. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/italian.md +0 -0
  68. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  69. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/spanish.md +0 -0
  70. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Monolingual/swedish.md +0 -0
  71. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Multilingual/european.md +0 -0
  72. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Multilingual/germanic.md +0 -0
  73. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  74. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/Multilingual/romance.md +0 -0
  75. {euroeval-15.7.0 → euroeval-15.7.2}/docs/leaderboards/README.md +0 -0
  76. {euroeval-15.7.0 → euroeval-15.7.2}/docs/methodology.md +0 -0
  77. {euroeval-15.7.0 → euroeval-15.7.2}/docs/python-package.md +0 -0
  78. {euroeval-15.7.0 → euroeval-15.7.2}/docs/tasks/README.md +0 -0
  79. {euroeval-15.7.0 → euroeval-15.7.2}/docs/tasks/common-sense-reasoning.md +0 -0
  80. {euroeval-15.7.0 → euroeval-15.7.2}/docs/tasks/knowledge.md +0 -0
  81. {euroeval-15.7.0 → euroeval-15.7.2}/docs/tasks/linguistic-acceptability.md +0 -0
  82. {euroeval-15.7.0 → euroeval-15.7.2}/docs/tasks/named-entity-recognition.md +0 -0
  83. {euroeval-15.7.0 → euroeval-15.7.2}/docs/tasks/reading-comprehension.md +0 -0
  84. {euroeval-15.7.0 → euroeval-15.7.2}/docs/tasks/sentiment-classification.md +0 -0
  85. {euroeval-15.7.0 → euroeval-15.7.2}/docs/tasks/speed.md +0 -0
  86. {euroeval-15.7.0 → euroeval-15.7.2}/docs/tasks/summarization.md +0 -0
  87. {euroeval-15.7.0 → euroeval-15.7.2}/gfx/euroeval.png +0 -0
  88. {euroeval-15.7.0 → euroeval-15.7.2}/gfx/euroeval.xcf +0 -0
  89. {euroeval-15.7.0 → euroeval-15.7.2}/gfx/scandeval.png +0 -0
  90. {euroeval-15.7.0 → euroeval-15.7.2}/makefile +0 -0
  91. {euroeval-15.7.0 → euroeval-15.7.2}/mkdocs.yaml +0 -0
  92. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/__init__.py +0 -0
  93. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/benchmark_modules/__init__.py +0 -0
  94. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/benchmark_modules/base.py +0 -0
  95. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/benchmark_modules/fresh.py +0 -0
  96. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/benchmark_modules/hf.py +0 -0
  97. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/callbacks.py +0 -0
  98. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/cli.py +0 -0
  99. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/constants.py +0 -0
  100. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/data_loading.py +0 -0
  101. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/danish.py +0 -0
  102. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/english.py +0 -0
  103. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/faroese.py +0 -0
  104. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/french.py +0 -0
  105. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/german.py +0 -0
  106. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/icelandic.py +0 -0
  107. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/italian.py +0 -0
  108. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/norwegian.py +0 -0
  109. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/spanish.py +0 -0
  110. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/dataset_configs/swedish.py +0 -0
  111. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/enums.py +0 -0
  112. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/exceptions.py +0 -0
  113. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/finetuning.py +0 -0
  114. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/generation.py +0 -0
  115. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/human_evaluation.py +0 -0
  116. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/model_cache.py +0 -0
  117. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/model_config.py +0 -0
  118. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/model_loading.py +0 -0
  119. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/prompt_templates/__init__.py +0 -0
  120. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  121. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  122. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  123. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  124. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  125. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/prompt_templates/summarization.py +0 -0
  126. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/speed_benchmark.py +0 -0
  127. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/task_group_utils/__init__.py +0 -0
  128. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  129. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/task_group_utils/question_answering.py +0 -0
  130. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  131. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/task_group_utils/token_classification.py +0 -0
  132. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/tasks.py +0 -0
  133. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/types.py +0 -0
  134. {euroeval-15.7.0 → euroeval-15.7.2}/src/euroeval/utils.py +0 -0
  135. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/constants.py +0 -0
  136. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_allocine.py +0 -0
  137. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_angry_tweets.py +0 -0
  138. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_arc.py +0 -0
  139. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_arc_is.py +0 -0
  140. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_belebele.py +0 -0
  141. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_cnn_dailymail.py +0 -0
  142. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_conll_en.py +0 -0
  143. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_conll_es.py +0 -0
  144. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_conll_nl.py +0 -0
  145. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_dane.py +0 -0
  146. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_danish_citizen_tests.py +0 -0
  147. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_dansk.py +0 -0
  148. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_danske_talemaader.py +0 -0
  149. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_danske_talemaader_old.py +0 -0
  150. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_dutch_cola.py +0 -0
  151. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_eltec.py +0 -0
  152. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_fone.py +0 -0
  153. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_foqa.py +0 -0
  154. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_fosent.py +0 -0
  155. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_fquad.py +0 -0
  156. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_germanquad.py +0 -0
  157. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_germeval.py +0 -0
  158. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_hellaswag.py +0 -0
  159. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  160. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_ice_linguistic.py +0 -0
  161. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_icelandic_error_corpus.py +0 -0
  162. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_icelandic_knowledge.py +0 -0
  163. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_icelandic_qa.py +0 -0
  164. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_icesum.py +0 -0
  165. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_ilpost_sum.py +0 -0
  166. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_jentoft.py +0 -0
  167. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_mim_gold_ner.py +0 -0
  168. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_mlqa_es.py +0 -0
  169. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_mlsum_de.py +0 -0
  170. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_mlsum_es.py +0 -0
  171. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_mmlu.py +0 -0
  172. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_multinerd-it.py +0 -0
  173. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_no_cola.py +0 -0
  174. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_no_sammendrag.py +0 -0
  175. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_nor_common_sense_qa.py +0 -0
  176. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_nordjylland_news.py +0 -0
  177. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_norec.py +0 -0
  178. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_norglm_multiqa.py +0 -0
  179. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_norglm_multisum.py +0 -0
  180. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_norne.py +0 -0
  181. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_norquad.py +0 -0
  182. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_nqii.py +0 -0
  183. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_nrk_quiz_qa.py +0 -0
  184. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_orange_sum.py +0 -0
  185. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_personal_sum.py +0 -0
  186. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_rrn.py +0 -0
  187. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_sb10k.py +0 -0
  188. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_scandiqa.py +0 -0
  189. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_schibsted.py +0 -0
  190. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_sentiment_headlines_es.py +0 -0
  191. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_sentipolc16.py +0 -0
  192. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_squad.py +0 -0
  193. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_squad_it.py +0 -0
  194. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_squad_nl.py +0 -0
  195. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_squad_nl_old.py +0 -0
  196. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_sst5.py +0 -0
  197. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_suc3.py +0 -0
  198. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_swedn.py +0 -0
  199. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_swerec.py +0 -0
  200. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_wiki_lingua_nl.py +0 -0
  201. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_wikiann_fo.py +0 -0
  202. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_wikineural-it.py +0 -0
  203. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_winogrande_is.py +0 -0
  204. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/create_xquad_es.py +0 -0
  205. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/fix_dot_env_file.py +0 -0
  206. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/load_ud_pos.py +0 -0
  207. {euroeval-15.7.0 → euroeval-15.7.2}/src/scripts/versioning.py +0 -0
  208. {euroeval-15.7.0 → euroeval-15.7.2}/tests/__init__.py +0 -0
  209. {euroeval-15.7.0 → euroeval-15.7.2}/tests/conftest.py +0 -0
  210. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_benchmark_config_factory.py +0 -0
  211. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_benchmark_modules/__init__.py +0 -0
  212. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_benchmark_modules/test_base.py +0 -0
  213. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_benchmark_modules/test_fresh.py +0 -0
  214. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_benchmark_modules/test_hf.py +0 -0
  215. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_benchmark_modules/test_litellm.py +0 -0
  216. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_benchmark_modules/test_vllm.py +0 -0
  217. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_benchmarker.py +0 -0
  218. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_callbacks.py +0 -0
  219. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_cli.py +0 -0
  220. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_constants.py +0 -0
  221. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_data_models.py +0 -0
  222. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_dataset_configs.py +0 -0
  223. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_enums.py +0 -0
  224. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_exceptions.py +0 -0
  225. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_finetuning.py +0 -0
  226. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_generation.py +0 -0
  227. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_human_evaluation.py +0 -0
  228. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_languages.py +0 -0
  229. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_model_cache.py +0 -0
  230. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_model_config.py +0 -0
  231. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_model_loading.py +0 -0
  232. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_speed_benchmark.py +0 -0
  233. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_task_utils/__init__.py +0 -0
  234. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_task_utils/test_question_answering.py +0 -0
  235. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_task_utils/test_sequence_classification.py +0 -0
  236. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_task_utils/test_text_to_text.py +0 -0
  237. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_task_utils/test_token_classification.py +0 -0
  238. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_tasks.py +0 -0
  239. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_tokenization_utils.py +0 -0
  240. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_types.py +0 -0
  241. {euroeval-15.7.0 → euroeval-15.7.2}/tests/test_utils.py +0 -0
@@ -8,9 +8,9 @@ repos:
8
8
  hooks:
9
9
  - id: end-of-file-fixer
10
10
  - id: trailing-whitespace
11
- # - id: debug-statements
11
+ - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.11.7
13
+ rev: v0.11.8
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -10,6 +10,34 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.7.2] - 2025-05-02
14
+ ### Fixed
15
+ - Now does not check if a model exists if it has already been evaluated. This is an
16
+ issue when evaluating Ollama models, if the Ollama server is not running.
17
+ - When evaluating instruction-tuned models on text classification tasks, the chat
18
+ template sometimes ends with special symbols, such as a newline, which can change the
19
+ tokenisation of the generated label. When we are evaluating the model using logprobs
20
+ we are thus looking for the wrong label in these cases. We now take this into account,
21
+ and log it to the user if the labels are not found, to avoid confusion.
22
+ - Finnish datasets were not included in the default "all" dataset list, which is the
23
+ default used when no datasets are specified. This has been fixed now.
24
+ - Temporarily disabled HellaSwag-fi, as there is an issue with the labels in the test
25
+ split, causing errors during evaluation. We will re-enable in a future release, when
26
+ this has been fixed.
27
+
28
+
29
+ ## [v15.7.1] - 2025-04-29
30
+ ### Changed
31
+ - Marked the DBRD Dutch sentiment classification as official, as the quality is
32
+ substantially better than the previous Dutch Social.
33
+
34
+ ### Fixed
35
+ - Fixed an issue with NER evaluation of instruction-tuned models, which was caused by
36
+ the "O" label mistakenly being included in the prompt template, causing an error
37
+ during evaluation. No evaluations were affected by this, only that some evaluations
38
+ could not be run.
39
+
40
+
13
41
  ## [v15.7.0] - 2025-04-28
14
42
  ### Added
15
43
  - Added support for Finnish 🇫🇮! This includes the Finnish part of the reading
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.7.0
3
+ Version: 15.7.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -7,68 +7,7 @@ information about what these constitute.
7
7
 
8
8
  ## Sentiment Classification
9
9
 
10
- ### Dutch Social
11
-
12
- This dataset consists of Dutch tweets annotated with sentiment labels. It is not sure
13
- how the sentiment labels were assigned, this information is pending from the authors.
14
-
15
- The original full dataset consists of 162,805 / 54,269 / 54,268 samples for training,
16
- validation and testing, respectively (so 271,342 samples used in total). We use a 1,024
17
- / 256 / 1,024 split for training, validation and testing, respectively. All the new
18
- splits are subsets of the original splits.
19
-
20
- Here are a few examples from the training split:
21
-
22
- ```json
23
- {
24
- "text": 'Novak Djokovic positief getest op coronavirus na eigen tennistoernooi\n\nhttps://t.co/U7VOcjANh9',
25
- "label": 'positive'
26
- }
27
- ```
28
- ```json
29
- {
30
- "text": "via @NYTimes https://t.co/IjbCWIwYvR",
31
- "label": "neutral"
32
- }
33
- ```
34
- ```json
35
- {
36
- "text": "@backinflow 30 min Corona tijd....",
37
- "label": "negative"
38
- }
39
- ```
40
-
41
- When evaluating generative models, we use the following setup (see the
42
- [methodology](/methodology) for more information on how these are used):
43
-
44
- - Number of few-shot examples: 12
45
- - Prefix prompt:
46
- ```
47
- Hieronder staan tweets en hun sentiment, dat 'positief', 'neutraal' of 'negatief' kan zijn.
48
- ```
49
- - Base prompt template:
50
- ```
51
- Tweet: {text}
52
- Sentiment: {label}
53
- ```
54
- - Instruction-tuned prompt template:
55
- ```
56
- Tweet: {text}
57
-
58
- Classificeer het sentiment in de tweet. Antwoord met 'positief', 'neutraal' of 'negatief'.
59
- ```
60
- - Label mapping:
61
- - `positive` ➡️ `positief`
62
- - `neutral` ➡️ `neutraal`
63
- - `negative` ➡️ `negatief`
64
-
65
- You can evaluate this dataset directly as follows:
66
-
67
- ```bash
68
- $ euroeval --model <model-id> --dataset dutch-social
69
- ```
70
-
71
- ### Unofficial: DBRD
10
+ ### DBRD
72
11
 
73
12
  This dataset was published in [this paper](https://doi.org/10.48550/arXiv.1910.00896)
74
13
  and features Dutch book reviews from [Hebban.nl](https://www.hebban.nl), annotated with
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.7.0"
3
+ version = "15.7.2"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -238,7 +238,7 @@ def prepare_languages(
238
238
  The default language codes of the languages to include.
239
239
 
240
240
  Returns:
241
- The prepared model or dataset languages.
241
+ The prepared dataset languages.
242
242
  """
243
243
  # Create a dictionary that maps languages to their associated language objects
244
244
  language_mapping = get_all_languages()
@@ -1,11 +1,8 @@
1
1
  """Generative models from an inference API, using the LiteLLM framework."""
2
2
 
3
3
  import collections.abc as c
4
- import itertools as it
5
- import json
6
4
  import logging
7
5
  import os
8
- import random
9
6
  import re
10
7
  import typing as t
11
8
  from functools import cached_property, partial
@@ -60,6 +57,7 @@ from ..exceptions import (
60
57
  NeedsEnvironmentVariable,
61
58
  NeedsExtraInstalled,
62
59
  )
60
+ from ..generation_utils import apply_prompt, extract_few_shot_examples
63
61
  from ..task_group_utils import (
64
62
  question_answering,
65
63
  sequence_classification,
@@ -943,14 +941,22 @@ class LiteLLMModel(BenchmarkModule):
943
941
  )
944
942
 
945
943
  if self.benchmark_config.few_shot:
946
- few_shot_examples = self._extract_few_shot_examples(
947
- dataset=dataset, task=task, itr_idx=itr_idx
944
+ few_shot_examples = extract_few_shot_examples(
945
+ dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
948
946
  )
949
947
  else:
950
948
  few_shot_examples = list()
951
949
 
952
950
  dataset["test"] = dataset["test"].map(
953
- partial(self._apply_prompt, few_shot_examples=few_shot_examples, task=task),
951
+ partial(
952
+ apply_prompt,
953
+ few_shot_examples=few_shot_examples,
954
+ model_config=self.model_config,
955
+ dataset_config=self.dataset_config,
956
+ instruction_model=True,
957
+ always_populate_text_field=False,
958
+ tokenizer=None,
959
+ ),
954
960
  batched=True,
955
961
  load_from_cache_file=False,
956
962
  keep_in_memory=True,
@@ -958,253 +964,6 @@ class LiteLLMModel(BenchmarkModule):
958
964
 
959
965
  return dataset
960
966
 
961
- def _extract_few_shot_examples(
962
- self, dataset: DatasetDict, task: Task, itr_idx: int
963
- ) -> list[dict[str, t.Any]]:
964
- """Extract few-shot examples from a dataset.
965
-
966
- This will always extract the examples from the training split.
967
-
968
- We ensure that the few-shot examples are unique by picking them one at a time.
969
-
970
- Args:
971
- dataset:
972
- The dataset to extract the few-shot examples from.
973
- task:
974
- The task that is being benchmarked.
975
- itr_idx:
976
- The index of the dataset in the iterator.
977
-
978
- Returns:
979
- The few-shot examples.
980
- """
981
- random_seed = 4242 + itr_idx
982
- num_few_shots = self.dataset_config.num_few_shot_examples
983
- few_shot_examples: list[dict[str, t.Any]] = list()
984
- shuffled_train = dataset["train"].shuffle(seed=random_seed)
985
-
986
- match task.task_group:
987
- case (
988
- TaskGroup.SEQUENCE_CLASSIFICATION
989
- | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
990
- ):
991
- labels = it.cycle(self.dataset_config.labels)
992
- while (
993
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
994
- ):
995
- label = next(labels)
996
- possible_examples = shuffled_train.filter(
997
- lambda x: x["label"].lower() == label.lower()
998
- )
999
- if len(possible_examples) == 0:
1000
- continue
1001
- example = possible_examples.select(range(1))[0]
1002
- few_shot_examples.append(example)
1003
- shuffled_train = shuffled_train.filter(
1004
- lambda x: x["text"] != example["text"]
1005
- )
1006
-
1007
- case TaskGroup.TEXT_TO_TEXT:
1008
- while (
1009
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
1010
- ):
1011
- example = shuffled_train.select(range(1))[0]
1012
- few_shot_examples.append(example)
1013
- shuffled_train = shuffled_train.filter(
1014
- lambda x: x["text"] != example["text"]
1015
- )
1016
-
1017
- case TaskGroup.TOKEN_CLASSIFICATION:
1018
- labels = it.cycle(
1019
- [
1020
- label.lower()
1021
- for label in self.dataset_config.labels
1022
- if label.lower().startswith("b-")
1023
- ]
1024
- )
1025
- while (
1026
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
1027
- ):
1028
- label = next(labels)
1029
- possible_examples = shuffled_train.filter(
1030
- lambda x: label in [tag.lower() for tag in x["labels"]]
1031
- )
1032
- if len(possible_examples) == 0:
1033
- continue
1034
- example = possible_examples.select(range(1))[0]
1035
- few_shot_examples.append(example)
1036
- shuffled_train = shuffled_train.filter(
1037
- lambda x: x["tokens"] != example["tokens"]
1038
- )
1039
-
1040
- case TaskGroup.QUESTION_ANSWERING:
1041
- # Locate the maximum number of tokens that constitutes a short example
1042
- for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
1043
- train_with_short_examples = dataset["train"].filter(
1044
- lambda example: len(example["context"]) < max_num_tokens
1045
- )
1046
- num_short_examples = len(train_with_short_examples)
1047
- if num_short_examples >= self.dataset_config.num_few_shot_examples:
1048
- break
1049
- else:
1050
- raise InvalidBenchmark(
1051
- "Could not find enough short examples for few-shot learning."
1052
- )
1053
-
1054
- shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
1055
- while (
1056
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
1057
- ):
1058
- example = shuffled_train.select(range(1))[0]
1059
- few_shot_examples.append(example)
1060
- shuffled_train = shuffled_train.filter(
1061
- lambda x: x["context"] != example["context"]
1062
- )
1063
-
1064
- case _:
1065
- raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
1066
-
1067
- random.seed(random_seed)
1068
- random.shuffle(few_shot_examples)
1069
- return few_shot_examples
1070
-
1071
- def _apply_prompt(
1072
- self,
1073
- examples: dict[str, t.Any],
1074
- few_shot_examples: list[dict[str, t.Any]],
1075
- task: Task,
1076
- ) -> dict[str, t.Any]:
1077
- """Apply prompt template to an example, potentially with few-shot examples.
1078
-
1079
- Args:
1080
- examples:
1081
- The examples to apply the few-shot examples to.
1082
- few_shot_examples:
1083
- The few-shot examples to apply.
1084
- task:
1085
- The task that is being benchmarked.
1086
-
1087
- Returns:
1088
- The example with the few-shot examples applied.
1089
- """
1090
-
1091
- def create_prompt(**kwargs: str) -> tuple[str, str]:
1092
- """Create a prompt from the given keyword arguments.
1093
-
1094
- Args:
1095
- kwargs:
1096
- The keyword arguments to use in the prompt.
1097
-
1098
- Returns:
1099
- A pair (prompt, label), where "label" is an empty string if the model is
1100
- not instruction tuned (as in this case it is included in the prompt).
1101
- """
1102
- label_key = "label" if "label" in kwargs else "target_text"
1103
- label = kwargs.pop(label_key)
1104
- label_mapping = self.dataset_config.prompt_label_mapping
1105
- label = label_mapping.get(label, label)
1106
- prompt = self.dataset_config.instruction_prompt.format(**kwargs)
1107
- return prompt, label
1108
-
1109
- match task.task_group:
1110
- case (
1111
- TaskGroup.SEQUENCE_CLASSIFICATION
1112
- | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
1113
- ):
1114
- few_shot_sections = [
1115
- create_prompt(
1116
- text=example["text"].replace("\n", " ").strip(),
1117
- label=example["label"].replace("\n", " ").strip(),
1118
- )
1119
- for example in few_shot_examples
1120
- ]
1121
- new_sections = [
1122
- create_prompt(text=text.replace("\n", " ").strip(), label="")
1123
- for text in examples["text"]
1124
- ]
1125
-
1126
- case TaskGroup.TEXT_TO_TEXT:
1127
- few_shot_sections = [
1128
- create_prompt(
1129
- text=example["text"].replace("\n", " ").strip(),
1130
- target_text=example["target_text"].replace("\n", " ").strip(),
1131
- )
1132
- for example in few_shot_examples
1133
- ]
1134
- new_sections = [
1135
- create_prompt(text=text.replace("\n", " ").strip(), target_text="")
1136
- for text in examples["text"]
1137
- ]
1138
-
1139
- case TaskGroup.TOKEN_CLASSIFICATION:
1140
-
1141
- def create_label(example: dict) -> str:
1142
- prompt_labels = self.dataset_config.prompt_label_mapping.values()
1143
- labels: dict[str, list[str]] = {
1144
- prompt_label: list() for prompt_label in prompt_labels
1145
- }
1146
- for token, label in zip(example["tokens"], example["labels"]):
1147
- label = label.lower()
1148
- if label == "o":
1149
- continue
1150
- prompt_label = self.dataset_config.prompt_label_mapping[label]
1151
- if label.startswith("b-"):
1152
- labels[prompt_label].append(token)
1153
- elif label.startswith("i-"):
1154
- labels[prompt_label][-1] += " " + token
1155
- return json.dumps(labels, ensure_ascii=False)
1156
-
1157
- few_shot_sections = [
1158
- create_prompt(
1159
- text=" ".join(example["tokens"]).replace("\n", " ").strip(),
1160
- label=create_label(example=example),
1161
- )
1162
- for example in few_shot_examples
1163
- ]
1164
- new_sections = [
1165
- create_prompt(
1166
- text=" ".join(tokens).replace("\n", " ").strip(), label=""
1167
- )
1168
- for tokens in examples["tokens"]
1169
- ]
1170
-
1171
- case TaskGroup.QUESTION_ANSWERING:
1172
- few_shot_sections = [
1173
- create_prompt(
1174
- text=example["context"].replace("\n", " ").strip(),
1175
- question=example["question"].replace("\n", " ").strip(),
1176
- label=example["answers"]["text"][0].replace("\n", " "),
1177
- )
1178
- for example in few_shot_examples
1179
- ]
1180
- new_sections = [
1181
- create_prompt(
1182
- text=context.replace("\n", " ").strip(),
1183
- question=question.replace("\n", " ").strip(),
1184
- label="",
1185
- )
1186
- for context, question in zip(
1187
- examples["context"], examples["question"]
1188
- )
1189
- ]
1190
-
1191
- case _:
1192
- raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
1193
-
1194
- few_shot_messages = [
1195
- dict(role=role, content=content)
1196
- for prompt, label in few_shot_sections
1197
- for role, content in [("user", prompt), ("assistant", label)]
1198
- ]
1199
-
1200
- messages_list = [
1201
- few_shot_messages + [dict(role="user", content=prompt)]
1202
- for prompt, _ in new_sections
1203
- ]
1204
-
1205
- examples["messages"] = messages_list
1206
- return examples
1207
-
1208
967
 
1209
968
  def raise_if_wrong_params(
1210
969
  model_config: ModelConfig, allowed_params: dict[str, list[str]]
@@ -1248,6 +1007,10 @@ def try_download_ollama_model(model_id: str) -> bool:
1248
1007
 
1249
1008
  Returns:
1250
1009
  Whether the model was downloaded successfully.
1010
+
1011
+ Raises:
1012
+ InvalidModel:
1013
+ If Ollama is not running or the model cannot be downloaded.
1251
1014
  """
1252
1015
  if not (model_id.startswith("ollama/") or model_id.startswith("ollama_chat/")):
1253
1016
  return False
@@ -1262,11 +1025,17 @@ def try_download_ollama_model(model_id: str) -> bool:
1262
1025
  level=logging.WARNING,
1263
1026
  )
1264
1027
 
1265
- downloaded_ollama_models: list[str] = [
1266
- model_obj.model
1267
- for model_obj in ollama.list().models
1268
- if model_obj.model is not None
1269
- ]
1028
+ try:
1029
+ downloaded_ollama_models: list[str] = [
1030
+ model_obj.model
1031
+ for model_obj in ollama.list().models
1032
+ if model_obj.model is not None
1033
+ ]
1034
+ except ConnectionError:
1035
+ raise InvalidModel(
1036
+ "Ollama does not seem to be running, so we cannot evaluate the model "
1037
+ f"{model_id!r}. Please make sure that Ollama is running and try again."
1038
+ )
1270
1039
 
1271
1040
  ollama_model_id = "/".join(model_id.split("/")[1:])
1272
1041
  if ollama_model_id not in downloaded_ollama_models: