EuroEval 15.8.0__tar.gz → 15.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (240) hide show
  1. {euroeval-15.8.0 → euroeval-15.8.2}/.github/workflows/ci.yaml +4 -0
  2. {euroeval-15.8.0 → euroeval-15.8.2}/.pre-commit-config.yaml +1 -1
  3. {euroeval-15.8.0 → euroeval-15.8.2}/CHANGELOG.md +17 -0
  4. {euroeval-15.8.0 → euroeval-15.8.2}/PKG-INFO +3 -3
  5. {euroeval-15.8.0 → euroeval-15.8.2}/pyproject.toml +3 -3
  6. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/benchmark_modules/litellm.py +7 -2
  7. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/data_models.py +9 -5
  8. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/model_cache.py +9 -0
  9. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/task_group_utils/sequence_classification.py +21 -32
  10. {euroeval-15.8.0 → euroeval-15.8.2}/uv.lock +3 -3
  11. {euroeval-15.8.0 → euroeval-15.8.2}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  12. {euroeval-15.8.0 → euroeval-15.8.2}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  13. {euroeval-15.8.0 → euroeval-15.8.2}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  14. {euroeval-15.8.0 → euroeval-15.8.2}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  15. {euroeval-15.8.0 → euroeval-15.8.2}/.gitignore +0 -0
  16. {euroeval-15.8.0 → euroeval-15.8.2}/CITATION.cff +0 -0
  17. {euroeval-15.8.0 → euroeval-15.8.2}/CODE_OF_CONDUCT.md +0 -0
  18. {euroeval-15.8.0 → euroeval-15.8.2}/CONTRIBUTING.md +0 -0
  19. {euroeval-15.8.0 → euroeval-15.8.2}/Dockerfile.cuda +0 -0
  20. {euroeval-15.8.0 → euroeval-15.8.2}/LICENSE +0 -0
  21. {euroeval-15.8.0 → euroeval-15.8.2}/NEW_DATASET_GUIDE.md +0 -0
  22. {euroeval-15.8.0 → euroeval-15.8.2}/README.md +0 -0
  23. {euroeval-15.8.0 → euroeval-15.8.2}/docs/CNAME +0 -0
  24. {euroeval-15.8.0 → euroeval-15.8.2}/docs/README.md +0 -0
  25. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/README.md +0 -0
  26. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/danish.md +0 -0
  27. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/dutch.md +0 -0
  28. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/english.md +0 -0
  29. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/faroese.md +0 -0
  30. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/finnish.md +0 -0
  31. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/french.md +0 -0
  32. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/german.md +0 -0
  33. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/icelandic.md +0 -0
  34. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/italian.md +0 -0
  35. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/norwegian.md +0 -0
  36. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/spanish.md +0 -0
  37. {euroeval-15.8.0 → euroeval-15.8.2}/docs/datasets/swedish.md +0 -0
  38. {euroeval-15.8.0 → euroeval-15.8.2}/docs/extras/radial_plotter.md +0 -0
  39. {euroeval-15.8.0 → euroeval-15.8.2}/docs/faq.md +0 -0
  40. {euroeval-15.8.0 → euroeval-15.8.2}/docs/gfx/favicon.png +0 -0
  41. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/danish.md +0 -0
  42. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/dutch.md +0 -0
  43. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/english.md +0 -0
  44. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/faroese.md +0 -0
  45. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/french.md +0 -0
  46. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/german.md +0 -0
  47. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  48. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/italian.md +0 -0
  49. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  50. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/spanish.md +0 -0
  51. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Monolingual/swedish.md +0 -0
  52. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Multilingual/european.md +0 -0
  53. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Multilingual/germanic.md +0 -0
  54. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  55. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/Multilingual/romance.md +0 -0
  56. {euroeval-15.8.0 → euroeval-15.8.2}/docs/leaderboards/README.md +0 -0
  57. {euroeval-15.8.0 → euroeval-15.8.2}/docs/methodology.md +0 -0
  58. {euroeval-15.8.0 → euroeval-15.8.2}/docs/python-package.md +0 -0
  59. {euroeval-15.8.0 → euroeval-15.8.2}/docs/tasks/README.md +0 -0
  60. {euroeval-15.8.0 → euroeval-15.8.2}/docs/tasks/common-sense-reasoning.md +0 -0
  61. {euroeval-15.8.0 → euroeval-15.8.2}/docs/tasks/knowledge.md +0 -0
  62. {euroeval-15.8.0 → euroeval-15.8.2}/docs/tasks/linguistic-acceptability.md +0 -0
  63. {euroeval-15.8.0 → euroeval-15.8.2}/docs/tasks/named-entity-recognition.md +0 -0
  64. {euroeval-15.8.0 → euroeval-15.8.2}/docs/tasks/reading-comprehension.md +0 -0
  65. {euroeval-15.8.0 → euroeval-15.8.2}/docs/tasks/sentiment-classification.md +0 -0
  66. {euroeval-15.8.0 → euroeval-15.8.2}/docs/tasks/speed.md +0 -0
  67. {euroeval-15.8.0 → euroeval-15.8.2}/docs/tasks/summarization.md +0 -0
  68. {euroeval-15.8.0 → euroeval-15.8.2}/gfx/euroeval.png +0 -0
  69. {euroeval-15.8.0 → euroeval-15.8.2}/gfx/euroeval.xcf +0 -0
  70. {euroeval-15.8.0 → euroeval-15.8.2}/gfx/scandeval.png +0 -0
  71. {euroeval-15.8.0 → euroeval-15.8.2}/makefile +0 -0
  72. {euroeval-15.8.0 → euroeval-15.8.2}/mkdocs.yaml +0 -0
  73. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/__init__.py +0 -0
  74. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/benchmark_config_factory.py +0 -0
  75. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/benchmark_modules/__init__.py +0 -0
  76. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/benchmark_modules/base.py +0 -0
  77. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/benchmark_modules/fresh.py +0 -0
  78. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/benchmark_modules/hf.py +0 -0
  79. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/benchmark_modules/vllm.py +0 -0
  80. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/benchmarker.py +0 -0
  81. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/callbacks.py +0 -0
  82. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/cli.py +0 -0
  83. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/constants.py +0 -0
  84. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/data_loading.py +0 -0
  85. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/__init__.py +0 -0
  86. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/danish.py +0 -0
  87. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/dutch.py +0 -0
  88. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/english.py +0 -0
  89. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/faroese.py +0 -0
  90. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/finnish.py +0 -0
  91. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/french.py +0 -0
  92. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/german.py +0 -0
  93. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/icelandic.py +0 -0
  94. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/italian.py +0 -0
  95. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/norwegian.py +0 -0
  96. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/spanish.py +0 -0
  97. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/dataset_configs/swedish.py +0 -0
  98. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/enums.py +0 -0
  99. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/exceptions.py +0 -0
  100. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/finetuning.py +0 -0
  101. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/generation.py +0 -0
  102. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/generation_utils.py +0 -0
  103. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/human_evaluation.py +0 -0
  104. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/languages.py +0 -0
  105. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/model_config.py +0 -0
  106. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/model_loading.py +0 -0
  107. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/prompt_templates/__init__.py +0 -0
  108. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  109. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  110. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  111. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  112. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  113. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/prompt_templates/summarization.py +0 -0
  114. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/scores.py +0 -0
  115. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/speed_benchmark.py +0 -0
  116. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/task_group_utils/__init__.py +0 -0
  117. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  118. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/task_group_utils/question_answering.py +0 -0
  119. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  120. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/task_group_utils/token_classification.py +0 -0
  121. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/tasks.py +0 -0
  122. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/tokenization_utils.py +0 -0
  123. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/types.py +0 -0
  124. {euroeval-15.8.0 → euroeval-15.8.2}/src/euroeval/utils.py +0 -0
  125. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/constants.py +0 -0
  126. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_allocine.py +0 -0
  127. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_angry_tweets.py +0 -0
  128. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_arc.py +0 -0
  129. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_arc_is.py +0 -0
  130. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_belebele.py +0 -0
  131. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_cnn_dailymail.py +0 -0
  132. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_conll_en.py +0 -0
  133. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_conll_es.py +0 -0
  134. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_conll_nl.py +0 -0
  135. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_dane.py +0 -0
  136. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_danish_citizen_tests.py +0 -0
  137. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_dansk.py +0 -0
  138. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_danske_talemaader.py +0 -0
  139. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_danske_talemaader_old.py +0 -0
  140. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_dbrd.py +0 -0
  141. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_dutch_cola.py +0 -0
  142. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_eltec.py +0 -0
  143. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_fone.py +0 -0
  144. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_foqa.py +0 -0
  145. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_fosent.py +0 -0
  146. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_fquad.py +0 -0
  147. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_germanquad.py +0 -0
  148. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_germeval.py +0 -0
  149. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_hellaswag.py +0 -0
  150. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_hellaswag_fi.py +0 -0
  151. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  152. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_ice_linguistic.py +0 -0
  153. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_icelandic_error_corpus.py +0 -0
  154. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_icelandic_knowledge.py +0 -0
  155. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_icelandic_qa.py +0 -0
  156. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_icesum.py +0 -0
  157. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_ilpost_sum.py +0 -0
  158. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_jentoft.py +0 -0
  159. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_mim_gold_ner.py +0 -0
  160. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_mlqa_es.py +0 -0
  161. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_mlsum_de.py +0 -0
  162. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_mlsum_es.py +0 -0
  163. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_mmlu.py +0 -0
  164. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_multinerd-it.py +0 -0
  165. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_no_cola.py +0 -0
  166. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_no_sammendrag.py +0 -0
  167. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_nor_common_sense_qa.py +0 -0
  168. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_nordjylland_news.py +0 -0
  169. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_norec.py +0 -0
  170. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_norglm_multiqa.py +0 -0
  171. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_norglm_multisum.py +0 -0
  172. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_norne.py +0 -0
  173. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_norquad.py +0 -0
  174. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_nqii.py +0 -0
  175. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_nrk_quiz_qa.py +0 -0
  176. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_orange_sum.py +0 -0
  177. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_personal_sum.py +0 -0
  178. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_rrn.py +0 -0
  179. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_sb10k.py +0 -0
  180. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_scala.py +0 -0
  181. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_scandiqa.py +0 -0
  182. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_scandisent_fi.py +0 -0
  183. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_schibsted.py +0 -0
  184. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_sentiment_headlines_es.py +0 -0
  185. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_sentipolc16.py +0 -0
  186. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_squad.py +0 -0
  187. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_squad_it.py +0 -0
  188. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_squad_nl.py +0 -0
  189. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_squad_nl_old.py +0 -0
  190. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_sst5.py +0 -0
  191. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_suc3.py +0 -0
  192. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_swedn.py +0 -0
  193. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_swerec.py +0 -0
  194. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_turku_ner_fi.py +0 -0
  195. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_tydiqa_fi.py +0 -0
  196. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_wiki_lingua_nl.py +0 -0
  197. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_wikiann_fo.py +0 -0
  198. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_wikineural-it.py +0 -0
  199. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_winogrande_is.py +0 -0
  200. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_xlsum_fi.py +0 -0
  201. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/create_xquad_es.py +0 -0
  202. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/fix_dot_env_file.py +0 -0
  203. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/load_ud_pos.py +0 -0
  204. {euroeval-15.8.0 → euroeval-15.8.2}/src/scripts/versioning.py +0 -0
  205. {euroeval-15.8.0 → euroeval-15.8.2}/tests/__init__.py +0 -0
  206. {euroeval-15.8.0 → euroeval-15.8.2}/tests/conftest.py +0 -0
  207. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_benchmark_config_factory.py +0 -0
  208. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_benchmark_modules/__init__.py +0 -0
  209. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_benchmark_modules/test_base.py +0 -0
  210. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_benchmark_modules/test_fresh.py +0 -0
  211. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_benchmark_modules/test_hf.py +0 -0
  212. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_benchmark_modules/test_litellm.py +0 -0
  213. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_benchmark_modules/test_vllm.py +0 -0
  214. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_benchmarker.py +0 -0
  215. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_callbacks.py +0 -0
  216. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_cli.py +0 -0
  217. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_constants.py +0 -0
  218. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_data_loading.py +0 -0
  219. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_data_models.py +0 -0
  220. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_dataset_configs.py +0 -0
  221. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_enums.py +0 -0
  222. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_exceptions.py +0 -0
  223. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_finetuning.py +0 -0
  224. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_generation.py +0 -0
  225. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_human_evaluation.py +0 -0
  226. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_languages.py +0 -0
  227. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_model_cache.py +0 -0
  228. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_model_config.py +0 -0
  229. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_model_loading.py +0 -0
  230. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_scores.py +0 -0
  231. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_speed_benchmark.py +0 -0
  232. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_task_utils/__init__.py +0 -0
  233. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_task_utils/test_question_answering.py +0 -0
  234. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_task_utils/test_sequence_classification.py +0 -0
  235. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_task_utils/test_text_to_text.py +0 -0
  236. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_task_utils/test_token_classification.py +0 -0
  237. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_tasks.py +0 -0
  238. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_tokenization_utils.py +0 -0
  239. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_types.py +0 -0
  240. {euroeval-15.8.0 → euroeval-15.8.2}/tests/test_utils.py +0 -0
@@ -10,6 +10,10 @@ on:
10
10
  branches:
11
11
  - main
12
12
 
13
+ concurrency:
14
+ group: ${{ github.workflow }}-${{ github.head_ref }}
15
+ cancel-in-progress: true
16
+
13
17
  jobs:
14
18
  code-check:
15
19
  if: github.event.pull_request.draft == false
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.11.8
13
+ rev: v0.11.9
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -10,6 +10,23 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.8.2] - 2025-05-12
14
+ ### Fixed
15
+ - Catch error when caching generative model outputs, when the number of model inputs and
16
+ outputs do not match.
17
+ - Disallow vLLM >=0.8.5, as it breaks generation output for several models.
18
+
19
+
20
+ ## [v15.8.1] - 2025-05-08
21
+ ### Fixed
22
+ - NER labels were included twice in the prompt templates (which was due to there being
23
+ both, e.g., `B-ORG` and `I-ORG`). This caused models not using structured generation,
24
+ such as reasoning models, to sometimes output the wrong labels. This has been fixed
25
+ now.
26
+ - If a model outputs a `\boxed{}` answer, we now extract and use that, rather than the
27
+ full generated answer.
28
+
29
+
13
30
  ## [v15.8.0] - 2025-05-07
14
31
  ### Added
15
32
  - Added the BeleBele datasets for Finnish, Italian and Spanish. They are listed as
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.8.0
3
+ Version: 15.8.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -62,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm<0.8.5,>=0.8.3; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm<0.8.5,>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
71
71
  Provides-Extra: human-evaluation
72
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
73
  Provides-Extra: test
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.8.0"
3
+ version = "15.8.2"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -46,7 +46,7 @@ dependencies = [
46
46
  generative = [
47
47
  "outlines>=0.1.11",
48
48
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
49
- "vllm>=0.8.3; platform_system == 'Linux'",
49
+ "vllm>=0.8.3,<0.8.5; platform_system == 'Linux'",
50
50
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
51
51
  ]
52
52
  human_evaluation = [
@@ -55,7 +55,7 @@ human_evaluation = [
55
55
  all = [
56
56
  "outlines>=0.1.11",
57
57
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
58
- "vllm>=0.8.3; platform_system == 'Linux'",
58
+ "vllm>=0.8.3,<0.8.5; platform_system == 'Linux'",
59
59
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
60
60
  "gradio>=4.26.0",
61
61
  ]
@@ -401,6 +401,12 @@ class LiteLLMModel(BenchmarkModule):
401
401
  model_responses=ordered_responses, model_id=self.model_config.model_id
402
402
  )
403
403
 
404
+ if len(messages) != len(model_output.sequences):
405
+ raise InvalidBenchmark(
406
+ f"Number of model inputs ({len(messages):,}) does not match the "
407
+ f"number of model outputs ({len(model_output.sequences):,})."
408
+ )
409
+
404
410
  return model_output
405
411
 
406
412
  def _handle_exception(
@@ -616,8 +622,7 @@ class LiteLLMModel(BenchmarkModule):
616
622
  scores = []
617
623
  for model_response in model_responses:
618
624
  if not model_response.choices:
619
- # This happens for reasoning models, when they don't finish thinking
620
- # and run out of tokens. Happens quite rarely, but we need to handle it.
625
+ sequences.append("")
621
626
  logger.warning(
622
627
  f"The model {model_id!r} did not end up "
623
628
  "generating any text. This is likely because the model ran "
@@ -529,12 +529,16 @@ class DatasetConfig:
529
529
  else:
530
530
  sep_word = main_language.or_separator
531
531
 
532
+ local_labels: list[str] = []
533
+ for label in self.labels:
534
+ if label not in self.prompt_label_mapping:
535
+ continue
536
+ local_label = self.prompt_label_mapping[label]
537
+ if local_label not in local_labels:
538
+ local_labels.append(local_label)
539
+
532
540
  # Convert labels to single-quoted labels - and remove duplicates
533
- quoted_labels = [
534
- f"'{self.prompt_label_mapping[label]}'"
535
- for label in set(self.labels)
536
- if label in self.prompt_label_mapping
537
- ]
541
+ quoted_labels = [f"'{label}'" for label in local_labels]
538
542
 
539
543
  if not quoted_labels:
540
544
  return ""
@@ -168,6 +168,15 @@ class ModelCache:
168
168
  input_column = "messages" if "messages" in model_inputs else "text"
169
169
  model_inputs = model_inputs[input_column]
170
170
 
171
+ # Double check that the number of inputs and outputs match
172
+ if not len(model_inputs) == len(model_output.sequences):
173
+ logger.warning(
174
+ f"Number of model inputs ({len(model_inputs)}) does not match the "
175
+ f"number of model outputs ({len(model_output.sequences)}). We will not "
176
+ f"cache the model outputs."
177
+ )
178
+ return
179
+
171
180
  # Store the generated sequences in the cache, one by one
172
181
  with tqdm(
173
182
  iterable=model_inputs,
@@ -144,9 +144,27 @@ def extract_labels_from_generation(
144
144
  )
145
145
  if labels is not None:
146
146
  return labels
147
- return get_closest_word_edit_labels(
148
- generated_sequences=model_output.sequences, dataset_config=dataset_config
149
- )
147
+
148
+ candidate_labels = [
149
+ dataset_config.prompt_label_mapping[lbl]
150
+ for lbl in dataset_config.id2label.values()
151
+ ]
152
+ new_predicted_labels: list[str] = list()
153
+ for predicted_label in model_output.sequences:
154
+ # If the prediction includes a boxed answer, use that instead of the full
155
+ # generation
156
+ if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
157
+ predicted_label = m.group(1)
158
+
159
+ # Pick the label with the smallest word edit distance to the predicted label
160
+ edit_distances = [
161
+ Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
162
+ for candidate_label in candidate_labels
163
+ ]
164
+ predicted_label = candidate_labels[np.argmin(edit_distances).item()]
165
+ new_predicted_labels.append(predicted_label)
166
+
167
+ return new_predicted_labels
150
168
 
151
169
 
152
170
  def get_closest_logprobs_labels(
@@ -305,32 +323,3 @@ def get_closest_logprobs_labels(
305
323
 
306
324
  assert len(output_labels) == len(generation_logprobs)
307
325
  return output_labels
308
-
309
-
310
- def get_closest_word_edit_labels(
311
- generated_sequences: list[str], dataset_config: "DatasetConfig"
312
- ) -> list[str]:
313
- """Get the labels with the smallest edit distance to the predicted labels.
314
-
315
- Args:
316
- generated_sequences:
317
- The generated sequences from the model.
318
- dataset_config:
319
- The configuration of the dataset.
320
-
321
- Returns:
322
- The candidate labels with the smallest edit distance to the predicted labels.
323
- """
324
- candidate_labels = [
325
- dataset_config.prompt_label_mapping[lbl]
326
- for lbl in dataset_config.id2label.values()
327
- ]
328
- new_predicted_labels: list[str] = list()
329
- for predicted_label in generated_sequences:
330
- edit_distances = [
331
- Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
332
- for candidate_label in candidate_labels
333
- ]
334
- closest_label = candidate_labels[np.argmin(edit_distances).item()]
335
- new_predicted_labels.append(closest_label)
336
- return new_predicted_labels
@@ -906,7 +906,7 @@ wheels = [
906
906
 
907
907
  [[package]]
908
908
  name = "euroeval"
909
- version = "15.8.0"
909
+ version = "15.8.2"
910
910
  source = { editable = "." }
911
911
  dependencies = [
912
912
  { name = "accelerate" },
@@ -1034,8 +1034,8 @@ requires-dist = [
1034
1034
  { name = "termcolor", specifier = ">=2.0.0" },
1035
1035
  { name = "torch", specifier = ">=2.6.0" },
1036
1036
  { name = "transformers", specifier = ">=4.51.0" },
1037
- { name = "vllm", marker = "sys_platform == 'linux' and extra == 'all'", specifier = ">=0.8.3" },
1038
- { name = "vllm", marker = "sys_platform == 'linux' and extra == 'generative'", specifier = ">=0.8.3" },
1037
+ { name = "vllm", marker = "sys_platform == 'linux' and extra == 'all'", specifier = ">=0.8.3,<0.8.5" },
1038
+ { name = "vllm", marker = "sys_platform == 'linux' and extra == 'generative'", specifier = ">=0.8.3,<0.8.5" },
1039
1039
  ]
1040
1040
  provides-extras = ["generative", "human-evaluation", "all", "test"]
1041
1041
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes