EuroEval 15.4.1__tar.gz → 15.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (211) hide show
  1. {euroeval-15.4.1 → euroeval-15.4.2}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +2 -0
  2. {euroeval-15.4.1 → euroeval-15.4.2}/.github/ISSUE_TEMPLATE/bug.yaml +17 -2
  3. {euroeval-15.4.1 → euroeval-15.4.2}/.github/ISSUE_TEMPLATE/feature_request.yaml +1 -11
  4. {euroeval-15.4.1 → euroeval-15.4.2}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +21 -10
  5. {euroeval-15.4.1 → euroeval-15.4.2}/.github/workflows/ci.yaml +0 -2
  6. {euroeval-15.4.1 → euroeval-15.4.2}/CHANGELOG.md +46 -1
  7. {euroeval-15.4.1 → euroeval-15.4.2}/PKG-INFO +4 -3
  8. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/danish.md +4 -5
  9. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/french.md +2 -2
  10. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/spanish.md +1 -1
  11. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/swedish.md +4 -5
  12. {euroeval-15.4.1 → euroeval-15.4.2}/pyproject.toml +4 -4
  13. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/benchmark_modules/hf.py +68 -37
  14. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/benchmark_modules/vllm.py +47 -8
  15. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/constants.py +3 -0
  16. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/data_models.py +7 -2
  17. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/dataset_configs.py +5 -5
  18. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/task_utils/sequence_classification.py +32 -27
  19. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/types.py +3 -3
  20. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/utils.py +32 -29
  21. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_mlsum_de.py +1 -1
  22. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_mlsum_es.py +1 -1
  23. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_constants.py +1 -1
  24. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_utils.py +0 -11
  25. {euroeval-15.4.1 → euroeval-15.4.2}/uv.lock +408 -372
  26. {euroeval-15.4.1 → euroeval-15.4.2}/.gitignore +0 -0
  27. {euroeval-15.4.1 → euroeval-15.4.2}/.pre-commit-config.yaml +0 -0
  28. {euroeval-15.4.1 → euroeval-15.4.2}/CITATION.cff +0 -0
  29. {euroeval-15.4.1 → euroeval-15.4.2}/CODE_OF_CONDUCT.md +0 -0
  30. {euroeval-15.4.1 → euroeval-15.4.2}/CONTRIBUTING.md +0 -0
  31. {euroeval-15.4.1 → euroeval-15.4.2}/Dockerfile.cuda +0 -0
  32. {euroeval-15.4.1 → euroeval-15.4.2}/LICENSE +0 -0
  33. {euroeval-15.4.1 → euroeval-15.4.2}/README.md +0 -0
  34. {euroeval-15.4.1 → euroeval-15.4.2}/docs/CNAME +0 -0
  35. {euroeval-15.4.1 → euroeval-15.4.2}/docs/README.md +0 -0
  36. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/README.md +0 -0
  37. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/dutch.md +0 -0
  38. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/english.md +0 -0
  39. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/faroese.md +0 -0
  40. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/german.md +0 -0
  41. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/icelandic.md +0 -0
  42. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/italian.md +0 -0
  43. {euroeval-15.4.1 → euroeval-15.4.2}/docs/datasets/norwegian.md +0 -0
  44. {euroeval-15.4.1 → euroeval-15.4.2}/docs/extras/radial_plotter.md +0 -0
  45. {euroeval-15.4.1 → euroeval-15.4.2}/docs/faq.md +0 -0
  46. {euroeval-15.4.1 → euroeval-15.4.2}/docs/gfx/favicon.png +0 -0
  47. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/danish.md +0 -0
  48. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/dutch.md +0 -0
  49. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/english.md +0 -0
  50. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/faroese.md +0 -0
  51. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/french.md +0 -0
  52. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/german.md +0 -0
  53. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  54. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/italian.md +0 -0
  55. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  56. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Monolingual/swedish.md +0 -0
  57. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Multilingual/european.md +0 -0
  58. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Multilingual/germanic.md +0 -0
  59. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  60. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/Multilingual/romance.md +0 -0
  61. {euroeval-15.4.1 → euroeval-15.4.2}/docs/leaderboards/README.md +0 -0
  62. {euroeval-15.4.1 → euroeval-15.4.2}/docs/methodology.md +0 -0
  63. {euroeval-15.4.1 → euroeval-15.4.2}/docs/python-package.md +0 -0
  64. {euroeval-15.4.1 → euroeval-15.4.2}/docs/tasks/README.md +0 -0
  65. {euroeval-15.4.1 → euroeval-15.4.2}/docs/tasks/common-sense-reasoning.md +0 -0
  66. {euroeval-15.4.1 → euroeval-15.4.2}/docs/tasks/knowledge.md +0 -0
  67. {euroeval-15.4.1 → euroeval-15.4.2}/docs/tasks/linguistic-acceptability.md +0 -0
  68. {euroeval-15.4.1 → euroeval-15.4.2}/docs/tasks/named-entity-recognition.md +0 -0
  69. {euroeval-15.4.1 → euroeval-15.4.2}/docs/tasks/reading-comprehension.md +0 -0
  70. {euroeval-15.4.1 → euroeval-15.4.2}/docs/tasks/sentiment-classification.md +0 -0
  71. {euroeval-15.4.1 → euroeval-15.4.2}/docs/tasks/speed.md +0 -0
  72. {euroeval-15.4.1 → euroeval-15.4.2}/docs/tasks/summarization.md +0 -0
  73. {euroeval-15.4.1 → euroeval-15.4.2}/gfx/euroeval.png +0 -0
  74. {euroeval-15.4.1 → euroeval-15.4.2}/gfx/euroeval.xcf +0 -0
  75. {euroeval-15.4.1 → euroeval-15.4.2}/gfx/scandeval.png +0 -0
  76. {euroeval-15.4.1 → euroeval-15.4.2}/makefile +0 -0
  77. {euroeval-15.4.1 → euroeval-15.4.2}/mkdocs.yaml +0 -0
  78. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/__init__.py +0 -0
  79. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/benchmark_config_factory.py +0 -0
  80. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/benchmark_modules/__init__.py +0 -0
  81. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/benchmark_modules/base.py +0 -0
  82. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/benchmark_modules/fresh.py +0 -0
  83. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/benchmark_modules/litellm.py +0 -0
  84. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/benchmarker.py +0 -0
  85. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/callbacks.py +0 -0
  86. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/cli.py +0 -0
  87. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/data_loading.py +0 -0
  88. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/enums.py +0 -0
  89. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/exceptions.py +0 -0
  90. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/finetuning.py +0 -0
  91. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/generation.py +0 -0
  92. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/human_evaluation.py +0 -0
  93. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/languages.py +0 -0
  94. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/model_cache.py +0 -0
  95. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/model_config.py +0 -0
  96. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/model_loading.py +0 -0
  97. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/scores.py +0 -0
  98. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/speed_benchmark.py +0 -0
  99. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/task_utils/__init__.py +0 -0
  100. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/task_utils/multiple_choice_classification.py +0 -0
  101. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/task_utils/question_answering.py +0 -0
  102. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/task_utils/text_to_text.py +0 -0
  103. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/task_utils/token_classification.py +0 -0
  104. {euroeval-15.4.1 → euroeval-15.4.2}/src/euroeval/tasks.py +0 -0
  105. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/constants.py +0 -0
  106. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_allocine.py +0 -0
  107. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_angry_tweets.py +0 -0
  108. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_arc.py +0 -0
  109. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_arc_is.py +0 -0
  110. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_belebele.py +0 -0
  111. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_cnn_dailymail.py +0 -0
  112. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_conll_en.py +0 -0
  113. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_conll_es.py +0 -0
  114. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_conll_nl.py +0 -0
  115. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_dane.py +0 -0
  116. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_danish_citizen_tests.py +0 -0
  117. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_dansk.py +0 -0
  118. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_danske_talemaader.py +0 -0
  119. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_danske_talemaader_old.py +0 -0
  120. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_dbrd.py +0 -0
  121. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_dutch_cola.py +0 -0
  122. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_dutch_social.py +0 -0
  123. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_eltec.py +0 -0
  124. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_fone.py +0 -0
  125. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_foqa.py +0 -0
  126. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_fosent.py +0 -0
  127. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_fquad.py +0 -0
  128. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_germanquad.py +0 -0
  129. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_germeval.py +0 -0
  130. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_hellaswag.py +0 -0
  131. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  132. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_ice_linguistic.py +0 -0
  133. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_icelandic_error_corpus.py +0 -0
  134. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_icelandic_knowledge.py +0 -0
  135. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_icelandic_qa.py +0 -0
  136. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_icesum.py +0 -0
  137. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_ilpost_sum.py +0 -0
  138. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_jentoft.py +0 -0
  139. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_mim_gold_ner.py +0 -0
  140. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_mlqa_es.py +0 -0
  141. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_mmlu.py +0 -0
  142. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_multinerd-it.py +0 -0
  143. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_no_cola.py +0 -0
  144. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_no_sammendrag.py +0 -0
  145. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_nor_common_sense_qa.py +0 -0
  146. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_nordjylland_news.py +0 -0
  147. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_norec.py +0 -0
  148. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_norglm_multiqa.py +0 -0
  149. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_norglm_multisum.py +0 -0
  150. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_norne.py +0 -0
  151. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_norquad.py +0 -0
  152. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_nqii.py +0 -0
  153. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_nrk_quiz_qa.py +0 -0
  154. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_orange_sum.py +0 -0
  155. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_personal_sum.py +0 -0
  156. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_rrn.py +0 -0
  157. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_sb10k.py +0 -0
  158. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_scala.py +0 -0
  159. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_scandiqa.py +0 -0
  160. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_schibsted.py +0 -0
  161. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_sentiment_headlines_es.py +0 -0
  162. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_sentipolc16.py +0 -0
  163. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_squad.py +0 -0
  164. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_squad_it.py +0 -0
  165. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_squad_nl.py +0 -0
  166. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_squad_nl_old.py +0 -0
  167. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_sst5.py +0 -0
  168. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_suc3.py +0 -0
  169. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_swedn.py +0 -0
  170. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_swerec.py +0 -0
  171. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_wiki_lingua_nl.py +0 -0
  172. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_wikiann_fo.py +0 -0
  173. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_wikineural-it.py +0 -0
  174. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_winogrande_is.py +0 -0
  175. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/create_xquad_es.py +0 -0
  176. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/fix_dot_env_file.py +0 -0
  177. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/load_ud_pos.py +0 -0
  178. {euroeval-15.4.1 → euroeval-15.4.2}/src/scripts/versioning.py +0 -0
  179. {euroeval-15.4.1 → euroeval-15.4.2}/tests/__init__.py +0 -0
  180. {euroeval-15.4.1 → euroeval-15.4.2}/tests/conftest.py +0 -0
  181. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_benchmark_config_factory.py +0 -0
  182. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_benchmark_modules/__init__.py +0 -0
  183. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_benchmark_modules/test_base.py +0 -0
  184. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_benchmark_modules/test_fresh.py +0 -0
  185. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_benchmark_modules/test_hf.py +0 -0
  186. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_benchmark_modules/test_litellm.py +0 -0
  187. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_benchmark_modules/test_vllm.py +0 -0
  188. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_benchmarker.py +0 -0
  189. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_callbacks.py +0 -0
  190. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_cli.py +0 -0
  191. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_data_loading.py +0 -0
  192. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_data_models.py +0 -0
  193. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_dataset_configs.py +0 -0
  194. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_enums.py +0 -0
  195. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_exceptions.py +0 -0
  196. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_finetuning.py +0 -0
  197. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_generation.py +0 -0
  198. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_human_evaluation.py +0 -0
  199. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_languages.py +0 -0
  200. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_model_cache.py +0 -0
  201. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_model_config.py +0 -0
  202. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_model_loading.py +0 -0
  203. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_scores.py +0 -0
  204. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_speed_benchmark.py +0 -0
  205. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_task_utils/__init__.py +0 -0
  206. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_task_utils/test_question_answering.py +0 -0
  207. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_task_utils/test_sequence_classification.py +0 -0
  208. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_task_utils/test_text_to_text.py +0 -0
  209. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_task_utils/test_token_classification.py +0 -0
  210. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_tasks.py +0 -0
  211. {euroeval-15.4.1 → euroeval-15.4.2}/tests/test_types.py +0 -0
@@ -2,6 +2,7 @@ name: 📚 Benchmark Dataset Request
2
2
  description: Do you think a particular benchmark dataset is missing in EuroEval?
3
3
  title: "[BENCHMARK DATASET REQUEST] <dataset-name>"
4
4
  labels: "benchmark dataset request"
5
+ type: task
5
6
 
6
7
  body:
7
8
  - type: input
@@ -30,6 +31,7 @@ body:
30
31
  - label: Icelandic
31
32
  - label: Italian
32
33
  - label: Norwegian (Bokmål or Nynorsk)
34
+ - label: Spanish
33
35
  - label: Swedish
34
36
  validations:
35
37
  required: true
@@ -1,7 +1,7 @@
1
1
  name: 🐛 Bug Report
2
2
  description: Have you experienced a bug using the `euroeval` package?
3
3
  title: "[BUG] <name-of-bug>"
4
- labels: bug
4
+ type: bug
5
5
 
6
6
  body:
7
7
  - type: markdown
@@ -46,8 +46,9 @@ body:
46
46
  - 3.10.x
47
47
  - 3.11.x
48
48
  - 3.12.x
49
+ - 3.13.x
49
50
  - Older than 3.10.x
50
- - Newer than 3.12.x
51
+ - Newer than 3.13.x
51
52
  validations:
52
53
  required: true
53
54
  - type: input
@@ -57,6 +58,20 @@ body:
57
58
  placeholder: Output of `pip list | grep EuroEval`
58
59
  validations:
59
60
  required: true
61
+ - type: input
62
+ attributes:
63
+ label: Transformers version
64
+ description: What version of 🤗 transformers are you using?
65
+ placeholder: Output of `pip list | grep transformers`
66
+ validations:
67
+ required: true
68
+ - type: input
69
+ attributes:
70
+ label: vLLM version
71
+ description: What version of vLLM are you using?
72
+ placeholder: Output of `pip list | grep vllm`
73
+ validations:
74
+ required: true
60
75
  - type: markdown
61
76
  attributes:
62
77
  value: >
@@ -1,7 +1,7 @@
1
1
  name: 🚀 Feature Request
2
2
  description: Is the EuroEval benchmark missing a feature?
3
3
  title: "[FEATURE REQUEST] <name-of-feature>"
4
- labels: enhancement
4
+ type: feature
5
5
 
6
6
  body:
7
7
  - type: textarea
@@ -11,16 +11,6 @@ body:
11
11
  A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*.
12
12
  validations:
13
13
  required: true
14
- - type: textarea
15
- attributes:
16
- label: Alternatives
17
- description: >
18
- A description of any alternative solutions or features you've considered, if any.
19
- - type: textarea
20
- attributes:
21
- label: Additional context
22
- description: >
23
- Add any other context or screenshots about the feature request.
24
14
  - type: markdown
25
15
  attributes:
26
16
  value: >
@@ -2,6 +2,7 @@ name: 📊 Model Evaluation Request
2
2
  description: Would you like to have a particular model included in the leaderboards?
3
3
  title: "[MODEL EVALUATION REQUEST] <model-name>"
4
4
  labels: "model evaluation request"
5
+ type: task
5
6
 
6
7
  body:
7
8
  - type: input
@@ -10,16 +11,6 @@ body:
10
11
  description: What is the Hugging Face model ID?
11
12
  validations:
12
13
  required: true
13
- - type: dropdown
14
- attributes:
15
- label: Model type
16
- description: What is the architecture of the model?
17
- options:
18
- - Decoder model (e.g., GPT)
19
- - Encoder model (e.g., BERT)
20
- - Sequence-to-sequence model (e.g., T5)
21
- validations:
22
- required: true
23
14
  - type: checkboxes
24
15
  attributes:
25
16
  label: Evaluation languages
@@ -36,9 +27,29 @@ body:
36
27
  - label: Icelandic
37
28
  - label: Italian
38
29
  - label: Norwegian (Bokmål or Nynorsk)
30
+ - label: Spanish
39
31
  - label: Swedish
40
32
  validations:
41
33
  required: true
34
+ - type: dropdown
35
+ attributes:
36
+ label: Model type
37
+ description: What is the architecture of the model?
38
+ options:
39
+ - Decoder model (e.g., GPT)
40
+ - Encoder model (e.g., BERT)
41
+ - Sequence-to-sequence model (e.g., T5)
42
+ validations:
43
+ required: true
44
+ - type: dropdown
45
+ attributes:
46
+ label: Model size
47
+ description: What is the size of the model?
48
+ options:
49
+ - Small (<=8B parameters)
50
+ - Large (>8B parameters)
51
+ validations:
52
+ required: true
42
53
  - type: dropdown
43
54
  attributes:
44
55
  label: Merged model
@@ -43,7 +43,6 @@ jobs:
43
43
  - name: Install uv and set up Python
44
44
  uses: astral-sh/setup-uv@v4
45
45
  with:
46
- enable-cache: true
47
46
  python-version: ${{ matrix.python-version }}
48
47
 
49
48
  - name: Install Dependencies
@@ -75,7 +74,6 @@ jobs:
75
74
  - name: Install uv and set up Python
76
75
  uses: astral-sh/setup-uv@v4
77
76
  with:
78
- enable-cache: true
79
77
  python-version: ${{ matrix.python-version }}
80
78
 
81
79
  - name: Install Dependencies
@@ -10,6 +10,51 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.4.2] - 2025-03-31
14
+ ### Added
15
+ - Now added version metadata to results, to easier track which versions of the various
16
+ dependencies were used when evaluating a model. This currently includes
17
+ `transformers`, `torch`, `vllm` and `outlines`.
18
+
19
+ ### Changed
20
+ - Changed the name of the German 'mlsum' summarisation dataset to 'mlsum-de', to reflect
21
+ that it is the German version of the dataset, and to avoid confusion with the Spanish
22
+ 'mlsum-es' dataset.
23
+
24
+ ### Fixed
25
+ - Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
26
+ compatibility < 8.0. This was contributed by [@marksverdhei](https://github.com/marksverdhei) ✨
27
+ - Corrected the name of the French sentiment dataset AlloCiné. This was contributed by
28
+ [@Alkarex](https://github.com/Alkarex) ✨
29
+ - Evaluating a specific model revision did not work for adapter models, as there was a
30
+ confusion between the revision of the adapter and the revision of the base model. We
31
+ now use the revision for the adapter and use the latest revision for the base model.
32
+ - In the (very unlikely) scenario that the model's tokeniser has the same first token
33
+ for two different labels in a text classification task, we now also use the second
34
+ token to ensure that we determine the correct label. If this is not possible, then we
35
+ warn the user.
36
+ - Now catches `TypeError` when trying to generate with vLLM, and retries 3 times before
37
+ giving up on evaluating the dataset.
38
+ - A bug in `transformers` caused models with the `image-text-to-text` pipeline tag to
39
+ not be detected as generative models. This has been patched now, and will be fixed
40
+ properly when [this transformers
41
+ PR](https://github.com/huggingface/transformers/pull/37107) has been merged.
42
+ - Force `vllm` v0.8.0 for now, as the severe degradation in generation output of some
43
+ models has not been resolved in versions v0.8.2 and v0.8.3.
44
+ - Only accepts the local labels for text classification tasks when evaluating decoder
45
+ models now, where we before accepted both the local and English labels. The reason is
46
+ that this caused a confusion mat times when there was a unique local label starting
47
+ with a particular letter, but a different English label starting with the same letter,
48
+ causing some models to be evaluated on the wrong label.
49
+ - When fetching the model information from the Hugging Face API we now attempt 3 times,
50
+ as the API sometimes fails. If it still fails after 3 attempts, we raise the
51
+ `HuggingFaceHubDown` exception.
52
+ - Now uses `fp16` instead of `bf16` when evaluating decoder models on GPUs with CUDA
53
+ compatibility < 8.0. This was contributed by [@marksverdhei](https://github.com/marksverdhei) ✨
54
+ - Fixed docs for ScandiQA-da and ScandiQA-sv, where it was incorrectly stated that
55
+ the splits were made by considering the original train/validation/test splits.
56
+
57
+
13
58
  ## [v15.4.1] - 2025-03-25
14
59
  ### Fixed
15
60
  - Disallow `vllm` v0.8.1, as it causes severe degradation in generation output of
@@ -211,7 +256,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
211
256
 
212
257
  ### Added
213
258
  - Added support for French! 🇫🇷This includes the sentiment classification dataset
214
- [Allocine](https://hf.co/datasets/tblard/allocine), the linguistic acceptability
259
+ [AlloCiné](https://hf.co/datasets/tblard/allocine), the linguistic acceptability
215
260
  dataset ScaLA with the [French Universal
216
261
  Dependencies](https://github.com/UniversalDependencies/UD_French-GSD), the reading
217
262
  comprehension dataset [FQuAD](https://hf.co/datasets/illuin/fquad) (and unofficially
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.4.1
3
+ Version: 15.4.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -42,6 +42,7 @@ Requires-Dist: more-itertools>=10.5.0
42
42
  Requires-Dist: numpy<2.0.0,>=1.23.0
43
43
  Requires-Dist: ollama>=0.4.7
44
44
  Requires-Dist: pandas>=2.2.0
45
+ Requires-Dist: peft>=0.15.0
45
46
  Requires-Dist: protobuf~=3.20.0
46
47
  Requires-Dist: pydantic>=2.6.0
47
48
  Requires-Dist: pyinfer>=0.0.3
@@ -61,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
61
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
62
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
63
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
64
- Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'all'
65
66
  Provides-Extra: generative
66
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
68
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
69
- Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
71
  Provides-Extra: human-evaluation
71
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
72
73
  Provides-Extra: test
@@ -285,11 +285,10 @@ the translated contexts still contained the answer to the question, potentially
285
285
  changing the answers slightly.
286
286
 
287
287
  The original full dataset consists of 6,810 / 500 / 500 samples for training,
288
- validation and testing, respectively. We use a 1,024 / 256 / 2,048 split for training,
289
- validation and testing, respectively (so 3,328 samples used in total). All validation
290
- samples in our version also belong to the original validation set, and all original test
291
- samples are included in our test set. The remaining 1,548 test samples in our version
292
- was sampled from the original training set.
288
+ validation and testing, respectively (so 3,328 samples used in total).
289
+ We use a 1,024 / 256 / 2,048 split for training, validation and testing, respectively,
290
+ where the splits are made by randomly sampling from the full dataset without considering
291
+ the original train/validation/test splits.
293
292
 
294
293
  Here are a few examples from the training split:
295
294
 
@@ -7,11 +7,11 @@ information about what these constitute.
7
7
 
8
8
  ## Sentiment Classification
9
9
 
10
- ### Allocine
10
+ ### AlloCiné
11
11
 
12
12
  This dataset was published in [this Github
13
13
  repository](https://github.com/TheophileBlard/french-sentiment-analysis-with-bert) and
14
- features reviews from the French movie review website Allocine. The reviews range from
14
+ features reviews from the French movie review website [AlloCiné](https://www.allocine.fr/). The reviews range from
15
15
  0.5 to 5 (inclusive), with steps of 0.5. The negative samples are reviews with a rating
16
16
  of at most 2, and the positive ones are reviews with a rating of at least 4. The reviews
17
17
  in between were discarded.
@@ -475,7 +475,7 @@ $ euroeval --model <model-id> --dataset hellaswag-es
475
475
 
476
476
  ## Summarization
477
477
 
478
- ### MLSum-es-mini
478
+ ### MLSum-es
479
479
 
480
480
  The dataset was published in [this paper](https://aclanthology.org/2020.emnlp-main.647/) and is obtained from online newspapers.
481
481
 
@@ -231,11 +231,10 @@ the translated contexts still contained the answer to the question, potentially
231
231
  changing the answers slightly.
232
232
 
233
233
  The original full dataset consists of 6,810 / 500 / 500 samples for training,
234
- validation and testing, respectively. We use a 1,024 / 256 / 2,048 split for training,
235
- validation and testing, respectively (so 3,328 samples used in total). All validation
236
- samples in our version also belong to the original validation set, and all original test
237
- samples are included in our test set. The remaining 1,548 test samples in our version
238
- was sampled from the original training set.
234
+ validation and testing, respectively (so 3,328 samples used in total).
235
+ We use a 1,024 / 256 / 2,048 split for training, validation and testing, respectively,
236
+ where the splits are made by randomly sampling from the full dataset without considering
237
+ the original train/validation/test splits.
239
238
 
240
239
  Here are a few examples from the training split:
241
240
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.4.1"
3
+ version = "15.4.2"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -39,13 +39,14 @@ dependencies = [
39
39
  "setuptools>=75.8.2",
40
40
  "demjson3>=3.0.6",
41
41
  "ollama>=0.4.7",
42
+ "peft>=0.15.0",
42
43
  ]
43
44
 
44
45
  [project.optional-dependencies]
45
46
  generative = [
46
47
  "outlines>=0.1.11",
47
48
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
48
- "vllm>=0.8.0,!=0.8.1; platform_system == 'Linux'",
49
+ "vllm==0.8.0; platform_system == 'Linux'",
49
50
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
50
51
  ]
51
52
  human_evaluation = [
@@ -54,7 +55,7 @@ human_evaluation = [
54
55
  all = [
55
56
  "outlines>=0.1.11",
56
57
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
57
- "vllm>=0.8.0,!=0.8.1; platform_system == 'Linux'",
58
+ "vllm==0.8.0; platform_system == 'Linux'",
58
59
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
59
60
  "gradio>=4.26.0",
60
61
  ]
@@ -86,7 +87,6 @@ dev-dependencies = [
86
87
  "nbstripout>=0.7.1",
87
88
  "coverage>=5.5",
88
89
  "lxml>=5.1.0",
89
- "peft>=0.13.2",
90
90
  "mkdocs-material>=9.5.45",
91
91
  "mkdocs-include-markdown-plugin>=7.0.1",
92
92
  "mkdocs-include-dir-to-nav>=1.2.0",
@@ -20,6 +20,7 @@ from huggingface_hub.utils import (
20
20
  HFValidationError,
21
21
  LocalTokenNotFoundError,
22
22
  )
23
+ from peft import PeftConfig
23
24
  from requests.exceptions import RequestException
24
25
  from torch import nn
25
26
  from transformers import (
@@ -34,6 +35,9 @@ from transformers import (
34
35
  Trainer,
35
36
  )
36
37
  from transformers.modelcard import TASK_MAPPING
38
+ from transformers.models.auto.modeling_auto import (
39
+ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
40
+ )
37
41
  from urllib3.exceptions import RequestError
38
42
 
39
43
  from ..constants import (
@@ -73,6 +77,7 @@ from ..utils import (
73
77
  get_class_by_name,
74
78
  get_eos_token,
75
79
  internet_connection_available,
80
+ log_once,
76
81
  )
77
82
  from .base import BenchmarkModule
78
83
 
@@ -727,53 +732,54 @@ def get_model_repo_info(
727
732
  # If the model does not exist locally, then we get the model info from the Hugging
728
733
  # Face Hub
729
734
  if model_info is None:
730
- try:
731
- model_info = hf_api.model_info(
732
- repo_id=model_id, revision=revision, token=token
733
- )
734
- except (GatedRepoError, LocalTokenNotFoundError) as e:
735
+ num_attempts = 3
736
+ for _ in range(num_attempts):
735
737
  try:
736
- hf_whoami(token=token)
737
- logger.warning(
738
- f"Could not access the model {model_id} with the revision "
739
- f"{revision}. The error was {str(e)!r}."
738
+ model_info = hf_api.model_info(
739
+ repo_id=model_id, revision=revision, token=token
740
740
  )
741
+ break
742
+ except (GatedRepoError, LocalTokenNotFoundError) as e:
743
+ try:
744
+ hf_whoami(token=token)
745
+ logger.warning(
746
+ f"Could not access the model {model_id} with the revision "
747
+ f"{revision}. The error was {str(e)!r}."
748
+ )
749
+ return None
750
+ except LocalTokenNotFoundError:
751
+ raise NeedsAdditionalArgument(
752
+ cli_argument="--api-key",
753
+ script_argument="api_key=<your-api-key>",
754
+ run_with_cli=benchmark_config.run_with_cli,
755
+ )
756
+ except (RepositoryNotFoundError, HFValidationError):
741
757
  return None
742
- except LocalTokenNotFoundError:
743
- raise NeedsAdditionalArgument(
744
- cli_argument="--api-key",
745
- script_argument="api_key=<your-api-key>",
746
- run_with_cli=benchmark_config.run_with_cli,
747
- )
748
- except (RepositoryNotFoundError, HFValidationError):
749
- return None
750
- except (OSError, RequestException):
751
- if internet_connection_available():
752
- raise HuggingFaceHubDown()
753
- else:
758
+ except (OSError, RequestException):
759
+ if internet_connection_available():
760
+ continue
754
761
  raise NoInternetConnection()
762
+ else:
763
+ raise HuggingFaceHubDown()
755
764
 
756
765
  # Get all the Hugging Face repository tags for the model. If the model is an adapter
757
766
  # model, then we also get the tags for the base model
758
767
  tags = model_info.tags or list()
759
- has_base_model_tag = any(
760
- tag.startswith("base_model:") and tag.count(":") == 1 for tag in tags
761
- )
762
768
  base_model_id: str | None = None
763
- if has_base_model_tag:
764
- has_adapter_config = model_info.siblings is not None and any(
765
- sibling.rfilename == "adapter_config.json"
766
- for sibling in model_info.siblings
769
+ has_adapter_config = model_info.siblings is not None and any(
770
+ sibling.rfilename == "adapter_config.json" for sibling in model_info.siblings
771
+ )
772
+ if has_adapter_config:
773
+ adapter_config = PeftConfig.from_pretrained(model_id, revision=revision)
774
+ base_model_id = adapter_config.base_model_name_or_path
775
+ log_once(
776
+ f"Model {model_id!r} identified as an adapter model, with base model "
777
+ f"{base_model_id!r}.",
778
+ level=logging.DEBUG,
767
779
  )
768
- if has_adapter_config:
769
- base_model_id = [
770
- tag.split(":")[1]
771
- for tag in tags
772
- if tag.startswith("base_model:") and tag.count(":") == 1
773
- ][0]
780
+ if base_model_id is not None:
774
781
  base_model_info = hf_api.model_info(
775
782
  repo_id=base_model_id,
776
- revision=revision,
777
783
  token=benchmark_config.api_key
778
784
  or os.getenv("HUGGINGFACE_API_KEY")
779
785
  or True,
@@ -781,12 +787,18 @@ def get_model_repo_info(
781
787
  tags += base_model_info.tags or list()
782
788
  tags = list(set(tags))
783
789
 
790
+ # TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
791
+ # 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
792
+ # when this PR has been merged in and published:
793
+ # https://github.com/huggingface/transformers/pull/37107
794
+ TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
795
+
784
796
  # Get the pipeline tag for the model. If it is not specified, then we determine it
785
797
  # by checking the model's architecture as written in the model's Hugging Face config
786
798
  pipeline_tag = model_info.pipeline_tag
787
799
  if pipeline_tag is None:
788
800
  hf_config = load_hf_model_config(
789
- model_id=model_id,
801
+ model_id=base_model_id or model_id,
790
802
  num_labels=0,
791
803
  id2label=dict(),
792
804
  label2id=dict(),
@@ -812,7 +824,6 @@ def get_model_repo_info(
812
824
  pipeline_tag = "fill-mask"
813
825
 
814
826
  if benchmark_config.only_allow_safetensors:
815
- # Check if any file ends with .safetensors
816
827
  repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
817
828
  has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
818
829
  if not has_safetensors:
@@ -826,6 +837,26 @@ def get_model_repo_info(
826
837
  )
827
838
  raise InvalidModel(msg)
828
839
 
840
+ # Also check base model if we are evaluating an adapter
841
+ if base_model_id is not None:
842
+ base_repo_files = hf_api.list_repo_files(repo_id=base_model_id)
843
+ base_has_safetensors = any(
844
+ f.endswith(".safetensors") for f in base_repo_files
845
+ )
846
+ if not base_has_safetensors:
847
+ msg = (
848
+ f"Base model {base_model_id} does not have safetensors weights "
849
+ "available."
850
+ )
851
+ if benchmark_config.run_with_cli:
852
+ msg += " Skipping since the `--only-allow-safetensors` flag is set."
853
+ else:
854
+ msg += (
855
+ " Skipping since the `only_allow_safetensors` argument is set "
856
+ "to `True`."
857
+ )
858
+ raise InvalidModel(msg)
859
+
829
860
  return HFModelInfo(
830
861
  pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id
831
862
  )
@@ -30,6 +30,7 @@ from ..constants import (
30
30
  REASONING_MAX_TOKENS,
31
31
  TASK_GROUPS_USING_LOGPROBS,
32
32
  TASKS_USING_JSON,
33
+ VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
33
34
  )
34
35
  from ..data_models import (
35
36
  BenchmarkConfig,
@@ -65,6 +66,7 @@ from ..utils import (
65
66
  get_bos_token,
66
67
  get_end_of_chat_token_ids,
67
68
  get_eos_token,
69
+ get_min_cuda_compute_capability,
68
70
  log_once,
69
71
  should_prompts_be_stripped,
70
72
  )
@@ -145,6 +147,7 @@ class VLLMModel(HuggingFaceEncoderModel):
145
147
  if self.model_config.adapter_base_model_id is not None:
146
148
  adapter_path = snapshot_download(
147
149
  repo_id=self.model_config.model_id,
150
+ revision=self.model_config.revision,
148
151
  cache_dir=Path(self.model_config.model_cache_dir),
149
152
  )
150
153
  self.buffer["lora_request"] = LoRARequest(
@@ -373,12 +376,27 @@ class VLLMModel(HuggingFaceEncoderModel):
373
376
 
374
377
  # Generate sequences using vLLM
375
378
  input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
376
- raw_outputs = self._model.generate(
377
- prompts=prompts,
378
- sampling_params=sampling_params,
379
- use_tqdm=(not input_is_a_test),
380
- lora_request=self.buffer.get("lora_request"),
381
- )
379
+ num_attempts = 3
380
+ for _ in range(num_attempts):
381
+ try:
382
+ raw_outputs = self._model.generate(
383
+ prompts=prompts,
384
+ sampling_params=sampling_params,
385
+ use_tqdm=(not input_is_a_test),
386
+ lora_request=self.buffer.get("lora_request"),
387
+ )
388
+ break
389
+ except TypeError as e:
390
+ logger.debug(
391
+ f"Encountered error during vLLM generation: {str(e)}. Retrying..."
392
+ )
393
+ sleep(1)
394
+ else:
395
+ raise InvalidBenchmark(
396
+ f"Could not generate sequences after {num_attempts} attempts."
397
+ )
398
+
399
+ # Parse the raw model outputs
382
400
  completion_ids: list[list[int]] = [
383
401
  output.outputs[0].token_ids for output in raw_outputs
384
402
  ]
@@ -846,13 +864,16 @@ def load_model_and_tokenizer(
846
864
  # Prefer base model ID if the model is an adapter - the adapter will be added on
847
865
  # during inference in this case
848
866
  model_id = model_config.adapter_base_model_id or model_config.model_id
867
+ revision = (
868
+ model_config.revision if model_config.adapter_base_model_id is None else "main"
869
+ )
849
870
 
850
871
  hf_model_config = load_hf_model_config(
851
872
  model_id=model_id,
852
873
  num_labels=0,
853
874
  id2label=dict(),
854
875
  label2id=dict(),
855
- revision=model_config.revision,
876
+ revision=revision,
856
877
  model_cache_dir=model_config.model_cache_dir,
857
878
  api_key=benchmark_config.api_key,
858
879
  trust_remote_code=benchmark_config.trust_remote_code,
@@ -881,6 +902,23 @@ def load_model_and_tokenizer(
881
902
  )
882
903
  dtype = torch.float16
883
904
 
905
+ if hf_model_config.torch_dtype == torch.bfloat16:
906
+ min_cuda_compute_capability = get_min_cuda_compute_capability()
907
+ required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
908
+
909
+ if min_cuda_compute_capability is not None:
910
+ if min_cuda_compute_capability < required_capability:
911
+ logger.info(
912
+ "You are loading a model with "
913
+ f"dtype {hf_model_config.torch_dtype}, "
914
+ "which vLLM only supports for CUDA devices with"
915
+ f"CUDA compute capability >={required_capability}. "
916
+ "You are using one or more devices with "
917
+ f"compute capability {min_cuda_compute_capability}. "
918
+ "Setting dtype to float16 instead."
919
+ )
920
+ dtype = torch.float16
921
+
884
922
  if model_config.adapter_base_model_id is not None:
885
923
  download_dir = str(Path(model_config.model_cache_dir) / "base_model")
886
924
  else:
@@ -916,7 +954,7 @@ def load_model_and_tokenizer(
916
954
  max_model_len=min(true_max_model_len, 5_000),
917
955
  download_dir=download_dir,
918
956
  trust_remote_code=benchmark_config.trust_remote_code,
919
- revision=model_config.revision,
957
+ revision=revision,
920
958
  seed=4242,
921
959
  distributed_executor_backend=executor_backend,
922
960
  tensor_parallel_size=torch.cuda.device_count(),
@@ -994,6 +1032,7 @@ def load_tokenizer(
994
1032
  Returns:
995
1033
  The loaded tokenizer.
996
1034
  """
1035
+ revision = revision if adapter_base_model_id is None else "main"
997
1036
  config = AutoConfig.from_pretrained(
998
1037
  adapter_base_model_id or model_id,
999
1038
  revision=revision,
@@ -54,3 +54,6 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
54
54
 
55
55
  # Hugging Face Hub tags used to classify models as merge models
56
56
  MERGE_TAGS = ["merge", "mergekit"]
57
+
58
+ # The minimum required CUDA compute capability for using bfloat16 in vLLM
59
+ VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
@@ -1,7 +1,6 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
3
  import collections.abc as c
4
- import importlib.metadata
5
4
  import json
6
5
  import pathlib
7
6
  import re
@@ -11,6 +10,8 @@ from dataclasses import dataclass, field
11
10
  import pydantic
12
11
  import torch
13
12
 
13
+ from euroeval.utils import get_package_version
14
+
14
15
  from .enums import Device, InferenceBackend, ModelType, TaskGroup
15
16
  from .types import ScoreDict
16
17
 
@@ -228,7 +229,11 @@ class BenchmarkResult(pydantic.BaseModel):
228
229
  generative_type: str | None
229
230
  few_shot: bool
230
231
  validation_split: bool
231
- euroeval_version: str = importlib.metadata.version("euroeval")
232
+ euroeval_version: str | None = get_package_version("euroeval")
233
+ transformers_version: str | None = get_package_version("transformers")
234
+ torch_version: str | None = get_package_version("torch")
235
+ vllm_version: str | None = get_package_version("vllm")
236
+ outlines_version: str | None = get_package_version("outlines")
232
237
 
233
238
  @classmethod
234
239
  def from_dict(cls, config: dict) -> "BenchmarkResult":