EuroEval 15.8.2__tar.gz → 15.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (240) hide show
  1. {euroeval-15.8.2 → euroeval-15.9.0}/.github/workflows/ci.yaml +1 -1
  2. {euroeval-15.8.2 → euroeval-15.9.0}/.pre-commit-config.yaml +2 -2
  3. {euroeval-15.8.2 → euroeval-15.9.0}/CHANGELOG.md +18 -0
  4. {euroeval-15.8.2 → euroeval-15.9.0}/Dockerfile.cuda +1 -2
  5. {euroeval-15.8.2 → euroeval-15.9.0}/PKG-INFO +3 -5
  6. {euroeval-15.8.2 → euroeval-15.9.0}/README.md +0 -2
  7. {euroeval-15.8.2 → euroeval-15.9.0}/docs/README.md +2 -3
  8. {euroeval-15.8.2 → euroeval-15.9.0}/makefile +4 -7
  9. {euroeval-15.8.2 → euroeval-15.9.0}/pyproject.toml +3 -3
  10. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/benchmark_config_factory.py +0 -31
  11. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/benchmark_modules/hf.py +26 -13
  12. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/benchmark_modules/vllm.py +70 -2
  13. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/benchmarker.py +0 -21
  14. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/cli.py +0 -10
  15. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/data_models.py +0 -5
  16. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/exceptions.py +0 -22
  17. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/human_evaluation.py +0 -1
  18. {euroeval-15.8.2 → euroeval-15.9.0}/tests/conftest.py +0 -1
  19. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_benchmark_modules/test_hf.py +31 -27
  20. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_cli.py +0 -2
  21. {euroeval-15.8.2 → euroeval-15.9.0}/uv.lock +540 -162
  22. {euroeval-15.8.2 → euroeval-15.9.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  23. {euroeval-15.8.2 → euroeval-15.9.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  24. {euroeval-15.8.2 → euroeval-15.9.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  25. {euroeval-15.8.2 → euroeval-15.9.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  26. {euroeval-15.8.2 → euroeval-15.9.0}/.gitignore +0 -0
  27. {euroeval-15.8.2 → euroeval-15.9.0}/CITATION.cff +0 -0
  28. {euroeval-15.8.2 → euroeval-15.9.0}/CODE_OF_CONDUCT.md +0 -0
  29. {euroeval-15.8.2 → euroeval-15.9.0}/CONTRIBUTING.md +0 -0
  30. {euroeval-15.8.2 → euroeval-15.9.0}/LICENSE +0 -0
  31. {euroeval-15.8.2 → euroeval-15.9.0}/NEW_DATASET_GUIDE.md +0 -0
  32. {euroeval-15.8.2 → euroeval-15.9.0}/docs/CNAME +0 -0
  33. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/README.md +0 -0
  34. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/danish.md +0 -0
  35. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/dutch.md +0 -0
  36. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/english.md +0 -0
  37. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/faroese.md +0 -0
  38. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/finnish.md +0 -0
  39. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/french.md +0 -0
  40. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/german.md +0 -0
  41. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/icelandic.md +0 -0
  42. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/italian.md +0 -0
  43. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/norwegian.md +0 -0
  44. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/spanish.md +0 -0
  45. {euroeval-15.8.2 → euroeval-15.9.0}/docs/datasets/swedish.md +0 -0
  46. {euroeval-15.8.2 → euroeval-15.9.0}/docs/extras/radial_plotter.md +0 -0
  47. {euroeval-15.8.2 → euroeval-15.9.0}/docs/faq.md +0 -0
  48. {euroeval-15.8.2 → euroeval-15.9.0}/docs/gfx/favicon.png +0 -0
  49. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/danish.md +0 -0
  50. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
  51. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/english.md +0 -0
  52. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
  53. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/french.md +0 -0
  54. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/german.md +0 -0
  55. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  56. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/italian.md +0 -0
  57. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  58. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
  59. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
  60. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Multilingual/european.md +0 -0
  61. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
  62. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  63. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/Multilingual/romance.md +0 -0
  64. {euroeval-15.8.2 → euroeval-15.9.0}/docs/leaderboards/README.md +0 -0
  65. {euroeval-15.8.2 → euroeval-15.9.0}/docs/methodology.md +0 -0
  66. {euroeval-15.8.2 → euroeval-15.9.0}/docs/python-package.md +0 -0
  67. {euroeval-15.8.2 → euroeval-15.9.0}/docs/tasks/README.md +0 -0
  68. {euroeval-15.8.2 → euroeval-15.9.0}/docs/tasks/common-sense-reasoning.md +0 -0
  69. {euroeval-15.8.2 → euroeval-15.9.0}/docs/tasks/knowledge.md +0 -0
  70. {euroeval-15.8.2 → euroeval-15.9.0}/docs/tasks/linguistic-acceptability.md +0 -0
  71. {euroeval-15.8.2 → euroeval-15.9.0}/docs/tasks/named-entity-recognition.md +0 -0
  72. {euroeval-15.8.2 → euroeval-15.9.0}/docs/tasks/reading-comprehension.md +0 -0
  73. {euroeval-15.8.2 → euroeval-15.9.0}/docs/tasks/sentiment-classification.md +0 -0
  74. {euroeval-15.8.2 → euroeval-15.9.0}/docs/tasks/speed.md +0 -0
  75. {euroeval-15.8.2 → euroeval-15.9.0}/docs/tasks/summarization.md +0 -0
  76. {euroeval-15.8.2 → euroeval-15.9.0}/gfx/euroeval.png +0 -0
  77. {euroeval-15.8.2 → euroeval-15.9.0}/gfx/euroeval.xcf +0 -0
  78. {euroeval-15.8.2 → euroeval-15.9.0}/gfx/scandeval.png +0 -0
  79. {euroeval-15.8.2 → euroeval-15.9.0}/mkdocs.yaml +0 -0
  80. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/__init__.py +0 -0
  81. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
  82. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/benchmark_modules/base.py +0 -0
  83. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/benchmark_modules/fresh.py +0 -0
  84. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/benchmark_modules/litellm.py +0 -0
  85. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/callbacks.py +0 -0
  86. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/constants.py +0 -0
  87. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/data_loading.py +0 -0
  88. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/__init__.py +0 -0
  89. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/danish.py +0 -0
  90. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/dutch.py +0 -0
  91. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/english.py +0 -0
  92. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/faroese.py +0 -0
  93. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/finnish.py +0 -0
  94. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/french.py +0 -0
  95. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/german.py +0 -0
  96. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/icelandic.py +0 -0
  97. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/italian.py +0 -0
  98. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/norwegian.py +0 -0
  99. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/spanish.py +0 -0
  100. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/dataset_configs/swedish.py +0 -0
  101. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/enums.py +0 -0
  102. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/finetuning.py +0 -0
  103. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/generation.py +0 -0
  104. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/generation_utils.py +0 -0
  105. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/languages.py +0 -0
  106. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/model_cache.py +0 -0
  107. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/model_config.py +0 -0
  108. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/model_loading.py +0 -0
  109. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/prompt_templates/__init__.py +0 -0
  110. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  111. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  112. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  113. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  114. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  115. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/prompt_templates/summarization.py +0 -0
  116. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/scores.py +0 -0
  117. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/speed_benchmark.py +0 -0
  118. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/task_group_utils/__init__.py +0 -0
  119. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  120. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/task_group_utils/question_answering.py +0 -0
  121. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
  122. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  123. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/task_group_utils/token_classification.py +0 -0
  124. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/tasks.py +0 -0
  125. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/tokenization_utils.py +0 -0
  126. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/types.py +0 -0
  127. {euroeval-15.8.2 → euroeval-15.9.0}/src/euroeval/utils.py +0 -0
  128. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/constants.py +0 -0
  129. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_allocine.py +0 -0
  130. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_angry_tweets.py +0 -0
  131. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_arc.py +0 -0
  132. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_arc_is.py +0 -0
  133. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_belebele.py +0 -0
  134. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_cnn_dailymail.py +0 -0
  135. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_conll_en.py +0 -0
  136. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_conll_es.py +0 -0
  137. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_conll_nl.py +0 -0
  138. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_dane.py +0 -0
  139. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_danish_citizen_tests.py +0 -0
  140. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_dansk.py +0 -0
  141. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_danske_talemaader.py +0 -0
  142. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_danske_talemaader_old.py +0 -0
  143. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_dbrd.py +0 -0
  144. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_dutch_cola.py +0 -0
  145. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_eltec.py +0 -0
  146. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_fone.py +0 -0
  147. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_foqa.py +0 -0
  148. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_fosent.py +0 -0
  149. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_fquad.py +0 -0
  150. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_germanquad.py +0 -0
  151. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_germeval.py +0 -0
  152. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_hellaswag.py +0 -0
  153. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_hellaswag_fi.py +0 -0
  154. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  155. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_ice_linguistic.py +0 -0
  156. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
  157. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_icelandic_knowledge.py +0 -0
  158. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_icelandic_qa.py +0 -0
  159. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_icesum.py +0 -0
  160. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_ilpost_sum.py +0 -0
  161. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_jentoft.py +0 -0
  162. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_mim_gold_ner.py +0 -0
  163. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_mlqa_es.py +0 -0
  164. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_mlsum_de.py +0 -0
  165. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_mlsum_es.py +0 -0
  166. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_mmlu.py +0 -0
  167. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_multinerd-it.py +0 -0
  168. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_no_cola.py +0 -0
  169. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_no_sammendrag.py +0 -0
  170. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
  171. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_nordjylland_news.py +0 -0
  172. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_norec.py +0 -0
  173. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_norglm_multiqa.py +0 -0
  174. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_norglm_multisum.py +0 -0
  175. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_norne.py +0 -0
  176. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_norquad.py +0 -0
  177. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_nqii.py +0 -0
  178. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
  179. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_orange_sum.py +0 -0
  180. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_personal_sum.py +0 -0
  181. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_rrn.py +0 -0
  182. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_sb10k.py +0 -0
  183. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_scala.py +0 -0
  184. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_scandiqa.py +0 -0
  185. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_scandisent_fi.py +0 -0
  186. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_schibsted.py +0 -0
  187. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
  188. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_sentipolc16.py +0 -0
  189. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_squad.py +0 -0
  190. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_squad_it.py +0 -0
  191. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_squad_nl.py +0 -0
  192. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_squad_nl_old.py +0 -0
  193. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_sst5.py +0 -0
  194. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_suc3.py +0 -0
  195. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_swedn.py +0 -0
  196. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_swerec.py +0 -0
  197. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_turku_ner_fi.py +0 -0
  198. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_tydiqa_fi.py +0 -0
  199. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
  200. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_wikiann_fo.py +0 -0
  201. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_wikineural-it.py +0 -0
  202. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_winogrande_is.py +0 -0
  203. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_xlsum_fi.py +0 -0
  204. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/create_xquad_es.py +0 -0
  205. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/fix_dot_env_file.py +0 -0
  206. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/load_ud_pos.py +0 -0
  207. {euroeval-15.8.2 → euroeval-15.9.0}/src/scripts/versioning.py +0 -0
  208. {euroeval-15.8.2 → euroeval-15.9.0}/tests/__init__.py +0 -0
  209. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_benchmark_config_factory.py +0 -0
  210. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_benchmark_modules/__init__.py +0 -0
  211. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_benchmark_modules/test_base.py +0 -0
  212. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_benchmark_modules/test_fresh.py +0 -0
  213. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_benchmark_modules/test_litellm.py +0 -0
  214. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_benchmark_modules/test_vllm.py +0 -0
  215. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_benchmarker.py +0 -0
  216. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_callbacks.py +0 -0
  217. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_constants.py +0 -0
  218. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_data_loading.py +0 -0
  219. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_data_models.py +0 -0
  220. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_dataset_configs.py +0 -0
  221. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_enums.py +0 -0
  222. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_exceptions.py +0 -0
  223. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_finetuning.py +0 -0
  224. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_generation.py +0 -0
  225. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_human_evaluation.py +0 -0
  226. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_languages.py +0 -0
  227. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_model_cache.py +0 -0
  228. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_model_config.py +0 -0
  229. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_model_loading.py +0 -0
  230. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_scores.py +0 -0
  231. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_speed_benchmark.py +0 -0
  232. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_task_utils/__init__.py +0 -0
  233. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_task_utils/test_question_answering.py +0 -0
  234. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_task_utils/test_sequence_classification.py +0 -0
  235. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_task_utils/test_text_to_text.py +0 -0
  236. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_task_utils/test_token_classification.py +0 -0
  237. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_tasks.py +0 -0
  238. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_tokenization_utils.py +0 -0
  239. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_types.py +0 -0
  240. {euroeval-15.8.2 → euroeval-15.9.0}/tests/test_utils.py +0 -0
@@ -30,7 +30,7 @@ jobs:
30
30
  python-version: "3.11"
31
31
  - run: python -m pip install pre-commit
32
32
  shell: bash
33
- - run: pre-commit run --show-diff-on-failure --color=always
33
+ - run: pre-commit run --show-diff-on-failure --color=always --all-files
34
34
  shell: bash
35
35
 
36
36
  pytest-linux:
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.11.9
13
+ rev: v0.11.12
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -31,7 +31,7 @@ repos:
31
31
  hooks:
32
32
  - id: nbstripout
33
33
  - repo: https://github.com/pre-commit/mirrors-mypy
34
- rev: v1.15.0
34
+ rev: v1.16.0
35
35
  hooks:
36
36
  - id: mypy
37
37
  args:
@@ -10,6 +10,24 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.9.0] - 2025-05-31
14
+ ### Changed
15
+ - Updated `vllm` to `>=0.9.0`, as the bug in `v0.8.5` has been fixed.
16
+ - Removed the `--use-flash-attention` flag as well as the corresponding warning, as
17
+ flash attention is now built-in to vLLM and is used by default.
18
+
19
+ ### Fixed
20
+ - When truncating prompts with vLLM models, we now correctly truncate them down below
21
+ the `MAX_CONTEXT_LENGTH` (set to 5,000 tokens). We have already ensured that all
22
+ prompts have less than 5,000 Gemma-3 tokens, but sometimes tokenizers add a few more
23
+ tokens.
24
+ - Fixed an issue regarding model existence check when benchmarking models on custom
25
+ inference API servers.
26
+ - Fixed an issue with Phi-4 models, as they output multiple end-of-reasoning tokens, and
27
+ it was previously cutting off at the first one, yielding faulty final answers. We now
28
+ cut off at the last end-of-reasoning token, which is the correct one.
29
+
30
+
13
31
  ## [v15.8.2] - 2025-05-12
14
32
  ### Fixed
15
33
  - Catch error when caching generative model outputs, when the number of model inputs and
@@ -5,8 +5,7 @@ RUN apt-get -y update && \
5
5
  apt-get -y upgrade && \
6
6
  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends gcc python3.11 python3-pip python3-dev git-all && \
7
7
  python3 -m pip install --upgrade pip wheel && \
8
- python3 -m pip install euroeval[all] && \
9
- FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE python3 -m pip install flash-attn --no-build-isolation
8
+ python3 -m pip install euroeval[all]
10
9
 
11
10
  # Move the existing evaluation results into the container, to avoid re-running the
12
11
  # evaluation
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.8.2
3
+ Version: 15.9.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -62,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm<0.8.5,>=0.8.3; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm<0.8.5,>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'generative'
71
71
  Provides-Extra: human-evaluation
72
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
73
  Provides-Extra: test
@@ -97,8 +97,6 @@ ______________________________________________________________________
97
97
 
98
98
  - Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
99
99
  dan.nielsen@alexandra.dk)
100
- - Kenneth Enevoldsen ([@KennethEnevoldsen](https://github.com/KennethEnevoldsen),
101
- kenneth.enevoldsen@cas.au.dk)
102
100
 
103
101
 
104
102
  ## Installation
@@ -21,8 +21,6 @@ ______________________________________________________________________
21
21
 
22
22
  - Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
23
23
  dan.nielsen@alexandra.dk)
24
- - Kenneth Enevoldsen ([@KennethEnevoldsen](https://github.com/KennethEnevoldsen),
25
- kenneth.enevoldsen@cas.au.dk)
26
24
 
27
25
 
28
26
  ## Installation
@@ -32,6 +32,5 @@ models. It started as a hobby project including Danish, Swedish and Norwegian, b
32
32
  since grown to include 8+ European languages.
33
33
 
34
34
  EuroEval is maintained by [Dan Saattrup Nielsen](https://www.saattrupdan.com/) from the
35
- [Alexandra Institute](https://alexandra.dk) and [Kenneth
36
- Enevoldsen](https://www.kennethenevoldsen.com/) from [Aarhus University](https://au.dk),
37
- and is funded by the EU project [TrustLLM](https://trustllm.eu/).
35
+ [Alexandra Institute](https://alexandra.dk), and is funded by the EU project
36
+ [TrustLLM](https://trustllm.eu/).
@@ -44,18 +44,15 @@ install-rust:
44
44
  install-uv:
45
45
  @if [ "$(shell which uv)" = "" ]; then \
46
46
  curl -LsSf https://astral.sh/uv/install.sh | sh; \
47
- echo "Installed uv."; \
48
- else \
49
- echo "Updating uv..."; \
50
- uv self update; \
47
+ echo "Installed uv."; \
48
+ else \
49
+ echo "Updating uv..."; \
50
+ uv self update; \
51
51
  fi
52
52
 
53
53
  install-dependencies:
54
54
  @uv python install 3.11
55
55
  @uv sync --all-extras --python 3.11
56
- @if [ "${NO_FLASH_ATTN}" != "1" ] && [ $$(uname) != "Darwin" ]; then \
57
- uv pip install --no-build-isolation flash-attn>=2.7.0.post2; \
58
- fi
59
56
 
60
57
  setup-environment-variables:
61
58
  @uv run python src/scripts/fix_dot_env_file.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.8.2"
3
+ version = "15.9.0"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -46,7 +46,7 @@ dependencies = [
46
46
  generative = [
47
47
  "outlines>=0.1.11",
48
48
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
49
- "vllm>=0.8.3,<0.8.5; platform_system == 'Linux'",
49
+ "vllm>=0.9.0; platform_system == 'Linux'",
50
50
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
51
51
  ]
52
52
  human_evaluation = [
@@ -55,7 +55,7 @@ human_evaluation = [
55
55
  all = [
56
56
  "outlines>=0.1.11",
57
57
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
58
- "vllm>=0.8.3,<0.8.5; platform_system == 'Linux'",
58
+ "vllm>=0.9.0; platform_system == 'Linux'",
59
59
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
60
60
  "gradio>=4.26.0",
61
61
  ]
@@ -1,6 +1,5 @@
1
1
  """Factory class for creating dataset configurations."""
2
2
 
3
- import importlib.util
4
3
  import logging
5
4
  import sys
6
5
  import typing as t
@@ -13,7 +12,6 @@ from .enums import Device
13
12
  from .exceptions import InvalidBenchmark
14
13
  from .languages import get_all_languages
15
14
  from .tasks import SPEED, get_all_tasks
16
- from .utils import log_once
17
15
 
18
16
  if t.TYPE_CHECKING:
19
17
  from .data_models import Language, Task
@@ -38,7 +36,6 @@ def build_benchmark_config(
38
36
  force: bool,
39
37
  verbose: bool,
40
38
  trust_remote_code: bool,
41
- use_flash_attention: bool | None,
42
39
  clear_model_cache: bool,
43
40
  evaluate_test_split: bool,
44
41
  few_shot: bool,
@@ -92,9 +89,6 @@ def build_benchmark_config(
92
89
  automatically set if `debug` is True.
93
90
  trust_remote_code:
94
91
  Whether to trust remote code when running the benchmark.
95
- use_flash_attention:
96
- Whether to use Flash Attention for the models. If None then it will be used
97
- if it is available.
98
92
  clear_model_cache:
99
93
  Whether to clear the model cache before running the benchmark.
100
94
  evaluate_test_split:
@@ -135,30 +129,6 @@ def build_benchmark_config(
135
129
 
136
130
  torch_device = prepare_device(device=device)
137
131
 
138
- if use_flash_attention is None:
139
- if torch_device.type != "cuda":
140
- use_flash_attention = False
141
- elif (
142
- importlib.util.find_spec("flash_attn") is None
143
- and importlib.util.find_spec("vllm_flash_attn") is None
144
- ):
145
- use_flash_attention = False
146
- if first_time and torch_device.type == "cuda":
147
- message = (
148
- "Flash attention has not been installed, so this will not be used. "
149
- "To install it, run `pip install -U wheel && "
150
- "FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn "
151
- "--no-build-isolation`. Alternatively, you can disable this "
152
- "message by setting "
153
- )
154
- if run_with_cli:
155
- message += "the flag `--no-use-flash-attention`."
156
- else:
157
- message += (
158
- "the argument `use_flash_attention=False` in the `Benchmarker`."
159
- )
160
- log_once(message=message, level=logging.INFO)
161
-
162
132
  # Set variable with number of iterations
163
133
  if hasattr(sys, "_called_from_test"):
164
134
  num_iterations = 1
@@ -178,7 +148,6 @@ def build_benchmark_config(
178
148
  verbose=verbose or debug,
179
149
  device=torch_device,
180
150
  trust_remote_code=trust_remote_code,
181
- use_flash_attention=use_flash_attention,
182
151
  clear_model_cache=clear_model_cache,
183
152
  evaluate_test_split=evaluate_test_split,
184
153
  few_shot=few_shot,
@@ -54,13 +54,11 @@ from ..enums import (
54
54
  TaskGroup,
55
55
  )
56
56
  from ..exceptions import (
57
- HuggingFaceHubDown,
58
57
  InvalidBenchmark,
59
58
  InvalidModel,
60
59
  NeedsAdditionalArgument,
61
60
  NeedsEnvironmentVariable,
62
61
  NeedsExtraInstalled,
63
- NoInternetConnection,
64
62
  )
65
63
  from ..languages import get_all_languages
66
64
  from ..task_group_utils import (
@@ -737,9 +735,10 @@ def get_model_repo_info(
737
735
  model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
738
736
 
739
737
  # If the model does not exist locally, then we get the model info from the Hugging
740
- # Face Hub
738
+ # Face Hub, if possible
741
739
  if model_info is None:
742
740
  num_attempts = 3
741
+ errors: list[Exception] = list()
743
742
  for _ in range(num_attempts):
744
743
  try:
745
744
  model_info = hf_api.model_info(
@@ -749,25 +748,37 @@ def get_model_repo_info(
749
748
  except (GatedRepoError, LocalTokenNotFoundError) as e:
750
749
  try:
751
750
  hf_whoami(token=token)
752
- logger.warning(
751
+ logger.debug(
753
752
  f"Could not access the model {model_id} with the revision "
754
753
  f"{revision}. The error was {str(e)!r}."
755
754
  )
756
755
  return None
757
756
  except LocalTokenNotFoundError:
758
- raise NeedsAdditionalArgument(
759
- cli_argument="--api-key",
760
- script_argument="api_key=<your-api-key>",
761
- run_with_cli=benchmark_config.run_with_cli,
757
+ logger.debug(
758
+ f"Could not access the model {model_id} with the revision "
759
+ f"{revision}. The error was {str(e)!r}. Please set the "
760
+ "`HUGGINGFACE_API_KEY` environment variable or use the "
761
+ "`--api-key` argument."
762
762
  )
763
+ return None
763
764
  except (RepositoryNotFoundError, HFValidationError):
764
765
  return None
765
- except (OSError, RequestException):
766
+ except (OSError, RequestException) as e:
766
767
  if internet_connection_available():
768
+ errors.append(e)
767
769
  continue
768
- raise NoInternetConnection()
770
+ logger.debug(
771
+ "Could not access the Hugging Face Hub. Please check your internet "
772
+ "connection."
773
+ )
774
+ return None
769
775
  else:
770
- raise HuggingFaceHubDown()
776
+ logger.debug(
777
+ f"Could not access model info for the model {model_id!r} from the "
778
+ f"Hugging Face Hub, after {num_attempts} attempts. The errors "
779
+ f"encountered were {errors!r}."
780
+ )
781
+ return None
771
782
 
772
783
  # Get all the Hugging Face repository tags for the model. If the model is an adapter
773
784
  # model, then we also get the tags for the base model
@@ -836,7 +847,8 @@ def get_model_repo_info(
836
847
  "Skipping since the `only_allow_safetensors` argument is set "
837
848
  "to `True`."
838
849
  )
839
- raise InvalidModel(msg)
850
+ logger.warning(msg)
851
+ return None
840
852
 
841
853
  # Also check base model if we are evaluating an adapter
842
854
  if base_model_id is not None:
@@ -856,7 +868,8 @@ def get_model_repo_info(
856
868
  " Skipping since the `only_allow_safetensors` argument is set "
857
869
  "to `True`."
858
870
  )
859
- raise InvalidModel(msg)
871
+ logging.warning(msg)
872
+ return None
860
873
 
861
874
  return HFModelInfo(
862
875
  pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id
@@ -84,7 +84,12 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
84
84
  destroy_distributed_environment,
85
85
  destroy_model_parallel,
86
86
  )
87
+ from vllm.inputs import PromptType
87
88
  from vllm.lora.request import LoRARequest
89
+ from vllm.model_executor.guided_decoding.guided_fields import GuidedDecodingRequest
90
+ from vllm.pooling_params import PoolingParams
91
+ from vllm.prompt_adapter.request import PromptAdapterRequest
92
+ from vllm.sampling_params import RequestOutputKind
88
93
 
89
94
  if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
90
95
  from outlines.models.vllm import adapt_tokenizer
@@ -451,7 +456,9 @@ class VLLMModel(HuggingFaceEncoderModel):
451
456
  text=prompts,
452
457
  truncation=True,
453
458
  max_length=max(
454
- self._tokenizer.model_max_length - max_tokens, 0
459
+ min(self._tokenizer.model_max_length, MAX_CONTEXT_LENGTH)
460
+ - max_tokens,
461
+ 0,
455
462
  ),
456
463
  )
457
464
  prompts = self._tokenizer.batch_decode(
@@ -491,8 +498,19 @@ class VLLMModel(HuggingFaceEncoderModel):
491
498
  output.outputs[0].token_ids for output in raw_outputs
492
499
  ]
493
500
  if self.end_of_reasoning_token_id in completion_ids[0]:
501
+ # Find the latest index of the end of reasoning token and slice
502
+ # the token IDs to only include the tokens after it
494
503
  completion_ids = [
495
- token_ids[token_ids.index(self.end_of_reasoning_token_id) + 1 :]
504
+ token_ids[
505
+ max(
506
+ [
507
+ i
508
+ for i, x in enumerate(token_ids)
509
+ if x == self.end_of_reasoning_token_id
510
+ ]
511
+ )
512
+ + 1 :
513
+ ]
496
514
  if self.end_of_reasoning_token_id in token_ids
497
515
  else token_ids
498
516
  for token_ids in completion_ids
@@ -814,6 +832,9 @@ def load_model_and_tokenizer(
814
832
  )
815
833
 
816
834
  model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
835
+ model._validate_and_add_requests = MethodType(
836
+ _validate_and_add_requests_with_fixed_progress_bars, model
837
+ )
817
838
  model.config = hf_model_config
818
839
 
819
840
  return model, tokenizer
@@ -934,6 +955,53 @@ def _run_engine_with_fixed_progress_bars(
934
955
  return outputs
935
956
 
936
957
 
958
+ def _validate_and_add_requests_with_fixed_progress_bars(
959
+ self: "LLM",
960
+ prompts: "PromptType | c.Sequence[PromptType]",
961
+ params: "SamplingParams | c.Sequence[SamplingParams] | PoolingParams | c.Sequence[PoolingParams]", # noqa: E501
962
+ *,
963
+ use_tqdm: bool,
964
+ lora_request: "c.Sequence[LoRARequest] | LoRARequest | None",
965
+ prompt_adapter_request: "PromptAdapterRequest | None",
966
+ tokenization_kwargs: dict[str, t.Any] | None = None,
967
+ guided_options: "GuidedDecodingRequest | None" = None,
968
+ priority: list[int] | None = None,
969
+ ) -> None:
970
+ if isinstance(prompts, (str, dict)):
971
+ # Convert a single prompt to a list.
972
+ prompts = [prompts]
973
+
974
+ num_requests = len(prompts)
975
+ if isinstance(params, list) and len(params) != num_requests:
976
+ raise ValueError("The lengths of prompts and params must be the same.")
977
+ if isinstance(lora_request, list) and len(lora_request) != num_requests:
978
+ raise ValueError("The lengths of prompts and lora_request must be the same.")
979
+
980
+ for sp in params if isinstance(params, list) else (params,):
981
+ if isinstance(sp, SamplingParams):
982
+ self._add_guided_params(sp, guided_options)
983
+
984
+ # We only care about the final output
985
+ sp.output_kind = RequestOutputKind.FINAL_ONLY
986
+
987
+ # Add requests to the engine.
988
+ it = prompts
989
+ if use_tqdm:
990
+ it = tqdm(it, desc="Adding requests", leave=False)
991
+
992
+ for i, prompt in enumerate(it):
993
+ self._add_request(
994
+ prompt,
995
+ params[i] if isinstance(params, c.Sequence) else params,
996
+ tokenization_kwargs=tokenization_kwargs,
997
+ lora_request=lora_request[i]
998
+ if isinstance(lora_request, c.Sequence)
999
+ else lora_request,
1000
+ prompt_adapter_request=prompt_adapter_request,
1001
+ priority=priority[i] if priority else 0,
1002
+ )
1003
+
1004
+
937
1005
  def clear_vllm() -> None:
938
1006
  """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
939
1007
  with contextlib.suppress(ValueError):
@@ -72,7 +72,6 @@ class Benchmarker:
72
72
  force: bool = False,
73
73
  verbose: bool = False,
74
74
  trust_remote_code: bool = False,
75
- use_flash_attention: bool | None = None,
76
75
  clear_model_cache: bool = False,
77
76
  evaluate_test_split: bool = False,
78
77
  few_shot: bool = True,
@@ -129,9 +128,6 @@ class Benchmarker:
129
128
  `debug` is True. Defaults to False.
130
129
  trust_remote_code:
131
130
  Whether to trust remote code when loading models. Defaults to False.
132
- use_flash_attention:
133
- Whether to use Flash Attention. If None then it will be used if it is
134
- installed and the model is a decoder model. Defaults to None.
135
131
  clear_model_cache:
136
132
  Whether to clear the model cache after benchmarking each model.
137
133
  Defaults to False.
@@ -190,7 +186,6 @@ class Benchmarker:
190
186
  force=force,
191
187
  verbose=verbose,
192
188
  trust_remote_code=trust_remote_code,
193
- use_flash_attention=use_flash_attention,
194
189
  clear_model_cache=clear_model_cache,
195
190
  evaluate_test_split=evaluate_test_split,
196
191
  few_shot=few_shot,
@@ -243,7 +238,6 @@ class Benchmarker:
243
238
  force: bool | None = None,
244
239
  verbose: bool | None = None,
245
240
  trust_remote_code: bool | None = None,
246
- use_flash_attention: bool | None = None,
247
241
  clear_model_cache: bool | None = None,
248
242
  evaluate_test_split: bool | None = None,
249
243
  few_shot: bool | None = None,
@@ -311,9 +305,6 @@ class Benchmarker:
311
305
  trust_remote_code:
312
306
  Whether to trust remote code when loading models. Defaults to the value
313
307
  specified when initialising the benchmarker.
314
- use_flash_attention:
315
- Whether to use Flash Attention. Defaults to the value specified when
316
- initialising the benchmarker.
317
308
  clear_model_cache:
318
309
  Whether to clear the model cache after benchmarking each model. Defaults
319
310
  to the value specified when initialising the benchmarker.
@@ -359,7 +350,6 @@ class Benchmarker:
359
350
  force=force,
360
351
  verbose=verbose,
361
352
  trust_remote_code=trust_remote_code,
362
- use_flash_attention=use_flash_attention,
363
353
  clear_model_cache=clear_model_cache,
364
354
  evaluate_test_split=evaluate_test_split,
365
355
  few_shot=few_shot,
@@ -531,7 +521,6 @@ class Benchmarker:
531
521
  force: bool | None = None,
532
522
  verbose: bool | None = None,
533
523
  trust_remote_code: bool | None = None,
534
- use_flash_attention: bool | None | None = None,
535
524
  clear_model_cache: bool | None = None,
536
525
  evaluate_test_split: bool | None = None,
537
526
  few_shot: bool | None = None,
@@ -590,9 +579,6 @@ class Benchmarker:
590
579
  trust_remote_code:
591
580
  Whether to trust remote code when loading models. If None, then this
592
581
  value will not be updated.
593
- use_flash_attention:
594
- Whether to use Flash Attention. If None, then this value will not be
595
- updated.
596
582
  clear_model_cache:
597
583
  Whether to clear the model cache after benchmarking each model. If None,
598
584
  then this value will not be updated.
@@ -658,8 +644,6 @@ class Benchmarker:
658
644
  benchmark_config_params.verbose = verbose
659
645
  if trust_remote_code is not None:
660
646
  benchmark_config_params.trust_remote_code = trust_remote_code
661
- if use_flash_attention is not None:
662
- benchmark_config_params.use_flash_attention = use_flash_attention
663
647
  if clear_model_cache is not None:
664
648
  benchmark_config_params.clear_model_cache = clear_model_cache
665
649
  if evaluate_test_split is not None:
@@ -863,7 +847,6 @@ class Benchmarker:
863
847
  force: bool | None = None,
864
848
  verbose: bool | None = None,
865
849
  trust_remote_code: bool | None = None,
866
- use_flash_attention: bool | None = None,
867
850
  clear_model_cache: bool | None = None,
868
851
  evaluate_test_split: bool | None = None,
869
852
  few_shot: bool | None = None,
@@ -931,9 +914,6 @@ class Benchmarker:
931
914
  trust_remote_code:
932
915
  Whether to trust remote code when loading models. Defaults to the value
933
916
  specified when initialising the benchmarker.
934
- use_flash_attention:
935
- Whether to use Flash Attention. Defaults to the value specified when
936
- initialising the benchmarker.
937
917
  clear_model_cache:
938
918
  Whether to clear the model cache after benchmarking each model. Defaults
939
919
  to the value specified when initialising the benchmarker.
@@ -981,7 +961,6 @@ class Benchmarker:
981
961
  force=force,
982
962
  verbose=verbose,
983
963
  trust_remote_code=trust_remote_code,
984
- use_flash_attention=use_flash_attention,
985
964
  clear_model_cache=clear_model_cache,
986
965
  evaluate_test_split=evaluate_test_split,
987
966
  few_shot=few_shot,
@@ -141,14 +141,6 @@ from .tasks import get_all_tasks
141
141
  help="""Whether to trust remote code. Only set this flag if you trust the supplier
142
142
  of the model.""",
143
143
  )
144
- @click.option(
145
- "--use-flash-attention/--no-use-flash-attention",
146
- default=None,
147
- show_default=True,
148
- help="""Whether to use Flash Attention. If not specified then the model will use
149
- Flash Attention for generative models if a CUDA GPU is available and `flash-attn`
150
- or `vllm-flash-attn` are installed.""",
151
- )
152
144
  @click.option(
153
145
  "--clear-model-cache/--no-clear-model-cache",
154
146
  default=False,
@@ -225,7 +217,6 @@ def benchmark(
225
217
  verbose: bool,
226
218
  device: str | None,
227
219
  trust_remote_code: bool,
228
- use_flash_attention: bool | None,
229
220
  clear_model_cache: bool,
230
221
  evaluate_test_split: bool,
231
222
  few_shot: bool,
@@ -261,7 +252,6 @@ def benchmark(
261
252
  cache_dir=cache_dir,
262
253
  device=device,
263
254
  trust_remote_code=trust_remote_code,
264
- use_flash_attention=use_flash_attention,
265
255
  clear_model_cache=clear_model_cache,
266
256
  evaluate_test_split=evaluate_test_split,
267
257
  few_shot=few_shot,
@@ -191,9 +191,6 @@ class BenchmarkConfig:
191
191
  Whether to print verbose output.
192
192
  trust_remote_code:
193
193
  Whether to trust remote code when loading models from the Hugging Face Hub.
194
- use_flash_attention:
195
- Whether to use Flash Attention. If None then this will be used for
196
- generative models.
197
194
  clear_model_cache:
198
195
  Whether to clear the model cache after benchmarking each model.
199
196
  evaluate_test_split:
@@ -231,7 +228,6 @@ class BenchmarkConfig:
231
228
  device: torch.device
232
229
  verbose: bool
233
230
  trust_remote_code: bool
234
- use_flash_attention: bool | None
235
231
  clear_model_cache: bool
236
232
  evaluate_test_split: bool
237
233
  few_shot: bool
@@ -263,7 +259,6 @@ class BenchmarkConfigParams(pydantic.BaseModel):
263
259
  force: bool
264
260
  verbose: bool
265
261
  trust_remote_code: bool
266
- use_flash_attention: bool | None
267
262
  clear_model_cache: bool
268
263
  evaluate_test_split: bool
269
264
  few_shot: bool
@@ -81,28 +81,6 @@ class NaNValueInModelOutput(Exception):
81
81
  super().__init__(self.message)
82
82
 
83
83
 
84
- class FlashAttentionNotInstalled(Exception):
85
- """The `flash-attn` package has not been installed."""
86
-
87
- def __init__(
88
- self,
89
- message: str = (
90
- "The model you are trying to load requires Flash Attention. To use Flash "
91
- "Attention, please install the `flash-attn` package, which can be done by "
92
- "running `pip install -U wheel && FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE "
93
- "pip install flash-attn --no-build-isolation`."
94
- ),
95
- ) -> None:
96
- """Initialise the exception.
97
-
98
- Args:
99
- message:
100
- The message to display.
101
- """
102
- self.message = message
103
- super().__init__(self.message)
104
-
105
-
106
84
  class NeedsExtraInstalled(InvalidModel):
107
85
  """The evaluation requires extra to be installed."""
108
86
 
@@ -263,7 +263,6 @@ class HumanEvaluator:
263
263
  force=False,
264
264
  verbose=False,
265
265
  trust_remote_code=False,
266
- use_flash_attention=None,
267
266
  clear_model_cache=False,
268
267
  evaluate_test_split=False,
269
268
  few_shot=True,
@@ -80,7 +80,6 @@ def benchmark_config(
80
80
  device=device,
81
81
  verbose=False,
82
82
  trust_remote_code=True,
83
- use_flash_attention=False,
84
83
  clear_model_cache=False,
85
84
  evaluate_test_split=False,
86
85
  few_shot=True,