EuroEval 15.15.0__tar.gz → 15.16.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (252) hide show
  1. {euroeval-15.15.0 → euroeval-15.16.0}/.github/ISSUE_TEMPLATE/bug.yaml +1 -1
  2. {euroeval-15.15.0 → euroeval-15.16.0}/.github/workflows/ci.yaml +4 -2
  3. {euroeval-15.15.0 → euroeval-15.16.0}/.pre-commit-config.yaml +2 -2
  4. {euroeval-15.15.0 → euroeval-15.16.0}/CHANGELOG.md +17 -0
  5. {euroeval-15.15.0 → euroeval-15.16.0}/PKG-INFO +3 -2
  6. {euroeval-15.15.0 → euroeval-15.16.0}/README.md +1 -0
  7. {euroeval-15.15.0 → euroeval-15.16.0}/pyproject.toml +2 -2
  8. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/litellm.py +155 -105
  9. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/vllm.py +10 -3
  10. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmarker.py +10 -11
  11. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/finetuning.py +2 -1
  12. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/metrics.py +6 -4
  13. {euroeval-15.15.0 → euroeval-15.16.0}/uv.lock +5 -5
  14. {euroeval-15.15.0 → euroeval-15.16.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  15. {euroeval-15.15.0 → euroeval-15.16.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  16. {euroeval-15.15.0 → euroeval-15.16.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  17. {euroeval-15.15.0 → euroeval-15.16.0}/.gitignore +0 -0
  18. {euroeval-15.15.0 → euroeval-15.16.0}/CITATION.cff +0 -0
  19. {euroeval-15.15.0 → euroeval-15.16.0}/CODE_OF_CONDUCT.md +0 -0
  20. {euroeval-15.15.0 → euroeval-15.16.0}/CONTRIBUTING.md +0 -0
  21. {euroeval-15.15.0 → euroeval-15.16.0}/Dockerfile.cuda +0 -0
  22. {euroeval-15.15.0 → euroeval-15.16.0}/LICENSE +0 -0
  23. {euroeval-15.15.0 → euroeval-15.16.0}/NEW_DATASET_GUIDE.md +0 -0
  24. {euroeval-15.15.0 → euroeval-15.16.0}/docs/CNAME +0 -0
  25. {euroeval-15.15.0 → euroeval-15.16.0}/docs/README.md +0 -0
  26. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/README.md +0 -0
  27. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/danish.md +0 -0
  28. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/dutch.md +0 -0
  29. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/english.md +0 -0
  30. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/faroese.md +0 -0
  31. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/finnish.md +0 -0
  32. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/french.md +0 -0
  33. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/german.md +0 -0
  34. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/icelandic.md +0 -0
  35. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/italian.md +0 -0
  36. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/norwegian.md +0 -0
  37. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/portuguese.md +0 -0
  38. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/spanish.md +0 -0
  39. {euroeval-15.15.0 → euroeval-15.16.0}/docs/datasets/swedish.md +0 -0
  40. {euroeval-15.15.0 → euroeval-15.16.0}/docs/extras/radial_plotter.md +0 -0
  41. {euroeval-15.15.0 → euroeval-15.16.0}/docs/faq.md +0 -0
  42. {euroeval-15.15.0 → euroeval-15.16.0}/docs/gfx/favicon.png +0 -0
  43. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/danish.md +0 -0
  44. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
  45. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/english.md +0 -0
  46. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
  47. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
  48. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/french.md +0 -0
  49. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/german.md +0 -0
  50. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  51. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/italian.md +0 -0
  52. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  53. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
  54. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
  55. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Multilingual/european.md +0 -0
  56. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
  57. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  58. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/Multilingual/romance.md +0 -0
  59. {euroeval-15.15.0 → euroeval-15.16.0}/docs/leaderboards/README.md +0 -0
  60. {euroeval-15.15.0 → euroeval-15.16.0}/docs/methodology.md +0 -0
  61. {euroeval-15.15.0 → euroeval-15.16.0}/docs/python-package.md +0 -0
  62. {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/README.md +0 -0
  63. {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/common-sense-reasoning.md +0 -0
  64. {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/knowledge.md +0 -0
  65. {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/linguistic-acceptability.md +0 -0
  66. {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/named-entity-recognition.md +0 -0
  67. {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/reading-comprehension.md +0 -0
  68. {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/sentiment-classification.md +0 -0
  69. {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/speed.md +0 -0
  70. {euroeval-15.15.0 → euroeval-15.16.0}/docs/tasks/summarization.md +0 -0
  71. {euroeval-15.15.0 → euroeval-15.16.0}/gfx/euroeval.png +0 -0
  72. {euroeval-15.15.0 → euroeval-15.16.0}/gfx/euroeval.xcf +0 -0
  73. {euroeval-15.15.0 → euroeval-15.16.0}/gfx/scandeval.png +0 -0
  74. {euroeval-15.15.0 → euroeval-15.16.0}/makefile +0 -0
  75. {euroeval-15.15.0 → euroeval-15.16.0}/mkdocs.yaml +0 -0
  76. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/__init__.py +0 -0
  77. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_config_factory.py +0 -0
  78. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
  79. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/base.py +0 -0
  80. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/fresh.py +0 -0
  81. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/benchmark_modules/hf.py +0 -0
  82. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/callbacks.py +0 -0
  83. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/cli.py +0 -0
  84. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/constants.py +0 -0
  85. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/data_loading.py +0 -0
  86. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/data_models.py +0 -0
  87. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/__init__.py +0 -0
  88. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/danish.py +0 -0
  89. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/dutch.py +0 -0
  90. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/english.py +0 -0
  91. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/faroese.py +0 -0
  92. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/finnish.py +0 -0
  93. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/french.py +0 -0
  94. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/german.py +0 -0
  95. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/icelandic.py +0 -0
  96. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/italian.py +0 -0
  97. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/norwegian.py +0 -0
  98. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/portuguese.py +0 -0
  99. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/spanish.py +0 -0
  100. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/dataset_configs/swedish.py +0 -0
  101. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/enums.py +0 -0
  102. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/exceptions.py +0 -0
  103. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/generation.py +0 -0
  104. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/generation_utils.py +0 -0
  105. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/human_evaluation.py +0 -0
  106. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/languages.py +0 -0
  107. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/model_cache.py +0 -0
  108. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/model_config.py +0 -0
  109. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/model_loading.py +0 -0
  110. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/__init__.py +0 -0
  111. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  112. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  113. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  114. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  115. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  116. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/prompt_templates/summarization.py +0 -0
  117. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/scores.py +0 -0
  118. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/speed_benchmark.py +0 -0
  119. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/__init__.py +0 -0
  120. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  121. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/question_answering.py +0 -0
  122. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
  123. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  124. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/task_group_utils/token_classification.py +0 -0
  125. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/tasks.py +0 -0
  126. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/tokenization_utils.py +0 -0
  127. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/types.py +0 -0
  128. {euroeval-15.15.0 → euroeval-15.16.0}/src/euroeval/utils.py +0 -0
  129. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/constants.py +0 -0
  130. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_allocine.py +0 -0
  131. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_angry_tweets.py +0 -0
  132. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_arc.py +0 -0
  133. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_arc_is.py +0 -0
  134. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_belebele.py +0 -0
  135. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_boolq_pt.py +0 -0
  136. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_cnn_dailymail.py +0 -0
  137. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_conll_en.py +0 -0
  138. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_conll_es.py +0 -0
  139. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_conll_nl.py +0 -0
  140. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_dane.py +0 -0
  141. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_danish_citizen_tests.py +0 -0
  142. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_dansk.py +0 -0
  143. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_danske_talemaader.py +0 -0
  144. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_danske_talemaader_old.py +0 -0
  145. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_dbrd.py +0 -0
  146. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_dutch_cola.py +0 -0
  147. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_eltec.py +0 -0
  148. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_fone.py +0 -0
  149. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_foqa.py +0 -0
  150. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_fosent.py +0 -0
  151. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_fquad.py +0 -0
  152. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_germanquad.py +0 -0
  153. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_germeval.py +0 -0
  154. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_goldenswag.py +0 -0
  155. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_harem.py +0 -0
  156. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_hellaswag.py +0 -0
  157. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_hellaswag_fi.py +0 -0
  158. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  159. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_ice_linguistic.py +0 -0
  160. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
  161. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_icelandic_knowledge.py +0 -0
  162. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_icelandic_qa.py +0 -0
  163. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_icesum.py +0 -0
  164. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_idioms_no.py +0 -0
  165. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_ilpost_sum.py +0 -0
  166. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_jentoft.py +0 -0
  167. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_life_in_the_uk.py +0 -0
  168. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mim_gold_ner.py +0 -0
  169. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mlqa_es.py +0 -0
  170. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mlsum_de.py +0 -0
  171. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mlsum_es.py +0 -0
  172. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_mmlu.py +0 -0
  173. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_multi_wiki_qa.py +0 -0
  174. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_multinerd-it.py +0 -0
  175. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_no_cola.py +0 -0
  176. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_no_sammendrag.py +0 -0
  177. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
  178. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_nordjylland_news.py +0 -0
  179. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norec.py +0 -0
  180. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norglm_multiqa.py +0 -0
  181. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norglm_multisum.py +0 -0
  182. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norne.py +0 -0
  183. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_norquad.py +0 -0
  184. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_nqii.py +0 -0
  185. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
  186. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_orange_sum.py +0 -0
  187. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_personal_sum.py +0 -0
  188. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_publico.py +0 -0
  189. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_rrn.py +0 -0
  190. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sb10k.py +0 -0
  191. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_scala.py +0 -0
  192. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_scandiqa.py +0 -0
  193. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_scandisent_fi.py +0 -0
  194. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_schibsted.py +0 -0
  195. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
  196. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sentipolc16.py +0 -0
  197. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_squad.py +0 -0
  198. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_squad_it.py +0 -0
  199. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_squad_nl.py +0 -0
  200. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_squad_nl_old.py +0 -0
  201. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sst2_pt.py +0 -0
  202. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_sst5.py +0 -0
  203. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_suc3.py +0 -0
  204. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_swedn.py +0 -0
  205. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_swerec.py +0 -0
  206. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_turku_ner_fi.py +0 -0
  207. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_tydiqa_fi.py +0 -0
  208. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
  209. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_wikiann_fo.py +0 -0
  210. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_wikineural-it.py +0 -0
  211. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_winogrande_is.py +0 -0
  212. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_xlsum_fi.py +0 -0
  213. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/create_xquad_es.py +0 -0
  214. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/fix_dot_env_file.py +0 -0
  215. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/load_ud_pos.py +0 -0
  216. {euroeval-15.15.0 → euroeval-15.16.0}/src/scripts/versioning.py +0 -0
  217. {euroeval-15.15.0 → euroeval-15.16.0}/tests/__init__.py +0 -0
  218. {euroeval-15.15.0 → euroeval-15.16.0}/tests/conftest.py +0 -0
  219. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_config_factory.py +0 -0
  220. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/__init__.py +0 -0
  221. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_base.py +0 -0
  222. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_fresh.py +0 -0
  223. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_hf.py +0 -0
  224. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_litellm.py +0 -0
  225. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmark_modules/test_vllm.py +0 -0
  226. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_benchmarker.py +0 -0
  227. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_callbacks.py +0 -0
  228. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_cli.py +0 -0
  229. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_constants.py +0 -0
  230. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_data_loading.py +0 -0
  231. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_data_models.py +0 -0
  232. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_dataset_configs.py +0 -0
  233. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_enums.py +0 -0
  234. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_exceptions.py +0 -0
  235. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_finetuning.py +0 -0
  236. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_generation.py +0 -0
  237. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_human_evaluation.py +0 -0
  238. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_languages.py +0 -0
  239. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_model_cache.py +0 -0
  240. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_model_config.py +0 -0
  241. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_model_loading.py +0 -0
  242. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_scores.py +0 -0
  243. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_speed_benchmark.py +0 -0
  244. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/__init__.py +0 -0
  245. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/test_question_answering.py +0 -0
  246. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/test_sequence_classification.py +0 -0
  247. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/test_text_to_text.py +0 -0
  248. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_task_utils/test_token_classification.py +0 -0
  249. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_tasks.py +0 -0
  250. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_tokenization_utils.py +0 -0
  251. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_types.py +0 -0
  252. {euroeval-15.15.0 → euroeval-15.16.0}/tests/test_utils.py +0 -0
@@ -55,7 +55,7 @@ body:
55
55
  attributes:
56
56
  label: EuroEval version
57
57
  description: What version of EuroEval are you using?
58
- placeholder: Output of `pip list | grep EuroEval`
58
+ placeholder: Output of `pip list | grep euroeval`
59
59
  validations:
60
60
  required: true
61
61
  - type: input
@@ -57,7 +57,7 @@ jobs:
57
57
  run: uv sync --no-dev --extra test
58
58
 
59
59
  - name: Start Ollama server
60
- run: curl -fsSL https://ollama.com/install.sh | sh
60
+ run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
61
61
 
62
62
  - name: Test with pytest
63
63
  run: uv run pytest
@@ -66,6 +66,8 @@ jobs:
66
66
  HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
67
67
  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
68
68
  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
69
+ GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
70
+ XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
69
71
 
70
72
  - name: Delete EuroEval cache
71
73
  run: rm -rf .euroeval_cache
@@ -88,7 +90,7 @@ jobs:
88
90
  run: uv sync --no-dev --extra test
89
91
 
90
92
  - name: Start Ollama server
91
- run: curl -fsSL https://ollama.com/install.sh | sh
93
+ run: curl -fsSL https://ollama.com/install.sh | sh && ollama serve &
92
94
 
93
95
  - name: Test with pytest
94
96
  run: uv run pytest
@@ -4,13 +4,13 @@ repos:
4
4
  hooks:
5
5
  - id: python-use-type-annotations
6
6
  - repo: https://github.com/pre-commit/pre-commit-hooks
7
- rev: v5.0.0
7
+ rev: v6.0.0
8
8
  hooks:
9
9
  - id: end-of-file-fixer
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.12.7
13
+ rev: v0.12.8
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -10,6 +10,23 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.16.0] - 2025-08-12
14
+ ### Added
15
+ - Added metadata for GPT-5 models.
16
+
17
+ ### Changed
18
+ - Updated `transformers` dependency to `>=4.55.0`.
19
+
20
+ ### Fixed
21
+ - If the model uses 'mxfp4' quantisation then we allow the dtype to be bfloat16, rather
22
+ than forcing float16. This caused issues with the new GPT-OSS models.
23
+ - Prevent multiple `Model <model-id> does not exist` logs when evaluating a model
24
+ that does not exist - now only logs this once.
25
+ - Cleaner error message when attempting to benchmark a generative model without having a
26
+ GPU available.
27
+ - Now raises error if an inference API is used with a parameter that is not supported.
28
+
29
+
13
30
  ## [v15.15.0] - 2025-08-06
14
31
  ### Added
15
32
  - Added the common-sense reasoning dataset GoldenSwag for the following
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.15.0
3
+ Version: 15.16.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -56,7 +56,7 @@ Requires-Dist: setuptools>=75.8.2
56
56
  Requires-Dist: tenacity>=9.0.0
57
57
  Requires-Dist: termcolor>=2.0.0
58
58
  Requires-Dist: torch>=2.6.0
59
- Requires-Dist: transformers>=4.51.0
59
+ Requires-Dist: transformers>=4.55.0
60
60
  Provides-Extra: all
61
61
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
@@ -233,6 +233,7 @@ A huge thank you to all the contributors who have helped make this project a suc
233
233
  <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
234
234
  <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
235
235
  <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
236
+ <a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
236
237
 
237
238
 
238
239
  ### Contribute to EuroEval
@@ -159,6 +159,7 @@ A huge thank you to all the contributors who have helped make this project a suc
159
159
  <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
160
160
  <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
161
161
  <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
162
+ <a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
162
163
 
163
164
 
164
165
  ### Contribute to EuroEval
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.15.0"
3
+ version = "15.16.0"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -14,7 +14,7 @@ dependencies = [
14
14
  "torch>=2.6.0",
15
15
  "pandas>=2.2.0",
16
16
  "numpy>=1.23.0,<2.0.0",
17
- "transformers>=4.51.0",
17
+ "transformers>=4.55.0",
18
18
  "accelerate>=1.9.0",
19
19
  "evaluate>=0.4.1",
20
20
  "datasets>=3.5.0",
@@ -6,7 +6,7 @@ import logging
6
6
  import os
7
7
  import re
8
8
  import typing as t
9
- from functools import cached_property, partial
9
+ from functools import cache, cached_property, partial
10
10
  from time import sleep
11
11
 
12
12
  import litellm
@@ -27,6 +27,7 @@ from litellm.exceptions import (
27
27
  RateLimitError,
28
28
  ServiceUnavailableError,
29
29
  Timeout,
30
+ UnsupportedParamsError,
30
31
  )
31
32
  from litellm.llms.vertex_ai.common_utils import VertexAIError
32
33
  from litellm.router import Router
@@ -87,6 +88,7 @@ logger = logging.getLogger("euroeval")
87
88
 
88
89
  VOCAB_SIZE_MAPPING = {
89
90
  # OpenAI models
91
+ r"gpt-5-.*": 100_256,
90
92
  r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
91
93
  r"gpt-4-[0-9]{4}-preview": 100_256,
92
94
  r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
@@ -105,6 +107,7 @@ VOCAB_SIZE_MAPPING = {
105
107
 
106
108
  MODEL_MAX_LENGTH_MAPPING = {
107
109
  # OpenAI models
110
+ r"gpt-5-.*": 272_000,
108
111
  r"gpt-4(-[0-9]{4})?": 8_191,
109
112
  r"gpt-4-32k(-[0-9]{4})?": 32_767,
110
113
  r"gpt-4-[0-9]{4}-preview": 128_000,
@@ -129,6 +132,7 @@ MODEL_MAX_LENGTH_MAPPING = {
129
132
 
130
133
  NUM_PARAMS_MAPPING = {
131
134
  # OpenAI models
135
+ r"gpt-5-.*": -1,
132
136
  r"gpt-4.*": -1,
133
137
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
134
138
  # Anthropic models
@@ -144,6 +148,7 @@ NUM_PARAMS_MAPPING = {
144
148
 
145
149
  ALLOWED_PARAMS = {
146
150
  # OpenAI models
151
+ r"gpt-5-.*": ["minimal", "low", "medium", "high"],
147
152
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
148
153
  # Anthropic models
149
154
  r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
@@ -269,28 +274,9 @@ class LiteLLMModel(BenchmarkModule):
269
274
  generative_type=self.generative_type,
270
275
  )
271
276
 
272
- # Set the core generation arguments
273
- generation_kwargs: dict[str, t.Any] = dict(
274
- model=self.model_config.model_id,
275
- max_completion_tokens=(
276
- REASONING_MAX_TOKENS
277
- if self.generative_type == GenerativeType.REASONING
278
- else self.dataset_config.max_generated_tokens
279
- ),
280
- stop=[],
281
- temperature=0.0,
282
- seed=4242,
283
- api_key=self.benchmark_config.api_key,
284
- api_base=self.benchmark_config.api_base,
285
- api_version=self.benchmark_config.api_version,
286
- max_retries=3,
287
- )
288
-
289
- # Set up the `response_format` generation argument if we are dealing with a task
290
- # using structured generation
277
+ # Sanity check that "JSON" is included in the prompt, as some models require
278
+ # this
291
279
  if self.dataset_config.task in TASKS_USING_JSON:
292
- # Sanity check that "JSON" is included in the prompt, as some models require
293
- # this
294
280
  for conversation in conversations:
295
281
  if not conversation:
296
282
  raise InvalidBenchmark(
@@ -310,87 +296,6 @@ class LiteLLMModel(BenchmarkModule):
310
296
  "Prompt must contain 'json' for JSON tasks."
311
297
  )
312
298
 
313
- if self.generative_type == GenerativeType.REASONING:
314
- log_once(
315
- f"The model {self.model_config.model_id!r} is a reasoning model "
316
- "and thus does not support structured generation, so we do not "
317
- "enable it.",
318
- level=logging.DEBUG,
319
- )
320
- elif supports_response_schema(model=self.model_config.model_id):
321
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
322
- keys_and_their_types: dict[str, t.Any] = {
323
- tag_name: (conlist(str, max_length=5), ...)
324
- for tag_name in ner_tag_names
325
- }
326
- pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
327
- generation_kwargs["response_format"] = pydantic_class
328
- log_once(
329
- "Enabling structured generation for model "
330
- f"{self.model_config.model_id!r} with the JSON schema "
331
- f"{pydantic_class.model_json_schema()}",
332
- level=logging.DEBUG,
333
- )
334
- else:
335
- generation_kwargs["response_format"] = dict(type="json_object")
336
- log_once(
337
- "Enabling structured JSON generation for model "
338
- f"{self.model_config.model_id!r} with no custom JSON schema, as "
339
- "the model does not support schemas.",
340
- level=logging.DEBUG,
341
- )
342
-
343
- # If the model is an Ollama reasoning model, we ensure that thinking is enabled
344
- if self.is_ollama and self.generative_type == GenerativeType.REASONING:
345
- generation_kwargs["think"] = True
346
- log_once(
347
- "Enabling thinking mode for Ollama model "
348
- f"{self.model_config.model_id!r}",
349
- level=logging.DEBUG,
350
- )
351
-
352
- # Handle manually set parameters
353
- if self.buffer["first_label_token_mapping"]:
354
- generation_kwargs["logprobs"] = True
355
- generation_kwargs["top_logprobs"] = MAX_LOGPROBS
356
- if self.model_config.revision == "thinking":
357
- generation_kwargs["thinking"] = dict(
358
- type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
359
- )
360
- log_once(
361
- f"Enabling thinking mode for model {self.model_config.model_id!r}",
362
- level=logging.DEBUG,
363
- )
364
- elif self.model_config.revision == "no-thinking":
365
- generation_kwargs["thinking"] = dict(budget_tokens=0)
366
- log_once(
367
- f"Disabling thinking mode for model {self.model_config.model_id!r}",
368
- level=logging.DEBUG,
369
- )
370
- elif self.model_config.revision in {"low", "medium", "high"}:
371
- generation_kwargs["reasoning_effort"] = self.model_config.revision
372
- log_once(
373
- f"Enabling reasoning effort {self.model_config.revision!r} for model "
374
- f"{self.model_config.model_id!r}",
375
- level=logging.DEBUG,
376
- )
377
-
378
- # Drop generation kwargs that are not supported by the model
379
- litellm.drop_params = True
380
-
381
- # First attempt is a test run with a single conversation to handle errors
382
- # quickly
383
- test_conversation = conversations[0]
384
- _, failures = safe_run(
385
- self._generate_async(
386
- model_id=self.model_config.model_id,
387
- conversations=[test_conversation],
388
- **generation_kwargs,
389
- )
390
- )
391
- for _, error in failures:
392
- self._handle_exception(error=error, generation_kwargs=generation_kwargs)
393
-
394
299
  all_responses: dict[int, "ModelResponse"] = {}
395
300
  conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
396
301
  enumerate(conversations)
@@ -404,7 +309,7 @@ class LiteLLMModel(BenchmarkModule):
404
309
  self._generate_async(
405
310
  model_id=self.model_config.model_id,
406
311
  conversations=list(batch_conversations),
407
- **generation_kwargs,
312
+ **self.get_generation_kwargs(dataset_config=self.dataset_config),
408
313
  )
409
314
  )
410
315
 
@@ -431,7 +336,12 @@ class LiteLLMModel(BenchmarkModule):
431
336
  # Attempt to handle the exceptions, to improve the chance of getting
432
337
  # successful generations next time around
433
338
  for _, error in failures:
434
- self._handle_exception(error=error, generation_kwargs=generation_kwargs)
339
+ self._handle_exception(
340
+ error=error,
341
+ generation_kwargs=self.get_generation_kwargs(
342
+ dataset_config=self.dataset_config
343
+ ),
344
+ )
435
345
 
436
346
  # Sleep for a second to avoid pinging the API server too quickly
437
347
  sleep(1)
@@ -484,6 +394,7 @@ class LiteLLMModel(BenchmarkModule):
484
394
  "`temperature` may only be set to 1",
485
395
  "'temperature' does not support 0.0 with this model. Only the default "
486
396
  "(1) value is supported",
397
+ "Only temperature=1 is supported",
487
398
  ]
488
399
  max_items_messages = ["'maxItems' is not permitted."]
489
400
  no_json_schema_messages = ["Property keys should match pattern"]
@@ -593,6 +504,20 @@ class LiteLLMModel(BenchmarkModule):
593
504
  )
594
505
  sleep(5)
595
506
  return
507
+ elif isinstance(error, UnsupportedParamsError):
508
+ unsupported_param_match = re.search(
509
+ pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
510
+ string=error.message,
511
+ )
512
+ if unsupported_param_match is None:
513
+ raise InvalidModel(error.message)
514
+ else:
515
+ unsupported_param = unsupported_param_match.group(0)
516
+ raise InvalidModel(
517
+ f"The model {model_id!r} does not support the parameter "
518
+ f"{unsupported_param!r}. Try again without this parameter. "
519
+ "Skipping this model."
520
+ )
596
521
  elif isinstance(error, (APIConnectionError, OSError)):
597
522
  # If there are too many I/O connections, we increase the number of allowed
598
523
  # file descriptors
@@ -1233,6 +1158,126 @@ class LiteLLMModel(BenchmarkModule):
1233
1158
 
1234
1159
  return dataset
1235
1160
 
1161
+ @cache
1162
+ def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
1163
+ """Get the generation arguments for the model.
1164
+
1165
+ Args:
1166
+ dataset_config:
1167
+ The dataset configuration, which is used to determine the generative
1168
+ type of the model. We use this as an argument here rather than using
1169
+ `self.dataset_config` to ensure that that the cache is updated when the
1170
+ dataset configuration changes.
1171
+
1172
+ Returns:
1173
+ The generation arguments for the model.
1174
+ """
1175
+ # Set the core generation arguments
1176
+ generation_kwargs: dict[str, t.Any] = dict(
1177
+ model=self.model_config.model_id,
1178
+ max_completion_tokens=(
1179
+ REASONING_MAX_TOKENS
1180
+ if self.generative_type == GenerativeType.REASONING
1181
+ else dataset_config.max_generated_tokens
1182
+ ),
1183
+ stop=[],
1184
+ temperature=0.0,
1185
+ seed=4242,
1186
+ api_key=self.benchmark_config.api_key,
1187
+ api_base=self.benchmark_config.api_base,
1188
+ api_version=self.benchmark_config.api_version,
1189
+ max_retries=3,
1190
+ )
1191
+
1192
+ # Set up the `response_format` generation argument if we are dealing with a task
1193
+ # using structured generation
1194
+ if dataset_config.task in TASKS_USING_JSON:
1195
+ if self.generative_type == GenerativeType.REASONING:
1196
+ log_once(
1197
+ f"The model {self.model_config.model_id!r} is a reasoning model "
1198
+ "and thus does not support structured generation, so we do not "
1199
+ "enable it.",
1200
+ level=logging.DEBUG,
1201
+ )
1202
+ elif supports_response_schema(model=self.model_config.model_id):
1203
+ ner_tag_names = list(dataset_config.prompt_label_mapping.values())
1204
+ keys_and_their_types: dict[str, t.Any] = {
1205
+ tag_name: (conlist(str, max_length=5), ...)
1206
+ for tag_name in ner_tag_names
1207
+ }
1208
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
1209
+ generation_kwargs["response_format"] = pydantic_class
1210
+ log_once(
1211
+ "Enabling structured generation for model "
1212
+ f"{self.model_config.model_id!r} with the JSON schema "
1213
+ f"{pydantic_class.model_json_schema()}",
1214
+ level=logging.DEBUG,
1215
+ )
1216
+ else:
1217
+ generation_kwargs["response_format"] = dict(type="json_object")
1218
+ log_once(
1219
+ "Enabling structured JSON generation for model "
1220
+ f"{self.model_config.model_id!r} with no custom JSON schema, as "
1221
+ "the model does not support schemas.",
1222
+ level=logging.DEBUG,
1223
+ )
1224
+
1225
+ # If the model is an Ollama reasoning model, we ensure that thinking is enabled
1226
+ if self.is_ollama and self.generative_type == GenerativeType.REASONING:
1227
+ generation_kwargs["think"] = True
1228
+ log_once(
1229
+ "Enabling thinking mode for Ollama model "
1230
+ f"{self.model_config.model_id!r}",
1231
+ level=logging.DEBUG,
1232
+ )
1233
+
1234
+ # Handle manually set parameters
1235
+ if self.buffer["first_label_token_mapping"]:
1236
+ generation_kwargs["logprobs"] = True
1237
+ generation_kwargs["top_logprobs"] = MAX_LOGPROBS
1238
+ if self.model_config.revision == "thinking":
1239
+ generation_kwargs["thinking"] = dict(
1240
+ type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
1241
+ )
1242
+ log_once(
1243
+ f"Enabling thinking mode for model {self.model_config.model_id!r}",
1244
+ level=logging.DEBUG,
1245
+ )
1246
+ elif self.model_config.revision == "no-thinking":
1247
+ generation_kwargs["thinking"] = dict(budget_tokens=0)
1248
+ log_once(
1249
+ f"Disabling thinking mode for model {self.model_config.model_id!r}",
1250
+ level=logging.DEBUG,
1251
+ )
1252
+ elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
1253
+ generation_kwargs["reasoning_effort"] = self.model_config.revision
1254
+ log_once(
1255
+ f"Enabling reasoning effort {self.model_config.revision!r} for model "
1256
+ f"{self.model_config.model_id!r}",
1257
+ level=logging.DEBUG,
1258
+ )
1259
+
1260
+ # First attempt is a test run with a single conversation to handle errors
1261
+ # quickly. We repeat this multiple times to deal with different types of
1262
+ # errors, and stop if we get a successful response.
1263
+ test_conversation = [
1264
+ litellm.ChatCompletionUserMessage(role="user", content="Test message")
1265
+ ]
1266
+ for _ in range(5):
1267
+ _, failures = safe_run(
1268
+ self._generate_async(
1269
+ model_id=self.model_config.model_id,
1270
+ conversations=[test_conversation],
1271
+ **generation_kwargs,
1272
+ )
1273
+ )
1274
+ if not failures:
1275
+ break
1276
+ for _, error in failures:
1277
+ self._handle_exception(error=error, generation_kwargs=generation_kwargs)
1278
+
1279
+ return generation_kwargs
1280
+
1236
1281
 
1237
1282
  def raise_if_wrong_params(
1238
1283
  model_config: ModelConfig, allowed_params: dict[str, list[str]]
@@ -1264,6 +1309,11 @@ def raise_if_wrong_params(
1264
1309
  msg += " No parameters are allowed."
1265
1310
  raise InvalidModel(msg)
1266
1311
  return
1312
+ else:
1313
+ raise InvalidModel(
1314
+ f"The parameter {param!r} is not supported for the model "
1315
+ f"{model_config.model_id!r}."
1316
+ )
1267
1317
 
1268
1318
 
1269
1319
  def try_download_ollama_model(model_id: str) -> bool:
@@ -168,7 +168,8 @@ class VLLMModel(HuggingFaceEncoderModel):
168
168
 
169
169
  def __del__(self) -> None:
170
170
  """Clean up the model and tokenizer."""
171
- clear_vllm()
171
+ if importlib.util.find_spec("vllm") is not None:
172
+ clear_vllm()
172
173
  if hasattr(self, "_model"):
173
174
  del self._model
174
175
  if hasattr(self, "_tokenizer"):
@@ -690,8 +691,14 @@ def load_model_and_tokenizer(
690
691
  )
691
692
  dtype = torch.float16
692
693
 
693
- # If the model is a quantized model, we need to set the dtype to float16
694
- if quantization is not None and hf_model_config.torch_dtype != torch.float16:
694
+ # If the model is a quantized model, we might need to change the dtype
695
+ if quantization == "mxfp4" and hf_model_config.torch_dtype is None:
696
+ dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
697
+ logger.debug(
698
+ "You are loading a quantized model where `torch_dtype` has not been set. "
699
+ f"Setting dtype to {dtype!r}."
700
+ )
701
+ elif quantization is not None and hf_model_config.torch_dtype != torch.float16:
695
702
  logger.info(
696
703
  "You are loading a quantized model with dtype "
697
704
  f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
@@ -379,7 +379,16 @@ class Benchmarker:
379
379
 
380
380
  current_benchmark_results: list[BenchmarkResult] = list()
381
381
  for model_id in model_ids:
382
- model_config: ModelConfig | None = None
382
+ # Load the model configuration, or skip the model if it is invalid
383
+ try:
384
+ model_config = get_model_config(
385
+ model_id=model_id, benchmark_config=benchmark_config
386
+ )
387
+ except InvalidModel as e:
388
+ logger.info(e.message)
389
+ num_finished_benchmarks += len(dataset_configs)
390
+ continue
391
+
383
392
  loaded_model: BenchmarkModule | None = None
384
393
  for dataset_config in dataset_configs:
385
394
  # Skip if we have already benchmarked this model on this dataset and
@@ -399,16 +408,6 @@ class Benchmarker:
399
408
  num_finished_benchmarks += 1
400
409
  continue
401
410
 
402
- if model_config is None:
403
- try:
404
- model_config = get_model_config(
405
- model_id=model_id, benchmark_config=benchmark_config
406
- )
407
- except InvalidModel as e:
408
- logger.info(e.message)
409
- num_finished_benchmarks += len(dataset_configs)
410
- continue
411
-
412
411
  # Skip if the model is an encoder model and the task is generative
413
412
  task_is_generative = (
414
413
  dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
@@ -3,6 +3,7 @@
3
3
  import logging
4
4
  import sys
5
5
  import typing as t
6
+ from functools import partial
6
7
 
7
8
  import torch
8
9
  from tqdm.auto import tqdm
@@ -198,7 +199,7 @@ def finetune_single_iteration(
198
199
  args=training_args,
199
200
  train_dataset=dataset["train"],
200
201
  eval_dataset=dataset["val"],
201
- compute_metrics=model.compute_metrics,
202
+ compute_metrics=partial(model.compute_metrics, dataset=None),
202
203
  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
203
204
  data_collator=model.data_collator,
204
205
  preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
@@ -51,7 +51,7 @@ class Metric(abc.ABC):
51
51
 
52
52
  @abc.abstractmethod
53
53
  def __call__(
54
- self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
54
+ self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
55
55
  ) -> float | None:
56
56
  """Calculate the metric score.
57
57
 
@@ -132,7 +132,7 @@ class HuggingFaceMetric(Metric):
132
132
  self.metric: "EvaluationModule | None" = None
133
133
 
134
134
  def __call__(
135
- self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
135
+ self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
136
136
  ) -> float | None:
137
137
  """Calculate the metric score.
138
138
 
@@ -225,7 +225,7 @@ class LLMAsAJudgeMetric(Metric):
225
225
  self.system_prompt = system_prompt
226
226
 
227
227
  def __call__(
228
- self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
228
+ self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset | None"
229
229
  ) -> float | None:
230
230
  """Calculate the metric score using the judge model.
231
231
 
@@ -359,7 +359,9 @@ class SpeedMetric(Metric):
359
359
  postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
360
360
  )
361
361
 
362
- def __call__(self, _: t.Sequence, __: t.Sequence, ___: "Dataset") -> float | None:
362
+ def __call__(
363
+ self, _: t.Sequence, __: t.Sequence, ___: "Dataset | None"
364
+ ) -> float | None:
363
365
  """Not used with the speed metric, but required for consistency."""
364
366
  raise NotImplementedError
365
367
 
@@ -1123,7 +1123,7 @@ wheels = [
1123
1123
 
1124
1124
  [[package]]
1125
1125
  name = "euroeval"
1126
- version = "15.15.0"
1126
+ version = "15.16.0"
1127
1127
  source = { editable = "." }
1128
1128
  dependencies = [
1129
1129
  { name = "accelerate" },
@@ -1246,7 +1246,7 @@ requires-dist = [
1246
1246
  { name = "tenacity", specifier = ">=9.0.0" },
1247
1247
  { name = "termcolor", specifier = ">=2.0.0" },
1248
1248
  { name = "torch", specifier = ">=2.6.0" },
1249
- { name = "transformers", specifier = ">=4.51.0" },
1249
+ { name = "transformers", specifier = ">=4.55.0" },
1250
1250
  { name = "vllm", marker = "sys_platform == 'linux' and extra == 'all'", specifier = ">=0.10.0" },
1251
1251
  { name = "vllm", marker = "sys_platform == 'linux' and extra == 'generative'", specifier = ">=0.10.0" },
1252
1252
  ]
@@ -5376,7 +5376,7 @@ wheels = [
5376
5376
 
5377
5377
  [[package]]
5378
5378
  name = "transformers"
5379
- version = "4.54.1"
5379
+ version = "4.55.0"
5380
5380
  source = { registry = "https://pypi.org/simple" }
5381
5381
  dependencies = [
5382
5382
  { name = "filelock" },
@@ -5390,9 +5390,9 @@ dependencies = [
5390
5390
  { name = "tokenizers" },
5391
5391
  { name = "tqdm" },
5392
5392
  ]
5393
- sdist = { url = "https://files.pythonhosted.org/packages/21/6c/4caeb57926f91d943f309b062e22ad1eb24a9f530421c5a65c1d89378a7a/transformers-4.54.1.tar.gz", hash = "sha256:b2551bb97903f13bd90c9467d0a144d41ca4d142defc044a99502bb77c5c1052", size = 9514288, upload-time = "2025-07-29T15:57:22.826Z" }
5393
+ sdist = { url = "https://files.pythonhosted.org/packages/27/5d/f7dc746eef83336a6b34197311fe0c1da0d1192f637c726c6a5cf0d83502/transformers-4.55.0.tar.gz", hash = "sha256:15aa138a05d07a15b30d191ea2c45e23061ebf9fcc928a1318e03fe2234f3ae1", size = 9569089, upload-time = "2025-08-05T16:13:48.997Z" }
5394
5394
  wheels = [
5395
- { url = "https://files.pythonhosted.org/packages/cf/18/eb7578f84ef5a080d4e5ca9bc4f7c68e7aa9c1e464f1b3d3001e4c642fce/transformers-4.54.1-py3-none-any.whl", hash = "sha256:c89965a4f62a0d07009d45927a9c6372848a02ab9ead9c318c3d082708bab529", size = 11176397, upload-time = "2025-07-29T15:57:19.692Z" },
5395
+ { url = "https://files.pythonhosted.org/packages/1c/93/bcb22fb52ed65084c0199270832aa4cdd4b41296d896f3e7ade188bccb68/transformers-4.55.0-py3-none-any.whl", hash = "sha256:29d9b8800e32a4a831bb16efb5f762f6a9742fef9fce5d693ed018d19b106490", size = 11267905, upload-time = "2025-08-05T16:13:34.814Z" },
5396
5396
  ]
5397
5397
 
5398
5398
  [[package]]
File without changes
File without changes
File without changes
File without changes
File without changes