EuroEval 15.5.0__tar.gz → 15.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (232) hide show
  1. {euroeval-15.5.0 → euroeval-15.6.0}/.pre-commit-config.yaml +1 -1
  2. {euroeval-15.5.0 → euroeval-15.6.0}/CHANGELOG.md +35 -0
  3. {euroeval-15.5.0 → euroeval-15.6.0}/PKG-INFO +30 -9
  4. {euroeval-15.5.0 → euroeval-15.6.0}/README.md +24 -3
  5. {euroeval-15.5.0 → euroeval-15.6.0}/makefile +3 -4
  6. {euroeval-15.5.0 → euroeval-15.6.0}/pyproject.toml +16 -5
  7. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/base.py +3 -2
  8. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/fresh.py +8 -6
  9. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/hf.py +33 -31
  10. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/litellm.py +120 -56
  11. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/vllm.py +41 -26
  12. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmarker.py +23 -21
  13. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/callbacks.py +2 -2
  14. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/constants.py +1 -1
  15. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/data_models.py +257 -42
  16. euroeval-15.6.0/src/euroeval/dataset_configs/__init__.py +61 -0
  17. euroeval-15.6.0/src/euroeval/dataset_configs/danish.py +120 -0
  18. euroeval-15.6.0/src/euroeval/dataset_configs/dutch.py +123 -0
  19. euroeval-15.6.0/src/euroeval/dataset_configs/english.py +88 -0
  20. euroeval-15.6.0/src/euroeval/dataset_configs/faroese.py +53 -0
  21. euroeval-15.6.0/src/euroeval/dataset_configs/french.py +83 -0
  22. euroeval-15.6.0/src/euroeval/dataset_configs/german.py +91 -0
  23. euroeval-15.6.0/src/euroeval/dataset_configs/icelandic.py +148 -0
  24. euroeval-15.6.0/src/euroeval/dataset_configs/italian.py +81 -0
  25. euroeval-15.6.0/src/euroeval/dataset_configs/norwegian.py +178 -0
  26. euroeval-15.6.0/src/euroeval/dataset_configs/spanish.py +78 -0
  27. euroeval-15.6.0/src/euroeval/dataset_configs/swedish.py +100 -0
  28. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/exceptions.py +10 -10
  29. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/finetuning.py +6 -10
  30. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/generation.py +1 -0
  31. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/human_evaluation.py +2 -2
  32. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/languages.py +20 -13
  33. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/model_cache.py +1 -1
  34. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/model_loading.py +1 -12
  35. euroeval-15.6.0/src/euroeval/prompt_templates/__init__.py +8 -0
  36. euroeval-15.6.0/src/euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  37. euroeval-15.6.0/src/euroeval/prompt_templates/multiple_choice.py +97 -0
  38. euroeval-15.6.0/src/euroeval/prompt_templates/named_entity_recognition.py +257 -0
  39. euroeval-15.6.0/src/euroeval/prompt_templates/reading_comprehension.py +118 -0
  40. euroeval-15.6.0/src/euroeval/prompt_templates/sentiment_classification.py +137 -0
  41. euroeval-15.6.0/src/euroeval/prompt_templates/summarization.py +97 -0
  42. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/speed_benchmark.py +1 -1
  43. {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/multiple_choice_classification.py +19 -11
  44. {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/question_answering.py +31 -30
  45. {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/sequence_classification.py +1 -1
  46. {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/text_to_text.py +1 -1
  47. {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/token_classification.py +3 -2
  48. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/tasks.py +54 -0
  49. euroeval-15.5.0/src/euroeval/utils.py → euroeval-15.6.0/src/euroeval/tokenization_utils.py +8 -339
  50. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/types.py +3 -1
  51. euroeval-15.6.0/src/euroeval/utils.py +329 -0
  52. {euroeval-15.5.0 → euroeval-15.6.0}/tests/conftest.py +4 -4
  53. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmarker.py +13 -33
  54. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_callbacks.py +2 -1
  55. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_data_loading.py +2 -2
  56. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_finetuning.py +2 -1
  57. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_model_loading.py +4 -4
  58. euroeval-15.5.0/tests/test_utils.py → euroeval-15.6.0/tests/test_tokenization_utils.py +3 -68
  59. euroeval-15.6.0/tests/test_utils.py +67 -0
  60. {euroeval-15.5.0 → euroeval-15.6.0}/uv.lock +302 -275
  61. euroeval-15.5.0/src/euroeval/dataset_configs.py +0 -2408
  62. {euroeval-15.5.0 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  63. {euroeval-15.5.0 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  64. {euroeval-15.5.0 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  65. {euroeval-15.5.0 → euroeval-15.6.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  66. {euroeval-15.5.0 → euroeval-15.6.0}/.github/workflows/ci.yaml +0 -0
  67. {euroeval-15.5.0 → euroeval-15.6.0}/.gitignore +0 -0
  68. {euroeval-15.5.0 → euroeval-15.6.0}/CITATION.cff +0 -0
  69. {euroeval-15.5.0 → euroeval-15.6.0}/CODE_OF_CONDUCT.md +0 -0
  70. {euroeval-15.5.0 → euroeval-15.6.0}/CONTRIBUTING.md +0 -0
  71. {euroeval-15.5.0 → euroeval-15.6.0}/Dockerfile.cuda +0 -0
  72. {euroeval-15.5.0 → euroeval-15.6.0}/LICENSE +0 -0
  73. {euroeval-15.5.0 → euroeval-15.6.0}/docs/CNAME +0 -0
  74. {euroeval-15.5.0 → euroeval-15.6.0}/docs/README.md +0 -0
  75. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/README.md +0 -0
  76. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/danish.md +0 -0
  77. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/dutch.md +0 -0
  78. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/english.md +0 -0
  79. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/faroese.md +0 -0
  80. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/french.md +0 -0
  81. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/german.md +0 -0
  82. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/icelandic.md +0 -0
  83. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/italian.md +0 -0
  84. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/norwegian.md +0 -0
  85. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/spanish.md +0 -0
  86. {euroeval-15.5.0 → euroeval-15.6.0}/docs/datasets/swedish.md +0 -0
  87. {euroeval-15.5.0 → euroeval-15.6.0}/docs/extras/radial_plotter.md +0 -0
  88. {euroeval-15.5.0 → euroeval-15.6.0}/docs/faq.md +0 -0
  89. {euroeval-15.5.0 → euroeval-15.6.0}/docs/gfx/favicon.png +0 -0
  90. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/danish.md +0 -0
  91. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
  92. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/english.md +0 -0
  93. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
  94. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/french.md +0 -0
  95. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/german.md +0 -0
  96. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  97. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/italian.md +0 -0
  98. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  99. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
  100. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Multilingual/european.md +0 -0
  101. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
  102. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  103. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/Multilingual/romance.md +0 -0
  104. {euroeval-15.5.0 → euroeval-15.6.0}/docs/leaderboards/README.md +0 -0
  105. {euroeval-15.5.0 → euroeval-15.6.0}/docs/methodology.md +0 -0
  106. {euroeval-15.5.0 → euroeval-15.6.0}/docs/python-package.md +0 -0
  107. {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/README.md +0 -0
  108. {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/common-sense-reasoning.md +0 -0
  109. {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/knowledge.md +0 -0
  110. {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/linguistic-acceptability.md +0 -0
  111. {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/named-entity-recognition.md +0 -0
  112. {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/reading-comprehension.md +0 -0
  113. {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/sentiment-classification.md +0 -0
  114. {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/speed.md +0 -0
  115. {euroeval-15.5.0 → euroeval-15.6.0}/docs/tasks/summarization.md +0 -0
  116. {euroeval-15.5.0 → euroeval-15.6.0}/gfx/euroeval.png +0 -0
  117. {euroeval-15.5.0 → euroeval-15.6.0}/gfx/euroeval.xcf +0 -0
  118. {euroeval-15.5.0 → euroeval-15.6.0}/gfx/scandeval.png +0 -0
  119. {euroeval-15.5.0 → euroeval-15.6.0}/mkdocs.yaml +0 -0
  120. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/__init__.py +0 -0
  121. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_config_factory.py +0 -0
  122. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
  123. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/cli.py +0 -0
  124. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/data_loading.py +0 -0
  125. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/enums.py +0 -0
  126. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/model_config.py +0 -0
  127. {euroeval-15.5.0 → euroeval-15.6.0}/src/euroeval/scores.py +0 -0
  128. {euroeval-15.5.0/src/euroeval/task_utils → euroeval-15.6.0/src/euroeval/task_group_utils}/__init__.py +0 -0
  129. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/constants.py +0 -0
  130. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_allocine.py +0 -0
  131. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_angry_tweets.py +0 -0
  132. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_arc.py +0 -0
  133. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_arc_is.py +0 -0
  134. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_belebele.py +0 -0
  135. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_cnn_dailymail.py +0 -0
  136. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_conll_en.py +0 -0
  137. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_conll_es.py +0 -0
  138. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_conll_nl.py +0 -0
  139. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dane.py +0 -0
  140. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_danish_citizen_tests.py +0 -0
  141. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dansk.py +0 -0
  142. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_danske_talemaader.py +0 -0
  143. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_danske_talemaader_old.py +0 -0
  144. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dbrd.py +0 -0
  145. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dutch_cola.py +0 -0
  146. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_dutch_social.py +0 -0
  147. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_eltec.py +0 -0
  148. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_fone.py +0 -0
  149. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_foqa.py +0 -0
  150. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_fosent.py +0 -0
  151. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_fquad.py +0 -0
  152. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_germanquad.py +0 -0
  153. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_germeval.py +0 -0
  154. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_hellaswag.py +0 -0
  155. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  156. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_ice_linguistic.py +0 -0
  157. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
  158. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_icelandic_knowledge.py +0 -0
  159. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_icelandic_qa.py +0 -0
  160. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_icesum.py +0 -0
  161. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_ilpost_sum.py +0 -0
  162. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_jentoft.py +0 -0
  163. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mim_gold_ner.py +0 -0
  164. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mlqa_es.py +0 -0
  165. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mlsum_de.py +0 -0
  166. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mlsum_es.py +0 -0
  167. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_mmlu.py +0 -0
  168. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_multinerd-it.py +0 -0
  169. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_no_cola.py +0 -0
  170. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_no_sammendrag.py +0 -0
  171. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
  172. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_nordjylland_news.py +0 -0
  173. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norec.py +0 -0
  174. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norglm_multiqa.py +0 -0
  175. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norglm_multisum.py +0 -0
  176. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norne.py +0 -0
  177. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_norquad.py +0 -0
  178. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_nqii.py +0 -0
  179. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
  180. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_orange_sum.py +0 -0
  181. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_personal_sum.py +0 -0
  182. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_rrn.py +0 -0
  183. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_sb10k.py +0 -0
  184. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_scala.py +0 -0
  185. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_scandiqa.py +0 -0
  186. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_schibsted.py +0 -0
  187. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
  188. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_sentipolc16.py +0 -0
  189. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_squad.py +0 -0
  190. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_squad_it.py +0 -0
  191. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_squad_nl.py +0 -0
  192. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_squad_nl_old.py +0 -0
  193. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_sst5.py +0 -0
  194. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_suc3.py +0 -0
  195. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_swedn.py +0 -0
  196. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_swerec.py +0 -0
  197. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
  198. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_wikiann_fo.py +0 -0
  199. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_wikineural-it.py +0 -0
  200. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_winogrande_is.py +0 -0
  201. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/create_xquad_es.py +0 -0
  202. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/fix_dot_env_file.py +0 -0
  203. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/load_ud_pos.py +0 -0
  204. {euroeval-15.5.0 → euroeval-15.6.0}/src/scripts/versioning.py +0 -0
  205. {euroeval-15.5.0 → euroeval-15.6.0}/tests/__init__.py +0 -0
  206. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_config_factory.py +0 -0
  207. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/__init__.py +0 -0
  208. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_base.py +0 -0
  209. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_fresh.py +0 -0
  210. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_hf.py +0 -0
  211. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_litellm.py +0 -0
  212. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_benchmark_modules/test_vllm.py +0 -0
  213. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_cli.py +0 -0
  214. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_constants.py +0 -0
  215. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_data_models.py +0 -0
  216. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_dataset_configs.py +0 -0
  217. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_enums.py +0 -0
  218. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_exceptions.py +0 -0
  219. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_generation.py +0 -0
  220. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_human_evaluation.py +0 -0
  221. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_languages.py +0 -0
  222. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_model_cache.py +0 -0
  223. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_model_config.py +0 -0
  224. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_scores.py +0 -0
  225. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_speed_benchmark.py +0 -0
  226. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/__init__.py +0 -0
  227. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/test_question_answering.py +0 -0
  228. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/test_sequence_classification.py +0 -0
  229. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/test_text_to_text.py +0 -0
  230. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_task_utils/test_token_classification.py +0 -0
  231. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_tasks.py +0 -0
  232. {euroeval-15.5.0 → euroeval-15.6.0}/tests/test_types.py +0 -0
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.11.4
13
+ rev: v0.11.5
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -10,6 +10,41 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.6.0] - 2025-04-13
14
+ ### Added
15
+ - We now support specifying custom inference providers when benchmarking via the Hugging
16
+ Face inference APIs. This can be done by specifying the model as
17
+ `huggingface/<inference-provider>/<organisation>/<model>`, as described in [these
18
+ LiteLLM docs](https://docs.litellm.ai/docs/providers/huggingface).
19
+
20
+ ### Changed
21
+ - Updated `transformers` to `>=4.51.0`, which includes support for Llama-4, Phi-4,
22
+ Deepseek-v3 and Qwen3. This also includes the `image-text-to-text` pipeline tag
23
+ properly, so that we do not have to use a custom fix for it anymore.
24
+ - Updated `vllm` to `>=0.8.3`, which includes support for Llama-4.
25
+ - Set the maximum amount of logprobs for generative models to 8, as that is the upper
26
+ bound for xAI models.
27
+ - When benchmarking Ollama models, if the model is not found, we now also check if the
28
+ model exists if prefixed with 'hf.co/'.
29
+ - Uniformised the prompt templates used for each task, so that they are more
30
+ consistent across tasks. Evaluation tests across different model types and sizes show
31
+ no significant performance difference between the new and old templates. This was
32
+ contributed by [@viggo-gascou](https://github.com/viggo-gascou) ✨
33
+
34
+ ### Fixed
35
+ - Avoid duplicate error messages when a rate limit occurs.
36
+ - ModernBERT models cannot be used on a CPU, which caused an error in our check for
37
+ maximal context length. In this case we simply skip this check and use the reported
38
+ maximal context length as-is.
39
+ - Fixed issue with benchmarking multiple generative models in the same evaluation
40
+ command. This was caused by vLLM and Ray not being able to release GPU memory
41
+ properly, but this seems to be released properly now.
42
+ - Now only logs when encoder models are being benchmarked on generative tasks if the
43
+ `--verbose` flag is set (or `verbose=True` in the `Benchmarker` API).
44
+ - All Spanish NER datasets were mistakenly marked as unofficial. The `conll-es` is now
45
+ marked as official.
46
+
47
+
13
48
  ## [v15.5.0] - 2025-04-07
14
49
  ### Added
15
50
  - Now allows supplying a parameter to API models, which is done by using
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.5.0
3
+ Version: 15.6.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -35,7 +35,7 @@ Requires-Dist: click>=8.1.3
35
35
  Requires-Dist: datasets>=2.15.0
36
36
  Requires-Dist: demjson3>=3.0.6
37
37
  Requires-Dist: evaluate>=0.4.1
38
- Requires-Dist: huggingface-hub>=0.24.0
38
+ Requires-Dist: huggingface-hub>=0.30.1
39
39
  Requires-Dist: levenshtein>=0.24.0
40
40
  Requires-Dist: litellm>=1.63.0
41
41
  Requires-Dist: more-itertools>=10.5.0
@@ -56,18 +56,18 @@ Requires-Dist: setuptools>=75.8.2
56
56
  Requires-Dist: tenacity>=9.0.0
57
57
  Requires-Dist: termcolor>=2.0.0
58
58
  Requires-Dist: torch>=2.6.0
59
- Requires-Dist: transformers>=4.50.0
59
+ Requires-Dist: transformers>=4.51.0
60
60
  Provides-Extra: all
61
61
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
71
71
  Provides-Extra: human-evaluation
72
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
73
  Provides-Extra: test
@@ -89,7 +89,7 @@ ______________________________________________________________________
89
89
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
90
90
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
91
91
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
92
- [![Code Coverage](https://img.shields.io/badge/Coverage-65%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
92
+ [![Code Coverage](https://img.shields.io/badge/Coverage-67%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
93
93
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
94
94
 
95
95
 
@@ -206,7 +206,9 @@ sentiment-classification`.
206
206
 
207
207
 
208
208
  ### Reproducing the datasets
209
- All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
209
+ All datasets used in this project are generated using the scripts located in the
210
+ [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
211
+ with the following command
210
212
 
211
213
  ```shell
212
214
  $ uv run src/scripts/<name-of-script>.py
@@ -218,8 +220,27 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
218
220
  $ uv run src/scripts/create_allocine.py
219
221
  ```
220
222
 
221
-
222
- ## Special Thanks :pray:
223
+ ## Contributors :pray:
224
+
225
+ A huge thank you to all the contributors who have helped make this project a success!
226
+
227
+ <a href="https://github.com/peter-sk"><img src="https://avatars.githubusercontent.com/u/6168908" width=50 alt="Contributor avatar for peter-sk"/></a>
228
+ <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
229
+ <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
230
+ <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
231
+ <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
232
+ <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
233
+ <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
234
+ <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
235
+ <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
236
+ <a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
237
+ <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
238
+ <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
239
+ <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
240
+
241
+ ### Special Thanks
242
+ - Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
243
+ [Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
223
244
  - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
224
245
  models on the leaderboards.
225
246
  - Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
@@ -13,7 +13,7 @@ ______________________________________________________________________
13
13
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
14
14
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
15
15
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
16
- [![Code Coverage](https://img.shields.io/badge/Coverage-65%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
16
+ [![Code Coverage](https://img.shields.io/badge/Coverage-67%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
17
17
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
18
18
 
19
19
 
@@ -130,7 +130,9 @@ sentiment-classification`.
130
130
 
131
131
 
132
132
  ### Reproducing the datasets
133
- All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
133
+ All datasets used in this project are generated using the scripts located in the
134
+ [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
135
+ with the following command
134
136
 
135
137
  ```shell
136
138
  $ uv run src/scripts/<name-of-script>.py
@@ -142,8 +144,27 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
142
144
  $ uv run src/scripts/create_allocine.py
143
145
  ```
144
146
 
147
+ ## Contributors :pray:
145
148
 
146
- ## Special Thanks :pray:
149
+ A huge thank you to all the contributors who have helped make this project a success!
150
+
151
+ <a href="https://github.com/peter-sk"><img src="https://avatars.githubusercontent.com/u/6168908" width=50 alt="Contributor avatar for peter-sk"/></a>
152
+ <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
153
+ <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
154
+ <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
155
+ <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
156
+ <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
157
+ <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
158
+ <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
159
+ <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
160
+ <a href="https://github.com/pakagronglb"><img src="https://avatars.githubusercontent.com/u/178713124" width=50 alt="Contributor avatar for pakagronglb"/></a>
161
+ <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
162
+ <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
163
+ <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
164
+
165
+ ### Special Thanks
166
+ - Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
167
+ [Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
147
168
  - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
148
169
  models on the leaderboards.
149
170
  - Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
@@ -56,7 +56,6 @@ install-dependencies:
56
56
  @if [ "${NO_FLASH_ATTN}" != "1" ] && [ $$(uname) != "Darwin" ]; then \
57
57
  uv pip install --no-build-isolation flash-attn>=2.7.0.post2; \
58
58
  fi
59
- @uv sync -U --only-dev
60
59
 
61
60
  setup-environment-variables:
62
61
  @uv run python src/scripts/fix_dot_env_file.py
@@ -156,8 +155,8 @@ publish-scandeval:
156
155
  fi
157
156
  @mv src/scandeval src/euroeval
158
157
 
159
- publish-major: bump-major publish ## Publish a major version
158
+ publish-major: install check bump-major publish ## Publish a major version
160
159
 
161
- publish-minor: bump-minor publish ## Publish a minor version
160
+ publish-minor: install check bump-minor publish ## Publish a minor version
162
161
 
163
- publish-patch: bump-patch publish ## Publish a patch version
162
+ publish-patch: install check bump-patch publish ## Publish a patch version
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.5.0"
3
+ version = "15.6.0"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -15,7 +15,7 @@ dependencies = [
15
15
  "torch>=2.6.0",
16
16
  "pandas>=2.2.0",
17
17
  "numpy>=1.23.0,<2.0.0",
18
- "transformers>=4.50.0",
18
+ "transformers>=4.51.0",
19
19
  "accelerate>=0.34.2",
20
20
  "evaluate>=0.4.1",
21
21
  "datasets>=2.15.0",
@@ -24,7 +24,7 @@ dependencies = [
24
24
  "termcolor>=2.0.0",
25
25
  "seqeval>=1.2.2",
26
26
  "python-dotenv>=1.0.1",
27
- "huggingface-hub>=0.24.0",
27
+ "huggingface-hub>=0.30.1",
28
28
  "pyinfer>=0.0.3",
29
29
  "sentencepiece>=0.1.96",
30
30
  "protobuf~=3.20.0",
@@ -46,7 +46,7 @@ dependencies = [
46
46
  generative = [
47
47
  "outlines>=0.1.11",
48
48
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
49
- "vllm>=0.8.0; platform_system == 'Linux'",
49
+ "vllm>=0.8.3; platform_system == 'Linux'",
50
50
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
51
51
  ]
52
52
  human_evaluation = [
@@ -55,7 +55,7 @@ human_evaluation = [
55
55
  all = [
56
56
  "outlines>=0.1.11",
57
57
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
58
- "vllm>=0.8.0; platform_system == 'Linux'",
58
+ "vllm>=0.8.3; platform_system == 'Linux'",
59
59
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
60
60
  "gradio>=4.26.0",
61
61
  ]
@@ -107,6 +107,7 @@ dev-dependencies = [
107
107
  "types-setuptools>=75.8.0.20250110",
108
108
  "types-ujson>=5.10.0.20240515",
109
109
  "types-simplejson>=3.2.0.2025032",
110
+ "debugpy>=1.8.13",
110
111
  ]
111
112
 
112
113
  [tool.ruff]
@@ -144,6 +145,16 @@ select = [
144
145
  # Pyflakes
145
146
  "F",
146
147
  ]
148
+ ignore = [
149
+ # Type annotations for "self" arguments
150
+ "ANN101",
151
+ # Type annotations for "cls" arguments
152
+ "ANN102",
153
+ # Type annotations for **kwargs
154
+ "ANN003",
155
+ # Docstrings for **kwargs
156
+ "D417",
157
+ ]
147
158
 
148
159
  [tool.ruff.lint.extend-per-file-ignores]
149
160
  "__init__.py" = [
@@ -10,7 +10,8 @@ from functools import cached_property, partial
10
10
  from datasets import DatasetDict
11
11
  from torch import nn
12
12
  from tqdm.auto import tqdm
13
- from transformers import PreTrainedTokenizer, Trainer
13
+ from transformers.tokenization_utils import PreTrainedTokenizer
14
+ from transformers.trainer import Trainer
14
15
 
15
16
  from ..data_models import (
16
17
  BenchmarkConfig,
@@ -21,7 +22,7 @@ from ..data_models import (
21
22
  )
22
23
  from ..enums import BatchingPreference, GenerativeType, TaskGroup
23
24
  from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
24
- from ..task_utils import (
25
+ from ..task_group_utils import (
25
26
  question_answering,
26
27
  sequence_classification,
27
28
  text_to_text,
@@ -4,19 +4,21 @@ import os
4
4
  from functools import cached_property
5
5
  from json import JSONDecodeError
6
6
 
7
- from transformers import (
8
- AutoConfig,
9
- AutoTokenizer,
7
+ from transformers.configuration_utils import PretrainedConfig
8
+ from transformers.modeling_utils import PreTrainedModel
9
+ from transformers.models.auto.configuration_auto import AutoConfig
10
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
11
+ from transformers.models.electra import (
10
12
  ElectraForQuestionAnswering,
11
13
  ElectraForSequenceClassification,
12
14
  ElectraForTokenClassification,
13
- PretrainedConfig,
14
- PreTrainedModel,
15
- PreTrainedTokenizer,
15
+ )
16
+ from transformers.models.xlm_roberta import (
16
17
  XLMRobertaForQuestionAnswering,
17
18
  XLMRobertaForSequenceClassification,
18
19
  XLMRobertaForTokenClassification,
19
20
  )
21
+ from transformers.tokenization_utils import PreTrainedTokenizer
20
22
 
21
23
  from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
22
24
  from ..enums import InferenceBackend, ModelType, TaskGroup
@@ -13,31 +13,29 @@ import torch
13
13
  from datasets import DatasetDict
14
14
  from huggingface_hub import HfApi
15
15
  from huggingface_hub import whoami as hf_whoami
16
- from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
17
- from huggingface_hub.hf_api import RepositoryNotFoundError, RevisionNotFoundError
18
- from huggingface_hub.utils import (
16
+ from huggingface_hub.errors import (
19
17
  GatedRepoError,
20
18
  HFValidationError,
21
19
  LocalTokenNotFoundError,
20
+ RepositoryNotFoundError,
21
+ RevisionNotFoundError,
22
22
  )
23
+ from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
23
24
  from peft import PeftConfig
24
25
  from requests.exceptions import RequestException
25
26
  from torch import nn
26
- from transformers import (
27
- AutoConfig,
28
- AutoTokenizer,
29
- BatchEncoding,
27
+ from transformers.configuration_utils import PretrainedConfig
28
+ from transformers.data.data_collator import (
30
29
  DataCollatorForTokenClassification,
31
30
  DataCollatorWithPadding,
32
- PretrainedConfig,
33
- PreTrainedModel,
34
- PreTrainedTokenizer,
35
- Trainer,
36
31
  )
37
32
  from transformers.modelcard import TASK_MAPPING
38
- from transformers.models.auto.modeling_auto import (
39
- MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
40
- )
33
+ from transformers.modeling_utils import PreTrainedModel
34
+ from transformers.models.auto.configuration_auto import AutoConfig
35
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
36
+ from transformers.tokenization_utils import PreTrainedTokenizer
37
+ from transformers.tokenization_utils_base import BatchEncoding
38
+ from transformers.trainer import Trainer
41
39
  from urllib3.exceptions import RequestError
42
40
 
43
41
  from ..constants import (
@@ -65,18 +63,17 @@ from ..exceptions import (
65
63
  NoInternetConnection,
66
64
  )
67
65
  from ..languages import get_all_languages
68
- from ..task_utils import (
66
+ from ..task_group_utils import (
69
67
  multiple_choice_classification,
70
68
  question_answering,
71
69
  token_classification,
72
70
  )
71
+ from ..tokenization_utils import get_bos_token, get_eos_token
73
72
  from ..types import ExtractLabelsFunction
74
73
  from ..utils import (
75
74
  block_terminal_output,
76
75
  create_model_cache_dir,
77
- get_bos_token,
78
76
  get_class_by_name,
79
- get_eos_token,
80
77
  internet_connection_available,
81
78
  log_once,
82
79
  )
@@ -690,7 +687,7 @@ def load_model_and_tokenizer(
690
687
  assert model is not None, "The model should not be None."
691
688
 
692
689
  model.eval()
693
- model.to(benchmark_config.device)
690
+ model.to(benchmark_config.device) # type: ignore[arg-type]
694
691
 
695
692
  if (
696
693
  isinstance(model, PreTrainedModel)
@@ -797,12 +794,6 @@ def get_model_repo_info(
797
794
  tags += base_model_info.tags or list()
798
795
  tags = list(set(tags))
799
796
 
800
- # TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
801
- # 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
802
- # when this PR has been merged in and published:
803
- # https://github.com/huggingface/transformers/pull/37107
804
- TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
805
-
806
797
  # Get the pipeline tag for the model. If it is not specified, then we determine it
807
798
  # by checking the model's architecture as written in the model's Hugging Face config
808
799
  pipeline_tag = model_info.pipeline_tag
@@ -824,7 +815,7 @@ def get_model_repo_info(
824
815
  generative_class_names = [
825
816
  class_name
826
817
  for tag in GENERATIVE_PIPELINE_TAGS
827
- for class_name in TASK_MAPPING.get(tag, dict()).values()
818
+ for class_name in TASK_MAPPING.get(tag, dict()).values() # type: ignore[attr-defined]
828
819
  ]
829
820
  if class_names is not None and any(
830
821
  class_name in generative_class_names for class_name in class_names
@@ -1083,17 +1074,20 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
1083
1074
  for attribute in attribute_list:
1084
1075
  token_type_embeddings = getattr(token_type_embeddings, attribute)
1085
1076
 
1077
+ token_type_embedding_tensor = token_type_embeddings.weight.data
1078
+ assert isinstance(token_type_embedding_tensor, torch.Tensor)
1079
+
1086
1080
  # If the token type embeddings has shape (1, ...) then set the shape to
1087
1081
  # (2, ...) by randomly initializing the second token type embedding
1088
- if token_type_embeddings.weight.data.shape[0] == 1:
1082
+ if token_type_embedding_tensor.shape[0] == 1:
1089
1083
  token_type_embeddings.weight.data = torch.cat(
1090
1084
  (
1091
- token_type_embeddings.weight.data,
1092
- torch.rand_like(token_type_embeddings.weight.data),
1085
+ token_type_embedding_tensor,
1086
+ torch.rand_like(token_type_embedding_tensor),
1093
1087
  ),
1094
1088
  dim=0,
1095
1089
  )
1096
- token_type_embeddings.num_embeddings = 2
1090
+ token_type_embeddings.num_embeddings = 2 # type: ignore[assignment]
1097
1091
 
1098
1092
  # Set the model config to use the new type vocab size
1099
1093
  model.config.type_vocab_size = 2
@@ -1160,7 +1154,7 @@ def align_model_and_tokenizer(
1160
1154
  # Move the model to the CPU, since otherwise we can't catch the IndexErrors when
1161
1155
  # finding the maximum sequence length of the model
1162
1156
  model_device = model.device
1163
- model.to(torch.device("cpu"))
1157
+ model.to(torch.device("cpu")) # type: ignore[arg-type]
1164
1158
 
1165
1159
  # Manually check that this model max length is valid for the model, and adjust
1166
1160
  # otherwise
@@ -1182,8 +1176,16 @@ def align_model_and_tokenizer(
1182
1176
  except IndexError:
1183
1177
  continue
1184
1178
 
1179
+ except ValueError as e:
1180
+ # This happens when the model is using Triton, such as with ModernBERT,
1181
+ # which doesn't work with CPU tensors at all
1182
+ if "cpu tensor" in str(e):
1183
+ break
1184
+ else:
1185
+ raise e
1186
+
1185
1187
  # Move the model back to the original device
1186
- model.to(model_device)
1188
+ model.to(model_device) # type: ignore[arg-type]
1187
1189
 
1188
1190
  # If there is a mismatch between the vocab size according to the tokenizer and
1189
1191
  # the vocab size according to the model, we raise an error