EuroEval 15.8.1__tar.gz → 15.8.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (240) hide show
  1. {euroeval-15.8.1 → euroeval-15.8.2}/.github/workflows/ci.yaml +4 -0
  2. {euroeval-15.8.1 → euroeval-15.8.2}/.pre-commit-config.yaml +1 -1
  3. {euroeval-15.8.1 → euroeval-15.8.2}/CHANGELOG.md +7 -0
  4. {euroeval-15.8.1 → euroeval-15.8.2}/PKG-INFO +3 -3
  5. {euroeval-15.8.1 → euroeval-15.8.2}/pyproject.toml +3 -3
  6. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/benchmark_modules/litellm.py +7 -2
  7. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/model_cache.py +9 -0
  8. {euroeval-15.8.1 → euroeval-15.8.2}/uv.lock +3 -3
  9. {euroeval-15.8.1 → euroeval-15.8.2}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  10. {euroeval-15.8.1 → euroeval-15.8.2}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  11. {euroeval-15.8.1 → euroeval-15.8.2}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  12. {euroeval-15.8.1 → euroeval-15.8.2}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  13. {euroeval-15.8.1 → euroeval-15.8.2}/.gitignore +0 -0
  14. {euroeval-15.8.1 → euroeval-15.8.2}/CITATION.cff +0 -0
  15. {euroeval-15.8.1 → euroeval-15.8.2}/CODE_OF_CONDUCT.md +0 -0
  16. {euroeval-15.8.1 → euroeval-15.8.2}/CONTRIBUTING.md +0 -0
  17. {euroeval-15.8.1 → euroeval-15.8.2}/Dockerfile.cuda +0 -0
  18. {euroeval-15.8.1 → euroeval-15.8.2}/LICENSE +0 -0
  19. {euroeval-15.8.1 → euroeval-15.8.2}/NEW_DATASET_GUIDE.md +0 -0
  20. {euroeval-15.8.1 → euroeval-15.8.2}/README.md +0 -0
  21. {euroeval-15.8.1 → euroeval-15.8.2}/docs/CNAME +0 -0
  22. {euroeval-15.8.1 → euroeval-15.8.2}/docs/README.md +0 -0
  23. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/README.md +0 -0
  24. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/danish.md +0 -0
  25. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/dutch.md +0 -0
  26. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/english.md +0 -0
  27. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/faroese.md +0 -0
  28. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/finnish.md +0 -0
  29. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/french.md +0 -0
  30. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/german.md +0 -0
  31. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/icelandic.md +0 -0
  32. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/italian.md +0 -0
  33. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/norwegian.md +0 -0
  34. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/spanish.md +0 -0
  35. {euroeval-15.8.1 → euroeval-15.8.2}/docs/datasets/swedish.md +0 -0
  36. {euroeval-15.8.1 → euroeval-15.8.2}/docs/extras/radial_plotter.md +0 -0
  37. {euroeval-15.8.1 → euroeval-15.8.2}/docs/faq.md +0 -0
  38. {euroeval-15.8.1 → euroeval-15.8.2}/docs/gfx/favicon.png +0 -0
  39. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/danish.md +0 -0
  40. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/dutch.md +0 -0
  41. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/english.md +0 -0
  42. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/faroese.md +0 -0
  43. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/french.md +0 -0
  44. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/german.md +0 -0
  45. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  46. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/italian.md +0 -0
  47. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  48. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/spanish.md +0 -0
  49. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Monolingual/swedish.md +0 -0
  50. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Multilingual/european.md +0 -0
  51. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Multilingual/germanic.md +0 -0
  52. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  53. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/Multilingual/romance.md +0 -0
  54. {euroeval-15.8.1 → euroeval-15.8.2}/docs/leaderboards/README.md +0 -0
  55. {euroeval-15.8.1 → euroeval-15.8.2}/docs/methodology.md +0 -0
  56. {euroeval-15.8.1 → euroeval-15.8.2}/docs/python-package.md +0 -0
  57. {euroeval-15.8.1 → euroeval-15.8.2}/docs/tasks/README.md +0 -0
  58. {euroeval-15.8.1 → euroeval-15.8.2}/docs/tasks/common-sense-reasoning.md +0 -0
  59. {euroeval-15.8.1 → euroeval-15.8.2}/docs/tasks/knowledge.md +0 -0
  60. {euroeval-15.8.1 → euroeval-15.8.2}/docs/tasks/linguistic-acceptability.md +0 -0
  61. {euroeval-15.8.1 → euroeval-15.8.2}/docs/tasks/named-entity-recognition.md +0 -0
  62. {euroeval-15.8.1 → euroeval-15.8.2}/docs/tasks/reading-comprehension.md +0 -0
  63. {euroeval-15.8.1 → euroeval-15.8.2}/docs/tasks/sentiment-classification.md +0 -0
  64. {euroeval-15.8.1 → euroeval-15.8.2}/docs/tasks/speed.md +0 -0
  65. {euroeval-15.8.1 → euroeval-15.8.2}/docs/tasks/summarization.md +0 -0
  66. {euroeval-15.8.1 → euroeval-15.8.2}/gfx/euroeval.png +0 -0
  67. {euroeval-15.8.1 → euroeval-15.8.2}/gfx/euroeval.xcf +0 -0
  68. {euroeval-15.8.1 → euroeval-15.8.2}/gfx/scandeval.png +0 -0
  69. {euroeval-15.8.1 → euroeval-15.8.2}/makefile +0 -0
  70. {euroeval-15.8.1 → euroeval-15.8.2}/mkdocs.yaml +0 -0
  71. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/__init__.py +0 -0
  72. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/benchmark_config_factory.py +0 -0
  73. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/benchmark_modules/__init__.py +0 -0
  74. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/benchmark_modules/base.py +0 -0
  75. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/benchmark_modules/fresh.py +0 -0
  76. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/benchmark_modules/hf.py +0 -0
  77. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/benchmark_modules/vllm.py +0 -0
  78. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/benchmarker.py +0 -0
  79. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/callbacks.py +0 -0
  80. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/cli.py +0 -0
  81. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/constants.py +0 -0
  82. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/data_loading.py +0 -0
  83. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/data_models.py +0 -0
  84. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/__init__.py +0 -0
  85. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/danish.py +0 -0
  86. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/dutch.py +0 -0
  87. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/english.py +0 -0
  88. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/faroese.py +0 -0
  89. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/finnish.py +0 -0
  90. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/french.py +0 -0
  91. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/german.py +0 -0
  92. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/icelandic.py +0 -0
  93. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/italian.py +0 -0
  94. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/norwegian.py +0 -0
  95. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/spanish.py +0 -0
  96. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/dataset_configs/swedish.py +0 -0
  97. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/enums.py +0 -0
  98. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/exceptions.py +0 -0
  99. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/finetuning.py +0 -0
  100. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/generation.py +0 -0
  101. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/generation_utils.py +0 -0
  102. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/human_evaluation.py +0 -0
  103. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/languages.py +0 -0
  104. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/model_config.py +0 -0
  105. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/model_loading.py +0 -0
  106. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/prompt_templates/__init__.py +0 -0
  107. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  108. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  109. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  110. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  111. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  112. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/prompt_templates/summarization.py +0 -0
  113. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/scores.py +0 -0
  114. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/speed_benchmark.py +0 -0
  115. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/task_group_utils/__init__.py +0 -0
  116. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  117. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/task_group_utils/question_answering.py +0 -0
  118. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
  119. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  120. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/task_group_utils/token_classification.py +0 -0
  121. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/tasks.py +0 -0
  122. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/tokenization_utils.py +0 -0
  123. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/types.py +0 -0
  124. {euroeval-15.8.1 → euroeval-15.8.2}/src/euroeval/utils.py +0 -0
  125. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/constants.py +0 -0
  126. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_allocine.py +0 -0
  127. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_angry_tweets.py +0 -0
  128. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_arc.py +0 -0
  129. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_arc_is.py +0 -0
  130. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_belebele.py +0 -0
  131. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_cnn_dailymail.py +0 -0
  132. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_conll_en.py +0 -0
  133. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_conll_es.py +0 -0
  134. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_conll_nl.py +0 -0
  135. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_dane.py +0 -0
  136. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_danish_citizen_tests.py +0 -0
  137. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_dansk.py +0 -0
  138. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_danske_talemaader.py +0 -0
  139. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_danske_talemaader_old.py +0 -0
  140. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_dbrd.py +0 -0
  141. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_dutch_cola.py +0 -0
  142. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_eltec.py +0 -0
  143. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_fone.py +0 -0
  144. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_foqa.py +0 -0
  145. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_fosent.py +0 -0
  146. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_fquad.py +0 -0
  147. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_germanquad.py +0 -0
  148. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_germeval.py +0 -0
  149. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_hellaswag.py +0 -0
  150. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_hellaswag_fi.py +0 -0
  151. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  152. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_ice_linguistic.py +0 -0
  153. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_icelandic_error_corpus.py +0 -0
  154. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_icelandic_knowledge.py +0 -0
  155. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_icelandic_qa.py +0 -0
  156. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_icesum.py +0 -0
  157. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_ilpost_sum.py +0 -0
  158. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_jentoft.py +0 -0
  159. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_mim_gold_ner.py +0 -0
  160. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_mlqa_es.py +0 -0
  161. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_mlsum_de.py +0 -0
  162. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_mlsum_es.py +0 -0
  163. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_mmlu.py +0 -0
  164. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_multinerd-it.py +0 -0
  165. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_no_cola.py +0 -0
  166. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_no_sammendrag.py +0 -0
  167. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_nor_common_sense_qa.py +0 -0
  168. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_nordjylland_news.py +0 -0
  169. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_norec.py +0 -0
  170. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_norglm_multiqa.py +0 -0
  171. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_norglm_multisum.py +0 -0
  172. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_norne.py +0 -0
  173. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_norquad.py +0 -0
  174. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_nqii.py +0 -0
  175. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_nrk_quiz_qa.py +0 -0
  176. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_orange_sum.py +0 -0
  177. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_personal_sum.py +0 -0
  178. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_rrn.py +0 -0
  179. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_sb10k.py +0 -0
  180. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_scala.py +0 -0
  181. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_scandiqa.py +0 -0
  182. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_scandisent_fi.py +0 -0
  183. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_schibsted.py +0 -0
  184. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_sentiment_headlines_es.py +0 -0
  185. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_sentipolc16.py +0 -0
  186. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_squad.py +0 -0
  187. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_squad_it.py +0 -0
  188. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_squad_nl.py +0 -0
  189. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_squad_nl_old.py +0 -0
  190. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_sst5.py +0 -0
  191. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_suc3.py +0 -0
  192. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_swedn.py +0 -0
  193. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_swerec.py +0 -0
  194. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_turku_ner_fi.py +0 -0
  195. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_tydiqa_fi.py +0 -0
  196. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_wiki_lingua_nl.py +0 -0
  197. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_wikiann_fo.py +0 -0
  198. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_wikineural-it.py +0 -0
  199. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_winogrande_is.py +0 -0
  200. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_xlsum_fi.py +0 -0
  201. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/create_xquad_es.py +0 -0
  202. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/fix_dot_env_file.py +0 -0
  203. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/load_ud_pos.py +0 -0
  204. {euroeval-15.8.1 → euroeval-15.8.2}/src/scripts/versioning.py +0 -0
  205. {euroeval-15.8.1 → euroeval-15.8.2}/tests/__init__.py +0 -0
  206. {euroeval-15.8.1 → euroeval-15.8.2}/tests/conftest.py +0 -0
  207. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_benchmark_config_factory.py +0 -0
  208. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_benchmark_modules/__init__.py +0 -0
  209. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_benchmark_modules/test_base.py +0 -0
  210. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_benchmark_modules/test_fresh.py +0 -0
  211. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_benchmark_modules/test_hf.py +0 -0
  212. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_benchmark_modules/test_litellm.py +0 -0
  213. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_benchmark_modules/test_vllm.py +0 -0
  214. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_benchmarker.py +0 -0
  215. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_callbacks.py +0 -0
  216. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_cli.py +0 -0
  217. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_constants.py +0 -0
  218. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_data_loading.py +0 -0
  219. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_data_models.py +0 -0
  220. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_dataset_configs.py +0 -0
  221. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_enums.py +0 -0
  222. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_exceptions.py +0 -0
  223. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_finetuning.py +0 -0
  224. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_generation.py +0 -0
  225. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_human_evaluation.py +0 -0
  226. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_languages.py +0 -0
  227. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_model_cache.py +0 -0
  228. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_model_config.py +0 -0
  229. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_model_loading.py +0 -0
  230. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_scores.py +0 -0
  231. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_speed_benchmark.py +0 -0
  232. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_task_utils/__init__.py +0 -0
  233. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_task_utils/test_question_answering.py +0 -0
  234. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_task_utils/test_sequence_classification.py +0 -0
  235. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_task_utils/test_text_to_text.py +0 -0
  236. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_task_utils/test_token_classification.py +0 -0
  237. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_tasks.py +0 -0
  238. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_tokenization_utils.py +0 -0
  239. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_types.py +0 -0
  240. {euroeval-15.8.1 → euroeval-15.8.2}/tests/test_utils.py +0 -0
@@ -10,6 +10,10 @@ on:
10
10
  branches:
11
11
  - main
12
12
 
13
+ concurrency:
14
+ group: ${{ github.workflow }}-${{ github.head_ref }}
15
+ cancel-in-progress: true
16
+
13
17
  jobs:
14
18
  code-check:
15
19
  if: github.event.pull_request.draft == false
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.11.8
13
+ rev: v0.11.9
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -10,6 +10,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.8.2] - 2025-05-12
14
+ ### Fixed
15
+ - Catch error when caching generative model outputs, when the number of model inputs and
16
+ outputs do not match.
17
+ - Disallow vLLM >=0.8.5, as it breaks generation output for several models.
18
+
19
+
13
20
  ## [v15.8.1] - 2025-05-08
14
21
  ### Fixed
15
22
  - NER labels were included twice in the prompt templates (which was due to there being
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.8.1
3
+ Version: 15.8.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -62,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
62
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm<0.8.5,>=0.8.3; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm<0.8.5,>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
71
71
  Provides-Extra: human-evaluation
72
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
73
  Provides-Extra: test
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.8.1"
3
+ version = "15.8.2"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -46,7 +46,7 @@ dependencies = [
46
46
  generative = [
47
47
  "outlines>=0.1.11",
48
48
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
49
- "vllm>=0.8.3; platform_system == 'Linux'",
49
+ "vllm>=0.8.3,<0.8.5; platform_system == 'Linux'",
50
50
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
51
51
  ]
52
52
  human_evaluation = [
@@ -55,7 +55,7 @@ human_evaluation = [
55
55
  all = [
56
56
  "outlines>=0.1.11",
57
57
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
58
- "vllm>=0.8.3; platform_system == 'Linux'",
58
+ "vllm>=0.8.3,<0.8.5; platform_system == 'Linux'",
59
59
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
60
60
  "gradio>=4.26.0",
61
61
  ]
@@ -401,6 +401,12 @@ class LiteLLMModel(BenchmarkModule):
401
401
  model_responses=ordered_responses, model_id=self.model_config.model_id
402
402
  )
403
403
 
404
+ if len(messages) != len(model_output.sequences):
405
+ raise InvalidBenchmark(
406
+ f"Number of model inputs ({len(messages):,}) does not match the "
407
+ f"number of model outputs ({len(model_output.sequences):,})."
408
+ )
409
+
404
410
  return model_output
405
411
 
406
412
  def _handle_exception(
@@ -616,8 +622,7 @@ class LiteLLMModel(BenchmarkModule):
616
622
  scores = []
617
623
  for model_response in model_responses:
618
624
  if not model_response.choices:
619
- # This happens for reasoning models, when they don't finish thinking
620
- # and run out of tokens. Happens quite rarely, but we need to handle it.
625
+ sequences.append("")
621
626
  logger.warning(
622
627
  f"The model {model_id!r} did not end up "
623
628
  "generating any text. This is likely because the model ran "
@@ -168,6 +168,15 @@ class ModelCache:
168
168
  input_column = "messages" if "messages" in model_inputs else "text"
169
169
  model_inputs = model_inputs[input_column]
170
170
 
171
+ # Double check that the number of inputs and outputs match
172
+ if not len(model_inputs) == len(model_output.sequences):
173
+ logger.warning(
174
+ f"Number of model inputs ({len(model_inputs)}) does not match the "
175
+ f"number of model outputs ({len(model_output.sequences)}). We will not "
176
+ f"cache the model outputs."
177
+ )
178
+ return
179
+
171
180
  # Store the generated sequences in the cache, one by one
172
181
  with tqdm(
173
182
  iterable=model_inputs,
@@ -906,7 +906,7 @@ wheels = [
906
906
 
907
907
  [[package]]
908
908
  name = "euroeval"
909
- version = "15.8.1"
909
+ version = "15.8.2"
910
910
  source = { editable = "." }
911
911
  dependencies = [
912
912
  { name = "accelerate" },
@@ -1034,8 +1034,8 @@ requires-dist = [
1034
1034
  { name = "termcolor", specifier = ">=2.0.0" },
1035
1035
  { name = "torch", specifier = ">=2.6.0" },
1036
1036
  { name = "transformers", specifier = ">=4.51.0" },
1037
- { name = "vllm", marker = "sys_platform == 'linux' and extra == 'all'", specifier = ">=0.8.3" },
1038
- { name = "vllm", marker = "sys_platform == 'linux' and extra == 'generative'", specifier = ">=0.8.3" },
1037
+ { name = "vllm", marker = "sys_platform == 'linux' and extra == 'all'", specifier = ">=0.8.3,<0.8.5" },
1038
+ { name = "vllm", marker = "sys_platform == 'linux' and extra == 'generative'", specifier = ">=0.8.3,<0.8.5" },
1039
1039
  ]
1040
1040
  provides-extras = ["generative", "human-evaluation", "all", "test"]
1041
1041
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes