EuroEval 15.9.2__tar.gz → 15.10.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (241) hide show
  1. {euroeval-15.9.2 → euroeval-15.10.1}/.pre-commit-config.yaml +2 -2
  2. {euroeval-15.9.2 → euroeval-15.10.1}/CHANGELOG.md +32 -0
  3. {euroeval-15.9.2 → euroeval-15.10.1}/PKG-INFO +7 -8
  4. {euroeval-15.9.2 → euroeval-15.10.1}/README.md +1 -1
  5. {euroeval-15.9.2 → euroeval-15.10.1}/docs/README.md +1 -1
  6. euroeval-15.10.1/docs/leaderboards/Monolingual/finnish.md +15 -0
  7. {euroeval-15.9.2 → euroeval-15.10.1}/pyproject.toml +7 -7
  8. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/hf.py +3 -3
  9. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/litellm.py +158 -122
  10. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/vllm.py +47 -143
  11. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/data_loading.py +8 -2
  12. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/finetuning.py +22 -0
  13. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +11 -1
  14. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/question_answering.py +14 -4
  15. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/tokenization_utils.py +103 -9
  16. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/utils.py +13 -8
  17. {euroeval-15.9.2 → euroeval-15.10.1}/uv.lock +1754 -1758
  18. {euroeval-15.9.2 → euroeval-15.10.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  19. {euroeval-15.9.2 → euroeval-15.10.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  20. {euroeval-15.9.2 → euroeval-15.10.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  21. {euroeval-15.9.2 → euroeval-15.10.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  22. {euroeval-15.9.2 → euroeval-15.10.1}/.github/workflows/ci.yaml +0 -0
  23. {euroeval-15.9.2 → euroeval-15.10.1}/.gitignore +0 -0
  24. {euroeval-15.9.2 → euroeval-15.10.1}/CITATION.cff +0 -0
  25. {euroeval-15.9.2 → euroeval-15.10.1}/CODE_OF_CONDUCT.md +0 -0
  26. {euroeval-15.9.2 → euroeval-15.10.1}/CONTRIBUTING.md +0 -0
  27. {euroeval-15.9.2 → euroeval-15.10.1}/Dockerfile.cuda +0 -0
  28. {euroeval-15.9.2 → euroeval-15.10.1}/LICENSE +0 -0
  29. {euroeval-15.9.2 → euroeval-15.10.1}/NEW_DATASET_GUIDE.md +0 -0
  30. {euroeval-15.9.2 → euroeval-15.10.1}/docs/CNAME +0 -0
  31. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/README.md +0 -0
  32. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/danish.md +0 -0
  33. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/dutch.md +0 -0
  34. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/english.md +0 -0
  35. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/faroese.md +0 -0
  36. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/finnish.md +0 -0
  37. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/french.md +0 -0
  38. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/german.md +0 -0
  39. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/icelandic.md +0 -0
  40. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/italian.md +0 -0
  41. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/norwegian.md +0 -0
  42. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/spanish.md +0 -0
  43. {euroeval-15.9.2 → euroeval-15.10.1}/docs/datasets/swedish.md +0 -0
  44. {euroeval-15.9.2 → euroeval-15.10.1}/docs/extras/radial_plotter.md +0 -0
  45. {euroeval-15.9.2 → euroeval-15.10.1}/docs/faq.md +0 -0
  46. {euroeval-15.9.2 → euroeval-15.10.1}/docs/gfx/favicon.png +0 -0
  47. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/danish.md +0 -0
  48. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
  49. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/english.md +0 -0
  50. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
  51. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/french.md +0 -0
  52. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/german.md +0 -0
  53. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  54. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/italian.md +0 -0
  55. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  56. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
  57. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
  58. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Multilingual/european.md +0 -0
  59. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
  60. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  61. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/Multilingual/romance.md +0 -0
  62. {euroeval-15.9.2 → euroeval-15.10.1}/docs/leaderboards/README.md +0 -0
  63. {euroeval-15.9.2 → euroeval-15.10.1}/docs/methodology.md +0 -0
  64. {euroeval-15.9.2 → euroeval-15.10.1}/docs/python-package.md +0 -0
  65. {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/README.md +0 -0
  66. {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/common-sense-reasoning.md +0 -0
  67. {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/knowledge.md +0 -0
  68. {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/linguistic-acceptability.md +0 -0
  69. {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/named-entity-recognition.md +0 -0
  70. {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/reading-comprehension.md +0 -0
  71. {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/sentiment-classification.md +0 -0
  72. {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/speed.md +0 -0
  73. {euroeval-15.9.2 → euroeval-15.10.1}/docs/tasks/summarization.md +0 -0
  74. {euroeval-15.9.2 → euroeval-15.10.1}/gfx/euroeval.png +0 -0
  75. {euroeval-15.9.2 → euroeval-15.10.1}/gfx/euroeval.xcf +0 -0
  76. {euroeval-15.9.2 → euroeval-15.10.1}/gfx/scandeval.png +0 -0
  77. {euroeval-15.9.2 → euroeval-15.10.1}/makefile +0 -0
  78. {euroeval-15.9.2 → euroeval-15.10.1}/mkdocs.yaml +0 -0
  79. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/__init__.py +0 -0
  80. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_config_factory.py +0 -0
  81. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
  82. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/base.py +0 -0
  83. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
  84. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/benchmarker.py +0 -0
  85. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/callbacks.py +0 -0
  86. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/cli.py +0 -0
  87. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/constants.py +0 -0
  88. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/data_models.py +0 -0
  89. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/__init__.py +0 -0
  90. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/danish.py +0 -0
  91. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/dutch.py +0 -0
  92. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/english.py +0 -0
  93. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/faroese.py +0 -0
  94. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/finnish.py +0 -0
  95. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/french.py +0 -0
  96. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/german.py +0 -0
  97. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/icelandic.py +0 -0
  98. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/italian.py +0 -0
  99. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/norwegian.py +0 -0
  100. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/spanish.py +0 -0
  101. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/dataset_configs/swedish.py +0 -0
  102. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/enums.py +0 -0
  103. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/exceptions.py +0 -0
  104. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/generation.py +0 -0
  105. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/generation_utils.py +0 -0
  106. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/human_evaluation.py +0 -0
  107. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/languages.py +0 -0
  108. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/model_cache.py +0 -0
  109. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/model_config.py +0 -0
  110. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/model_loading.py +0 -0
  111. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/__init__.py +0 -0
  112. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  113. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  114. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  115. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  116. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  117. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/prompt_templates/summarization.py +0 -0
  118. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/scores.py +0 -0
  119. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/speed_benchmark.py +0 -0
  120. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/__init__.py +0 -0
  121. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
  122. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  123. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/task_group_utils/token_classification.py +0 -0
  124. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/tasks.py +0 -0
  125. {euroeval-15.9.2 → euroeval-15.10.1}/src/euroeval/types.py +0 -0
  126. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/constants.py +0 -0
  127. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_allocine.py +0 -0
  128. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_angry_tweets.py +0 -0
  129. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_arc.py +0 -0
  130. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_arc_is.py +0 -0
  131. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_belebele.py +0 -0
  132. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_cnn_dailymail.py +0 -0
  133. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_conll_en.py +0 -0
  134. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_conll_es.py +0 -0
  135. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_conll_nl.py +0 -0
  136. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_dane.py +0 -0
  137. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_danish_citizen_tests.py +0 -0
  138. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_dansk.py +0 -0
  139. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_danske_talemaader.py +0 -0
  140. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_danske_talemaader_old.py +0 -0
  141. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_dbrd.py +0 -0
  142. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_dutch_cola.py +0 -0
  143. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_eltec.py +0 -0
  144. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_fone.py +0 -0
  145. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_foqa.py +0 -0
  146. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_fosent.py +0 -0
  147. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_fquad.py +0 -0
  148. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_germanquad.py +0 -0
  149. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_germeval.py +0 -0
  150. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_hellaswag.py +0 -0
  151. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_hellaswag_fi.py +0 -0
  152. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  153. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_ice_linguistic.py +0 -0
  154. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
  155. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_icelandic_knowledge.py +0 -0
  156. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_icelandic_qa.py +0 -0
  157. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_icesum.py +0 -0
  158. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_ilpost_sum.py +0 -0
  159. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_jentoft.py +0 -0
  160. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mim_gold_ner.py +0 -0
  161. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mlqa_es.py +0 -0
  162. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mlsum_de.py +0 -0
  163. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mlsum_es.py +0 -0
  164. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_mmlu.py +0 -0
  165. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_multinerd-it.py +0 -0
  166. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_no_cola.py +0 -0
  167. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_no_sammendrag.py +0 -0
  168. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
  169. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_nordjylland_news.py +0 -0
  170. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norec.py +0 -0
  171. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norglm_multiqa.py +0 -0
  172. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norglm_multisum.py +0 -0
  173. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norne.py +0 -0
  174. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_norquad.py +0 -0
  175. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_nqii.py +0 -0
  176. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
  177. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_orange_sum.py +0 -0
  178. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_personal_sum.py +0 -0
  179. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_rrn.py +0 -0
  180. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_sb10k.py +0 -0
  181. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_scala.py +0 -0
  182. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_scandiqa.py +0 -0
  183. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_scandisent_fi.py +0 -0
  184. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_schibsted.py +0 -0
  185. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
  186. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_sentipolc16.py +0 -0
  187. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_squad.py +0 -0
  188. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_squad_it.py +0 -0
  189. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_squad_nl.py +0 -0
  190. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_squad_nl_old.py +0 -0
  191. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_sst5.py +0 -0
  192. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_suc3.py +0 -0
  193. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_swedn.py +0 -0
  194. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_swerec.py +0 -0
  195. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_turku_ner_fi.py +0 -0
  196. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_tydiqa_fi.py +0 -0
  197. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
  198. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_wikiann_fo.py +0 -0
  199. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_wikineural-it.py +0 -0
  200. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_winogrande_is.py +0 -0
  201. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_xlsum_fi.py +0 -0
  202. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/create_xquad_es.py +0 -0
  203. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/fix_dot_env_file.py +0 -0
  204. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/load_ud_pos.py +0 -0
  205. {euroeval-15.9.2 → euroeval-15.10.1}/src/scripts/versioning.py +0 -0
  206. {euroeval-15.9.2 → euroeval-15.10.1}/tests/__init__.py +0 -0
  207. {euroeval-15.9.2 → euroeval-15.10.1}/tests/conftest.py +0 -0
  208. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_config_factory.py +0 -0
  209. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/__init__.py +0 -0
  210. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_base.py +0 -0
  211. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_fresh.py +0 -0
  212. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_hf.py +0 -0
  213. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_litellm.py +0 -0
  214. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmark_modules/test_vllm.py +0 -0
  215. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_benchmarker.py +0 -0
  216. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_callbacks.py +0 -0
  217. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_cli.py +0 -0
  218. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_constants.py +0 -0
  219. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_data_loading.py +0 -0
  220. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_data_models.py +0 -0
  221. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_dataset_configs.py +0 -0
  222. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_enums.py +0 -0
  223. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_exceptions.py +0 -0
  224. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_finetuning.py +0 -0
  225. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_generation.py +0 -0
  226. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_human_evaluation.py +0 -0
  227. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_languages.py +0 -0
  228. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_model_cache.py +0 -0
  229. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_model_config.py +0 -0
  230. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_model_loading.py +0 -0
  231. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_scores.py +0 -0
  232. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_speed_benchmark.py +0 -0
  233. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/__init__.py +0 -0
  234. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/test_question_answering.py +0 -0
  235. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/test_sequence_classification.py +0 -0
  236. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/test_text_to_text.py +0 -0
  237. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_task_utils/test_token_classification.py +0 -0
  238. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_tasks.py +0 -0
  239. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_tokenization_utils.py +0 -0
  240. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_types.py +0 -0
  241. {euroeval-15.9.2 → euroeval-15.10.1}/tests/test_utils.py +0 -0
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.11.12
13
+ rev: v0.12.0
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -31,7 +31,7 @@ repos:
31
31
  hooks:
32
32
  - id: nbstripout
33
33
  - repo: https://github.com/pre-commit/mirrors-mypy
34
- rev: v1.16.0
34
+ rev: v1.16.1
35
35
  hooks:
36
36
  - id: mypy
37
37
  args:
@@ -10,6 +10,38 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.10.1] - 2025-06-20
14
+ ### Fixed
15
+ - Fixed an issue when benchmarking encoder models on reading comprehension tasks, where
16
+ we sometimes would truncate the model outputs when they should not have been.
17
+
18
+
19
+ ## [v15.10.0] - 2025-06-17
20
+ ### Changed
21
+ - Updated `vllm` to `>=0.9.1`.
22
+ - Updated `litellm` to `>=1.72.2`.
23
+ - Updated `ollama` to `>=0.5.1`.
24
+ - Better detecmtion of instruction-tuned models.
25
+
26
+ ### Fixed
27
+ - Fixed an issue where the EOS token would be included in the vLLM generation output,
28
+ leading to incorrect evaluation results. We now manually remove all stop tokens from
29
+ the generation output, which fixes this issue.
30
+ - Now correctly detects reasoning models for Ollama models and enables their new "think"
31
+ parameter whenever a reasoning model is detected.
32
+ - Added a cap on the number of concurrent connections when evaluating API models, to
33
+ avoid running into errors related to too many open file descriptors. In case this
34
+ error _still_ occurs, we now give the user an informative error message on how to
35
+ increase the maximum number of open file descriptors on their system.
36
+ - Catch requests.ConnectionError when loading datasets.
37
+ - When benchmarking encoder models on reading comprehension tasks, we allow the model
38
+ outputs to have more than two elements (start and end position logits), where we
39
+ instead just use the first two elements and ignore the rest.
40
+ - When an encoder model outputs additional tensors aside from the logits, we now remove
41
+ these tensors from the output dictionary via the `preprocess_logits_for_metrics`
42
+ argument to `Trainer`.
43
+
44
+
13
45
  ## [v15.9.2] - 2025-06-04
14
46
  ### Fixed
15
47
  - Allow a model to not have any BOS and EOS tokens.
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.9.2
3
+ Version: 15.10.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
7
7
  Author-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
8
- Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>
8
+ Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
9
9
  License: MIT License
10
10
 
11
11
  Copyright (c) 2022-2024 Dan Saattrup Nielsen
@@ -37,13 +37,12 @@ Requires-Dist: demjson3>=3.0.6
37
37
  Requires-Dist: evaluate>=0.4.1
38
38
  Requires-Dist: huggingface-hub>=0.30.1
39
39
  Requires-Dist: levenshtein>=0.24.0
40
- Requires-Dist: litellm>=1.63.0
40
+ Requires-Dist: litellm>=1.72.2
41
41
  Requires-Dist: more-itertools>=10.5.0
42
42
  Requires-Dist: numpy<2.0.0,>=1.23.0
43
- Requires-Dist: ollama>=0.4.7
43
+ Requires-Dist: ollama>=0.5.1
44
44
  Requires-Dist: pandas>=2.2.0
45
45
  Requires-Dist: peft>=0.15.0
46
- Requires-Dist: protobuf~=3.20.0
47
46
  Requires-Dist: pydantic>=2.6.0
48
47
  Requires-Dist: pyinfer>=0.0.3
49
48
  Requires-Dist: python-dotenv>=1.0.1
@@ -62,12 +61,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
62
61
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
63
62
  Requires-Dist: gradio>=4.26.0; extra == 'all'
64
63
  Requires-Dist: outlines>=0.1.11; extra == 'all'
65
- Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
66
65
  Provides-Extra: generative
67
66
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
67
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
68
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
70
- Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
71
70
  Provides-Extra: human-evaluation
72
71
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
73
72
  Provides-Extra: test
@@ -93,7 +92,7 @@ ______________________________________________________________________
93
92
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
94
93
 
95
94
 
96
- ## Maintainers
95
+ ## Maintainer
97
96
 
98
97
  - Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
99
98
  dan.nielsen@alexandra.dk)
@@ -17,7 +17,7 @@ ______________________________________________________________________
17
17
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
18
18
 
19
19
 
20
- ## Maintainers
20
+ ## Maintainer
21
21
 
22
22
  - Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
23
23
  dan.nielsen@alexandra.dk)
@@ -29,7 +29,7 @@ or [LM Studio](https://lmstudio.ai/).
29
29
  The idea of EuroEval grew out of the development of Danish language model RøBÆRTa in
30
30
  2021, when we realised that there was no standard way to evaluate Danish language
31
31
  models. It started as a hobby project including Danish, Swedish and Norwegian, but has
32
- since grown to include 8+ European languages.
32
+ since grown to include 12+ European languages.
33
33
 
34
34
  EuroEval is maintained by [Dan Saattrup Nielsen](https://www.saattrupdan.com/) from the
35
35
  [Alexandra Institute](https://alexandra.dk), and is funded by the EU project
@@ -0,0 +1,15 @@
1
+ ---
2
+ hide:
3
+ - toc
4
+ ---
5
+ # 🇫🇮 Finnish
6
+
7
+ See the [leaderboard page](/leaderboards) for more information about all the columns.
8
+
9
+ /// tab | Generative Leaderboard
10
+ <iframe title="" aria-label="Table" id="datawrapper-chart-ubHSy" src="https://datawrapper.dwcdn.net/ubHSy" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="847" data-external="1"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}}))}();</script>
11
+ ///
12
+
13
+ /// tab | NLU Leaderboard
14
+ <iframe title="" aria-label="Table" id="datawrapper-chart-qVbA3" src="https://datawrapper.dwcdn.net/qVbA3/1/" scrolling="no" frameborder="0" style="width: 0; min-width: 100% !important; border: none;" height="818" data-external="1"></iframe><script type="text/javascript">!function(){"use strict";window.addEventListener("message",(function(a){if(void 0!==a.data["datawrapper-height"]){var e=document.querySelectorAll("iframe");for(var t in a.data["datawrapper-height"])for(var r,i=0;r=e[i];i++)if(r.contentWindow===a.source){var d=a.data["datawrapper-height"][t]+"px";r.style.height=d}}}))}();</script>
15
+ ///
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.9.2"
3
+ version = "15.10.1"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -8,7 +8,6 @@ authors = [
8
8
  ]
9
9
  maintainers = [
10
10
  {name = "Dan Saattrup Nielsen", email = "dan.nielsen@alexandra.dk"},
11
- {name = "Kenneth Enevoldsen", email = "kenneth.enevoldsen@cas.au.dk"},
12
11
  ]
13
12
  requires-python = ">=3.10,<4.0"
14
13
  dependencies = [
@@ -27,18 +26,17 @@ dependencies = [
27
26
  "huggingface-hub>=0.30.1",
28
27
  "pyinfer>=0.0.3",
29
28
  "sentencepiece>=0.1.96",
30
- "protobuf~=3.20.0",
31
29
  "sacremoses>=0.1.1",
32
30
  "more-itertools>=10.5.0",
33
31
  "tenacity>=9.0.0",
34
- "litellm>=1.63.0",
32
+ "litellm>=1.72.2",
35
33
  "rouge-score>=0.1.2",
36
34
  "bert-score>=0.3.13",
37
35
  "levenshtein>=0.24.0",
38
36
  "scikit-learn<1.6.0",
39
37
  "setuptools>=75.8.2",
40
38
  "demjson3>=3.0.6",
41
- "ollama>=0.4.7",
39
+ "ollama>=0.5.1",
42
40
  "peft>=0.15.0",
43
41
  ]
44
42
 
@@ -46,7 +44,7 @@ dependencies = [
46
44
  generative = [
47
45
  "outlines>=0.1.11",
48
46
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
49
- "vllm>=0.9.0; platform_system == 'Linux'",
47
+ "vllm>=0.9.1; platform_system == 'Linux'",
50
48
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
51
49
  ]
52
50
  human_evaluation = [
@@ -55,7 +53,7 @@ human_evaluation = [
55
53
  all = [
56
54
  "outlines>=0.1.11",
57
55
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
58
- "vllm>=0.9.0; platform_system == 'Linux'",
56
+ "vllm>=0.9.1; platform_system == 'Linux'",
59
57
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
60
58
  "gradio>=4.26.0",
61
59
  ]
@@ -150,6 +148,8 @@ ignore = [
150
148
  "ANN101",
151
149
  # Type annotations for "cls" arguments
152
150
  "ANN102",
151
+ # Type annotations for *args
152
+ "ANN002",
153
153
  # Type annotations for **kwargs
154
154
  "ANN003",
155
155
  # Docstrings for **kwargs
@@ -378,7 +378,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
378
378
  tokenizer=self._tokenizer,
379
379
  ),
380
380
  batched=True,
381
- batch_size=1,
381
+ batch_size=10,
382
382
  remove_columns=dataset["train"].column_names,
383
383
  load_from_cache_file=False,
384
384
  keep_in_memory=True,
@@ -389,7 +389,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
389
389
  tokenizer=self._tokenizer,
390
390
  ),
391
391
  batched=True,
392
- batch_size=1,
392
+ batch_size=10,
393
393
  remove_columns=dataset["val"].column_names,
394
394
  load_from_cache_file=False,
395
395
  keep_in_memory=True,
@@ -400,7 +400,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
400
400
  tokenizer=self._tokenizer,
401
401
  ),
402
402
  batched=True,
403
- batch_size=1,
403
+ batch_size=10,
404
404
  remove_columns=dataset["test"].column_names,
405
405
  load_from_cache_file=False,
406
406
  keep_in_memory=True,