EuroEval 16.2.1__tar.gz → 16.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (275) hide show
  1. {euroeval-16.2.1 → euroeval-16.2.2}/CHANGELOG.md +7 -0
  2. {euroeval-16.2.1 → euroeval-16.2.2}/PKG-INFO +1 -1
  3. {euroeval-16.2.1 → euroeval-16.2.2}/pyproject.toml +1 -1
  4. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/benchmark_modules/vllm.py +10 -7
  5. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/benchmarker.py +181 -344
  6. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/utils.py +9 -4
  7. {euroeval-16.2.1 → euroeval-16.2.2}/tests/conftest.py +9 -0
  8. euroeval-16.2.2/tests/test_cli.py +69 -0
  9. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_data_models.py +47 -9
  10. {euroeval-16.2.1 → euroeval-16.2.2}/uv.lock +1 -1
  11. euroeval-16.2.1/tests/test_cli.py +0 -81
  12. {euroeval-16.2.1 → euroeval-16.2.2}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  13. {euroeval-16.2.1 → euroeval-16.2.2}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  14. {euroeval-16.2.1 → euroeval-16.2.2}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  15. {euroeval-16.2.1 → euroeval-16.2.2}/.github/ISSUE_TEMPLATE/language_request.yaml +0 -0
  16. {euroeval-16.2.1 → euroeval-16.2.2}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  17. {euroeval-16.2.1 → euroeval-16.2.2}/.github/workflows/ci.yaml +0 -0
  18. {euroeval-16.2.1 → euroeval-16.2.2}/.gitignore +0 -0
  19. {euroeval-16.2.1 → euroeval-16.2.2}/.pre-commit-config.yaml +0 -0
  20. {euroeval-16.2.1 → euroeval-16.2.2}/CITATION.cff +0 -0
  21. {euroeval-16.2.1 → euroeval-16.2.2}/CODE_OF_CONDUCT.md +0 -0
  22. {euroeval-16.2.1 → euroeval-16.2.2}/CONTRIBUTING.md +0 -0
  23. {euroeval-16.2.1 → euroeval-16.2.2}/Dockerfile.cuda +0 -0
  24. {euroeval-16.2.1 → euroeval-16.2.2}/LICENSE +0 -0
  25. {euroeval-16.2.1 → euroeval-16.2.2}/NEW_DATASET_GUIDE.md +0 -0
  26. {euroeval-16.2.1 → euroeval-16.2.2}/README.md +0 -0
  27. {euroeval-16.2.1 → euroeval-16.2.2}/docs/CNAME +0 -0
  28. {euroeval-16.2.1 → euroeval-16.2.2}/docs/README.md +0 -0
  29. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/README.md +0 -0
  30. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/danish.md +0 -0
  31. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/dutch.md +0 -0
  32. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/english.md +0 -0
  33. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/estonian.md +0 -0
  34. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/faroese.md +0 -0
  35. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/finnish.md +0 -0
  36. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/french.md +0 -0
  37. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/german.md +0 -0
  38. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/icelandic.md +0 -0
  39. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/italian.md +0 -0
  40. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/latvian.md +0 -0
  41. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/norwegian.md +0 -0
  42. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/polish.md +0 -0
  43. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/portuguese.md +0 -0
  44. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/spanish.md +0 -0
  45. {euroeval-16.2.1 → euroeval-16.2.2}/docs/datasets/swedish.md +0 -0
  46. {euroeval-16.2.1 → euroeval-16.2.2}/docs/extras/radial_plotter.md +0 -0
  47. {euroeval-16.2.1 → euroeval-16.2.2}/docs/faq.md +0 -0
  48. {euroeval-16.2.1 → euroeval-16.2.2}/docs/gfx/favicon.png +0 -0
  49. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/danish.md +0 -0
  50. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/dutch.md +0 -0
  51. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/english.md +0 -0
  52. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/estonian.md +0 -0
  53. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/faroese.md +0 -0
  54. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/finnish.md +0 -0
  55. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/french.md +0 -0
  56. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/german.md +0 -0
  57. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  58. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/italian.md +0 -0
  59. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  60. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/portuguese.md +0 -0
  61. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/spanish.md +0 -0
  62. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Monolingual/swedish.md +0 -0
  63. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Multilingual/european.md +0 -0
  64. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Multilingual/finnic.md +0 -0
  65. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Multilingual/germanic.md +0 -0
  66. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  67. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/Multilingual/romance.md +0 -0
  68. {euroeval-16.2.1 → euroeval-16.2.2}/docs/leaderboards/README.md +0 -0
  69. {euroeval-16.2.1 → euroeval-16.2.2}/docs/methodology.md +0 -0
  70. {euroeval-16.2.1 → euroeval-16.2.2}/docs/python-package.md +0 -0
  71. {euroeval-16.2.1 → euroeval-16.2.2}/docs/tasks/README.md +0 -0
  72. {euroeval-16.2.1 → euroeval-16.2.2}/docs/tasks/common-sense-reasoning.md +0 -0
  73. {euroeval-16.2.1 → euroeval-16.2.2}/docs/tasks/knowledge.md +0 -0
  74. {euroeval-16.2.1 → euroeval-16.2.2}/docs/tasks/linguistic-acceptability.md +0 -0
  75. {euroeval-16.2.1 → euroeval-16.2.2}/docs/tasks/named-entity-recognition.md +0 -0
  76. {euroeval-16.2.1 → euroeval-16.2.2}/docs/tasks/reading-comprehension.md +0 -0
  77. {euroeval-16.2.1 → euroeval-16.2.2}/docs/tasks/sentiment-classification.md +0 -0
  78. {euroeval-16.2.1 → euroeval-16.2.2}/docs/tasks/speed.md +0 -0
  79. {euroeval-16.2.1 → euroeval-16.2.2}/docs/tasks/summarization.md +0 -0
  80. {euroeval-16.2.1 → euroeval-16.2.2}/gfx/euroeval.png +0 -0
  81. {euroeval-16.2.1 → euroeval-16.2.2}/gfx/euroeval.xcf +0 -0
  82. {euroeval-16.2.1 → euroeval-16.2.2}/gfx/scandeval.png +0 -0
  83. {euroeval-16.2.1 → euroeval-16.2.2}/makefile +0 -0
  84. {euroeval-16.2.1 → euroeval-16.2.2}/mkdocs.yaml +0 -0
  85. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/__init__.py +0 -0
  86. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/benchmark_config_factory.py +0 -0
  87. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/benchmark_modules/__init__.py +0 -0
  88. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/benchmark_modules/base.py +0 -0
  89. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/benchmark_modules/fresh.py +0 -0
  90. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/benchmark_modules/hf.py +0 -0
  91. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/benchmark_modules/litellm.py +0 -0
  92. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/callbacks.py +0 -0
  93. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/cli.py +0 -0
  94. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/constants.py +0 -0
  95. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/data_loading.py +0 -0
  96. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/data_models.py +35 -35
  97. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/__init__.py +0 -0
  98. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/danish.py +0 -0
  99. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/dutch.py +0 -0
  100. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/english.py +0 -0
  101. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/estonian.py +0 -0
  102. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/faroese.py +0 -0
  103. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/finnish.py +0 -0
  104. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/french.py +0 -0
  105. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/german.py +0 -0
  106. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/icelandic.py +0 -0
  107. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/italian.py +0 -0
  108. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/latvian.py +0 -0
  109. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/norwegian.py +0 -0
  110. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/polish.py +0 -0
  111. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/portuguese.py +0 -0
  112. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/spanish.py +0 -0
  113. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/dataset_configs/swedish.py +0 -0
  114. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/enums.py +0 -0
  115. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/exceptions.py +0 -0
  116. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/finetuning.py +0 -0
  117. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/generation.py +0 -0
  118. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/generation_utils.py +0 -0
  119. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/languages.py +0 -0
  120. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/metrics/__init__.py +0 -0
  121. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/metrics/base.py +0 -0
  122. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/metrics/huggingface.py +0 -0
  123. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/metrics/llm_as_a_judge.py +0 -0
  124. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/metrics/pipeline.py +0 -0
  125. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/metrics/speed.py +0 -0
  126. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/model_cache.py +0 -0
  127. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/model_config.py +0 -0
  128. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/model_loading.py +0 -0
  129. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/prompt_templates/__init__.py +0 -0
  130. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  131. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  132. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  133. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  134. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  135. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/prompt_templates/summarization.py +0 -0
  136. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/scores.py +0 -0
  137. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/speed_benchmark.py +0 -0
  138. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/task_group_utils/__init__.py +0 -0
  139. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  140. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/task_group_utils/question_answering.py +0 -0
  141. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
  142. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  143. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/task_group_utils/token_classification.py +0 -0
  144. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/tasks.py +0 -0
  145. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/tokenisation_utils.py +0 -0
  146. {euroeval-16.2.1 → euroeval-16.2.2}/src/euroeval/types.py +0 -0
  147. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/constants.py +0 -0
  148. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_allocine.py +0 -0
  149. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_angry_tweets.py +0 -0
  150. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_arc.py +0 -0
  151. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_arc_is.py +0 -0
  152. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_belebele.py +0 -0
  153. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_boolq_pt.py +0 -0
  154. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_cnn_dailymail.py +0 -0
  155. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_conll_en.py +0 -0
  156. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_conll_es.py +0 -0
  157. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_conll_nl.py +0 -0
  158. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_copa_lv.py +0 -0
  159. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_dane.py +0 -0
  160. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_danish_citizen_tests.py +0 -0
  161. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_dansk.py +0 -0
  162. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_danske_talemaader.py +0 -0
  163. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_danske_talemaader_old.py +0 -0
  164. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_dbrd.py +0 -0
  165. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_dutch_cola.py +0 -0
  166. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_eltec.py +0 -0
  167. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_err_news.py +0 -0
  168. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_estner.py +0 -0
  169. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_estonian_valence.py +0 -0
  170. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_european_values.py +0 -0
  171. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_exam_et.py +0 -0
  172. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_fone.py +0 -0
  173. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_foqa.py +0 -0
  174. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_fosent.py +0 -0
  175. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_fquad.py +0 -0
  176. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_fullstack_ner.py +0 -0
  177. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_germanquad.py +0 -0
  178. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_germeval.py +0 -0
  179. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_goldenswag.py +0 -0
  180. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_grammar_et.py +0 -0
  181. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_harem.py +0 -0
  182. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_hellaswag.py +0 -0
  183. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_hellaswag_fi.py +0 -0
  184. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  185. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_ice_linguistic.py +0 -0
  186. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_icelandic_error_corpus.py +0 -0
  187. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_icelandic_knowledge.py +0 -0
  188. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_icelandic_qa.py +0 -0
  189. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_icesum.py +0 -0
  190. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_idioms_no.py +0 -0
  191. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_ilpost_sum.py +0 -0
  192. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_jentoft.py +0 -0
  193. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_kpwr_ner.py +0 -0
  194. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_latvian_lsm_summary.py +0 -0
  195. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
  196. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_life_in_the_uk.py +0 -0
  197. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_llmzszl.py +0 -0
  198. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_mim_gold_ner.py +0 -0
  199. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_mlqa_es.py +0 -0
  200. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_mlsum_de.py +0 -0
  201. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_mlsum_es.py +0 -0
  202. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_mmlu.py +0 -0
  203. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_mmlu_lv.py +0 -0
  204. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_multi_wiki_qa.py +0 -0
  205. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_multinerd-it.py +0 -0
  206. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_no_cola.py +0 -0
  207. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_no_sammendrag.py +0 -0
  208. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_nor_common_sense_qa.py +0 -0
  209. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_nordjylland_news.py +0 -0
  210. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_norec.py +0 -0
  211. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_norglm_multiqa.py +0 -0
  212. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_norglm_multisum.py +0 -0
  213. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_norne.py +0 -0
  214. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_norquad.py +0 -0
  215. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_nqii.py +0 -0
  216. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_nrk_quiz_qa.py +0 -0
  217. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_orange_sum.py +0 -0
  218. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_personal_sum.py +0 -0
  219. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_polemo2.py +0 -0
  220. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_poquad.py +0 -0
  221. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_psc.py +0 -0
  222. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_publico.py +0 -0
  223. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_rrn.py +0 -0
  224. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_sb10k.py +0 -0
  225. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_scala.py +0 -0
  226. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_scandiqa.py +0 -0
  227. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_scandisent_fi.py +0 -0
  228. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_schibsted.py +0 -0
  229. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_sentiment_headlines_es.py +0 -0
  230. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_sentipolc16.py +0 -0
  231. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_squad.py +0 -0
  232. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_squad_it.py +0 -0
  233. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_squad_nl.py +0 -0
  234. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_squad_nl_old.py +0 -0
  235. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_sst2_pt.py +0 -0
  236. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_sst5.py +0 -0
  237. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_suc3.py +0 -0
  238. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_swedish_skolprov.py +0 -0
  239. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_swedn.py +0 -0
  240. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_swerec.py +0 -0
  241. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_trivia_et.py +0 -0
  242. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_turku_ner_fi.py +0 -0
  243. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_tydiqa_fi.py +0 -0
  244. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_wiki_lingua_nl.py +0 -0
  245. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_wikiann_lv.py +0 -0
  246. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_wikineural-it.py +0 -0
  247. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_winogrande.py +0 -0
  248. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_winogrande_et.py +0 -0
  249. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_winogrande_is.py +0 -0
  250. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_xlsum_fi.py +0 -0
  251. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/create_xquad.py +0 -0
  252. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/fix_dot_env_file.py +0 -0
  253. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/load_ud_pos.py +0 -0
  254. {euroeval-16.2.1 → euroeval-16.2.2}/src/scripts/versioning.py +0 -0
  255. {euroeval-16.2.1 → euroeval-16.2.2}/tests/__init__.py +0 -0
  256. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_benchmark_config_factory.py +0 -0
  257. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_benchmark_modules/__init__.py +0 -0
  258. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_benchmark_modules/test_hf.py +0 -0
  259. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_benchmarker.py +0 -0
  260. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_callbacks.py +0 -0
  261. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_constants.py +0 -0
  262. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_data_loading.py +0 -0
  263. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_dataset_configs.py +0 -0
  264. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_enums.py +0 -0
  265. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_exceptions.py +0 -0
  266. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_finetuning.py +0 -0
  267. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_languages.py +0 -0
  268. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_model_config.py +0 -0
  269. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_model_loading.py +0 -0
  270. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_scores.py +0 -0
  271. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_speed_benchmark.py +0 -0
  272. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_tasks.py +0 -0
  273. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_tokenisation_utils.py +0 -0
  274. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_types.py +0 -0
  275. {euroeval-16.2.1 → euroeval-16.2.2}/tests/test_utils.py +0 -0
@@ -10,6 +10,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v16.2.2] - 2025-09-15
14
+ ### Fixed
15
+ - Added missing benchmark arguments to the `Benchmarker.benchmark` method.
16
+ - Fixed another issue related to the `download_only` mode, causing model evaluations to
17
+ fail, as it could not find the model locally. This has been fixed now.
18
+
19
+
13
20
  ## [v16.2.1] - 2025-09-15
14
21
  ### Fixed
15
22
  - Some of the `download_only` arguments were missing in the code, and have now been
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.2.1
3
+ Version: 16.2.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "16.2.1"
3
+ version = "16.2.2"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -836,15 +836,18 @@ def load_model_and_tokeniser(
836
836
 
837
837
  clear_vllm()
838
838
 
839
- # if we do not have an internet connection we need to give the path to the folder
840
- # that contains the model weights and config files, otherwise vLLM will try to
841
- # download them regardless if they are already present in the download_dir
842
- model_path = resolve_model_path(download_dir)
843
-
844
839
  try:
845
840
  model = LLM(
846
- model=model_id if internet_connection_available() else model_path,
847
- tokenizer=model_id if internet_connection_available() else model_path,
841
+ model=(
842
+ model_id
843
+ if internet_connection_available()
844
+ else resolve_model_path(download_dir=download_dir)
845
+ ),
846
+ tokenizer=(
847
+ model_id
848
+ if internet_connection_available()
849
+ else resolve_model_path(download_dir=download_dir)
850
+ ),
848
851
  gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
849
852
  max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
850
853
  download_dir=download_dir,
@@ -6,7 +6,6 @@ import logging
6
6
  import re
7
7
  import sys
8
8
  import typing as t
9
- from copy import deepcopy
10
9
  from pathlib import Path
11
10
  from shutil import rmtree
12
11
  from time import sleep
@@ -200,10 +199,10 @@ class Benchmarker:
200
199
  )
201
200
 
202
201
  self.benchmark_config_default_params = BenchmarkConfigParams(
203
- progress_bar=progress_bar,
204
- save_results=save_results,
205
202
  task=task,
206
203
  dataset=dataset,
204
+ progress_bar=progress_bar,
205
+ save_results=save_results,
207
206
  language=language,
208
207
  model_language=model_language,
209
208
  dataset_language=dataset_language,
@@ -212,21 +211,21 @@ class Benchmarker:
212
211
  raise_errors=raise_errors,
213
212
  cache_dir=cache_dir,
214
213
  api_key=api_key,
215
- force=force,
216
- verbose=verbose,
214
+ api_base=api_base,
215
+ api_version=api_version,
217
216
  trust_remote_code=trust_remote_code,
218
217
  clear_model_cache=clear_model_cache,
219
218
  evaluate_test_split=evaluate_test_split,
220
219
  few_shot=few_shot,
221
220
  num_iterations=num_iterations,
222
- api_base=api_base,
223
- api_version=api_version,
221
+ requires_safetensors=requires_safetensors,
222
+ download_only=download_only,
224
223
  gpu_memory_utilization=gpu_memory_utilization,
225
224
  generative_type=generative_type,
226
- download_only=download_only,
225
+ verbose=verbose,
226
+ force=force,
227
227
  debug=debug,
228
228
  run_with_cli=run_with_cli,
229
- requires_safetensors=requires_safetensors,
230
229
  )
231
230
 
232
231
  self.benchmark_config = build_benchmark_config(
@@ -332,8 +331,8 @@ class Benchmarker:
332
331
  raise_errors: bool | None = None,
333
332
  cache_dir: str | None = None,
334
333
  api_key: str | None = None,
335
- force: bool | None = None,
336
- verbose: bool | None = None,
334
+ api_base: str | None = None,
335
+ api_version: str | None = None,
337
336
  trust_remote_code: bool | None = None,
338
337
  clear_model_cache: bool | None = None,
339
338
  evaluate_test_split: bool | None = None,
@@ -341,6 +340,11 @@ class Benchmarker:
341
340
  num_iterations: int | None = None,
342
341
  requires_safetensors: bool | None = None,
343
342
  download_only: bool | None = None,
343
+ gpu_memory_utilization: float | None = None,
344
+ generative_type: GenerativeType | None = None,
345
+ force: bool | None = None,
346
+ verbose: bool | None = None,
347
+ debug: bool | None = None,
344
348
  ) -> list[BenchmarkResult]:
345
349
  """Benchmarks models on datasets.
346
350
 
@@ -393,13 +397,13 @@ class Benchmarker:
393
397
  api_key:
394
398
  The API key to use for a given inference server. Defaults to the value
395
399
  specified when initialising the benchmarker.
396
- force:
397
- Whether to force evaluations of models, even if they have been
398
- benchmarked already. Defaults to the value specified when initialising
399
- the benchmarker.
400
- verbose:
401
- Whether to output additional output. Defaults to the value specified
402
- when initialising the benchmarker.
400
+ api_base:
401
+ The base URL for a given inference API. Only relevant if `model` refers
402
+ to a model on an inference API. Defaults to the value specified when
403
+ initialising the benchmarker.
404
+ api_version:
405
+ The version of the API to use. Defaults to the value specified when
406
+ initialising the benchmarker.
403
407
  trust_remote_code:
404
408
  Whether to trust remote code when loading models. Defaults to the value
405
409
  specified when initialising the benchmarker.
@@ -424,6 +428,27 @@ class Benchmarker:
424
428
  download_only:
425
429
  Whether to only download the models without evaluating them. Defaults
426
430
  to the value specified when initialising the benchmarker.
431
+ gpu_memory_utilization:
432
+ The GPU memory utilization to use for vLLM. Only relevant if the model
433
+ is generative. A larger value will result in faster evaluation, but at
434
+ the risk of running out of GPU memory. Only reduce this if you are
435
+ running out of GPU memory. Defaults to the value specified when
436
+ initialising the benchmarker.
437
+ generative_type:
438
+ The type of generative model to benchmark. Only relevant if the model is
439
+ generative. If not specified, then the type will be inferred based on
440
+ the tags of the model. Defaults to the value specified when initialising
441
+ the benchmarker.
442
+ force:
443
+ Whether to force evaluations of models, even if they have been
444
+ benchmarked already. Defaults to the value specified when initialising
445
+ the benchmarker.
446
+ verbose:
447
+ Whether to output additional output. Defaults to the value specified
448
+ when initialising the benchmarker.
449
+ debug:
450
+ Whether to output debug information. Defaults to the value specified
451
+ when initialising the benchmarker.
427
452
 
428
453
  Returns:
429
454
  A list of benchmark results.
@@ -435,28 +460,141 @@ class Benchmarker:
435
460
  if task is not None and dataset is not None:
436
461
  raise ValueError("Only one of `task` and `dataset` can be specified.")
437
462
 
438
- benchmark_config = self._get_updated_benchmark_config(
439
- task=task,
440
- dataset=dataset,
441
- progress_bar=progress_bar,
442
- save_results=save_results,
443
- language=language,
444
- model_language=model_language,
445
- dataset_language=dataset_language,
446
- device=device,
447
- batch_size=batch_size,
448
- raise_errors=raise_errors,
449
- cache_dir=cache_dir,
450
- api_key=api_key,
451
- force=force,
452
- verbose=verbose,
453
- trust_remote_code=trust_remote_code,
454
- clear_model_cache=clear_model_cache,
455
- evaluate_test_split=evaluate_test_split,
456
- few_shot=few_shot,
457
- num_iterations=num_iterations,
458
- requires_safetensors=requires_safetensors,
459
- download_only=download_only,
463
+ # Get a new updated benchmark configuration, based on any changes to the
464
+ # parameters
465
+ benchmark_config_params = BenchmarkConfigParams(
466
+ task=(
467
+ task if task is not None else self.benchmark_config_default_params.task
468
+ ),
469
+ dataset=(
470
+ dataset
471
+ if dataset is not None
472
+ else self.benchmark_config_default_params.dataset
473
+ ),
474
+ progress_bar=(
475
+ progress_bar
476
+ if progress_bar is not None
477
+ else self.benchmark_config_default_params.progress_bar
478
+ ),
479
+ save_results=(
480
+ save_results
481
+ if save_results is not None
482
+ else self.benchmark_config_default_params.save_results
483
+ ),
484
+ language=(
485
+ language
486
+ if language is not None
487
+ else self.benchmark_config_default_params.language
488
+ ),
489
+ model_language=(
490
+ model_language
491
+ if model_language is not None
492
+ else self.benchmark_config_default_params.model_language
493
+ ),
494
+ dataset_language=(
495
+ dataset_language
496
+ if dataset_language is not None
497
+ else self.benchmark_config_default_params.dataset_language
498
+ ),
499
+ device=(
500
+ device
501
+ if device is not None
502
+ else self.benchmark_config_default_params.device
503
+ ),
504
+ batch_size=(
505
+ batch_size
506
+ if batch_size is not None
507
+ else self.benchmark_config_default_params.batch_size
508
+ ),
509
+ raise_errors=(
510
+ raise_errors
511
+ if raise_errors is not None
512
+ else self.benchmark_config_default_params.raise_errors
513
+ ),
514
+ cache_dir=(
515
+ cache_dir
516
+ if cache_dir is not None
517
+ else self.benchmark_config_default_params.cache_dir
518
+ ),
519
+ api_key=(
520
+ api_key
521
+ if api_key is not None
522
+ else self.benchmark_config_default_params.api_key
523
+ ),
524
+ api_base=(
525
+ api_base
526
+ if api_base is not None
527
+ else self.benchmark_config_default_params.api_base
528
+ ),
529
+ api_version=(
530
+ api_version
531
+ if api_version is not None
532
+ else self.benchmark_config_default_params.api_version
533
+ ),
534
+ trust_remote_code=(
535
+ trust_remote_code
536
+ if trust_remote_code is not None
537
+ else self.benchmark_config_default_params.trust_remote_code
538
+ ),
539
+ clear_model_cache=(
540
+ clear_model_cache
541
+ if clear_model_cache is not None
542
+ else self.benchmark_config_default_params.clear_model_cache
543
+ ),
544
+ evaluate_test_split=(
545
+ evaluate_test_split
546
+ if evaluate_test_split is not None
547
+ else self.benchmark_config_default_params.evaluate_test_split
548
+ ),
549
+ few_shot=(
550
+ few_shot
551
+ if few_shot is not None
552
+ else self.benchmark_config_default_params.few_shot
553
+ ),
554
+ num_iterations=(
555
+ num_iterations
556
+ if num_iterations is not None
557
+ else self.benchmark_config_default_params.num_iterations
558
+ ),
559
+ requires_safetensors=(
560
+ requires_safetensors
561
+ if requires_safetensors is not None
562
+ else self.benchmark_config_default_params.requires_safetensors
563
+ ),
564
+ download_only=(
565
+ download_only
566
+ if download_only is not None
567
+ else self.benchmark_config_default_params.download_only
568
+ ),
569
+ gpu_memory_utilization=(
570
+ gpu_memory_utilization
571
+ if gpu_memory_utilization is not None
572
+ else self.benchmark_config_default_params.gpu_memory_utilization
573
+ ),
574
+ generative_type=(
575
+ generative_type
576
+ if generative_type is not None
577
+ else self.benchmark_config_default_params.generative_type
578
+ ),
579
+ force=(
580
+ force
581
+ if force is not None
582
+ else self.benchmark_config_default_params.force
583
+ ),
584
+ verbose=(
585
+ verbose
586
+ if verbose is not None
587
+ else self.benchmark_config_default_params.verbose
588
+ ),
589
+ debug=(
590
+ debug
591
+ if debug is not None
592
+ else self.benchmark_config_default_params.debug
593
+ ),
594
+ run_with_cli=self.benchmark_config_default_params.run_with_cli,
595
+ )
596
+ benchmark_config = build_benchmark_config(
597
+ benchmark_config_params=benchmark_config_params
460
598
  )
461
599
 
462
600
  adjust_logging_level(verbose=benchmark_config.verbose)
@@ -654,176 +792,6 @@ class Benchmarker:
654
792
  destroy_process_group()
655
793
  return current_benchmark_results
656
794
 
657
- def _get_updated_benchmark_config(
658
- self,
659
- progress_bar: bool | None = None,
660
- save_results: bool | None = None,
661
- task: str | list[str] | None | None = None,
662
- dataset: str | list[str] | None | None = None,
663
- language: str | list[str] | None = None,
664
- model_language: str | list[str] | None | None = None,
665
- dataset_language: str | list[str] | None | None = None,
666
- device: Device | None | None = None,
667
- batch_size: int | None = None,
668
- raise_errors: bool | None = None,
669
- cache_dir: str | None = None,
670
- api_key: str | None | None = None,
671
- force: bool | None = None,
672
- verbose: bool | None = None,
673
- trust_remote_code: bool | None = None,
674
- clear_model_cache: bool | None = None,
675
- evaluate_test_split: bool | None = None,
676
- few_shot: bool | None = None,
677
- num_iterations: int | None = None,
678
- api_base: str | None | None = None,
679
- api_version: str | None | None = None,
680
- debug: bool | None = None,
681
- run_with_cli: bool | None = None,
682
- requires_safetensors: bool | None = None,
683
- download_only: bool | None = None,
684
- ) -> "BenchmarkConfig":
685
- """Get an updated benchmark configuration.
686
-
687
- Args:
688
- progress_bar:
689
- Whether progress bars should be shown. If None, then this value will not
690
- be updated.
691
- save_results:
692
- Whether to save the benchmark results to
693
- 'euroeval_benchmark_results.jsonl'. If None, then this value will not
694
- be updated.
695
- task:
696
- The tasks benchmark the model(s) on. If None, then this value will not
697
- be updated.
698
- dataset:
699
- The datasets to benchmark on. If None, then this value will not be
700
- updated.
701
- language:
702
- The language codes of the languages to include, both for models and
703
- datasets. If None, then this value will not be updated.
704
- model_language:
705
- The language codes of the languages to include for models. If None, then
706
- this value will not be updated.
707
- dataset_language:
708
- The language codes of the languages to include for datasets. If None,
709
- then this value will not be updated.
710
- device:
711
- The device to use for benchmarking. If None, then this value will not be
712
- updated.
713
- batch_size:
714
- The batch size to use. If None, then this value will not be updated.
715
- raise_errors:
716
- Whether to raise errors instead of skipping the model evaluation. If
717
- None, then this value will not be updated.
718
- cache_dir:
719
- Directory to store cached models. If None, then this value will not be
720
- updated.
721
- api_key:
722
- The API key to use for a given inference server. If None, then this
723
- value will not be updated.
724
- force:
725
- Whether to force evaluations of models, even if they have been
726
- benchmarked already. If None, then this value will not be updated.
727
- verbose:
728
- Whether to output additional output. If None, then this value will not
729
- be updated.
730
- trust_remote_code:
731
- Whether to trust remote code when loading models. If None, then this
732
- value will not be updated.
733
- clear_model_cache:
734
- Whether to clear the model cache after benchmarking each model. If None,
735
- then this value will not be updated.
736
- evaluate_test_split:
737
- Whether to evaluate the test split of the datasets. If None, then this
738
- value will not be updated.
739
- few_shot:
740
- Whether to only evaluate the model using few-shot evaluation. If None,
741
- then this value will not be updated.
742
- num_iterations:
743
- The number of times each model should be evaluated. If None, then this
744
- value will not be updated.
745
- api_base:
746
- The base URL for a given inference API. If None, then this value will
747
- not be updated.
748
- api_version:
749
- The version of the API to use. If None, then this value will not be
750
- updated.
751
- debug:
752
- Whether to output debug information. If None, then this value will not
753
- be updated.
754
- run_with_cli:
755
- Whether the benchmarker is being run from the command-line interface.
756
- If None, then this value will not be updated.
757
- requires_safetensors:
758
- Whether to only allow models that use the safetensors format. If None,
759
- then this value will not be updated.
760
- download_only:
761
- Whether to only download the models without evaluating them. If None,
762
- then this value will not be updated.
763
- download_only:
764
- Whether to only download models and datasets without performing any
765
- benchmarking. If None, then this value will not be updated.
766
-
767
- Returns:
768
- The updated benchmark configuration.
769
- """
770
- benchmark_config_params = deepcopy(self.benchmark_config_default_params)
771
-
772
- if progress_bar is not None:
773
- benchmark_config_params.progress_bar = progress_bar
774
- if save_results is not None:
775
- benchmark_config_params.save_results = save_results
776
- if task is not None:
777
- benchmark_config_params.task = task
778
- benchmark_config_params.dataset = None
779
- if dataset is not None:
780
- benchmark_config_params.dataset = dataset
781
- benchmark_config_params.task = None
782
- if language is not None:
783
- benchmark_config_params.language = language
784
- if model_language is not None:
785
- benchmark_config_params.model_language = model_language
786
- if dataset_language is not None:
787
- benchmark_config_params.dataset_language = dataset_language
788
- if device is not None:
789
- benchmark_config_params.device = device
790
- if batch_size is not None:
791
- benchmark_config_params.batch_size = batch_size
792
- if raise_errors is not None:
793
- benchmark_config_params.raise_errors = raise_errors
794
- if cache_dir is not None:
795
- benchmark_config_params.cache_dir = cache_dir
796
- if api_key is not None:
797
- benchmark_config_params.api_key = api_key
798
- if force is not None:
799
- benchmark_config_params.force = force
800
- if verbose is not None:
801
- benchmark_config_params.verbose = verbose
802
- if trust_remote_code is not None:
803
- benchmark_config_params.trust_remote_code = trust_remote_code
804
- if clear_model_cache is not None:
805
- benchmark_config_params.clear_model_cache = clear_model_cache
806
- if evaluate_test_split is not None:
807
- benchmark_config_params.evaluate_test_split = evaluate_test_split
808
- if few_shot is not None:
809
- benchmark_config_params.few_shot = few_shot
810
- if num_iterations is not None:
811
- benchmark_config_params.num_iterations = num_iterations
812
- if api_base is not None:
813
- benchmark_config_params.api_base = api_base
814
- if api_version is not None:
815
- benchmark_config_params.api_version = api_version
816
- if debug is not None:
817
- benchmark_config_params.debug = debug
818
- if run_with_cli is not None:
819
- benchmark_config_params.run_with_cli = run_with_cli
820
- if requires_safetensors is not None:
821
- benchmark_config_params.requires_safetensors = requires_safetensors
822
- if download_only is not None:
823
- benchmark_config_params.download_only = download_only
824
-
825
- return build_benchmark_config(benchmark_config_params=benchmark_config_params)
826
-
827
795
  def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
828
796
  """Prepare the model ID(s) to be benchmarked.
829
797
 
@@ -991,144 +959,13 @@ class Benchmarker:
991
959
  raise e
992
960
  return e
993
961
 
994
- def __call__(
995
- self,
996
- model: list[str] | str,
997
- task: str | list[str] | None = None,
998
- dataset: list[str] | str | None = None,
999
- progress_bar: bool | None = None,
1000
- save_results: bool | None = None,
1001
- language: str | list[str] | None = None,
1002
- model_language: str | list[str] | None = None,
1003
- dataset_language: str | list[str] | None = None,
1004
- device: Device | None = None,
1005
- batch_size: int | None = None,
1006
- raise_errors: bool | None = None,
1007
- cache_dir: str | None = None,
1008
- api_key: str | None = None,
1009
- force: bool | None = None,
1010
- verbose: bool | None = None,
1011
- trust_remote_code: bool | None = None,
1012
- clear_model_cache: bool | None = None,
1013
- evaluate_test_split: bool | None = None,
1014
- few_shot: bool | None = None,
1015
- num_iterations: int | None = None,
1016
- requires_safetensors: bool | None = None,
1017
- ) -> list[BenchmarkResult]:
1018
- """Benchmarks models on datasets.
1019
-
1020
- Args:
1021
- model:
1022
- The full Hugging Face Hub path(s) to the pretrained transformer model.
1023
- The specific model version to use can be added after the suffix '@':
1024
- "model@v1.0.0". It can be a branch name, a tag name, or a commit id,
1025
- and defaults to the latest version if not specified.
1026
- task:
1027
- The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
1028
- If both `task` and `dataset` are None then all datasets will be
1029
- benchmarked. Defaults to None.
1030
- dataset:
1031
- The datasets to benchmark on. Mutually exclusive with `task`. If both
1032
- `task` and `dataset` are None then all datasets will be benchmarked.
1033
- Defaults to None.
1034
- progress_bar:
1035
- Whether progress bars should be shown. Defaults to the value specified
1036
- when initialising the benchmarker.
1037
- save_results:
1038
- Whether to save the benchmark results to
1039
- 'euroeval_benchmark_results.jsonl'. Defaults to the value specified
1040
- when initialising the benchmarker.
1041
- language:
1042
- The language codes of the languages to include, both for models and
1043
- datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
1044
- 'all' if all languages should be considered. Defaults to the value
1045
- specified when initialising the benchmarker.
1046
- model_language:
1047
- The language codes of the languages to include for models. If specified
1048
- then this overrides the `language` parameter for model languages.
1049
- Defaults to the value specified when initialising the benchmarker.
1050
- dataset_language:
1051
- The language codes of the languages to include for datasets. If
1052
- specified then this overrides the `language` parameter for dataset
1053
- languages. Defaults to the value specified when initialising the
1054
- benchmarker.
1055
- device:
1056
- The device to use for benchmarking. Defaults to the value specified when
1057
- initialising the benchmarker.
1058
- batch_size:
1059
- The batch size to use. Defaults to the value specified when initialising
1060
- the benchmarker.
1061
- raise_errors:
1062
- Whether to raise errors instead of skipping the model evaluation.
1063
- cache_dir:
1064
- Directory to store cached models. Defaults to the value specified when
1065
- initialising the benchmarker.
1066
- api_key:
1067
- The API key to use for a given inference server. Defaults to the value
1068
- specified when initialising the benchmarker.
1069
- force:
1070
- Whether to force evaluations of models, even if they have been
1071
- benchmarked already. Defaults to the value specified when initialising
1072
- the benchmarker.
1073
- verbose:
1074
- Whether to output additional output. Defaults to the value specified
1075
- when initialising the benchmarker.
1076
- trust_remote_code:
1077
- Whether to trust remote code when loading models. Defaults to the value
1078
- specified when initialising the benchmarker.
1079
- clear_model_cache:
1080
- Whether to clear the model cache after benchmarking each model. Defaults
1081
- to the value specified when initialising the benchmarker.
1082
- evaluate_test_split:
1083
- Whether to evaluate the test split of the datasets. Defaults to the
1084
- value specified when initialising the benchmarker.
1085
- few_shot:
1086
- Whether to only evaluate the model using few-shot evaluation. Only
1087
- relevant if the model is generative. Defaults to the value specified
1088
- when initialising the benchmarker.
1089
- num_iterations:
1090
- The number of times each model should be evaluated. This is only meant
1091
- to be used for power users, and scores will not be allowed on the
1092
- leaderboards if this is changed. Defaults to the value specified when
1093
- initialising the benchmarker.
1094
- requires_safetensors:
1095
- Whether to only allow models that use the safetensors format. Defaults
1096
- to the value specified when initialising the benchmarker.
1097
-
1098
- Returns:
1099
- A list of benchmark results.
1100
-
1101
- Raises:
1102
- ValueError:
1103
- If both `task` and `dataset` are specified.
1104
- """
962
+ def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
963
+ """Alias for `self.benchmark()`."""
1105
964
  logger.warning(
1106
965
  "Calling the `Benchmarker` class directly is deprecated. Please use the "
1107
966
  "`benchmark` function instead. This will be removed in a future version."
1108
967
  )
1109
- return self.benchmark(
1110
- model=model,
1111
- task=task,
1112
- dataset=dataset,
1113
- progress_bar=progress_bar,
1114
- save_results=save_results,
1115
- language=language,
1116
- model_language=model_language,
1117
- dataset_language=dataset_language,
1118
- device=device,
1119
- batch_size=batch_size,
1120
- raise_errors=raise_errors,
1121
- cache_dir=cache_dir,
1122
- api_key=api_key,
1123
- force=force,
1124
- verbose=verbose,
1125
- trust_remote_code=trust_remote_code,
1126
- clear_model_cache=clear_model_cache,
1127
- evaluate_test_split=evaluate_test_split,
1128
- few_shot=few_shot,
1129
- num_iterations=num_iterations,
1130
- requires_safetensors=requires_safetensors,
1131
- )
968
+ return self.benchmark(*args, **kwds)
1132
969
 
1133
970
 
1134
971
  def model_has_been_benchmarked(