EuroEval 16.1.1__tar.gz → 16.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (274) hide show
  1. {euroeval-16.1.1 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +1 -0
  2. euroeval-16.2.1/.github/ISSUE_TEMPLATE/language_request.yaml +49 -0
  3. {euroeval-16.1.1 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +1 -0
  4. {euroeval-16.1.1 → euroeval-16.2.1}/CHANGELOG.md +34 -0
  5. {euroeval-16.1.1 → euroeval-16.2.1}/PKG-INFO +31 -7
  6. {euroeval-16.1.1 → euroeval-16.2.1}/README.md +26 -2
  7. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/icelandic.md +10 -10
  8. {euroeval-16.1.1 → euroeval-16.2.1}/pyproject.toml +10 -7
  9. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/__init__.py +7 -6
  10. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_config_factory.py +41 -125
  11. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/hf.py +31 -16
  12. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/litellm.py +2 -0
  13. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/vllm.py +24 -9
  14. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmarker.py +138 -16
  15. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/cli.py +8 -0
  16. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/data_models.py +5 -0
  17. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/generation.py +3 -1
  18. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/base.py +12 -0
  19. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/huggingface.py +23 -2
  20. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +6 -5
  21. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/named_entity_recognition.py +3 -3
  22. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/sentiment_classification.py +5 -5
  23. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/tasks.py +3 -0
  24. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/tokenisation_utils.py +0 -6
  25. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/types.py +2 -2
  26. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/utils.py +77 -5
  27. {euroeval-16.1.1 → euroeval-16.2.1}/tests/conftest.py +1 -0
  28. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_benchmarker.py +56 -0
  29. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_cli.py +2 -0
  30. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_data_loading.py +9 -0
  31. {euroeval-16.1.1 → euroeval-16.2.1}/uv.lock +668 -522
  32. {euroeval-16.1.1 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  33. {euroeval-16.1.1 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  34. {euroeval-16.1.1 → euroeval-16.2.1}/.github/workflows/ci.yaml +0 -0
  35. {euroeval-16.1.1 → euroeval-16.2.1}/.gitignore +0 -0
  36. {euroeval-16.1.1 → euroeval-16.2.1}/.pre-commit-config.yaml +0 -0
  37. {euroeval-16.1.1 → euroeval-16.2.1}/CITATION.cff +0 -0
  38. {euroeval-16.1.1 → euroeval-16.2.1}/CODE_OF_CONDUCT.md +0 -0
  39. {euroeval-16.1.1 → euroeval-16.2.1}/CONTRIBUTING.md +0 -0
  40. {euroeval-16.1.1 → euroeval-16.2.1}/Dockerfile.cuda +0 -0
  41. {euroeval-16.1.1 → euroeval-16.2.1}/LICENSE +0 -0
  42. {euroeval-16.1.1 → euroeval-16.2.1}/NEW_DATASET_GUIDE.md +0 -0
  43. {euroeval-16.1.1 → euroeval-16.2.1}/docs/CNAME +0 -0
  44. {euroeval-16.1.1 → euroeval-16.2.1}/docs/README.md +0 -0
  45. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/README.md +0 -0
  46. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/danish.md +0 -0
  47. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/dutch.md +0 -0
  48. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/english.md +0 -0
  49. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/estonian.md +0 -0
  50. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/faroese.md +0 -0
  51. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/finnish.md +0 -0
  52. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/french.md +0 -0
  53. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/german.md +0 -0
  54. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/italian.md +0 -0
  55. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/latvian.md +0 -0
  56. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/norwegian.md +0 -0
  57. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/polish.md +0 -0
  58. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/portuguese.md +0 -0
  59. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/spanish.md +0 -0
  60. {euroeval-16.1.1 → euroeval-16.2.1}/docs/datasets/swedish.md +0 -0
  61. {euroeval-16.1.1 → euroeval-16.2.1}/docs/extras/radial_plotter.md +0 -0
  62. {euroeval-16.1.1 → euroeval-16.2.1}/docs/faq.md +0 -0
  63. {euroeval-16.1.1 → euroeval-16.2.1}/docs/gfx/favicon.png +0 -0
  64. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/danish.md +0 -0
  65. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
  66. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/english.md +0 -0
  67. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/estonian.md +0 -0
  68. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
  69. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/finnish.md +0 -0
  70. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/french.md +0 -0
  71. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/german.md +0 -0
  72. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  73. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/italian.md +0 -0
  74. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  75. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/portuguese.md +0 -0
  76. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
  77. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
  78. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/european.md +0 -0
  79. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/finnic.md +0 -0
  80. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
  81. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  82. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/Multilingual/romance.md +0 -0
  83. {euroeval-16.1.1 → euroeval-16.2.1}/docs/leaderboards/README.md +0 -0
  84. {euroeval-16.1.1 → euroeval-16.2.1}/docs/methodology.md +0 -0
  85. {euroeval-16.1.1 → euroeval-16.2.1}/docs/python-package.md +0 -0
  86. {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/README.md +0 -0
  87. {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/common-sense-reasoning.md +0 -0
  88. {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/knowledge.md +0 -0
  89. {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/linguistic-acceptability.md +0 -0
  90. {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/named-entity-recognition.md +0 -0
  91. {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/reading-comprehension.md +0 -0
  92. {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/sentiment-classification.md +0 -0
  93. {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/speed.md +0 -0
  94. {euroeval-16.1.1 → euroeval-16.2.1}/docs/tasks/summarization.md +0 -0
  95. {euroeval-16.1.1 → euroeval-16.2.1}/gfx/euroeval.png +0 -0
  96. {euroeval-16.1.1 → euroeval-16.2.1}/gfx/euroeval.xcf +0 -0
  97. {euroeval-16.1.1 → euroeval-16.2.1}/gfx/scandeval.png +0 -0
  98. {euroeval-16.1.1 → euroeval-16.2.1}/makefile +0 -0
  99. {euroeval-16.1.1 → euroeval-16.2.1}/mkdocs.yaml +0 -0
  100. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
  101. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/base.py +0 -0
  102. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
  103. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/callbacks.py +0 -0
  104. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/constants.py +0 -0
  105. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/data_loading.py +0 -0
  106. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/__init__.py +0 -0
  107. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/danish.py +0 -0
  108. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/dutch.py +0 -0
  109. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/english.py +0 -0
  110. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/estonian.py +0 -0
  111. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/faroese.py +0 -0
  112. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/finnish.py +0 -0
  113. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/french.py +0 -0
  114. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/german.py +0 -0
  115. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/icelandic.py +0 -0
  116. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/italian.py +0 -0
  117. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/latvian.py +0 -0
  118. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/norwegian.py +0 -0
  119. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/polish.py +0 -0
  120. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/portuguese.py +0 -0
  121. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/spanish.py +0 -0
  122. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/dataset_configs/swedish.py +0 -0
  123. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/enums.py +0 -0
  124. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/exceptions.py +0 -0
  125. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/finetuning.py +0 -0
  126. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/generation_utils.py +0 -0
  127. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/languages.py +0 -0
  128. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/__init__.py +0 -0
  129. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/llm_as_a_judge.py +0 -0
  130. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/pipeline.py +0 -0
  131. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/metrics/speed.py +0 -0
  132. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/model_cache.py +0 -0
  133. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/model_config.py +0 -0
  134. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/model_loading.py +0 -0
  135. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/__init__.py +0 -0
  136. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  137. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  138. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/prompt_templates/summarization.py +0 -0
  139. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/scores.py +0 -0
  140. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/speed_benchmark.py +0 -0
  141. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/__init__.py +0 -0
  142. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  143. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/question_answering.py +0 -0
  144. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
  145. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  146. {euroeval-16.1.1 → euroeval-16.2.1}/src/euroeval/task_group_utils/token_classification.py +0 -0
  147. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/constants.py +0 -0
  148. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_allocine.py +0 -0
  149. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_angry_tweets.py +0 -0
  150. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_arc.py +0 -0
  151. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_arc_is.py +0 -0
  152. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_belebele.py +0 -0
  153. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_boolq_pt.py +0 -0
  154. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_cnn_dailymail.py +0 -0
  155. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_conll_en.py +0 -0
  156. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_conll_es.py +0 -0
  157. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_conll_nl.py +0 -0
  158. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_copa_lv.py +0 -0
  159. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_dane.py +0 -0
  160. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_danish_citizen_tests.py +0 -0
  161. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_dansk.py +0 -0
  162. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_danske_talemaader.py +0 -0
  163. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_danske_talemaader_old.py +0 -0
  164. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_dbrd.py +0 -0
  165. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_dutch_cola.py +0 -0
  166. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_eltec.py +0 -0
  167. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_err_news.py +0 -0
  168. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_estner.py +0 -0
  169. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_estonian_valence.py +0 -0
  170. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_european_values.py +0 -0
  171. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_exam_et.py +0 -0
  172. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_fone.py +0 -0
  173. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_foqa.py +0 -0
  174. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_fosent.py +0 -0
  175. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_fquad.py +0 -0
  176. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_fullstack_ner.py +0 -0
  177. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_germanquad.py +0 -0
  178. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_germeval.py +0 -0
  179. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_goldenswag.py +0 -0
  180. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_grammar_et.py +0 -0
  181. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_harem.py +0 -0
  182. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_hellaswag.py +0 -0
  183. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_hellaswag_fi.py +0 -0
  184. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  185. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_ice_linguistic.py +0 -0
  186. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
  187. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_icelandic_knowledge.py +0 -0
  188. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_icelandic_qa.py +0 -0
  189. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_icesum.py +0 -0
  190. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_idioms_no.py +0 -0
  191. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_ilpost_sum.py +0 -0
  192. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_jentoft.py +0 -0
  193. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_kpwr_ner.py +0 -0
  194. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_latvian_lsm_summary.py +0 -0
  195. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
  196. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_life_in_the_uk.py +0 -0
  197. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_llmzszl.py +0 -0
  198. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mim_gold_ner.py +0 -0
  199. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mlqa_es.py +0 -0
  200. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mlsum_de.py +0 -0
  201. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mlsum_es.py +0 -0
  202. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mmlu.py +0 -0
  203. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_mmlu_lv.py +0 -0
  204. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_multi_wiki_qa.py +0 -0
  205. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_multinerd-it.py +0 -0
  206. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_no_cola.py +0 -0
  207. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_no_sammendrag.py +0 -0
  208. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
  209. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_nordjylland_news.py +0 -0
  210. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norec.py +0 -0
  211. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norglm_multiqa.py +0 -0
  212. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norglm_multisum.py +0 -0
  213. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norne.py +0 -0
  214. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_norquad.py +0 -0
  215. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_nqii.py +0 -0
  216. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
  217. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_orange_sum.py +0 -0
  218. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_personal_sum.py +0 -0
  219. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_polemo2.py +0 -0
  220. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_poquad.py +0 -0
  221. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_psc.py +0 -0
  222. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_publico.py +0 -0
  223. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_rrn.py +0 -0
  224. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sb10k.py +0 -0
  225. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_scala.py +0 -0
  226. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_scandiqa.py +0 -0
  227. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_scandisent_fi.py +0 -0
  228. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_schibsted.py +0 -0
  229. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
  230. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sentipolc16.py +0 -0
  231. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_squad.py +0 -0
  232. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_squad_it.py +0 -0
  233. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_squad_nl.py +0 -0
  234. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_squad_nl_old.py +0 -0
  235. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sst2_pt.py +0 -0
  236. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_sst5.py +0 -0
  237. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_suc3.py +0 -0
  238. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_swedish_skolprov.py +0 -0
  239. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_swedn.py +0 -0
  240. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_swerec.py +0 -0
  241. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_trivia_et.py +0 -0
  242. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_turku_ner_fi.py +0 -0
  243. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_tydiqa_fi.py +0 -0
  244. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
  245. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_wikiann_lv.py +0 -0
  246. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_wikineural-it.py +0 -0
  247. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_winogrande.py +0 -0
  248. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_winogrande_et.py +0 -0
  249. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_winogrande_is.py +0 -0
  250. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_xlsum_fi.py +0 -0
  251. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/create_xquad.py +0 -0
  252. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/fix_dot_env_file.py +0 -0
  253. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/load_ud_pos.py +0 -0
  254. {euroeval-16.1.1 → euroeval-16.2.1}/src/scripts/versioning.py +0 -0
  255. {euroeval-16.1.1 → euroeval-16.2.1}/tests/__init__.py +0 -0
  256. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_benchmark_config_factory.py +0 -0
  257. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_benchmark_modules/__init__.py +0 -0
  258. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_benchmark_modules/test_hf.py +0 -0
  259. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_callbacks.py +0 -0
  260. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_constants.py +0 -0
  261. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_data_models.py +0 -0
  262. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_dataset_configs.py +0 -0
  263. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_enums.py +0 -0
  264. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_exceptions.py +0 -0
  265. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_finetuning.py +0 -0
  266. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_languages.py +0 -0
  267. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_model_config.py +0 -0
  268. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_model_loading.py +0 -0
  269. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_scores.py +0 -0
  270. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_speed_benchmark.py +0 -0
  271. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_tasks.py +0 -0
  272. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_tokenisation_utils.py +0 -0
  273. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_types.py +0 -0
  274. {euroeval-16.1.1 → euroeval-16.2.1}/tests/test_utils.py +0 -0
@@ -34,6 +34,7 @@ body:
34
34
  - label: Italian
35
35
  - label: Latvian
36
36
  - label: Norwegian (Bokmål or Nynorsk)
37
+ - label: Polish
37
38
  - label: Portuguese
38
39
  - label: Spanish
39
40
  - label: Swedish
@@ -0,0 +1,49 @@
1
+ name: 🌍 Language Request
2
+ description: Is there a European language missing in EuroEval?
3
+ title: "[LANGUAGE REQUEST] <language-name>"
4
+ labels: "new language"
5
+ type: task
6
+
7
+ body:
8
+ - type: input
9
+ attributes:
10
+ label: Language name and code
11
+ description: What is the name and ISO 639 code of the language?
12
+ validations:
13
+ required: true
14
+ - type: markdown
15
+ attributes:
16
+ value: >
17
+ Here are some existing evaluation datasets in the language, that could be used:
18
+ - type: textarea
19
+ attributes:
20
+ label: Sentiment classification dataset
21
+ description: Link to one or more datasets in the language (leave blank if unknown)
22
+ - type: textarea
23
+ attributes:
24
+ label: Linguistic acceptability dataset
25
+ description: Link to one or more datasets in the language (leave blank if unknown)
26
+ - type: textarea
27
+ attributes:
28
+ label: Named entity recognition dataset
29
+ description: Link to one or more datasets in the language (leave blank if unknown)
30
+ - type: textarea
31
+ attributes:
32
+ label: Reading comprehension dataset
33
+ description: Link to one or more datasets in the language (leave blank if unknown)
34
+ - type: textarea
35
+ attributes:
36
+ label: Summarisation dataset
37
+ description: Link to one or more datasets in the language (leave blank if unknown)
38
+ - type: textarea
39
+ attributes:
40
+ label: Knowledge dataset
41
+ description: Link to one or more datasets in the language (leave blank if unknown)
42
+ - type: textarea
43
+ attributes:
44
+ label: Common-sense reasoning dataset
45
+ description: Link to one or more datasets in the language (leave blank if unknown)
46
+ - type: markdown
47
+ attributes:
48
+ value: >
49
+ Thanks for contributing 🎉!
@@ -23,6 +23,7 @@ body:
23
23
  - label: West Germanic languages (Dutch, English, German)
24
24
  - label: Finnic languages (Estonian, Finnish)
25
25
  - label: Latvian
26
+ - label: Polish
26
27
  validations:
27
28
  required: true
28
29
  - type: dropdown
@@ -10,6 +10,40 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v16.2.1] - 2025-09-15
14
+ ### Fixed
15
+ - Some of the `download_only` arguments were missing in the code, and have now been
16
+ added.
17
+
18
+
19
+ ## [v16.2.0] - 2025-09-15
20
+ ### Added
21
+ - Now supports evaluating models in an offline environment. This is done by first
22
+ downloading all necessary models, datasets, metrics and other artifacts while online,
23
+ using the new `--download-only` flag (or `download_only=True` in the `Benchmarker`
24
+ API). Then you can safely disable internet access and run the evaluation as normal,
25
+ and it will use the cached models, datasets and metrics. This was contributed by
26
+ @viggo-gascou ✨
27
+ - Added the `timm` package to the set of `generative` extra dependencies, as it is
28
+ required to load some multimodal models, such as Gemma-3n.
29
+
30
+ ### Changed
31
+ - Now does not benchmark encoder models on multiple-choice classification tasks, as they
32
+ get near-random performance and these scores are not used in the leaderboards. We can
33
+ change this in the future if we find a way to make encoder models work better on these
34
+ tasks.
35
+ - For generative vLLM models that can swap between reasoning and non-reasoning modes,
36
+ we previously defaulted to reasoning. We now default to what the model uses by
37
+ default, which is non-reasoning for most models.
38
+
39
+ ### Fixed
40
+ - Fixed an issue where old evaluation records could not be loaded, as the format had
41
+ changed. We are now able to load old records again.
42
+ - Fixed some grammatical errors in the Icelandic prompts.
43
+ - Now stores model IDs with parameters (e.g., `o3#low`) correctly in the benchmark
44
+ results, rather than just the base model ID (e.g., `o3`).
45
+
46
+
13
47
  ## [v16.1.1] - 2025-09-12
14
48
  ### Fixed
15
49
  - Fixed an issue from v16.1.0, where reasoning models were not using the tokeniser's
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.1.1
3
+ Version: 16.2.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,13 +61,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
61
61
  Provides-Extra: all
62
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
64
- Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
65
- Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: timm>=1.0.19; extra == 'all'
65
+ Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
- Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
70
- Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: timm>=1.0.19; extra == 'generative'
70
+ Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
71
71
  Description-Content-Type: text/markdown
72
72
 
73
73
  <div align='center'>
@@ -152,13 +152,13 @@ model:
152
152
  ```
153
153
  >>> from euroeval import Benchmarker
154
154
  >>> benchmark = Benchmarker()
155
- >>> benchmark(model="<model>")
155
+ >>> benchmark(model="<model-id>")
156
156
  ```
157
157
 
158
158
  To benchmark on a specific task and/or language, you simply specify the `task` or
159
159
  `language` arguments, shown here with same example as above:
160
160
  ```
161
- >>> benchmark(model="<model>", task="sentiment-classification", language="da")
161
+ >>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
162
162
  ```
163
163
 
164
164
  If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
@@ -168,6 +168,30 @@ models on the Danish sentiment classification task:
168
168
  >>> benchmark(task="sentiment-classification", language="da")
169
169
  ```
170
170
 
171
+ ### Benchmarking in an Offline Environment
172
+ If you need to benchmark in an offline environment, you need to download the models,
173
+ datasets and metrics beforehand. This can be done by adding the `--download-only`
174
+ argument, from the command line, or the `download_only` argument, if benchmarking from a
175
+ script. For example to download the model you want and all of the Danish sentiment
176
+ classification datasets:
177
+ ```
178
+ $ euroeval --model <model-id> --task sentiment-classification --language da --download-only
179
+ ```
180
+
181
+ Or from a script:
182
+ ```
183
+ >>> benchmark(
184
+ ... model="<model-id>",
185
+ ... task="sentiment-classification",
186
+ ... language="da",
187
+ ... download_only=True,
188
+ ... )
189
+ ```
190
+
191
+ Please note: Offline benchmarking of adapter models is not currently supported. An
192
+ internet connection will be required during evaluation. If offline support is important
193
+ to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
194
+
171
195
  ### Benchmarking from Docker
172
196
  A Dockerfile is provided in the repo, which can be downloaded and run, without needing
173
197
  to clone the repo and installing from source. This can be fetched programmatically by
@@ -80,13 +80,13 @@ model:
80
80
  ```
81
81
  >>> from euroeval import Benchmarker
82
82
  >>> benchmark = Benchmarker()
83
- >>> benchmark(model="<model>")
83
+ >>> benchmark(model="<model-id>")
84
84
  ```
85
85
 
86
86
  To benchmark on a specific task and/or language, you simply specify the `task` or
87
87
  `language` arguments, shown here with same example as above:
88
88
  ```
89
- >>> benchmark(model="<model>", task="sentiment-classification", language="da")
89
+ >>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
90
90
  ```
91
91
 
92
92
  If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
@@ -96,6 +96,30 @@ models on the Danish sentiment classification task:
96
96
  >>> benchmark(task="sentiment-classification", language="da")
97
97
  ```
98
98
 
99
+ ### Benchmarking in an Offline Environment
100
+ If you need to benchmark in an offline environment, you need to download the models,
101
+ datasets and metrics beforehand. This can be done by adding the `--download-only`
102
+ argument, from the command line, or the `download_only` argument, if benchmarking from a
103
+ script. For example to download the model you want and all of the Danish sentiment
104
+ classification datasets:
105
+ ```
106
+ $ euroeval --model <model-id> --task sentiment-classification --language da --download-only
107
+ ```
108
+
109
+ Or from a script:
110
+ ```
111
+ >>> benchmark(
112
+ ... model="<model-id>",
113
+ ... task="sentiment-classification",
114
+ ... language="da",
115
+ ... download_only=True,
116
+ ... )
117
+ ```
118
+
119
+ Please note: Offline benchmarking of adapter models is not currently supported. An
120
+ internet connection will be required during evaluation. If offline support is important
121
+ to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
122
+
99
123
  ### Benchmarking from Docker
100
124
  A Dockerfile is provided in the repo, which can be downloaded and run, without needing
101
125
  to clone the repo and installing from source. This can be fetched programmatically by
@@ -44,11 +44,11 @@ When evaluating generative models, we use the following setup (see the
44
44
  - Number of few-shot examples: 12
45
45
  - Prefix prompt:
46
46
  ```
47
- Eftirfarandi eru yfirferðir ásamt lyndisgildi þeirra, sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.
47
+ Hér fyrir neðan eru textabrot ásamt lyndisgildi þeirra sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.
48
48
  ```
49
49
  - Base prompt template:
50
50
  ```
51
- Yfirferð: {text}
51
+ Textabrot: {text}
52
52
  Lyndi: {label}
53
53
  ```
54
54
  - Instruction-tuned prompt template:
@@ -117,13 +117,13 @@ When evaluating generative models, we use the following setup (see the
117
117
  - Base prompt template:
118
118
  ```
119
119
  Setning: {text}
120
- Nefndar einingar: {label}
120
+ Nafneiningar: {label}
121
121
  ```
122
122
  - Instruction-tuned prompt template:
123
123
  ```
124
124
  Setning: {text}
125
125
 
126
- Greinið nefndu einingarnar í setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum 'einstaklingur', 'staðsetning', 'stofnun' og 'ýmislegt'. Gildin ættu að vera listi yfir nefndu einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.
126
+ Greindu nefndu einingarnar í setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum 'einstaklingur', 'staðsetning', 'stofnun' og 'ýmislegt'. Gildin ættu að vera listi yfir nefndu einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.
127
127
  ```
128
128
  - Label mapping:
129
129
  - `B-PER` ➡️ `einstaklingur`
@@ -186,7 +186,7 @@ When evaluating generative models, we use the following setup (see the
186
186
  - Number of few-shot examples: 12
187
187
  - Prefix prompt:
188
188
  ```
189
- Eftirfarandi eru setningar og hvort þær eru málfræðilega réttar.
189
+ Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
190
190
  ```
191
191
  - Base prompt template:
192
192
  ```
@@ -197,7 +197,7 @@ When evaluating generative models, we use the following setup (see the
197
197
  ```
198
198
  Setning: {text}
199
199
 
200
- Greinið hvort setningin er málfræðilega rétt eða ekki. Svarið skal vera 'já' ef setningin er rétt og 'nei' ef hún er ekki.
200
+ Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
201
201
  ```
202
202
  - Label mapping:
203
203
  - `correct` ➡️ `já`
@@ -249,7 +249,7 @@ When evaluating generative models, we use the following setup (see the
249
249
  - Number of few-shot examples: 12
250
250
  - Prefix prompt:
251
251
  ```
252
- Eftirfarandi eru setningar og hvort þær eru málfræðilega réttar.
252
+ Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
253
253
  ```
254
254
  - Base prompt template:
255
255
  ```
@@ -260,7 +260,7 @@ When evaluating generative models, we use the following setup (see the
260
260
  ```
261
261
  Setning: {text}
262
262
 
263
- Greinið hvort setningin er málfræðilega rétt eða ekki. Svarið skal vera 'já' ef setningin er rétt og 'nei' ef hún er ekki.
263
+ Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
264
264
  ```
265
265
  - Label mapping:
266
266
  - `correct` ➡️ `já`
@@ -310,7 +310,7 @@ When evaluating generative models, we use the following setup (see the
310
310
  - Number of few-shot examples: 12
311
311
  - Prefix prompt:
312
312
  ```
313
- Eftirfarandi eru setningar og hvort þær eru málfræðilega réttar.
313
+ Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
314
314
  ```
315
315
  - Base prompt template:
316
316
  ```
@@ -321,7 +321,7 @@ When evaluating generative models, we use the following setup (see the
321
321
  ```
322
322
  Setning: {text}
323
323
 
324
- Greinið hvort setningin er málfræðilega rétt eða ekki. Svarið skal vera 'já' ef setningin er rétt og 'nei' ef hún er ekki.
324
+ Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
325
325
  ```
326
326
  - Label mapping:
327
327
  - `correct` ➡️ `já`
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "16.1.1"
3
+ version = "16.2.1"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -33,7 +33,7 @@ dependencies = [
33
33
  "rouge-score>=0.1.2",
34
34
  "bert-score>=0.3.13",
35
35
  "levenshtein>=0.24.0",
36
- "scikit-learn==1.6.1", # Required for loading European values pipeline
36
+ "scikit-learn==1.6.1", # Required for loading European values pipeline
37
37
  "setuptools>=75.8.2",
38
38
  "demjson3>=3.0.6",
39
39
  "ollama>=0.5.1",
@@ -45,15 +45,15 @@ dependencies = [
45
45
  [project.optional-dependencies]
46
46
  generative = [
47
47
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
48
- "vllm>=0.10.1; platform_system == 'Linux'",
49
- "flashinfer-python>=0.3.1; platform_system == 'Linux'",
48
+ "vllm[flashinfer]>=0.10.1; platform_system == 'Linux'",
50
49
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
50
+ "timm>=1.0.19",
51
51
  ]
52
52
  all = [
53
53
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
54
- "vllm>=0.10.1; platform_system == 'Linux'",
55
- "flashinfer-python>=0.3.1; platform_system == 'Linux'",
54
+ "vllm[flashinfer]>=0.10.1; platform_system == 'Linux'",
56
55
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
56
+ "timm>=1.0.19",
57
57
  ]
58
58
 
59
59
  [project.urls]
@@ -100,6 +100,8 @@ dev-dependencies = [
100
100
  "types-ujson>=5.10.0.20240515",
101
101
  "types-simplejson>=3.2.0.2025032",
102
102
  "debugpy>=1.8.13",
103
+ "pytest-socket>=0.7.0",
104
+ "pytest-dependency>=0.6.0",
103
105
  ]
104
106
 
105
107
  [tool.ruff]
@@ -170,6 +172,7 @@ addopts = [
170
172
  "--cov=src/euroeval",
171
173
  "--color=yes",
172
174
  "-vvv",
175
+ "--allow-unix-socket"
173
176
  ]
174
177
  xfail_strict = true
175
178
  filterwarnings = [
@@ -181,7 +184,7 @@ filterwarnings = [
181
184
  "ignore::ResourceWarning",
182
185
  "ignore::FutureWarning",
183
186
  ]
184
- log_cli_level = "info"
187
+ log_cli_level = "INFO"
185
188
  testpaths = [
186
189
  "tests",
187
190
  "src/euroeval",
@@ -12,12 +12,13 @@ import warnings
12
12
  from termcolor import colored
13
13
 
14
14
  # Block specific warnings before importing anything else, as they can be noisy
15
- warnings.filterwarnings("ignore", category=UserWarning)
16
- warnings.filterwarnings("ignore", category=FutureWarning)
17
- logging.getLogger("httpx").setLevel(logging.CRITICAL)
18
- logging.getLogger("datasets").setLevel(logging.CRITICAL)
19
- logging.getLogger("vllm").setLevel(logging.CRITICAL)
20
- os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
15
+ if os.getenv("FULL_LOG") != "1":
16
+ warnings.filterwarnings("ignore", category=UserWarning)
17
+ warnings.filterwarnings("ignore", category=FutureWarning)
18
+ logging.getLogger("httpx").setLevel(logging.CRITICAL)
19
+ logging.getLogger("datasets").setLevel(logging.CRITICAL)
20
+ logging.getLogger("vllm").setLevel(logging.CRITICAL)
21
+ os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
21
22
 
22
23
  # Set up logging
23
24
  fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -6,9 +6,9 @@ import typing as t
6
6
 
7
7
  import torch
8
8
 
9
- from .data_models import BenchmarkConfig
9
+ from .data_models import BenchmarkConfig, BenchmarkConfigParams
10
10
  from .dataset_configs import get_all_dataset_configs
11
- from .enums import Device, GenerativeType
11
+ from .enums import Device
12
12
  from .exceptions import InvalidBenchmark
13
13
  from .languages import get_all_languages
14
14
  from .tasks import SPEED, get_all_tasks
@@ -21,150 +21,66 @@ logger = logging.getLogger("euroeval")
21
21
 
22
22
 
23
23
  def build_benchmark_config(
24
- progress_bar: bool,
25
- save_results: bool,
26
- task: str | list[str] | None,
27
- dataset: str | list[str] | None,
28
- language: str | list[str],
29
- model_language: str | list[str] | None,
30
- dataset_language: str | list[str] | None,
31
- device: Device | None,
32
- batch_size: int,
33
- raise_errors: bool,
34
- cache_dir: str,
35
- api_key: str | None,
36
- force: bool,
37
- verbose: bool,
38
- trust_remote_code: bool,
39
- clear_model_cache: bool,
40
- evaluate_test_split: bool,
41
- few_shot: bool,
42
- num_iterations: int,
43
- api_base: str | None,
44
- api_version: str | None,
45
- gpu_memory_utilization: float,
46
- generative_type: GenerativeType | None,
47
- debug: bool,
48
- run_with_cli: bool,
49
- requires_safetensors: bool,
24
+ benchmark_config_params: BenchmarkConfigParams,
50
25
  ) -> BenchmarkConfig:
51
26
  """Create a benchmark configuration.
52
27
 
53
28
  Args:
54
- progress_bar:
55
- Whether to show a progress bar when running the benchmark.
56
- save_results:
57
- Whether to save the benchmark results to a file.
58
- task:
59
- The tasks to include for dataset. If None then datasets will not be
60
- filtered based on their task.
61
- dataset:
62
- The datasets to include for task. If None then all datasets will be
63
- included, limited by the `task` parameter.
64
- language:
65
- The language codes of the languages to include, both for models and
66
- datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
67
- to 'all' if all languages should be considered.
68
- model_language:
69
- The language codes of the languages to include for models. If None then
70
- the `language` parameter will be used.
71
- dataset_language:
72
- The language codes of the languages to include for datasets. If None then
73
- the `language` parameter will be used.
74
- device:
75
- The device to use for running the models. If None then the device will be
76
- set automatically.
77
- batch_size:
78
- The batch size to use for running the models.
79
- raise_errors:
80
- Whether to raise errors when running the benchmark.
81
- cache_dir:
82
- The directory to use for caching the models.
83
- api_key:
84
- The API key to use for a given inference server.
85
- force:
86
- Whether to force the benchmark to run even if the results are already
87
- cached.
88
- verbose:
89
- Whether to print verbose output when running the benchmark. This is
90
- automatically set if `debug` is True.
91
- trust_remote_code:
92
- Whether to trust remote code when running the benchmark.
93
- clear_model_cache:
94
- Whether to clear the model cache before running the benchmark.
95
- evaluate_test_split:
96
- Whether to use the test split for the datasets.
97
- few_shot:
98
- Whether to use few-shot learning for the models.
99
- num_iterations:
100
- The number of iterations each model should be evaluated for.
101
- api_base:
102
- The base URL for a given inference API. Only relevant if `model` refers to a
103
- model on an inference API.
104
- api_version:
105
- The version of the API to use for a given inference API.
106
- gpu_memory_utilization:
107
- The GPU memory utilization to use for vLLM. A larger value will result in
108
- faster evaluation, but at the risk of running out of GPU memory. Only reduce
109
- this if you are running out of GPU memory. Only relevant if the model is
110
- generative.
111
- generative_type:
112
- The type of generative model. Only relevant if the model is generative. If
113
- not specified, the type will be inferred automatically.
114
- debug:
115
- Whether to run the benchmark in debug mode.
116
- run_with_cli:
117
- Whether the benchmark is being run with the CLI.
118
- requires_safetensors:
119
- Whether to only allow evaluations of models stored as safetensors.
29
+ benchmark_config_params:
30
+ The parameters for creating the benchmark configuration.
120
31
 
121
32
  Returns:
122
33
  The benchmark configuration.
123
34
  """
124
- language_codes = get_correct_language_codes(language_codes=language)
35
+ language_codes = get_correct_language_codes(
36
+ language_codes=benchmark_config_params.language
37
+ )
125
38
  model_languages = prepare_languages(
126
- language_codes=model_language, default_language_codes=language_codes
39
+ language_codes=benchmark_config_params.model_language,
40
+ default_language_codes=language_codes,
127
41
  )
128
42
  dataset_languages = prepare_languages(
129
- language_codes=dataset_language, default_language_codes=language_codes
43
+ language_codes=benchmark_config_params.dataset_language,
44
+ default_language_codes=language_codes,
130
45
  )
131
46
 
132
47
  tasks, datasets = prepare_tasks_and_datasets(
133
- task=task, dataset=dataset, dataset_languages=dataset_languages
48
+ task=benchmark_config_params.task,
49
+ dataset=benchmark_config_params.dataset,
50
+ dataset_languages=dataset_languages,
134
51
  )
135
52
 
136
- torch_device = prepare_device(device=device)
137
-
138
- # Set variable with number of iterations
139
- if hasattr(sys, "_called_from_test"):
140
- num_iterations = 1
141
-
142
53
  return BenchmarkConfig(
143
54
  model_languages=model_languages,
144
55
  dataset_languages=dataset_languages,
145
56
  tasks=tasks,
146
57
  datasets=datasets,
147
- batch_size=batch_size,
148
- raise_errors=raise_errors,
149
- cache_dir=cache_dir,
150
- api_key=api_key,
151
- force=force,
152
- progress_bar=progress_bar,
153
- save_results=save_results,
154
- verbose=verbose or debug,
155
- device=torch_device,
156
- trust_remote_code=trust_remote_code,
157
- clear_model_cache=clear_model_cache,
158
- evaluate_test_split=evaluate_test_split,
159
- few_shot=few_shot,
160
- num_iterations=num_iterations,
161
- api_base=api_base,
162
- api_version=api_version,
163
- gpu_memory_utilization=gpu_memory_utilization,
164
- generative_type=generative_type,
165
- debug=debug,
166
- run_with_cli=run_with_cli,
167
- requires_safetensors=requires_safetensors,
58
+ batch_size=benchmark_config_params.batch_size,
59
+ raise_errors=benchmark_config_params.raise_errors,
60
+ cache_dir=benchmark_config_params.cache_dir,
61
+ api_key=benchmark_config_params.api_key,
62
+ force=benchmark_config_params.force,
63
+ progress_bar=benchmark_config_params.progress_bar,
64
+ save_results=benchmark_config_params.save_results,
65
+ verbose=benchmark_config_params.verbose or benchmark_config_params.debug,
66
+ device=prepare_device(device=benchmark_config_params.device),
67
+ trust_remote_code=benchmark_config_params.trust_remote_code,
68
+ clear_model_cache=benchmark_config_params.clear_model_cache,
69
+ evaluate_test_split=benchmark_config_params.evaluate_test_split,
70
+ few_shot=benchmark_config_params.few_shot,
71
+ num_iterations=(
72
+ 1
73
+ if hasattr(sys, "_called_from_test")
74
+ else benchmark_config_params.num_iterations
75
+ ),
76
+ api_base=benchmark_config_params.api_base,
77
+ api_version=benchmark_config_params.api_version,
78
+ gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
79
+ generative_type=benchmark_config_params.generative_type,
80
+ debug=benchmark_config_params.debug,
81
+ run_with_cli=benchmark_config_params.run_with_cli,
82
+ requires_safetensors=benchmark_config_params.requires_safetensors,
83
+ download_only=benchmark_config_params.download_only,
168
84
  )
169
85
 
170
86