EuroEval 16.2.0__tar.gz → 16.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (274) hide show
  1. {euroeval-16.2.0 → euroeval-16.2.1}/CHANGELOG.md +6 -0
  2. {euroeval-16.2.0 → euroeval-16.2.1}/PKG-INFO +1 -1
  3. {euroeval-16.2.0 → euroeval-16.2.1}/pyproject.toml +1 -1
  4. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/benchmark_config_factory.py +41 -129
  5. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/benchmarker.py +11 -2
  6. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/data_models.py +1 -0
  7. {euroeval-16.2.0 → euroeval-16.2.1}/uv.lock +1 -1
  8. {euroeval-16.2.0 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  9. {euroeval-16.2.0 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  10. {euroeval-16.2.0 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  11. {euroeval-16.2.0 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/language_request.yaml +0 -0
  12. {euroeval-16.2.0 → euroeval-16.2.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  13. {euroeval-16.2.0 → euroeval-16.2.1}/.github/workflows/ci.yaml +0 -0
  14. {euroeval-16.2.0 → euroeval-16.2.1}/.gitignore +0 -0
  15. {euroeval-16.2.0 → euroeval-16.2.1}/.pre-commit-config.yaml +0 -0
  16. {euroeval-16.2.0 → euroeval-16.2.1}/CITATION.cff +0 -0
  17. {euroeval-16.2.0 → euroeval-16.2.1}/CODE_OF_CONDUCT.md +0 -0
  18. {euroeval-16.2.0 → euroeval-16.2.1}/CONTRIBUTING.md +0 -0
  19. {euroeval-16.2.0 → euroeval-16.2.1}/Dockerfile.cuda +0 -0
  20. {euroeval-16.2.0 → euroeval-16.2.1}/LICENSE +0 -0
  21. {euroeval-16.2.0 → euroeval-16.2.1}/NEW_DATASET_GUIDE.md +0 -0
  22. {euroeval-16.2.0 → euroeval-16.2.1}/README.md +0 -0
  23. {euroeval-16.2.0 → euroeval-16.2.1}/docs/CNAME +0 -0
  24. {euroeval-16.2.0 → euroeval-16.2.1}/docs/README.md +0 -0
  25. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/README.md +0 -0
  26. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/danish.md +0 -0
  27. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/dutch.md +0 -0
  28. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/english.md +0 -0
  29. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/estonian.md +0 -0
  30. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/faroese.md +0 -0
  31. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/finnish.md +0 -0
  32. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/french.md +0 -0
  33. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/german.md +0 -0
  34. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/icelandic.md +0 -0
  35. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/italian.md +0 -0
  36. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/latvian.md +0 -0
  37. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/norwegian.md +0 -0
  38. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/polish.md +0 -0
  39. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/portuguese.md +0 -0
  40. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/spanish.md +0 -0
  41. {euroeval-16.2.0 → euroeval-16.2.1}/docs/datasets/swedish.md +0 -0
  42. {euroeval-16.2.0 → euroeval-16.2.1}/docs/extras/radial_plotter.md +0 -0
  43. {euroeval-16.2.0 → euroeval-16.2.1}/docs/faq.md +0 -0
  44. {euroeval-16.2.0 → euroeval-16.2.1}/docs/gfx/favicon.png +0 -0
  45. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/danish.md +0 -0
  46. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
  47. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/english.md +0 -0
  48. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/estonian.md +0 -0
  49. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
  50. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/finnish.md +0 -0
  51. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/french.md +0 -0
  52. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/german.md +0 -0
  53. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  54. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/italian.md +0 -0
  55. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  56. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/portuguese.md +0 -0
  57. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
  58. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
  59. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Multilingual/european.md +0 -0
  60. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Multilingual/finnic.md +0 -0
  61. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
  62. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  63. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/Multilingual/romance.md +0 -0
  64. {euroeval-16.2.0 → euroeval-16.2.1}/docs/leaderboards/README.md +0 -0
  65. {euroeval-16.2.0 → euroeval-16.2.1}/docs/methodology.md +0 -0
  66. {euroeval-16.2.0 → euroeval-16.2.1}/docs/python-package.md +0 -0
  67. {euroeval-16.2.0 → euroeval-16.2.1}/docs/tasks/README.md +0 -0
  68. {euroeval-16.2.0 → euroeval-16.2.1}/docs/tasks/common-sense-reasoning.md +0 -0
  69. {euroeval-16.2.0 → euroeval-16.2.1}/docs/tasks/knowledge.md +0 -0
  70. {euroeval-16.2.0 → euroeval-16.2.1}/docs/tasks/linguistic-acceptability.md +0 -0
  71. {euroeval-16.2.0 → euroeval-16.2.1}/docs/tasks/named-entity-recognition.md +0 -0
  72. {euroeval-16.2.0 → euroeval-16.2.1}/docs/tasks/reading-comprehension.md +0 -0
  73. {euroeval-16.2.0 → euroeval-16.2.1}/docs/tasks/sentiment-classification.md +0 -0
  74. {euroeval-16.2.0 → euroeval-16.2.1}/docs/tasks/speed.md +0 -0
  75. {euroeval-16.2.0 → euroeval-16.2.1}/docs/tasks/summarization.md +0 -0
  76. {euroeval-16.2.0 → euroeval-16.2.1}/gfx/euroeval.png +0 -0
  77. {euroeval-16.2.0 → euroeval-16.2.1}/gfx/euroeval.xcf +0 -0
  78. {euroeval-16.2.0 → euroeval-16.2.1}/gfx/scandeval.png +0 -0
  79. {euroeval-16.2.0 → euroeval-16.2.1}/makefile +0 -0
  80. {euroeval-16.2.0 → euroeval-16.2.1}/mkdocs.yaml +0 -0
  81. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/__init__.py +0 -0
  82. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
  83. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/benchmark_modules/base.py +0 -0
  84. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
  85. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/benchmark_modules/hf.py +0 -0
  86. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/benchmark_modules/litellm.py +0 -0
  87. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/benchmark_modules/vllm.py +0 -0
  88. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/callbacks.py +0 -0
  89. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/cli.py +0 -0
  90. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/constants.py +0 -0
  91. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/data_loading.py +0 -0
  92. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/__init__.py +0 -0
  93. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/danish.py +0 -0
  94. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/dutch.py +0 -0
  95. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/english.py +0 -0
  96. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/estonian.py +0 -0
  97. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/faroese.py +0 -0
  98. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/finnish.py +0 -0
  99. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/french.py +0 -0
  100. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/german.py +0 -0
  101. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/icelandic.py +0 -0
  102. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/italian.py +0 -0
  103. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/latvian.py +0 -0
  104. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/norwegian.py +0 -0
  105. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/polish.py +0 -0
  106. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/portuguese.py +0 -0
  107. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/spanish.py +0 -0
  108. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/dataset_configs/swedish.py +0 -0
  109. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/enums.py +0 -0
  110. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/exceptions.py +0 -0
  111. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/finetuning.py +0 -0
  112. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/generation.py +0 -0
  113. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/generation_utils.py +0 -0
  114. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/languages.py +0 -0
  115. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/metrics/__init__.py +0 -0
  116. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/metrics/base.py +0 -0
  117. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/metrics/huggingface.py +0 -0
  118. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/metrics/llm_as_a_judge.py +0 -0
  119. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/metrics/pipeline.py +0 -0
  120. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/metrics/speed.py +0 -0
  121. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/model_cache.py +0 -0
  122. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/model_config.py +0 -0
  123. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/model_loading.py +0 -0
  124. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/prompt_templates/__init__.py +0 -0
  125. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  126. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  127. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  128. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  129. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  130. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/prompt_templates/summarization.py +0 -0
  131. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/scores.py +0 -0
  132. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/speed_benchmark.py +0 -0
  133. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/task_group_utils/__init__.py +0 -0
  134. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  135. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/task_group_utils/question_answering.py +0 -0
  136. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/task_group_utils/sequence_classification.py +0 -0
  137. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  138. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/task_group_utils/token_classification.py +0 -0
  139. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/tasks.py +0 -0
  140. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/tokenisation_utils.py +0 -0
  141. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/types.py +0 -0
  142. {euroeval-16.2.0 → euroeval-16.2.1}/src/euroeval/utils.py +0 -0
  143. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/constants.py +0 -0
  144. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_allocine.py +0 -0
  145. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_angry_tweets.py +0 -0
  146. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_arc.py +0 -0
  147. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_arc_is.py +0 -0
  148. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_belebele.py +0 -0
  149. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_boolq_pt.py +0 -0
  150. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_cnn_dailymail.py +0 -0
  151. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_conll_en.py +0 -0
  152. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_conll_es.py +0 -0
  153. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_conll_nl.py +0 -0
  154. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_copa_lv.py +0 -0
  155. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_dane.py +0 -0
  156. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_danish_citizen_tests.py +0 -0
  157. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_dansk.py +0 -0
  158. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_danske_talemaader.py +0 -0
  159. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_danske_talemaader_old.py +0 -0
  160. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_dbrd.py +0 -0
  161. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_dutch_cola.py +0 -0
  162. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_eltec.py +0 -0
  163. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_err_news.py +0 -0
  164. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_estner.py +0 -0
  165. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_estonian_valence.py +0 -0
  166. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_european_values.py +0 -0
  167. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_exam_et.py +0 -0
  168. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_fone.py +0 -0
  169. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_foqa.py +0 -0
  170. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_fosent.py +0 -0
  171. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_fquad.py +0 -0
  172. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_fullstack_ner.py +0 -0
  173. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_germanquad.py +0 -0
  174. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_germeval.py +0 -0
  175. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_goldenswag.py +0 -0
  176. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_grammar_et.py +0 -0
  177. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_harem.py +0 -0
  178. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_hellaswag.py +0 -0
  179. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_hellaswag_fi.py +0 -0
  180. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  181. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_ice_linguistic.py +0 -0
  182. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
  183. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_icelandic_knowledge.py +0 -0
  184. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_icelandic_qa.py +0 -0
  185. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_icesum.py +0 -0
  186. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_idioms_no.py +0 -0
  187. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_ilpost_sum.py +0 -0
  188. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_jentoft.py +0 -0
  189. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_kpwr_ner.py +0 -0
  190. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_latvian_lsm_summary.py +0 -0
  191. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
  192. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_life_in_the_uk.py +0 -0
  193. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_llmzszl.py +0 -0
  194. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_mim_gold_ner.py +0 -0
  195. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_mlqa_es.py +0 -0
  196. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_mlsum_de.py +0 -0
  197. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_mlsum_es.py +0 -0
  198. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_mmlu.py +0 -0
  199. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_mmlu_lv.py +0 -0
  200. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_multi_wiki_qa.py +0 -0
  201. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_multinerd-it.py +0 -0
  202. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_no_cola.py +0 -0
  203. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_no_sammendrag.py +0 -0
  204. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
  205. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_nordjylland_news.py +0 -0
  206. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_norec.py +0 -0
  207. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_norglm_multiqa.py +0 -0
  208. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_norglm_multisum.py +0 -0
  209. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_norne.py +0 -0
  210. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_norquad.py +0 -0
  211. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_nqii.py +0 -0
  212. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
  213. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_orange_sum.py +0 -0
  214. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_personal_sum.py +0 -0
  215. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_polemo2.py +0 -0
  216. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_poquad.py +0 -0
  217. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_psc.py +0 -0
  218. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_publico.py +0 -0
  219. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_rrn.py +0 -0
  220. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_sb10k.py +0 -0
  221. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_scala.py +0 -0
  222. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_scandiqa.py +0 -0
  223. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_scandisent_fi.py +0 -0
  224. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_schibsted.py +0 -0
  225. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
  226. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_sentipolc16.py +0 -0
  227. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_squad.py +0 -0
  228. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_squad_it.py +0 -0
  229. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_squad_nl.py +0 -0
  230. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_squad_nl_old.py +0 -0
  231. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_sst2_pt.py +0 -0
  232. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_sst5.py +0 -0
  233. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_suc3.py +0 -0
  234. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_swedish_skolprov.py +0 -0
  235. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_swedn.py +0 -0
  236. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_swerec.py +0 -0
  237. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_trivia_et.py +0 -0
  238. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_turku_ner_fi.py +0 -0
  239. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_tydiqa_fi.py +0 -0
  240. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
  241. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_wikiann_lv.py +0 -0
  242. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_wikineural-it.py +0 -0
  243. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_winogrande.py +0 -0
  244. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_winogrande_et.py +0 -0
  245. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_winogrande_is.py +0 -0
  246. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_xlsum_fi.py +0 -0
  247. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/create_xquad.py +0 -0
  248. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/fix_dot_env_file.py +0 -0
  249. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/load_ud_pos.py +0 -0
  250. {euroeval-16.2.0 → euroeval-16.2.1}/src/scripts/versioning.py +0 -0
  251. {euroeval-16.2.0 → euroeval-16.2.1}/tests/__init__.py +0 -0
  252. {euroeval-16.2.0 → euroeval-16.2.1}/tests/conftest.py +0 -0
  253. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_benchmark_config_factory.py +0 -0
  254. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_benchmark_modules/__init__.py +0 -0
  255. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_benchmark_modules/test_hf.py +0 -0
  256. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_benchmarker.py +0 -0
  257. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_callbacks.py +0 -0
  258. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_cli.py +0 -0
  259. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_constants.py +0 -0
  260. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_data_loading.py +0 -0
  261. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_data_models.py +0 -0
  262. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_dataset_configs.py +0 -0
  263. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_enums.py +0 -0
  264. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_exceptions.py +0 -0
  265. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_finetuning.py +0 -0
  266. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_languages.py +0 -0
  267. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_model_config.py +0 -0
  268. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_model_loading.py +0 -0
  269. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_scores.py +0 -0
  270. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_speed_benchmark.py +0 -0
  271. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_tasks.py +0 -0
  272. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_tokenisation_utils.py +0 -0
  273. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_types.py +0 -0
  274. {euroeval-16.2.0 → euroeval-16.2.1}/tests/test_utils.py +0 -0
@@ -10,6 +10,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v16.2.1] - 2025-09-15
14
+ ### Fixed
15
+ - Some of the `download_only` arguments were missing in the code, and have now been
16
+ added.
17
+
18
+
13
19
  ## [v16.2.0] - 2025-09-15
14
20
  ### Added
15
21
  - Now supports evaluating models in an offline environment. This is done by first
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.2.0
3
+ Version: 16.2.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "16.2.0"
3
+ version = "16.2.1"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -6,9 +6,9 @@ import typing as t
6
6
 
7
7
  import torch
8
8
 
9
- from .data_models import BenchmarkConfig
9
+ from .data_models import BenchmarkConfig, BenchmarkConfigParams
10
10
  from .dataset_configs import get_all_dataset_configs
11
- from .enums import Device, GenerativeType
11
+ from .enums import Device
12
12
  from .exceptions import InvalidBenchmark
13
13
  from .languages import get_all_languages
14
14
  from .tasks import SPEED, get_all_tasks
@@ -21,154 +21,66 @@ logger = logging.getLogger("euroeval")
21
21
 
22
22
 
23
23
  def build_benchmark_config(
24
- progress_bar: bool,
25
- save_results: bool,
26
- task: str | list[str] | None,
27
- dataset: str | list[str] | None,
28
- language: str | list[str],
29
- model_language: str | list[str] | None,
30
- dataset_language: str | list[str] | None,
31
- device: Device | None,
32
- batch_size: int,
33
- raise_errors: bool,
34
- cache_dir: str,
35
- api_key: str | None,
36
- force: bool,
37
- verbose: bool,
38
- trust_remote_code: bool,
39
- clear_model_cache: bool,
40
- evaluate_test_split: bool,
41
- few_shot: bool,
42
- num_iterations: int,
43
- api_base: str | None,
44
- api_version: str | None,
45
- gpu_memory_utilization: float,
46
- generative_type: GenerativeType | None,
47
- debug: bool,
48
- run_with_cli: bool,
49
- requires_safetensors: bool,
50
- download_only: bool,
24
+ benchmark_config_params: BenchmarkConfigParams,
51
25
  ) -> BenchmarkConfig:
52
26
  """Create a benchmark configuration.
53
27
 
54
28
  Args:
55
- progress_bar:
56
- Whether to show a progress bar when running the benchmark.
57
- save_results:
58
- Whether to save the benchmark results to a file.
59
- task:
60
- The tasks to include for dataset. If None then datasets will not be
61
- filtered based on their task.
62
- dataset:
63
- The datasets to include for task. If None then all datasets will be
64
- included, limited by the `task` parameter.
65
- language:
66
- The language codes of the languages to include, both for models and
67
- datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
68
- to 'all' if all languages should be considered.
69
- model_language:
70
- The language codes of the languages to include for models. If None then
71
- the `language` parameter will be used.
72
- dataset_language:
73
- The language codes of the languages to include for datasets. If None then
74
- the `language` parameter will be used.
75
- device:
76
- The device to use for running the models. If None then the device will be
77
- set automatically.
78
- batch_size:
79
- The batch size to use for running the models.
80
- raise_errors:
81
- Whether to raise errors when running the benchmark.
82
- cache_dir:
83
- The directory to use for caching the models.
84
- api_key:
85
- The API key to use for a given inference server.
86
- force:
87
- Whether to force the benchmark to run even if the results are already
88
- cached.
89
- verbose:
90
- Whether to print verbose output when running the benchmark. This is
91
- automatically set if `debug` is True.
92
- trust_remote_code:
93
- Whether to trust remote code when running the benchmark.
94
- clear_model_cache:
95
- Whether to clear the model cache before running the benchmark.
96
- evaluate_test_split:
97
- Whether to use the test split for the datasets.
98
- few_shot:
99
- Whether to use few-shot learning for the models.
100
- num_iterations:
101
- The number of iterations each model should be evaluated for.
102
- api_base:
103
- The base URL for a given inference API. Only relevant if `model` refers to a
104
- model on an inference API.
105
- api_version:
106
- The version of the API to use for a given inference API.
107
- gpu_memory_utilization:
108
- The GPU memory utilization to use for vLLM. A larger value will result in
109
- faster evaluation, but at the risk of running out of GPU memory. Only reduce
110
- this if you are running out of GPU memory. Only relevant if the model is
111
- generative.
112
- generative_type:
113
- The type of generative model. Only relevant if the model is generative. If
114
- not specified, the type will be inferred automatically.
115
- debug:
116
- Whether to run the benchmark in debug mode.
117
- run_with_cli:
118
- Whether the benchmark is being run with the CLI.
119
- requires_safetensors:
120
- Whether to only allow evaluations of models stored as safetensors.
121
- download_only:
122
- Whether to only download the requested model weights and datasets.
29
+ benchmark_config_params:
30
+ The parameters for creating the benchmark configuration.
123
31
 
124
32
  Returns:
125
33
  The benchmark configuration.
126
34
  """
127
- language_codes = get_correct_language_codes(language_codes=language)
35
+ language_codes = get_correct_language_codes(
36
+ language_codes=benchmark_config_params.language
37
+ )
128
38
  model_languages = prepare_languages(
129
- language_codes=model_language, default_language_codes=language_codes
39
+ language_codes=benchmark_config_params.model_language,
40
+ default_language_codes=language_codes,
130
41
  )
131
42
  dataset_languages = prepare_languages(
132
- language_codes=dataset_language, default_language_codes=language_codes
43
+ language_codes=benchmark_config_params.dataset_language,
44
+ default_language_codes=language_codes,
133
45
  )
134
46
 
135
47
  tasks, datasets = prepare_tasks_and_datasets(
136
- task=task, dataset=dataset, dataset_languages=dataset_languages
48
+ task=benchmark_config_params.task,
49
+ dataset=benchmark_config_params.dataset,
50
+ dataset_languages=dataset_languages,
137
51
  )
138
52
 
139
- torch_device = prepare_device(device=device)
140
-
141
- # Set variable with number of iterations
142
- if hasattr(sys, "_called_from_test"):
143
- num_iterations = 1
144
-
145
53
  return BenchmarkConfig(
146
54
  model_languages=model_languages,
147
55
  dataset_languages=dataset_languages,
148
56
  tasks=tasks,
149
57
  datasets=datasets,
150
- batch_size=batch_size,
151
- raise_errors=raise_errors,
152
- cache_dir=cache_dir,
153
- api_key=api_key,
154
- force=force,
155
- progress_bar=progress_bar,
156
- save_results=save_results,
157
- verbose=verbose or debug,
158
- device=torch_device,
159
- trust_remote_code=trust_remote_code,
160
- clear_model_cache=clear_model_cache,
161
- evaluate_test_split=evaluate_test_split,
162
- few_shot=few_shot,
163
- num_iterations=num_iterations,
164
- api_base=api_base,
165
- api_version=api_version,
166
- gpu_memory_utilization=gpu_memory_utilization,
167
- generative_type=generative_type,
168
- debug=debug,
169
- run_with_cli=run_with_cli,
170
- requires_safetensors=requires_safetensors,
171
- download_only=download_only,
58
+ batch_size=benchmark_config_params.batch_size,
59
+ raise_errors=benchmark_config_params.raise_errors,
60
+ cache_dir=benchmark_config_params.cache_dir,
61
+ api_key=benchmark_config_params.api_key,
62
+ force=benchmark_config_params.force,
63
+ progress_bar=benchmark_config_params.progress_bar,
64
+ save_results=benchmark_config_params.save_results,
65
+ verbose=benchmark_config_params.verbose or benchmark_config_params.debug,
66
+ device=prepare_device(device=benchmark_config_params.device),
67
+ trust_remote_code=benchmark_config_params.trust_remote_code,
68
+ clear_model_cache=benchmark_config_params.clear_model_cache,
69
+ evaluate_test_split=benchmark_config_params.evaluate_test_split,
70
+ few_shot=benchmark_config_params.few_shot,
71
+ num_iterations=(
72
+ 1
73
+ if hasattr(sys, "_called_from_test")
74
+ else benchmark_config_params.num_iterations
75
+ ),
76
+ api_base=benchmark_config_params.api_base,
77
+ api_version=benchmark_config_params.api_version,
78
+ gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
79
+ generative_type=benchmark_config_params.generative_type,
80
+ debug=benchmark_config_params.debug,
81
+ run_with_cli=benchmark_config_params.run_with_cli,
82
+ requires_safetensors=benchmark_config_params.requires_safetensors,
83
+ download_only=benchmark_config_params.download_only,
172
84
  )
173
85
 
174
86
 
@@ -223,13 +223,14 @@ class Benchmarker:
223
223
  api_version=api_version,
224
224
  gpu_memory_utilization=gpu_memory_utilization,
225
225
  generative_type=generative_type,
226
+ download_only=download_only,
226
227
  debug=debug,
227
228
  run_with_cli=run_with_cli,
228
229
  requires_safetensors=requires_safetensors,
229
230
  )
230
231
 
231
232
  self.benchmark_config = build_benchmark_config(
232
- **self.benchmark_config_default_params.model_dump()
233
+ benchmark_config_params=self.benchmark_config_default_params
233
234
  )
234
235
 
235
236
  # Initialise variable storing model lists, so we only have to fetch it once
@@ -339,6 +340,7 @@ class Benchmarker:
339
340
  few_shot: bool | None = None,
340
341
  num_iterations: int | None = None,
341
342
  requires_safetensors: bool | None = None,
343
+ download_only: bool | None = None,
342
344
  ) -> list[BenchmarkResult]:
343
345
  """Benchmarks models on datasets.
344
346
 
@@ -454,6 +456,7 @@ class Benchmarker:
454
456
  few_shot=few_shot,
455
457
  num_iterations=num_iterations,
456
458
  requires_safetensors=requires_safetensors,
459
+ download_only=download_only,
457
460
  )
458
461
 
459
462
  adjust_logging_level(verbose=benchmark_config.verbose)
@@ -677,6 +680,7 @@ class Benchmarker:
677
680
  debug: bool | None = None,
678
681
  run_with_cli: bool | None = None,
679
682
  requires_safetensors: bool | None = None,
683
+ download_only: bool | None = None,
680
684
  ) -> "BenchmarkConfig":
681
685
  """Get an updated benchmark configuration.
682
686
 
@@ -756,6 +760,9 @@ class Benchmarker:
756
760
  download_only:
757
761
  Whether to only download the models without evaluating them. If None,
758
762
  then this value will not be updated.
763
+ download_only:
764
+ Whether to only download models and datasets without performing any
765
+ benchmarking. If None, then this value will not be updated.
759
766
 
760
767
  Returns:
761
768
  The updated benchmark configuration.
@@ -812,8 +819,10 @@ class Benchmarker:
812
819
  benchmark_config_params.run_with_cli = run_with_cli
813
820
  if requires_safetensors is not None:
814
821
  benchmark_config_params.requires_safetensors = requires_safetensors
822
+ if download_only is not None:
823
+ benchmark_config_params.download_only = download_only
815
824
 
816
- return build_benchmark_config(**benchmark_config_params.model_dump())
825
+ return build_benchmark_config(benchmark_config_params=benchmark_config_params)
817
826
 
818
827
  def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
819
828
  """Prepare the model ID(s) to be benchmarked.
@@ -289,6 +289,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
289
289
  api_version: str | None
290
290
  gpu_memory_utilization: float
291
291
  generative_type: GenerativeType | None
292
+ download_only: bool
292
293
  debug: bool
293
294
  run_with_cli: bool
294
295
  requires_safetensors: bool
@@ -894,7 +894,7 @@ wheels = [
894
894
 
895
895
  [[package]]
896
896
  name = "euroeval"
897
- version = "16.2.0"
897
+ version = "16.2.1"
898
898
  source = { editable = "." }
899
899
  dependencies = [
900
900
  { name = "accelerate" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes