EuroEval 16.1.0__tar.gz → 16.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (284) hide show
  1. {euroeval-16.1.0 → euroeval-16.2.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +1 -0
  2. euroeval-16.2.0/.github/ISSUE_TEMPLATE/language_request.yaml +49 -0
  3. {euroeval-16.1.0 → euroeval-16.2.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +1 -0
  4. {euroeval-16.1.0 → euroeval-16.2.0}/.gitignore +3 -0
  5. {euroeval-16.1.0 → euroeval-16.2.0}/.pre-commit-config.yaml +1 -1
  6. {euroeval-16.1.0 → euroeval-16.2.0}/CHANGELOG.md +36 -0
  7. {euroeval-16.1.0 → euroeval-16.2.0}/PKG-INFO +31 -7
  8. {euroeval-16.1.0 → euroeval-16.2.0}/README.md +26 -2
  9. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/icelandic.md +10 -10
  10. {euroeval-16.1.0 → euroeval-16.2.0}/pyproject.toml +10 -7
  11. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/__init__.py +7 -6
  12. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_config_factory.py +4 -0
  13. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/hf.py +31 -16
  14. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/litellm.py +2 -0
  15. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/vllm.py +24 -9
  16. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmarker.py +127 -14
  17. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/cli.py +8 -0
  18. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/data_models.py +4 -0
  19. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/generation.py +3 -1
  20. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/generation_utils.py +10 -4
  21. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/base.py +12 -0
  22. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/huggingface.py +23 -2
  23. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/linguistic_acceptability.py +6 -5
  24. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/named_entity_recognition.py +3 -3
  25. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/sentiment_classification.py +5 -5
  26. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/sequence_classification.py +1 -1
  27. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/tasks.py +3 -0
  28. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/tokenisation_utils.py +12 -13
  29. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/types.py +2 -2
  30. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/utils.py +77 -5
  31. {euroeval-16.1.0 → euroeval-16.2.0}/tests/conftest.py +1 -0
  32. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_benchmarker.py +56 -0
  33. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_cli.py +2 -0
  34. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_data_loading.py +10 -1
  35. {euroeval-16.1.0 → euroeval-16.2.0}/uv.lock +668 -522
  36. euroeval-16.1.0/generated_contracts/employment_contract_001.md +0 -137
  37. euroeval-16.1.0/generated_contracts/employment_contract_002.md +0 -152
  38. euroeval-16.1.0/generated_contracts/employment_contract_003.md +0 -144
  39. euroeval-16.1.0/generated_contracts/employment_contract_004.md +0 -139
  40. euroeval-16.1.0/generated_contracts/employment_contract_005.md +0 -146
  41. euroeval-16.1.0/generated_contracts/employment_contract_006.md +0 -127
  42. euroeval-16.1.0/generated_contracts/employment_contract_007.md +0 -147
  43. euroeval-16.1.0/generated_contracts/employment_contract_008.md +0 -136
  44. euroeval-16.1.0/generated_contracts/employment_contract_009.md +0 -143
  45. euroeval-16.1.0/generated_contracts/employment_contract_010.md +0 -148
  46. {euroeval-16.1.0 → euroeval-16.2.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  47. {euroeval-16.1.0 → euroeval-16.2.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  48. {euroeval-16.1.0 → euroeval-16.2.0}/.github/workflows/ci.yaml +0 -0
  49. {euroeval-16.1.0 → euroeval-16.2.0}/CITATION.cff +0 -0
  50. {euroeval-16.1.0 → euroeval-16.2.0}/CODE_OF_CONDUCT.md +0 -0
  51. {euroeval-16.1.0 → euroeval-16.2.0}/CONTRIBUTING.md +0 -0
  52. {euroeval-16.1.0 → euroeval-16.2.0}/Dockerfile.cuda +0 -0
  53. {euroeval-16.1.0 → euroeval-16.2.0}/LICENSE +0 -0
  54. {euroeval-16.1.0 → euroeval-16.2.0}/NEW_DATASET_GUIDE.md +0 -0
  55. {euroeval-16.1.0 → euroeval-16.2.0}/docs/CNAME +0 -0
  56. {euroeval-16.1.0 → euroeval-16.2.0}/docs/README.md +0 -0
  57. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/README.md +0 -0
  58. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/danish.md +0 -0
  59. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/dutch.md +0 -0
  60. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/english.md +0 -0
  61. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/estonian.md +0 -0
  62. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/faroese.md +0 -0
  63. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/finnish.md +0 -0
  64. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/french.md +0 -0
  65. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/german.md +0 -0
  66. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/italian.md +0 -0
  67. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/latvian.md +0 -0
  68. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/norwegian.md +0 -0
  69. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/polish.md +0 -0
  70. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/portuguese.md +0 -0
  71. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/spanish.md +0 -0
  72. {euroeval-16.1.0 → euroeval-16.2.0}/docs/datasets/swedish.md +0 -0
  73. {euroeval-16.1.0 → euroeval-16.2.0}/docs/extras/radial_plotter.md +0 -0
  74. {euroeval-16.1.0 → euroeval-16.2.0}/docs/faq.md +0 -0
  75. {euroeval-16.1.0 → euroeval-16.2.0}/docs/gfx/favicon.png +0 -0
  76. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/danish.md +0 -0
  77. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
  78. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/english.md +0 -0
  79. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/estonian.md +0 -0
  80. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
  81. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
  82. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/french.md +0 -0
  83. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/german.md +0 -0
  84. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  85. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/italian.md +0 -0
  86. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  87. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/portuguese.md +0 -0
  88. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
  89. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
  90. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/european.md +0 -0
  91. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/finnic.md +0 -0
  92. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
  93. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  94. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/Multilingual/romance.md +0 -0
  95. {euroeval-16.1.0 → euroeval-16.2.0}/docs/leaderboards/README.md +0 -0
  96. {euroeval-16.1.0 → euroeval-16.2.0}/docs/methodology.md +0 -0
  97. {euroeval-16.1.0 → euroeval-16.2.0}/docs/python-package.md +0 -0
  98. {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/README.md +0 -0
  99. {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/common-sense-reasoning.md +0 -0
  100. {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/knowledge.md +0 -0
  101. {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/linguistic-acceptability.md +0 -0
  102. {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/named-entity-recognition.md +0 -0
  103. {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/reading-comprehension.md +0 -0
  104. {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/sentiment-classification.md +0 -0
  105. {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/speed.md +0 -0
  106. {euroeval-16.1.0 → euroeval-16.2.0}/docs/tasks/summarization.md +0 -0
  107. {euroeval-16.1.0 → euroeval-16.2.0}/gfx/euroeval.png +0 -0
  108. {euroeval-16.1.0 → euroeval-16.2.0}/gfx/euroeval.xcf +0 -0
  109. {euroeval-16.1.0 → euroeval-16.2.0}/gfx/scandeval.png +0 -0
  110. {euroeval-16.1.0 → euroeval-16.2.0}/makefile +0 -0
  111. {euroeval-16.1.0 → euroeval-16.2.0}/mkdocs.yaml +0 -0
  112. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/__init__.py +0 -0
  113. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/base.py +0 -0
  114. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/benchmark_modules/fresh.py +0 -0
  115. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/callbacks.py +0 -0
  116. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/constants.py +0 -0
  117. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/data_loading.py +0 -0
  118. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/__init__.py +0 -0
  119. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/danish.py +0 -0
  120. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/dutch.py +0 -0
  121. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/english.py +0 -0
  122. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/estonian.py +0 -0
  123. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/faroese.py +0 -0
  124. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/finnish.py +0 -0
  125. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/french.py +0 -0
  126. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/german.py +0 -0
  127. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/icelandic.py +0 -0
  128. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/italian.py +0 -0
  129. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/latvian.py +0 -0
  130. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/norwegian.py +0 -0
  131. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/polish.py +0 -0
  132. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/portuguese.py +0 -0
  133. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/spanish.py +0 -0
  134. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/dataset_configs/swedish.py +0 -0
  135. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/enums.py +0 -0
  136. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/exceptions.py +0 -0
  137. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/finetuning.py +0 -0
  138. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/languages.py +0 -0
  139. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/__init__.py +0 -0
  140. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/llm_as_a_judge.py +0 -0
  141. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/pipeline.py +0 -0
  142. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/metrics/speed.py +0 -0
  143. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/model_cache.py +0 -0
  144. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/model_config.py +0 -0
  145. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/model_loading.py +0 -0
  146. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/__init__.py +0 -0
  147. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  148. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  149. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/prompt_templates/summarization.py +0 -0
  150. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/scores.py +0 -0
  151. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/speed_benchmark.py +0 -0
  152. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/__init__.py +0 -0
  153. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  154. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/question_answering.py +0 -0
  155. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  156. {euroeval-16.1.0 → euroeval-16.2.0}/src/euroeval/task_group_utils/token_classification.py +0 -0
  157. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/constants.py +0 -0
  158. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_allocine.py +0 -0
  159. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_angry_tweets.py +0 -0
  160. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_arc.py +0 -0
  161. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_arc_is.py +0 -0
  162. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_belebele.py +0 -0
  163. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_boolq_pt.py +0 -0
  164. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_cnn_dailymail.py +0 -0
  165. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_conll_en.py +0 -0
  166. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_conll_es.py +0 -0
  167. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_conll_nl.py +0 -0
  168. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_copa_lv.py +0 -0
  169. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_dane.py +0 -0
  170. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_danish_citizen_tests.py +0 -0
  171. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_dansk.py +0 -0
  172. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_danske_talemaader.py +0 -0
  173. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_danske_talemaader_old.py +0 -0
  174. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_dbrd.py +0 -0
  175. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_dutch_cola.py +0 -0
  176. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_eltec.py +0 -0
  177. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_err_news.py +0 -0
  178. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_estner.py +0 -0
  179. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_estonian_valence.py +0 -0
  180. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_european_values.py +0 -0
  181. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_exam_et.py +0 -0
  182. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_fone.py +0 -0
  183. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_foqa.py +0 -0
  184. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_fosent.py +0 -0
  185. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_fquad.py +0 -0
  186. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_fullstack_ner.py +0 -0
  187. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_germanquad.py +0 -0
  188. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_germeval.py +0 -0
  189. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_goldenswag.py +0 -0
  190. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_grammar_et.py +0 -0
  191. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_harem.py +0 -0
  192. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_hellaswag.py +0 -0
  193. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_hellaswag_fi.py +0 -0
  194. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  195. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_ice_linguistic.py +0 -0
  196. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
  197. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_icelandic_knowledge.py +0 -0
  198. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_icelandic_qa.py +0 -0
  199. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_icesum.py +0 -0
  200. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_idioms_no.py +0 -0
  201. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_ilpost_sum.py +0 -0
  202. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_jentoft.py +0 -0
  203. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_kpwr_ner.py +0 -0
  204. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_latvian_lsm_summary.py +0 -0
  205. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
  206. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_life_in_the_uk.py +0 -0
  207. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_llmzszl.py +0 -0
  208. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mim_gold_ner.py +0 -0
  209. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mlqa_es.py +0 -0
  210. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mlsum_de.py +0 -0
  211. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mlsum_es.py +0 -0
  212. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mmlu.py +0 -0
  213. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_mmlu_lv.py +0 -0
  214. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_multi_wiki_qa.py +0 -0
  215. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_multinerd-it.py +0 -0
  216. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_no_cola.py +0 -0
  217. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_no_sammendrag.py +0 -0
  218. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
  219. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_nordjylland_news.py +0 -0
  220. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norec.py +0 -0
  221. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norglm_multiqa.py +0 -0
  222. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norglm_multisum.py +0 -0
  223. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norne.py +0 -0
  224. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_norquad.py +0 -0
  225. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_nqii.py +0 -0
  226. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
  227. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_orange_sum.py +0 -0
  228. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_personal_sum.py +0 -0
  229. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_polemo2.py +0 -0
  230. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_poquad.py +0 -0
  231. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_psc.py +0 -0
  232. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_publico.py +0 -0
  233. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_rrn.py +0 -0
  234. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sb10k.py +0 -0
  235. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_scala.py +0 -0
  236. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_scandiqa.py +0 -0
  237. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_scandisent_fi.py +0 -0
  238. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_schibsted.py +0 -0
  239. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
  240. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sentipolc16.py +0 -0
  241. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_squad.py +0 -0
  242. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_squad_it.py +0 -0
  243. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_squad_nl.py +0 -0
  244. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_squad_nl_old.py +0 -0
  245. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sst2_pt.py +0 -0
  246. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_sst5.py +0 -0
  247. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_suc3.py +0 -0
  248. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_swedish_skolprov.py +0 -0
  249. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_swedn.py +0 -0
  250. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_swerec.py +0 -0
  251. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_trivia_et.py +0 -0
  252. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_turku_ner_fi.py +0 -0
  253. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_tydiqa_fi.py +0 -0
  254. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
  255. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_wikiann_lv.py +0 -0
  256. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_wikineural-it.py +0 -0
  257. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_winogrande.py +0 -0
  258. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_winogrande_et.py +0 -0
  259. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_winogrande_is.py +0 -0
  260. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_xlsum_fi.py +0 -0
  261. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/create_xquad.py +0 -0
  262. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/fix_dot_env_file.py +0 -0
  263. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/load_ud_pos.py +0 -0
  264. {euroeval-16.1.0 → euroeval-16.2.0}/src/scripts/versioning.py +0 -0
  265. {euroeval-16.1.0 → euroeval-16.2.0}/tests/__init__.py +0 -0
  266. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_benchmark_config_factory.py +0 -0
  267. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_benchmark_modules/__init__.py +0 -0
  268. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_benchmark_modules/test_hf.py +0 -0
  269. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_callbacks.py +0 -0
  270. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_constants.py +0 -0
  271. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_data_models.py +0 -0
  272. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_dataset_configs.py +0 -0
  273. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_enums.py +0 -0
  274. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_exceptions.py +0 -0
  275. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_finetuning.py +0 -0
  276. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_languages.py +0 -0
  277. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_model_config.py +0 -0
  278. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_model_loading.py +0 -0
  279. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_scores.py +0 -0
  280. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_speed_benchmark.py +0 -0
  281. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_tasks.py +0 -0
  282. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_tokenisation_utils.py +0 -0
  283. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_types.py +0 -0
  284. {euroeval-16.1.0 → euroeval-16.2.0}/tests/test_utils.py +0 -0
@@ -34,6 +34,7 @@ body:
34
34
  - label: Italian
35
35
  - label: Latvian
36
36
  - label: Norwegian (Bokmål or Nynorsk)
37
+ - label: Polish
37
38
  - label: Portuguese
38
39
  - label: Spanish
39
40
  - label: Swedish
@@ -0,0 +1,49 @@
1
+ name: 🌍 Language Request
2
+ description: Is there a European language missing in EuroEval?
3
+ title: "[LANGUAGE REQUEST] <language-name>"
4
+ labels: "new language"
5
+ type: task
6
+
7
+ body:
8
+ - type: input
9
+ attributes:
10
+ label: Language name and code
11
+ description: What is the name and ISO 639 code of the language?
12
+ validations:
13
+ required: true
14
+ - type: markdown
15
+ attributes:
16
+ value: >
17
+ Here are some existing evaluation datasets in the language, that could be used:
18
+ - type: textarea
19
+ attributes:
20
+ label: Sentiment classification dataset
21
+ description: Link to one or more datasets in the language (leave blank if unknown)
22
+ - type: textarea
23
+ attributes:
24
+ label: Linguistic acceptability dataset
25
+ description: Link to one or more datasets in the language (leave blank if unknown)
26
+ - type: textarea
27
+ attributes:
28
+ label: Named entity recognition dataset
29
+ description: Link to one or more datasets in the language (leave blank if unknown)
30
+ - type: textarea
31
+ attributes:
32
+ label: Reading comprehension dataset
33
+ description: Link to one or more datasets in the language (leave blank if unknown)
34
+ - type: textarea
35
+ attributes:
36
+ label: Summarisation dataset
37
+ description: Link to one or more datasets in the language (leave blank if unknown)
38
+ - type: textarea
39
+ attributes:
40
+ label: Knowledge dataset
41
+ description: Link to one or more datasets in the language (leave blank if unknown)
42
+ - type: textarea
43
+ attributes:
44
+ label: Common-sense reasoning dataset
45
+ description: Link to one or more datasets in the language (leave blank if unknown)
46
+ - type: markdown
47
+ attributes:
48
+ value: >
49
+ Thanks for contributing 🎉!
@@ -23,6 +23,7 @@ body:
23
23
  - label: West Germanic languages (Dutch, English, German)
24
24
  - label: Finnic languages (Estonian, Finnish)
25
25
  - label: Latvian
26
+ - label: Polish
26
27
  validations:
27
28
  required: true
28
29
  - type: dropdown
@@ -121,3 +121,6 @@ gfx/euroeval-*.png
121
121
  gfx/euroeval-*.jpeg
122
122
  gfx/euroeval-*.jpg
123
123
  gfx/euroeval-*.xcf
124
+
125
+ # Contracts
126
+ generated_contracts/
@@ -34,7 +34,7 @@ repos:
34
34
  hooks:
35
35
  - id: nbstripout
36
36
  - repo: https://github.com/pre-commit/mirrors-mypy
37
- rev: v1.17.1
37
+ rev: v1.18.1
38
38
  hooks:
39
39
  - id: mypy
40
40
  args:
@@ -10,6 +10,42 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v16.2.0] - 2025-09-15
14
+ ### Added
15
+ - Now supports evaluating models in an offline environment. This is done by first
16
+ downloading all necessary models, datasets, metrics and other artifacts while online,
17
+ using the new `--download-only` flag (or `download_only=True` in the `Benchmarker`
18
+ API). Then you can safely disable internet access and run the evaluation as normal,
19
+ and it will use the cached models, datasets and metrics. This was contributed by
20
+ @viggo-gascou ✨
21
+ - Added the `timm` package to the set of `generative` extra dependencies, as it is
22
+ required to load some multimodal models, such as Gemma-3n.
23
+
24
+ ### Changed
25
+ - Now does not benchmark encoder models on multiple-choice classification tasks, as they
26
+ get near-random performance and these scores are not used in the leaderboards. We can
27
+ change this in the future if we find a way to make encoder models work better on these
28
+ tasks.
29
+ - For generative vLLM models that can swap between reasoning and non-reasoning modes,
30
+ we previously defaulted to reasoning. We now default to what the model uses by
31
+ default, which is non-reasoning for most models.
32
+
33
+ ### Fixed
34
+ - Fixed an issue where old evaluation records could not be loaded, as the format had
35
+ changed. We are now able to load old records again.
36
+ - Fixed some grammatical errors in the Icelandic prompts.
37
+ - Now stores model IDs with parameters (e.g., `o3#low`) correctly in the benchmark
38
+ results, rather than just the base model ID (e.g., `o3`).
39
+
40
+
41
+ ## [v16.1.1] - 2025-09-12
42
+ ### Fixed
43
+ - Fixed an issue from v16.1.0, where reasoning models were not using the tokeniser's
44
+ chat template.
45
+ - Fixed an issue with some of the prompts for base decoders, that the list of possible
46
+ labels for sequence classification tasks was not included in the prompt.
47
+
48
+
13
49
  ## [v16.1.0] - 2025-09-11
14
50
  ### Added
15
51
  - Added support for Polish 🇵🇱! This includes the reading comprehension dataset PoQuAD,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.1.0
3
+ Version: 16.2.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,13 +61,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
61
61
  Provides-Extra: all
62
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
64
- Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
65
- Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: timm>=1.0.19; extra == 'all'
65
+ Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
- Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
70
- Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: timm>=1.0.19; extra == 'generative'
70
+ Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
71
71
  Description-Content-Type: text/markdown
72
72
 
73
73
  <div align='center'>
@@ -152,13 +152,13 @@ model:
152
152
  ```
153
153
  >>> from euroeval import Benchmarker
154
154
  >>> benchmark = Benchmarker()
155
- >>> benchmark(model="<model>")
155
+ >>> benchmark(model="<model-id>")
156
156
  ```
157
157
 
158
158
  To benchmark on a specific task and/or language, you simply specify the `task` or
159
159
  `language` arguments, shown here with same example as above:
160
160
  ```
161
- >>> benchmark(model="<model>", task="sentiment-classification", language="da")
161
+ >>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
162
162
  ```
163
163
 
164
164
  If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
@@ -168,6 +168,30 @@ models on the Danish sentiment classification task:
168
168
  >>> benchmark(task="sentiment-classification", language="da")
169
169
  ```
170
170
 
171
+ ### Benchmarking in an Offline Environment
172
+ If you need to benchmark in an offline environment, you need to download the models,
173
+ datasets and metrics beforehand. This can be done by adding the `--download-only`
174
+ argument, from the command line, or the `download_only` argument, if benchmarking from a
175
+ script. For example to download the model you want and all of the Danish sentiment
176
+ classification datasets:
177
+ ```
178
+ $ euroeval --model <model-id> --task sentiment-classification --language da --download-only
179
+ ```
180
+
181
+ Or from a script:
182
+ ```
183
+ >>> benchmark(
184
+ ... model="<model-id>",
185
+ ... task="sentiment-classification",
186
+ ... language="da",
187
+ ... download_only=True,
188
+ ... )
189
+ ```
190
+
191
+ Please note: Offline benchmarking of adapter models is not currently supported. An
192
+ internet connection will be required during evaluation. If offline support is important
193
+ to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
194
+
171
195
  ### Benchmarking from Docker
172
196
  A Dockerfile is provided in the repo, which can be downloaded and run, without needing
173
197
  to clone the repo and installing from source. This can be fetched programmatically by
@@ -80,13 +80,13 @@ model:
80
80
  ```
81
81
  >>> from euroeval import Benchmarker
82
82
  >>> benchmark = Benchmarker()
83
- >>> benchmark(model="<model>")
83
+ >>> benchmark(model="<model-id>")
84
84
  ```
85
85
 
86
86
  To benchmark on a specific task and/or language, you simply specify the `task` or
87
87
  `language` arguments, shown here with same example as above:
88
88
  ```
89
- >>> benchmark(model="<model>", task="sentiment-classification", language="da")
89
+ >>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
90
90
  ```
91
91
 
92
92
  If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
@@ -96,6 +96,30 @@ models on the Danish sentiment classification task:
96
96
  >>> benchmark(task="sentiment-classification", language="da")
97
97
  ```
98
98
 
99
+ ### Benchmarking in an Offline Environment
100
+ If you need to benchmark in an offline environment, you need to download the models,
101
+ datasets and metrics beforehand. This can be done by adding the `--download-only`
102
+ argument, from the command line, or the `download_only` argument, if benchmarking from a
103
+ script. For example to download the model you want and all of the Danish sentiment
104
+ classification datasets:
105
+ ```
106
+ $ euroeval --model <model-id> --task sentiment-classification --language da --download-only
107
+ ```
108
+
109
+ Or from a script:
110
+ ```
111
+ >>> benchmark(
112
+ ... model="<model-id>",
113
+ ... task="sentiment-classification",
114
+ ... language="da",
115
+ ... download_only=True,
116
+ ... )
117
+ ```
118
+
119
+ Please note: Offline benchmarking of adapter models is not currently supported. An
120
+ internet connection will be required during evaluation. If offline support is important
121
+ to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
122
+
99
123
  ### Benchmarking from Docker
100
124
  A Dockerfile is provided in the repo, which can be downloaded and run, without needing
101
125
  to clone the repo and installing from source. This can be fetched programmatically by
@@ -44,11 +44,11 @@ When evaluating generative models, we use the following setup (see the
44
44
  - Number of few-shot examples: 12
45
45
  - Prefix prompt:
46
46
  ```
47
- Eftirfarandi eru yfirferðir ásamt lyndisgildi þeirra, sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.
47
+ Hér fyrir neðan eru textabrot ásamt lyndisgildi þeirra sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.
48
48
  ```
49
49
  - Base prompt template:
50
50
  ```
51
- Yfirferð: {text}
51
+ Textabrot: {text}
52
52
  Lyndi: {label}
53
53
  ```
54
54
  - Instruction-tuned prompt template:
@@ -117,13 +117,13 @@ When evaluating generative models, we use the following setup (see the
117
117
  - Base prompt template:
118
118
  ```
119
119
  Setning: {text}
120
- Nefndar einingar: {label}
120
+ Nafneiningar: {label}
121
121
  ```
122
122
  - Instruction-tuned prompt template:
123
123
  ```
124
124
  Setning: {text}
125
125
 
126
- Greinið nefndu einingarnar í setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum 'einstaklingur', 'staðsetning', 'stofnun' og 'ýmislegt'. Gildin ættu að vera listi yfir nefndu einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.
126
+ Greindu nefndu einingarnar í setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum 'einstaklingur', 'staðsetning', 'stofnun' og 'ýmislegt'. Gildin ættu að vera listi yfir nefndu einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.
127
127
  ```
128
128
  - Label mapping:
129
129
  - `B-PER` ➡️ `einstaklingur`
@@ -186,7 +186,7 @@ When evaluating generative models, we use the following setup (see the
186
186
  - Number of few-shot examples: 12
187
187
  - Prefix prompt:
188
188
  ```
189
- Eftirfarandi eru setningar og hvort þær eru málfræðilega réttar.
189
+ Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
190
190
  ```
191
191
  - Base prompt template:
192
192
  ```
@@ -197,7 +197,7 @@ When evaluating generative models, we use the following setup (see the
197
197
  ```
198
198
  Setning: {text}
199
199
 
200
- Greinið hvort setningin er málfræðilega rétt eða ekki. Svarið skal vera 'já' ef setningin er rétt og 'nei' ef hún er ekki.
200
+ Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
201
201
  ```
202
202
  - Label mapping:
203
203
  - `correct` ➡️ `já`
@@ -249,7 +249,7 @@ When evaluating generative models, we use the following setup (see the
249
249
  - Number of few-shot examples: 12
250
250
  - Prefix prompt:
251
251
  ```
252
- Eftirfarandi eru setningar og hvort þær eru málfræðilega réttar.
252
+ Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
253
253
  ```
254
254
  - Base prompt template:
255
255
  ```
@@ -260,7 +260,7 @@ When evaluating generative models, we use the following setup (see the
260
260
  ```
261
261
  Setning: {text}
262
262
 
263
- Greinið hvort setningin er málfræðilega rétt eða ekki. Svarið skal vera 'já' ef setningin er rétt og 'nei' ef hún er ekki.
263
+ Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
264
264
  ```
265
265
  - Label mapping:
266
266
  - `correct` ➡️ `já`
@@ -310,7 +310,7 @@ When evaluating generative models, we use the following setup (see the
310
310
  - Number of few-shot examples: 12
311
311
  - Prefix prompt:
312
312
  ```
313
- Eftirfarandi eru setningar og hvort þær eru málfræðilega réttar.
313
+ Hér fyrir neðan eru setningar ásamt mati á því hvort þær eru málfræðilega réttar.
314
314
  ```
315
315
  - Base prompt template:
316
316
  ```
@@ -321,7 +321,7 @@ When evaluating generative models, we use the following setup (see the
321
321
  ```
322
322
  Setning: {text}
323
323
 
324
- Greinið hvort setningin er málfræðilega rétt eða ekki. Svarið skal vera 'já' ef setningin er rétt og 'nei' ef hún er ekki.
324
+ Greindu hvort setningin er málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún er það ekki.
325
325
  ```
326
326
  - Label mapping:
327
327
  - `correct` ➡️ `já`
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "16.1.0"
3
+ version = "16.2.0"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -33,7 +33,7 @@ dependencies = [
33
33
  "rouge-score>=0.1.2",
34
34
  "bert-score>=0.3.13",
35
35
  "levenshtein>=0.24.0",
36
- "scikit-learn==1.6.1", # Required for loading European values pipeline
36
+ "scikit-learn==1.6.1", # Required for loading European values pipeline
37
37
  "setuptools>=75.8.2",
38
38
  "demjson3>=3.0.6",
39
39
  "ollama>=0.5.1",
@@ -45,15 +45,15 @@ dependencies = [
45
45
  [project.optional-dependencies]
46
46
  generative = [
47
47
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
48
- "vllm>=0.10.1; platform_system == 'Linux'",
49
- "flashinfer-python>=0.3.1; platform_system == 'Linux'",
48
+ "vllm[flashinfer]>=0.10.1; platform_system == 'Linux'",
50
49
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
50
+ "timm>=1.0.19",
51
51
  ]
52
52
  all = [
53
53
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
54
- "vllm>=0.10.1; platform_system == 'Linux'",
55
- "flashinfer-python>=0.3.1; platform_system == 'Linux'",
54
+ "vllm[flashinfer]>=0.10.1; platform_system == 'Linux'",
56
55
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
56
+ "timm>=1.0.19",
57
57
  ]
58
58
 
59
59
  [project.urls]
@@ -100,6 +100,8 @@ dev-dependencies = [
100
100
  "types-ujson>=5.10.0.20240515",
101
101
  "types-simplejson>=3.2.0.2025032",
102
102
  "debugpy>=1.8.13",
103
+ "pytest-socket>=0.7.0",
104
+ "pytest-dependency>=0.6.0",
103
105
  ]
104
106
 
105
107
  [tool.ruff]
@@ -170,6 +172,7 @@ addopts = [
170
172
  "--cov=src/euroeval",
171
173
  "--color=yes",
172
174
  "-vvv",
175
+ "--allow-unix-socket"
173
176
  ]
174
177
  xfail_strict = true
175
178
  filterwarnings = [
@@ -181,7 +184,7 @@ filterwarnings = [
181
184
  "ignore::ResourceWarning",
182
185
  "ignore::FutureWarning",
183
186
  ]
184
- log_cli_level = "info"
187
+ log_cli_level = "INFO"
185
188
  testpaths = [
186
189
  "tests",
187
190
  "src/euroeval",
@@ -12,12 +12,13 @@ import warnings
12
12
  from termcolor import colored
13
13
 
14
14
  # Block specific warnings before importing anything else, as they can be noisy
15
- warnings.filterwarnings("ignore", category=UserWarning)
16
- warnings.filterwarnings("ignore", category=FutureWarning)
17
- logging.getLogger("httpx").setLevel(logging.CRITICAL)
18
- logging.getLogger("datasets").setLevel(logging.CRITICAL)
19
- logging.getLogger("vllm").setLevel(logging.CRITICAL)
20
- os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
15
+ if os.getenv("FULL_LOG") != "1":
16
+ warnings.filterwarnings("ignore", category=UserWarning)
17
+ warnings.filterwarnings("ignore", category=FutureWarning)
18
+ logging.getLogger("httpx").setLevel(logging.CRITICAL)
19
+ logging.getLogger("datasets").setLevel(logging.CRITICAL)
20
+ logging.getLogger("vllm").setLevel(logging.CRITICAL)
21
+ os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
21
22
 
22
23
  # Set up logging
23
24
  fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -47,6 +47,7 @@ def build_benchmark_config(
47
47
  debug: bool,
48
48
  run_with_cli: bool,
49
49
  requires_safetensors: bool,
50
+ download_only: bool,
50
51
  ) -> BenchmarkConfig:
51
52
  """Create a benchmark configuration.
52
53
 
@@ -117,6 +118,8 @@ def build_benchmark_config(
117
118
  Whether the benchmark is being run with the CLI.
118
119
  requires_safetensors:
119
120
  Whether to only allow evaluations of models stored as safetensors.
121
+ download_only:
122
+ Whether to only download the requested model weights and datasets.
120
123
 
121
124
  Returns:
122
125
  The benchmark configuration.
@@ -165,6 +168,7 @@ def build_benchmark_config(
165
168
  debug=debug,
166
169
  run_with_cli=run_with_cli,
167
170
  requires_safetensors=requires_safetensors,
171
+ download_only=download_only,
168
172
  )
169
173
 
170
174
 
@@ -146,21 +146,25 @@ class HuggingFaceEncoderModel(BenchmarkModule):
146
146
  Returns:
147
147
  The number of parameters in the model.
148
148
  """
149
- token = get_hf_token(api_key=self.benchmark_config.api_key)
150
- hf_api = HfApi(token=token)
151
- try:
152
- repo_info = hf_api.model_info(
153
- repo_id=self.model_config.adapter_base_model_id
154
- or self.model_config.model_id,
155
- revision=self.model_config.revision,
156
- )
157
- except (
158
- RepositoryNotFoundError,
159
- RevisionNotFoundError,
160
- RequestException,
161
- HFValidationError,
162
- ):
149
+ # No need to try to use the API if we have no internet.
150
+ if not internet_connection_available():
163
151
  repo_info = None
152
+ else:
153
+ token = get_hf_token(api_key=self.benchmark_config.api_key)
154
+ hf_api = HfApi(token=token)
155
+ try:
156
+ repo_info = hf_api.model_info(
157
+ repo_id=self.model_config.adapter_base_model_id
158
+ or self.model_config.model_id,
159
+ revision=self.model_config.revision,
160
+ )
161
+ except (
162
+ RepositoryNotFoundError,
163
+ RevisionNotFoundError,
164
+ RequestException,
165
+ HFValidationError,
166
+ ):
167
+ repo_info = None
164
168
 
165
169
  if (
166
170
  repo_info is not None
@@ -558,7 +562,7 @@ def load_model_and_tokeniser(
558
562
  The benchmark configuration
559
563
 
560
564
  Returns:
561
- The loaded model and tokeniser.
565
+ A pair (model, tokeniser), with the loaded model and tokeniser
562
566
  """
563
567
  config: "PretrainedConfig"
564
568
  block_terminal_output()
@@ -686,6 +690,7 @@ def load_model_and_tokeniser(
686
690
  model=model,
687
691
  model_id=model_id,
688
692
  trust_remote_code=benchmark_config.trust_remote_code,
693
+ model_cache_dir=model_config.model_cache_dir,
689
694
  )
690
695
 
691
696
  return model, tokeniser
@@ -722,6 +727,11 @@ def get_model_repo_info(
722
727
  ):
723
728
  model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
724
729
 
730
+ # If we have not internet, and the model_id is not a directory for a local model
731
+ # we also just create a dummy model info object.
732
+ elif not internet_connection_available():
733
+ model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
734
+
725
735
  # If the model does not exist locally, then we get the model info from the Hugging
726
736
  # Face Hub, if possible
727
737
  if model_info is None:
@@ -867,7 +877,10 @@ def get_model_repo_info(
867
877
 
868
878
 
869
879
  def load_tokeniser(
870
- model: "PreTrainedModel | None", model_id: str, trust_remote_code: bool
880
+ model: "PreTrainedModel | None",
881
+ model_id: str,
882
+ trust_remote_code: bool,
883
+ model_cache_dir: str,
871
884
  ) -> "PreTrainedTokenizer":
872
885
  """Load the tokeniser.
873
886
 
@@ -889,6 +902,7 @@ def load_tokeniser(
889
902
  trust_remote_code=trust_remote_code,
890
903
  padding_side="right",
891
904
  truncation_side="right",
905
+ cache_dir=model_cache_dir,
892
906
  )
893
907
 
894
908
  # If the model is a subclass of a certain model types then we have to add a prefix
@@ -999,6 +1013,7 @@ def load_hf_model_config(
999
1013
  token=get_hf_token(api_key=api_key),
1000
1014
  trust_remote_code=trust_remote_code,
1001
1015
  cache_dir=model_cache_dir,
1016
+ local_files_only=not internet_connection_available(),
1002
1017
  )
1003
1018
  if config.eos_token_id is not None and config.pad_token_id is None:
1004
1019
  if isinstance(config.eos_token_id, list):
@@ -984,6 +984,7 @@ class LiteLLMModel(BenchmarkModule):
984
984
  model=None,
985
985
  model_id=model_id,
986
986
  trust_remote_code=self.benchmark_config.trust_remote_code,
987
+ model_cache_dir=self.model_config.model_cache_dir,
987
988
  )
988
989
 
989
990
  if (
@@ -1066,6 +1067,7 @@ class LiteLLMModel(BenchmarkModule):
1066
1067
  model=None,
1067
1068
  model_id=model_id,
1068
1069
  trust_remote_code=self.benchmark_config.trust_remote_code,
1070
+ model_cache_dir=self.model_config.model_cache_dir,
1069
1071
  )
1070
1072
 
1071
1073
  all_max_lengths: list[int] = list()
@@ -72,7 +72,9 @@ from ..utils import (
72
72
  create_model_cache_dir,
73
73
  get_hf_token,
74
74
  get_min_cuda_compute_capability,
75
+ internet_connection_available,
75
76
  log_once,
77
+ resolve_model_path,
76
78
  split_model_id,
77
79
  )
78
80
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
@@ -146,7 +148,7 @@ class VLLMModel(HuggingFaceEncoderModel):
146
148
  )
147
149
 
148
150
  self.end_of_reasoning_token = get_end_of_reasoning_token(
149
- model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
151
+ model=self._model, tokeniser=self._tokeniser, model_config=model_config
150
152
  )
151
153
  self.end_of_chat_token_ids = get_end_of_chat_token_ids(
152
154
  tokeniser=self._tokeniser, generative_type=self.generative_type
@@ -834,10 +836,15 @@ def load_model_and_tokeniser(
834
836
 
835
837
  clear_vllm()
836
838
 
839
+ # if we do not have an internet connection we need to give the path to the folder
840
+ # that contains the model weights and config files, otherwise vLLM will try to
841
+ # download them regardless if they are already present in the download_dir
842
+ model_path = resolve_model_path(download_dir)
843
+
837
844
  try:
838
845
  model = LLM(
839
- model=model_id,
840
- tokenizer=model_id,
846
+ model=model_id if internet_connection_available() else model_path,
847
+ tokenizer=model_id if internet_connection_available() else model_path,
841
848
  gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
842
849
  max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
843
850
  download_dir=download_dir,
@@ -925,6 +932,7 @@ def load_tokeniser(
925
932
  cache_dir=model_cache_dir,
926
933
  token=token,
927
934
  trust_remote_code=trust_remote_code,
935
+ local_files_only=not internet_connection_available(),
928
936
  )
929
937
  num_retries = 5
930
938
  for _ in range(num_retries):
@@ -937,8 +945,10 @@ def load_tokeniser(
937
945
  padding_side="left",
938
946
  truncation_side="left",
939
947
  model_max_length=model_max_length,
948
+ cache_dir=model_cache_dir,
940
949
  config=config,
941
950
  token=token,
951
+ local_files_only=not internet_connection_available(),
942
952
  )
943
953
  break
944
954
  except (json.JSONDecodeError, OSError, TypeError) as e:
@@ -996,7 +1006,7 @@ def clear_vllm() -> None:
996
1006
 
997
1007
 
998
1008
  def get_end_of_reasoning_token(
999
- model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
1009
+ model: "LLM", tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
1000
1010
  ) -> str | None:
1001
1011
  """Get the end-of-reasoning token for a generative model.
1002
1012
 
@@ -1005,21 +1015,26 @@ def get_end_of_reasoning_token(
1005
1015
  The vLLM model.
1006
1016
  tokeniser:
1007
1017
  The tokeniser.
1008
- model_id:
1009
- The model ID.
1018
+ model_config:
1019
+ The model configuration.
1010
1020
 
1011
1021
  Returns:
1012
1022
  The end of reasoning token, or None if it could not be found.
1013
1023
  """
1024
+ model_id = model_config.model_id
1025
+
1014
1026
  # Create a prompt to check if the model uses the reasoning tokens
1015
1027
  prompt = "What is your name?"
1016
1028
  if has_chat_template(tokeniser=tokeniser):
1029
+ extra_kwargs = dict()
1030
+ if model_config.param in {"thinking", "no-thinking"}:
1031
+ extra_kwargs["enable_thinking"] = model_config.param == "thinking"
1017
1032
  templated_prompt = apply_chat_template(
1018
1033
  conversation=[dict(role="user", content=prompt)],
1019
1034
  tokeniser=tokeniser,
1020
1035
  tokenise=False,
1021
1036
  add_generation_prompt=True,
1022
- enable_thinking=True,
1037
+ **extra_kwargs,
1023
1038
  )
1024
1039
  assert isinstance(templated_prompt, str)
1025
1040
  prompt = templated_prompt
@@ -1042,8 +1057,8 @@ def get_end_of_reasoning_token(
1042
1057
  if not bor_reasoning_matches:
1043
1058
  log_once(
1044
1059
  f"The model {model_id!r} did not generate any beginning-of-reasoning "
1045
- "tokens in the prompt or the completion. Assuming the model is not "
1046
- "a reasoning model.",
1060
+ "tokens in the prompt or the completion. Assuming the model is not a "
1061
+ "reasoning model.",
1047
1062
  level=logging.DEBUG,
1048
1063
  )
1049
1064
  return None