EuroEval 16.0.0__tar.gz → 16.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (273) hide show
  1. {euroeval-16.0.0 → euroeval-16.0.1}/CHANGELOG.md +20 -0
  2. {euroeval-16.0.0 → euroeval-16.0.1}/PKG-INFO +3 -1
  3. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/README.md +5 -15
  4. {euroeval-16.0.0 → euroeval-16.0.1}/makefile +3 -0
  5. {euroeval-16.0.0 → euroeval-16.0.1}/pyproject.toml +3 -1
  6. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/__init__.py +5 -0
  7. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/vllm.py +41 -28
  8. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/constants.py +6 -0
  9. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/data_models.py +20 -16
  10. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/danish.py +0 -3
  11. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/generation_utils.py +44 -6
  12. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/pipeline.py +50 -8
  13. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/model_cache.py +13 -1
  14. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +2 -2
  15. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/sequence_classification.py +66 -53
  16. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/token_classification.py +14 -0
  17. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/tasks.py +9 -7
  18. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/tokenization_utils.py +1 -2
  19. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/utils.py +32 -1
  20. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_european_values.py +33 -27
  21. {euroeval-16.0.0 → euroeval-16.0.1}/uv.lock +58 -1
  22. {euroeval-16.0.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  23. {euroeval-16.0.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  24. {euroeval-16.0.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  25. {euroeval-16.0.0 → euroeval-16.0.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  26. {euroeval-16.0.0 → euroeval-16.0.1}/.github/workflows/ci.yaml +0 -0
  27. {euroeval-16.0.0 → euroeval-16.0.1}/.gitignore +0 -0
  28. {euroeval-16.0.0 → euroeval-16.0.1}/.pre-commit-config.yaml +0 -0
  29. {euroeval-16.0.0 → euroeval-16.0.1}/CITATION.cff +0 -0
  30. {euroeval-16.0.0 → euroeval-16.0.1}/CODE_OF_CONDUCT.md +0 -0
  31. {euroeval-16.0.0 → euroeval-16.0.1}/CONTRIBUTING.md +0 -0
  32. {euroeval-16.0.0 → euroeval-16.0.1}/Dockerfile.cuda +0 -0
  33. {euroeval-16.0.0 → euroeval-16.0.1}/LICENSE +0 -0
  34. {euroeval-16.0.0 → euroeval-16.0.1}/NEW_DATASET_GUIDE.md +0 -0
  35. {euroeval-16.0.0 → euroeval-16.0.1}/README.md +0 -0
  36. {euroeval-16.0.0 → euroeval-16.0.1}/docs/CNAME +0 -0
  37. {euroeval-16.0.0 → euroeval-16.0.1}/docs/README.md +0 -0
  38. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/README.md +0 -0
  39. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/danish.md +0 -0
  40. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/dutch.md +0 -0
  41. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/english.md +0 -0
  42. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/estonian.md +0 -0
  43. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/faroese.md +0 -0
  44. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/finnish.md +0 -0
  45. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/french.md +0 -0
  46. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/german.md +0 -0
  47. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/icelandic.md +0 -0
  48. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/italian.md +0 -0
  49. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/latvian.md +0 -0
  50. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/norwegian.md +0 -0
  51. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/portuguese.md +0 -0
  52. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/spanish.md +0 -0
  53. {euroeval-16.0.0 → euroeval-16.0.1}/docs/datasets/swedish.md +0 -0
  54. {euroeval-16.0.0 → euroeval-16.0.1}/docs/extras/radial_plotter.md +0 -0
  55. {euroeval-16.0.0 → euroeval-16.0.1}/docs/faq.md +0 -0
  56. {euroeval-16.0.0 → euroeval-16.0.1}/docs/gfx/favicon.png +0 -0
  57. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/danish.md +0 -0
  58. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
  59. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/english.md +0 -0
  60. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
  61. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/finnish.md +0 -0
  62. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/french.md +0 -0
  63. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/german.md +0 -0
  64. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  65. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/italian.md +0 -0
  66. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  67. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/portuguese.md +0 -0
  68. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
  69. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
  70. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/european.md +0 -0
  71. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
  72. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  73. {euroeval-16.0.0 → euroeval-16.0.1}/docs/leaderboards/Multilingual/romance.md +0 -0
  74. {euroeval-16.0.0 → euroeval-16.0.1}/docs/methodology.md +0 -0
  75. {euroeval-16.0.0 → euroeval-16.0.1}/docs/python-package.md +0 -0
  76. {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/README.md +0 -0
  77. {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/common-sense-reasoning.md +0 -0
  78. {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/knowledge.md +0 -0
  79. {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/linguistic-acceptability.md +0 -0
  80. {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/named-entity-recognition.md +0 -0
  81. {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/reading-comprehension.md +0 -0
  82. {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/sentiment-classification.md +0 -0
  83. {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/speed.md +0 -0
  84. {euroeval-16.0.0 → euroeval-16.0.1}/docs/tasks/summarization.md +0 -0
  85. {euroeval-16.0.0 → euroeval-16.0.1}/gfx/euroeval.png +0 -0
  86. {euroeval-16.0.0 → euroeval-16.0.1}/gfx/euroeval.xcf +0 -0
  87. {euroeval-16.0.0 → euroeval-16.0.1}/gfx/scandeval.png +0 -0
  88. {euroeval-16.0.0 → euroeval-16.0.1}/mkdocs.yaml +0 -0
  89. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_config_factory.py +0 -0
  90. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
  91. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/base.py +0 -0
  92. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
  93. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/hf.py +0 -0
  94. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmark_modules/litellm.py +0 -0
  95. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/benchmarker.py +0 -0
  96. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/callbacks.py +0 -0
  97. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/cli.py +0 -0
  98. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/data_loading.py +0 -0
  99. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/__init__.py +0 -0
  100. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/dutch.py +0 -0
  101. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/english.py +0 -0
  102. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/estonian.py +0 -0
  103. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/faroese.py +0 -0
  104. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/finnish.py +0 -0
  105. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/french.py +0 -0
  106. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/german.py +0 -0
  107. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/icelandic.py +0 -0
  108. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/italian.py +0 -0
  109. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/latvian.py +0 -0
  110. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/norwegian.py +0 -0
  111. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/portuguese.py +0 -0
  112. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/spanish.py +0 -0
  113. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/dataset_configs/swedish.py +0 -0
  114. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/enums.py +0 -0
  115. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/exceptions.py +0 -0
  116. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/finetuning.py +0 -0
  117. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/generation.py +0 -0
  118. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/languages.py +0 -0
  119. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/__init__.py +0 -0
  120. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/base.py +0 -0
  121. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/huggingface.py +0 -0
  122. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/llm_as_a_judge.py +0 -0
  123. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/metrics/speed.py +0 -0
  124. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/model_config.py +0 -0
  125. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/model_loading.py +0 -0
  126. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/__init__.py +0 -0
  127. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  128. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  129. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  130. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  131. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  132. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/prompt_templates/summarization.py +0 -0
  133. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/scores.py +0 -0
  134. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/speed_benchmark.py +0 -0
  135. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/__init__.py +0 -0
  136. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/question_answering.py +0 -0
  137. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  138. {euroeval-16.0.0 → euroeval-16.0.1}/src/euroeval/types.py +0 -0
  139. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/constants.py +0 -0
  140. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_allocine.py +0 -0
  141. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_angry_tweets.py +0 -0
  142. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_arc.py +0 -0
  143. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_arc_is.py +0 -0
  144. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_belebele.py +0 -0
  145. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_boolq_pt.py +0 -0
  146. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_cnn_dailymail.py +0 -0
  147. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_conll_en.py +0 -0
  148. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_conll_es.py +0 -0
  149. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_conll_nl.py +0 -0
  150. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_copa_lv.py +0 -0
  151. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_dane.py +0 -0
  152. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_danish_citizen_tests.py +0 -0
  153. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_dansk.py +0 -0
  154. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_danske_talemaader.py +0 -0
  155. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_danske_talemaader_old.py +0 -0
  156. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_dbrd.py +0 -0
  157. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_dutch_cola.py +0 -0
  158. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_eltec.py +0 -0
  159. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_err_news.py +0 -0
  160. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_estner.py +0 -0
  161. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_estonian_valence.py +0 -0
  162. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_exam_et.py +0 -0
  163. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_fone.py +0 -0
  164. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_foqa.py +0 -0
  165. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_fosent.py +0 -0
  166. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_fquad.py +0 -0
  167. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_fullstack_ner.py +0 -0
  168. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_germanquad.py +0 -0
  169. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_germeval.py +0 -0
  170. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_goldenswag.py +0 -0
  171. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_grammar_et.py +0 -0
  172. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_harem.py +0 -0
  173. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_hellaswag.py +0 -0
  174. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_hellaswag_fi.py +0 -0
  175. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  176. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_ice_linguistic.py +0 -0
  177. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
  178. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_icelandic_knowledge.py +0 -0
  179. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_icelandic_qa.py +0 -0
  180. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_icesum.py +0 -0
  181. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_idioms_no.py +0 -0
  182. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_ilpost_sum.py +0 -0
  183. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_jentoft.py +0 -0
  184. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_latvian_lsm_summary.py +0 -0
  185. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
  186. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_life_in_the_uk.py +0 -0
  187. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mim_gold_ner.py +0 -0
  188. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mlqa_es.py +0 -0
  189. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mlsum_de.py +0 -0
  190. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mlsum_es.py +0 -0
  191. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mmlu.py +0 -0
  192. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_mmlu_lv.py +0 -0
  193. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_multi_wiki_qa.py +0 -0
  194. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_multinerd-it.py +0 -0
  195. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_no_cola.py +0 -0
  196. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_no_sammendrag.py +0 -0
  197. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
  198. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_nordjylland_news.py +0 -0
  199. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norec.py +0 -0
  200. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norglm_multiqa.py +0 -0
  201. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norglm_multisum.py +0 -0
  202. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norne.py +0 -0
  203. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_norquad.py +0 -0
  204. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_nqii.py +0 -0
  205. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
  206. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_orange_sum.py +0 -0
  207. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_personal_sum.py +0 -0
  208. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_publico.py +0 -0
  209. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_rrn.py +0 -0
  210. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sb10k.py +0 -0
  211. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_scala.py +0 -0
  212. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_scandiqa.py +0 -0
  213. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_scandisent_fi.py +0 -0
  214. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_schibsted.py +0 -0
  215. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
  216. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sentipolc16.py +0 -0
  217. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_squad.py +0 -0
  218. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_squad_it.py +0 -0
  219. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_squad_nl.py +0 -0
  220. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_squad_nl_old.py +0 -0
  221. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sst2_pt.py +0 -0
  222. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_sst5.py +0 -0
  223. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_suc3.py +0 -0
  224. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_swedn.py +0 -0
  225. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_swerec.py +0 -0
  226. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_turku_ner_fi.py +0 -0
  227. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_tydiqa_fi.py +0 -0
  228. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
  229. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_wikiann_fo.py +0 -0
  230. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_wikiann_lv.py +0 -0
  231. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_wikineural-it.py +0 -0
  232. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_winogrande_et.py +0 -0
  233. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_winogrande_is.py +0 -0
  234. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_xlsum_fi.py +0 -0
  235. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/create_xquad_es.py +0 -0
  236. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/fix_dot_env_file.py +0 -0
  237. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/load_ud_pos.py +0 -0
  238. {euroeval-16.0.0 → euroeval-16.0.1}/src/scripts/versioning.py +0 -0
  239. {euroeval-16.0.0 → euroeval-16.0.1}/tests/__init__.py +0 -0
  240. {euroeval-16.0.0 → euroeval-16.0.1}/tests/conftest.py +0 -0
  241. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_config_factory.py +0 -0
  242. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/__init__.py +0 -0
  243. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_base.py +0 -0
  244. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_fresh.py +0 -0
  245. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_hf.py +0 -0
  246. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_litellm.py +0 -0
  247. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmark_modules/test_vllm.py +0 -0
  248. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_benchmarker.py +0 -0
  249. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_callbacks.py +0 -0
  250. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_cli.py +0 -0
  251. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_constants.py +0 -0
  252. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_data_loading.py +0 -0
  253. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_data_models.py +0 -0
  254. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_dataset_configs.py +0 -0
  255. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_enums.py +0 -0
  256. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_exceptions.py +0 -0
  257. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_finetuning.py +0 -0
  258. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_generation.py +0 -0
  259. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_languages.py +0 -0
  260. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_model_cache.py +0 -0
  261. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_model_config.py +0 -0
  262. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_model_loading.py +0 -0
  263. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_scores.py +0 -0
  264. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_speed_benchmark.py +0 -0
  265. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/__init__.py +0 -0
  266. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/test_question_answering.py +0 -0
  267. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/test_sequence_classification.py +0 -0
  268. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/test_text_to_text.py +0 -0
  269. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_task_utils/test_token_classification.py +0 -0
  270. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_tasks.py +0 -0
  271. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_tokenization_utils.py +0 -0
  272. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_types.py +0 -0
  273. {euroeval-16.0.0 → euroeval-16.0.1}/tests/test_utils.py +0 -0
@@ -10,6 +10,26 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v16.0.1] - 2025-09-07
14
+ ### Fixed
15
+ - Fixed a bug causing encoders to fail when evaluating on the Exam-et dataset.
16
+ - Previously we would abort an evaluation completely if the model outputted a single
17
+ invalid output on a classification task. As individual samples rarely have a great
18
+ influence on the overall score, we now just assign the closest label to the sample and
19
+ continue the evaluation. This will be logged to the user, so that they are aware of
20
+ this. Some tasks are more sensitive to individual samples, such as European values,
21
+ where we still abort the evaluation if a single sample is invalid.
22
+ - Fixed a bug where logprobs were not used for classification tasks when evaluating
23
+ generative models, due to the fact that we raised the number of generated tokens to 10
24
+ for such tasks. This did not affect the results, but it meant that some evaluations
25
+ failed.
26
+ - Now includes FlashInfer as a dependency, as it is required by vLLM.
27
+ - Changed the choices in European values to use letters, like the other multiple
28
+ choice tasks, rather than numbers. Aside from ensuring consistency, we also avoid the
29
+ issue where '10' and '1' often both have the same first token ('1'), causing us not to
30
+ be able to use logprobs to determine the answer.
31
+
32
+
13
33
  ## [v16.0.0] - 2025-09-05
14
34
  ### Added
15
35
  - Added support for Latvian 🇱🇻! This includes the sentiment classification dataset
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.0.0
3
+ Version: 16.0.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,10 +61,12 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
61
61
  Provides-Extra: all
62
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
64
65
  Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
65
66
  Provides-Extra: generative
66
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
68
70
  Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
69
71
  Description-Content-Type: text/markdown
70
72
 
@@ -14,9 +14,8 @@ Each language has two leaderboards:
14
14
  - **Generative Leaderboard**: This leaderboard shows the performance of models that can
15
15
  generate text. These models have been evaluated on _all_ [tasks](/tasks), both NLU and
16
16
  NLG.
17
- - **NLU Leaderboard**: This leaderboard shows the performance of models that can only
18
- understand text, and not generate text themselves. These models have been evaluated on
19
- the NLU tasks only.
17
+ - **NLU Leaderboard**: This leaderboard shows the performance of models that can
18
+ understand text, which includes both generative and non-generative models.
20
19
 
21
20
 
22
21
  ## 📊 How to Read the Leaderboards
@@ -26,15 +25,14 @@ model across all the tasks in the leaderboard. The lower the rank, the better th
26
25
 
27
26
  The columns that follow the rank columns are metadata about the model:
28
27
 
29
- - `Parameters`: The total number of parameters in the model, in millions.
30
- - `Vocabulary`: The size of the model's vocabulary, in thousands.
31
- - `Context`: The maximum number of tokens that the model can process at a time.
32
- - `Speed`: The inference time of the model - see more [here](/tasks/speed).
33
28
  - `Type`: The type of model:
34
29
  - 🔍 indicates that it is an encoder model (e.g., BERT)
35
30
  - 🧠 indicates that it is a base generative model (e.g., GPT-2)
36
31
  - 📝 indicates that it is an instruction-tuned model (e.g., ChatGPT)
37
32
  - 🤔 indicates that it is a reasoning model (e.g., o1)
33
+ - `Parameters`: The total number of parameters in the model, in millions.
34
+ - `Vocabulary`: The size of the model's vocabulary, in thousands.
35
+ - `Context`: The maximum number of tokens that the model can process at a time.
38
36
  - `Commercial`: Whether the model can be used for commercial purposes. See [here](/faq)
39
37
  for more information.
40
38
  - `Merge`: Whether the model is a merge of other models.
@@ -47,11 +45,3 @@ the given model on each of the datasets.
47
45
  To read more about the individual datasets, see the [datasets](/datasets) page. If
48
46
  you're interested in the methodology behind the benchmark, see the
49
47
  [methodology](/methodology) page.
50
-
51
- /// tab | Generative Scatter Plot
52
-
53
- ///
54
-
55
- /// tab | NLU Scatter Plot
56
-
57
- ///
@@ -144,3 +144,6 @@ publish-major: install check bump-major publish ## Publish a major version
144
144
  publish-minor: install check bump-minor publish ## Publish a minor version
145
145
 
146
146
  publish-patch: install check bump-patch publish ## Publish a patch version
147
+
148
+ loc: ## Count the number of lines of code in the project
149
+ @git ls-files | grep '\.py' | xargs wc -l | tail -n 1
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "16.0.0"
3
+ version = "16.0.1"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -46,11 +46,13 @@ dependencies = [
46
46
  generative = [
47
47
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
48
48
  "vllm>=0.10.1; platform_system == 'Linux'",
49
+ "flashinfer-python>=0.3.1; platform_system == 'Linux'",
49
50
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
50
51
  ]
51
52
  all = [
52
53
  "bitsandbytes>=0.43.1; platform_system == 'Linux'",
53
54
  "vllm>=0.10.1; platform_system == 'Linux'",
55
+ "flashinfer-python>=0.3.1; platform_system == 'Linux'",
54
56
  "fbgemm-gpu>=1.0.0; platform_system == 'Linux'",
55
57
  ]
56
58
 
@@ -13,6 +13,7 @@ from termcolor import colored
13
13
 
14
14
  # Block specific warnings before importing anything else, as they can be noisy
15
15
  warnings.filterwarnings("ignore", category=UserWarning)
16
+ warnings.filterwarnings("ignore", category=FutureWarning)
16
17
  logging.getLogger("httpx").setLevel(logging.CRITICAL)
17
18
  logging.getLogger("datasets").setLevel(logging.CRITICAL)
18
19
  logging.getLogger("vllm").setLevel(logging.CRITICAL)
@@ -101,6 +102,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
101
102
  os.environ["VLLM_USE_V1"] = "1"
102
103
 
103
104
 
105
+ # Use the FlashInfer flash-attention backend for vLLM
106
+ os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
107
+
108
+
104
109
  # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
105
110
  # former and LiteLLM uses the latter
106
111
  if os.getenv("HUGGINGFACE_API_KEY"):
@@ -337,31 +337,6 @@ class VLLMModel(HuggingFaceEncoderModel):
337
337
  if end_of_chat_token:
338
338
  stop_tokens.append(end_of_chat_token)
339
339
 
340
- structured_generation_schema = None
341
- if self.dataset_config.task.uses_structured_output:
342
- if self.generative_type == GenerativeType.REASONING:
343
- log_once(
344
- f"The model {self.model_config.model_id!r} is a reasoning model "
345
- "and thus does not support structured generation, so we do not "
346
- "enable it.",
347
- level=logging.DEBUG,
348
- )
349
- else:
350
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
351
- keys_and_their_types: dict[str, t.Any] = {
352
- tag_name: (conlist(str, max_length=5), ...)
353
- for tag_name in ner_tag_names
354
- }
355
- answer_format_class = create_model(
356
- "AnswerFormat", **keys_and_their_types
357
- )
358
- structured_generation_schema = answer_format_class.model_json_schema()
359
- log_once(
360
- "Using structured generation with the JSON schema "
361
- f"{structured_generation_schema}",
362
- level=logging.DEBUG,
363
- )
364
-
365
340
  # Get the mapping from labels to the first token in the label. We call this each
366
341
  # time we generate a new dataset since the dataset config can change
367
342
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
@@ -382,8 +357,29 @@ class VLLMModel(HuggingFaceEncoderModel):
382
357
  "error was. Skipping this evaluation."
383
358
  )
384
359
 
385
- # Define the guided decoding that we will use for structured generation
386
- if structured_generation_schema is not None:
360
+ structured_generation_schema = None
361
+ if (
362
+ self.dataset_config.task.uses_structured_output
363
+ or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
364
+ ) and self.generative_type == GenerativeType.REASONING:
365
+ guided_decoding = None
366
+ logger.debug(
367
+ "The dataset uses structured output, but we are not using it as the "
368
+ "model is a reasoning model."
369
+ )
370
+ elif self.dataset_config.task.uses_structured_output:
371
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
372
+ keys_and_their_types: dict[str, t.Any] = {
373
+ tag_name: (conlist(str, max_length=5), ...)
374
+ for tag_name in ner_tag_names
375
+ }
376
+ answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
377
+ structured_generation_schema = answer_format_class.model_json_schema()
378
+ log_once(
379
+ "Using structured generation with the JSON schema: "
380
+ f"{json.dumps(structured_generation_schema)}",
381
+ level=logging.DEBUG,
382
+ )
387
383
  guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
388
384
  elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
389
385
  guided_decoding = GuidedDecodingParams(
@@ -392,8 +388,17 @@ class VLLMModel(HuggingFaceEncoderModel):
392
388
  for label in self.dataset_config.labels
393
389
  ]
394
390
  )
391
+ log_once(
392
+ "Using structured generation with the choices: "
393
+ f"{guided_decoding.choice!r}.",
394
+ level=logging.DEBUG,
395
+ )
395
396
  else:
396
397
  guided_decoding = None
398
+ log_once(
399
+ "Not using structured generation as the dataset does not require it.",
400
+ level=logging.DEBUG,
401
+ )
397
402
 
398
403
  # Define the parameters used for vLLM generation
399
404
  max_tokens: int = (
@@ -439,6 +444,7 @@ class VLLMModel(HuggingFaceEncoderModel):
439
444
  # Generate sequences using vLLM
440
445
  input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
441
446
  num_attempts = 3
447
+ truncation_attempts = 0
442
448
  for _ in range(num_attempts):
443
449
  try:
444
450
  raw_outputs = self._model.generate(
@@ -466,12 +472,19 @@ class VLLMModel(HuggingFaceEncoderModel):
466
472
  "Prompts are too long, so truncating them and trying again..."
467
473
  )
468
474
  logger.debug(f"The error message was: {str(e)}")
475
+
476
+ # If we have already tried truncating the prompts a few times, then
477
+ # we truncate a bit more aggressively
478
+ extra_truncation = 50 * truncation_attempts
479
+ truncation_attempts += 1
480
+
469
481
  tokenized_prompts = self._tokeniser(
470
482
  text=prompts,
471
483
  truncation=True,
472
484
  max_length=max(
473
485
  min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
474
- - max_tokens,
486
+ - max_tokens
487
+ - extra_truncation,
475
488
  0,
476
489
  ),
477
490
  )
@@ -75,3 +75,9 @@ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
75
75
 
76
76
  # These characters are stripped from JSON output when trying to identify the label
77
77
  JSON_STRIP_CHARACTERS = ' {}\n\r":'
78
+
79
+
80
+ # The number of tokens we generate when evaluating generative models on classification
81
+ # tasks. We also use this to determine whether we should store logprobs in the model
82
+ # outputs (and cache).
83
+ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
@@ -125,6 +125,12 @@ class Task:
125
125
  A list of generative model types that are allowed to be evaluated on this
126
126
  task. If None, all generative model types are allowed. Only relevant if
127
127
  `allowed_model_types` includes generative models.
128
+ allow_invalid_model_outputs (optional):
129
+ Whether to allow invalid model outputs. This is only relevant for generative
130
+ models on classification tasks, where the model may generate an output
131
+ which is not one of the allowed labels. If True, the model output will be
132
+ mapped to the closest valid label. If False, the model output will be
133
+ considered incorrect and the evaluation will be aborted. Defaults to True.
128
134
  """
129
135
 
130
136
  name: str
@@ -148,6 +154,7 @@ class Task:
148
154
  GenerativeType.REASONING,
149
155
  ]
150
156
  )
157
+ allow_invalid_model_outputs: bool = True
151
158
 
152
159
  def __post_init__(self) -> None:
153
160
  """Post-initialisation checks."""
@@ -430,7 +437,6 @@ class DatasetConfig:
430
437
  if self._prompt_prefix is None
431
438
  else self._prompt_prefix
432
439
  )
433
- prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
434
440
  return prompt_prefix
435
441
 
436
442
  @property
@@ -443,7 +449,6 @@ class DatasetConfig:
443
449
  if self._prompt_template is None
444
450
  else self._prompt_template
445
451
  )
446
- prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
447
452
  return prompt_template
448
453
 
449
454
  @property
@@ -456,9 +461,6 @@ class DatasetConfig:
456
461
  if self._instruction_prompt is None
457
462
  else self._instruction_prompt
458
463
  )
459
- instruction_prompt = instruction_prompt.replace(
460
- "{labels_str}", self._labels_str
461
- )
462
464
  return instruction_prompt
463
465
 
464
466
  @property
@@ -519,15 +521,16 @@ class DatasetConfig:
519
521
  """Return a hash of the dataset configuration."""
520
522
  return hash(self.name)
521
523
 
522
- @property
523
- def _labels_str(self) -> str:
524
+ def get_labels_str(self, labels: list[str] | None = None) -> str:
524
525
  """Converts a set of labels to a natural string, in the specified language.
525
526
 
526
527
  If the task is NER, we separate using 'and' and use the mapped labels instead of
527
528
  the BIO NER labels.
528
529
 
529
530
  Args:
530
- language: The language to be used when converting the labels.
531
+ labels (optional):
532
+ The labels to convert to a natural string. If None, uses all the labels
533
+ in the dataset. Defaults to None.
531
534
 
532
535
  Returns:
533
536
  The natural string representation of the labels in specified language.
@@ -539,16 +542,17 @@ class DatasetConfig:
539
542
  else:
540
543
  sep_word = main_language.or_separator
541
544
 
542
- local_labels: list[str] = []
543
- for label in self.labels:
544
- if label not in self.prompt_label_mapping:
545
- continue
546
- local_label = self.prompt_label_mapping[label]
547
- if local_label not in local_labels:
548
- local_labels.append(local_label)
545
+ if labels is None:
546
+ labels = list()
547
+ for english_label in self.labels:
548
+ if english_label not in self.prompt_label_mapping:
549
+ continue
550
+ label = self.prompt_label_mapping[english_label]
551
+ if label not in labels:
552
+ labels.append(label)
549
553
 
550
554
  # Convert labels to single-quoted labels - and remove duplicates
551
- quoted_labels = [f"'{label}'" for label in local_labels]
555
+ quoted_labels = [f"'{label}'" for label in labels]
552
556
 
553
557
  if not quoted_labels:
554
558
  return ""
@@ -84,7 +84,6 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
84
84
  languages=[DA],
85
85
  splits=["test"],
86
86
  bootstrap_samples=False,
87
- _instruction_prompt="{text}",
88
87
  )
89
88
 
90
89
 
@@ -159,7 +158,6 @@ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
159
158
  languages=[DA],
160
159
  splits=["test"],
161
160
  bootstrap_samples=False,
162
- _instruction_prompt="{text}",
163
161
  unofficial=True,
164
162
  )
165
163
 
@@ -172,6 +170,5 @@ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
172
170
  languages=[DA],
173
171
  splits=["test"],
174
172
  bootstrap_samples=False,
175
- _instruction_prompt="{text}",
176
173
  unofficial=True,
177
174
  )
@@ -9,7 +9,7 @@ import typing as t
9
9
  from .enums import TaskGroup
10
10
  from .exceptions import InvalidBenchmark
11
11
  from .tokenization_utils import apply_chat_template
12
- from .utils import log_once
12
+ from .utils import extract_multiple_choice_labels, log_once
13
13
 
14
14
  if t.TYPE_CHECKING:
15
15
  from datasets import DatasetDict
@@ -230,18 +230,49 @@ def apply_prompt(
230
230
  return dataset_config.prompt_template.format(**kwargs), ""
231
231
 
232
232
  match dataset_config.task.task_group:
233
- case (
234
- TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
235
- ):
233
+ case TaskGroup.SEQUENCE_CLASSIFICATION:
234
+ labels_str = dataset_config.get_labels_str()
235
+ few_shot_sections = [
236
+ create_prompt(
237
+ text=example["text"].replace("\n", " ").strip(),
238
+ label=example["label"].replace("\n", " ").strip(),
239
+ labels_str=labels_str,
240
+ )
241
+ for example in few_shot_examples
242
+ ]
243
+ new_sections = [
244
+ create_prompt(
245
+ text=text.replace("\n", " ").strip(),
246
+ label="",
247
+ labels_str=labels_str,
248
+ )
249
+ for text in examples["text"]
250
+ ]
251
+
252
+ case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
236
253
  few_shot_sections = [
237
254
  create_prompt(
238
255
  text=example["text"].replace("\n", " ").strip(),
239
256
  label=example["label"].replace("\n", " ").strip(),
257
+ labels_str=dataset_config.get_labels_str(
258
+ labels=extract_multiple_choice_labels(
259
+ prompt=example["text"],
260
+ candidate_labels=dataset_config.labels,
261
+ )
262
+ ),
240
263
  )
241
264
  for example in few_shot_examples
242
265
  ]
243
266
  new_sections = [
244
- create_prompt(text=text.replace("\n", " ").strip(), label="")
267
+ create_prompt(
268
+ text=text.replace("\n", " ").strip(),
269
+ label="",
270
+ labels_str=dataset_config.get_labels_str(
271
+ labels=extract_multiple_choice_labels(
272
+ prompt=text, candidate_labels=dataset_config.labels
273
+ )
274
+ ),
275
+ )
245
276
  for text in examples["text"]
246
277
  ]
247
278
 
@@ -259,6 +290,7 @@ def apply_prompt(
259
290
  ]
260
291
 
261
292
  case TaskGroup.TOKEN_CLASSIFICATION:
293
+ labels_str = dataset_config.get_labels_str()
262
294
 
263
295
  def create_label(example: dict) -> str:
264
296
  prompt_labels = dataset_config.prompt_label_mapping.values()
@@ -280,12 +312,15 @@ def apply_prompt(
280
312
  create_prompt(
281
313
  text=" ".join(example["tokens"]).replace("\n", " ").strip(),
282
314
  label=create_label(example=example),
315
+ labels_str=labels_str,
283
316
  )
284
317
  for example in few_shot_examples
285
318
  ]
286
319
  new_sections = [
287
320
  create_prompt(
288
- text=" ".join(tokens).replace("\n", " ").strip(), label=""
321
+ text=" ".join(tokens).replace("\n", " ").strip(),
322
+ label="",
323
+ labels_str=labels_str,
289
324
  )
290
325
  for tokens in examples["tokens"]
291
326
  ]
@@ -375,4 +410,7 @@ def apply_prompt(
375
410
  for new_prompt, _ in new_sections
376
411
  ]
377
412
 
413
+ # Always add the final prompts without few-shot examples, too, for analysis
414
+ examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
415
+
378
416
  return examples
@@ -26,6 +26,27 @@ logger: logging.Logger = logging.getLogger("euroeval")
26
26
  T = t.TypeVar("T", bound=int | float | str | bool)
27
27
 
28
28
 
29
+ class PreprocessingFunction(t.Protocol):
30
+ """A protocol for a preprocessing function."""
31
+
32
+ def __call__(
33
+ self, predictions: c.Sequence[int], dataset: "Dataset"
34
+ ) -> c.Sequence[int]:
35
+ """Preprocess the model predictions before they are passed to the pipeline.
36
+
37
+ Args:
38
+ predictions:
39
+ The model predictions.
40
+ dataset:
41
+ The dataset used for evaluation. This is only used in case any
42
+ additional metadata is used to compute the metrics.
43
+
44
+ Returns:
45
+ The preprocessed model predictions.
46
+ """
47
+ ...
48
+
49
+
29
50
  class PipelineMetric(Metric):
30
51
  """Load a scikit-learn pipeline and use it to get scores from the predictions."""
31
52
 
@@ -36,7 +57,7 @@ class PipelineMetric(Metric):
36
57
  pipeline_repo: str,
37
58
  pipeline_scoring_function: c.Callable[["Pipeline", c.Sequence], float],
38
59
  pipeline_file_name: str = "pipeline.pkl",
39
- preprocessing_fn: c.Callable[[c.Sequence[T]], c.Sequence[T]] = lambda x: x,
60
+ preprocessing_fn: PreprocessingFunction | None = None,
40
61
  postprocessing_fn: c.Callable[[float], tuple[float, str]] | None = None,
41
62
  ) -> None:
42
63
  """Initialise the pipeline transform metric.
@@ -101,7 +122,10 @@ class PipelineMetric(Metric):
101
122
  """
102
123
  if self.pipeline is None:
103
124
  self.pipeline = self._download_pipeline()
104
- predictions = self.preprocessing_fn(predictions)
125
+ if self.preprocessing_fn is not None:
126
+ predictions = self.preprocessing_fn(
127
+ predictions=predictions, dataset=dataset
128
+ )
105
129
  return self.pipeline_scoring_function(self.pipeline, predictions)
106
130
 
107
131
  def _download_pipeline(self) -> "Pipeline":
@@ -133,13 +157,18 @@ class PipelineMetric(Metric):
133
157
  ### European Values Metric ###
134
158
 
135
159
 
136
- def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence[int]:
160
+ def european_values_preprocessing_fn(
161
+ predictions: c.Sequence[int], dataset: "Dataset"
162
+ ) -> c.Sequence[int]:
137
163
  """Preprocess the model predictions for the European Values metric.
138
164
 
139
165
  Args:
140
166
  predictions:
141
167
  The model predictions, a sequence of integers representing the predicted
142
168
  choices for each question.
169
+ dataset:
170
+ The dataset used for evaluation. This is only used in case any additional
171
+ metadata is used to compute the metrics.
143
172
 
144
173
  Returns:
145
174
  The preprocessed model predictions, a sequence of integers representing the
@@ -154,6 +183,17 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
154
183
  num_questions = 53
155
184
  num_phrasings_per_question = 5
156
185
 
186
+ # Convert the predictions to integers
187
+ integer_predictions = []
188
+ for prediction, idx_to_choice in zip(predictions, dataset["idx_to_choice"]):
189
+ idx_to_choice = {
190
+ int(idx): int(choice)
191
+ for idx, choice in idx_to_choice.items()
192
+ if choice is not None
193
+ }
194
+ integer_prediction = idx_to_choice[prediction]
195
+ integer_predictions.append(integer_prediction)
196
+
157
197
  assert len(predictions) % num_questions == 0, (
158
198
  f"The number of predictions ({len(predictions)}) is not a multiple of "
159
199
  f"{num_questions}, which is required for the European Values metric."
@@ -171,7 +211,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
171
211
  # Shape: (num_questions, num_phrasings_per_question)
172
212
  arr = np.array(
173
213
  [
174
- predictions[i : i + num_phrasings_per_question]
214
+ integer_predictions[i : i + num_phrasings_per_question]
175
215
  for i in range(0, len(predictions), num_phrasings_per_question)
176
216
  ]
177
217
  )
@@ -188,7 +228,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
188
228
  arr = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=arr)
189
229
 
190
230
  # Convert the array to a list
191
- predictions = arr.tolist()
231
+ integer_predictions = arr.tolist()
192
232
 
193
233
  # Some of the questions are categorical and we're only interested in whether the
194
234
  # model chooses a specific choice or not. This mapping takes the question index
@@ -208,11 +248,13 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
208
248
  }
209
249
 
210
250
  # Map the predictions to the choices we're interested in
211
- predictions = list(predictions)
251
+ integer_predictions = list(integer_predictions)
212
252
  for question_idx, choice in question_choices.items():
213
- predictions[question_idx] = 1 if predictions[question_idx] == choice else 0
253
+ integer_predictions[question_idx] = (
254
+ 1 if integer_predictions[question_idx] == choice else 0
255
+ )
214
256
 
215
- return predictions
257
+ return integer_predictions
216
258
 
217
259
 
218
260
  def european_values_scoring_function(
@@ -10,7 +10,9 @@ from dataclasses import asdict
10
10
 
11
11
  from tqdm.auto import tqdm
12
12
 
13
+ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
13
14
  from .data_models import GenerativeModelOutput, SingleGenerativeModelOutput
15
+ from .utils import log_once
14
16
 
15
17
  if t.TYPE_CHECKING:
16
18
  from pathlib import Path
@@ -189,10 +191,20 @@ class ModelCache:
189
191
  # the indices of the top scores, to save space. Further, we only store
190
192
  # the scores if the generated sequence is shorter than the maximum
191
193
  # length
192
- if model_output.scores is not None and self.max_generated_tokens < 8:
194
+ if (
195
+ model_output.scores is not None
196
+ and self.max_generated_tokens
197
+ <= NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
198
+ ):
193
199
  assert model_output.scores is not None
194
200
  scores = model_output.scores[sample_idx]
195
201
  else:
202
+ if model_output.scores is not None:
203
+ log_once(
204
+ "The generated sequence is longer than the maximum "
205
+ "length for classification. Not caching the scores.",
206
+ level=logging.DEBUG,
207
+ )
196
208
  scores = None
197
209
  self[model_input] = SingleGenerativeModelOutput(
198
210
  sequence=model_output.sequences[sample_idx], scores=scores
@@ -126,7 +126,7 @@ def prepare_examples(
126
126
  ):
127
127
  choice_idxs.append(idx)
128
128
 
129
- choices = [sections[idx] for idx in choice_idxs]
129
+ choices = [sections[idx] for idx in reversed(choice_idxs)]
130
130
 
131
131
  # Check that the choices are present, and that all of them are at the end
132
132
  assert len(choices) > 0, "No choices found in the document."
@@ -146,7 +146,7 @@ def prepare_examples(
146
146
  )
147
147
  new_examples["label"] = [
148
148
  int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
149
- for letter, choice in zip("abcde", choices)
149
+ for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
150
150
  ]
151
151
  new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
152
152
  return new_examples