EuroEval 15.8.0__tar.gz → 15.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (240) hide show
  1. {euroeval-15.8.0 → euroeval-15.8.1}/CHANGELOG.md +10 -0
  2. {euroeval-15.8.0 → euroeval-15.8.1}/PKG-INFO +1 -1
  3. {euroeval-15.8.0 → euroeval-15.8.1}/pyproject.toml +1 -1
  4. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/data_models.py +9 -5
  5. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/task_group_utils/sequence_classification.py +21 -32
  6. {euroeval-15.8.0 → euroeval-15.8.1}/uv.lock +1 -1
  7. {euroeval-15.8.0 → euroeval-15.8.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  8. {euroeval-15.8.0 → euroeval-15.8.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  9. {euroeval-15.8.0 → euroeval-15.8.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  10. {euroeval-15.8.0 → euroeval-15.8.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  11. {euroeval-15.8.0 → euroeval-15.8.1}/.github/workflows/ci.yaml +0 -0
  12. {euroeval-15.8.0 → euroeval-15.8.1}/.gitignore +0 -0
  13. {euroeval-15.8.0 → euroeval-15.8.1}/.pre-commit-config.yaml +0 -0
  14. {euroeval-15.8.0 → euroeval-15.8.1}/CITATION.cff +0 -0
  15. {euroeval-15.8.0 → euroeval-15.8.1}/CODE_OF_CONDUCT.md +0 -0
  16. {euroeval-15.8.0 → euroeval-15.8.1}/CONTRIBUTING.md +0 -0
  17. {euroeval-15.8.0 → euroeval-15.8.1}/Dockerfile.cuda +0 -0
  18. {euroeval-15.8.0 → euroeval-15.8.1}/LICENSE +0 -0
  19. {euroeval-15.8.0 → euroeval-15.8.1}/NEW_DATASET_GUIDE.md +0 -0
  20. {euroeval-15.8.0 → euroeval-15.8.1}/README.md +0 -0
  21. {euroeval-15.8.0 → euroeval-15.8.1}/docs/CNAME +0 -0
  22. {euroeval-15.8.0 → euroeval-15.8.1}/docs/README.md +0 -0
  23. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/README.md +0 -0
  24. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/danish.md +0 -0
  25. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/dutch.md +0 -0
  26. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/english.md +0 -0
  27. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/faroese.md +0 -0
  28. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/finnish.md +0 -0
  29. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/french.md +0 -0
  30. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/german.md +0 -0
  31. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/icelandic.md +0 -0
  32. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/italian.md +0 -0
  33. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/norwegian.md +0 -0
  34. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/spanish.md +0 -0
  35. {euroeval-15.8.0 → euroeval-15.8.1}/docs/datasets/swedish.md +0 -0
  36. {euroeval-15.8.0 → euroeval-15.8.1}/docs/extras/radial_plotter.md +0 -0
  37. {euroeval-15.8.0 → euroeval-15.8.1}/docs/faq.md +0 -0
  38. {euroeval-15.8.0 → euroeval-15.8.1}/docs/gfx/favicon.png +0 -0
  39. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/danish.md +0 -0
  40. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
  41. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/english.md +0 -0
  42. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
  43. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/french.md +0 -0
  44. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/german.md +0 -0
  45. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  46. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/italian.md +0 -0
  47. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  48. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
  49. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
  50. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Multilingual/european.md +0 -0
  51. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
  52. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  53. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/Multilingual/romance.md +0 -0
  54. {euroeval-15.8.0 → euroeval-15.8.1}/docs/leaderboards/README.md +0 -0
  55. {euroeval-15.8.0 → euroeval-15.8.1}/docs/methodology.md +0 -0
  56. {euroeval-15.8.0 → euroeval-15.8.1}/docs/python-package.md +0 -0
  57. {euroeval-15.8.0 → euroeval-15.8.1}/docs/tasks/README.md +0 -0
  58. {euroeval-15.8.0 → euroeval-15.8.1}/docs/tasks/common-sense-reasoning.md +0 -0
  59. {euroeval-15.8.0 → euroeval-15.8.1}/docs/tasks/knowledge.md +0 -0
  60. {euroeval-15.8.0 → euroeval-15.8.1}/docs/tasks/linguistic-acceptability.md +0 -0
  61. {euroeval-15.8.0 → euroeval-15.8.1}/docs/tasks/named-entity-recognition.md +0 -0
  62. {euroeval-15.8.0 → euroeval-15.8.1}/docs/tasks/reading-comprehension.md +0 -0
  63. {euroeval-15.8.0 → euroeval-15.8.1}/docs/tasks/sentiment-classification.md +0 -0
  64. {euroeval-15.8.0 → euroeval-15.8.1}/docs/tasks/speed.md +0 -0
  65. {euroeval-15.8.0 → euroeval-15.8.1}/docs/tasks/summarization.md +0 -0
  66. {euroeval-15.8.0 → euroeval-15.8.1}/gfx/euroeval.png +0 -0
  67. {euroeval-15.8.0 → euroeval-15.8.1}/gfx/euroeval.xcf +0 -0
  68. {euroeval-15.8.0 → euroeval-15.8.1}/gfx/scandeval.png +0 -0
  69. {euroeval-15.8.0 → euroeval-15.8.1}/makefile +0 -0
  70. {euroeval-15.8.0 → euroeval-15.8.1}/mkdocs.yaml +0 -0
  71. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/__init__.py +0 -0
  72. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/benchmark_config_factory.py +0 -0
  73. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/benchmark_modules/__init__.py +0 -0
  74. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/benchmark_modules/base.py +0 -0
  75. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/benchmark_modules/fresh.py +0 -0
  76. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/benchmark_modules/hf.py +0 -0
  77. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/benchmark_modules/litellm.py +0 -0
  78. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/benchmark_modules/vllm.py +0 -0
  79. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/benchmarker.py +0 -0
  80. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/callbacks.py +0 -0
  81. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/cli.py +0 -0
  82. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/constants.py +0 -0
  83. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/data_loading.py +0 -0
  84. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/__init__.py +0 -0
  85. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/danish.py +0 -0
  86. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/dutch.py +0 -0
  87. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/english.py +0 -0
  88. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/faroese.py +0 -0
  89. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/finnish.py +0 -0
  90. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/french.py +0 -0
  91. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/german.py +0 -0
  92. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/icelandic.py +0 -0
  93. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/italian.py +0 -0
  94. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/norwegian.py +0 -0
  95. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/spanish.py +0 -0
  96. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/dataset_configs/swedish.py +0 -0
  97. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/enums.py +0 -0
  98. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/exceptions.py +0 -0
  99. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/finetuning.py +0 -0
  100. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/generation.py +0 -0
  101. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/generation_utils.py +0 -0
  102. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/human_evaluation.py +0 -0
  103. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/languages.py +0 -0
  104. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/model_cache.py +0 -0
  105. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/model_config.py +0 -0
  106. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/model_loading.py +0 -0
  107. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/prompt_templates/__init__.py +0 -0
  108. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  109. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  110. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  111. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  112. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  113. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/prompt_templates/summarization.py +0 -0
  114. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/scores.py +0 -0
  115. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/speed_benchmark.py +0 -0
  116. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/task_group_utils/__init__.py +0 -0
  117. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  118. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/task_group_utils/question_answering.py +0 -0
  119. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  120. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/task_group_utils/token_classification.py +0 -0
  121. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/tasks.py +0 -0
  122. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/tokenization_utils.py +0 -0
  123. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/types.py +0 -0
  124. {euroeval-15.8.0 → euroeval-15.8.1}/src/euroeval/utils.py +0 -0
  125. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/constants.py +0 -0
  126. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_allocine.py +0 -0
  127. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_angry_tweets.py +0 -0
  128. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_arc.py +0 -0
  129. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_arc_is.py +0 -0
  130. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_belebele.py +0 -0
  131. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_cnn_dailymail.py +0 -0
  132. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_conll_en.py +0 -0
  133. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_conll_es.py +0 -0
  134. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_conll_nl.py +0 -0
  135. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_dane.py +0 -0
  136. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_danish_citizen_tests.py +0 -0
  137. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_dansk.py +0 -0
  138. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_danske_talemaader.py +0 -0
  139. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_danske_talemaader_old.py +0 -0
  140. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_dbrd.py +0 -0
  141. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_dutch_cola.py +0 -0
  142. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_eltec.py +0 -0
  143. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_fone.py +0 -0
  144. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_foqa.py +0 -0
  145. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_fosent.py +0 -0
  146. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_fquad.py +0 -0
  147. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_germanquad.py +0 -0
  148. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_germeval.py +0 -0
  149. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_hellaswag.py +0 -0
  150. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_hellaswag_fi.py +0 -0
  151. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  152. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_ice_linguistic.py +0 -0
  153. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
  154. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_icelandic_knowledge.py +0 -0
  155. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_icelandic_qa.py +0 -0
  156. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_icesum.py +0 -0
  157. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_ilpost_sum.py +0 -0
  158. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_jentoft.py +0 -0
  159. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_mim_gold_ner.py +0 -0
  160. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_mlqa_es.py +0 -0
  161. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_mlsum_de.py +0 -0
  162. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_mlsum_es.py +0 -0
  163. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_mmlu.py +0 -0
  164. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_multinerd-it.py +0 -0
  165. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_no_cola.py +0 -0
  166. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_no_sammendrag.py +0 -0
  167. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
  168. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_nordjylland_news.py +0 -0
  169. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_norec.py +0 -0
  170. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_norglm_multiqa.py +0 -0
  171. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_norglm_multisum.py +0 -0
  172. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_norne.py +0 -0
  173. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_norquad.py +0 -0
  174. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_nqii.py +0 -0
  175. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
  176. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_orange_sum.py +0 -0
  177. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_personal_sum.py +0 -0
  178. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_rrn.py +0 -0
  179. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_sb10k.py +0 -0
  180. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_scala.py +0 -0
  181. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_scandiqa.py +0 -0
  182. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_scandisent_fi.py +0 -0
  183. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_schibsted.py +0 -0
  184. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
  185. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_sentipolc16.py +0 -0
  186. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_squad.py +0 -0
  187. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_squad_it.py +0 -0
  188. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_squad_nl.py +0 -0
  189. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_squad_nl_old.py +0 -0
  190. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_sst5.py +0 -0
  191. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_suc3.py +0 -0
  192. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_swedn.py +0 -0
  193. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_swerec.py +0 -0
  194. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_turku_ner_fi.py +0 -0
  195. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_tydiqa_fi.py +0 -0
  196. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
  197. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_wikiann_fo.py +0 -0
  198. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_wikineural-it.py +0 -0
  199. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_winogrande_is.py +0 -0
  200. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_xlsum_fi.py +0 -0
  201. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/create_xquad_es.py +0 -0
  202. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/fix_dot_env_file.py +0 -0
  203. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/load_ud_pos.py +0 -0
  204. {euroeval-15.8.0 → euroeval-15.8.1}/src/scripts/versioning.py +0 -0
  205. {euroeval-15.8.0 → euroeval-15.8.1}/tests/__init__.py +0 -0
  206. {euroeval-15.8.0 → euroeval-15.8.1}/tests/conftest.py +0 -0
  207. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_benchmark_config_factory.py +0 -0
  208. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_benchmark_modules/__init__.py +0 -0
  209. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_benchmark_modules/test_base.py +0 -0
  210. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_benchmark_modules/test_fresh.py +0 -0
  211. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_benchmark_modules/test_hf.py +0 -0
  212. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_benchmark_modules/test_litellm.py +0 -0
  213. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_benchmark_modules/test_vllm.py +0 -0
  214. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_benchmarker.py +0 -0
  215. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_callbacks.py +0 -0
  216. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_cli.py +0 -0
  217. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_constants.py +0 -0
  218. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_data_loading.py +0 -0
  219. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_data_models.py +0 -0
  220. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_dataset_configs.py +0 -0
  221. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_enums.py +0 -0
  222. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_exceptions.py +0 -0
  223. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_finetuning.py +0 -0
  224. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_generation.py +0 -0
  225. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_human_evaluation.py +0 -0
  226. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_languages.py +0 -0
  227. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_model_cache.py +0 -0
  228. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_model_config.py +0 -0
  229. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_model_loading.py +0 -0
  230. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_scores.py +0 -0
  231. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_speed_benchmark.py +0 -0
  232. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_task_utils/__init__.py +0 -0
  233. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_task_utils/test_question_answering.py +0 -0
  234. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_task_utils/test_sequence_classification.py +0 -0
  235. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_task_utils/test_text_to_text.py +0 -0
  236. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_task_utils/test_token_classification.py +0 -0
  237. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_tasks.py +0 -0
  238. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_tokenization_utils.py +0 -0
  239. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_types.py +0 -0
  240. {euroeval-15.8.0 → euroeval-15.8.1}/tests/test_utils.py +0 -0
@@ -10,6 +10,16 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.8.1] - 2025-05-08
14
+ ### Fixed
15
+ - NER labels were included twice in the prompt templates (which was due to there being
16
+ both, e.g., `B-ORG` and `I-ORG`). This caused models not using structured generation,
17
+ such as reasoning models, to sometimes output the wrong labels. This has been fixed
18
+ now.
19
+ - If a model outputs a `\boxed{}` answer, we now extract and use that, rather than the
20
+ full generated answer.
21
+
22
+
13
23
  ## [v15.8.0] - 2025-05-07
14
24
  ### Added
15
25
  - Added the BeleBele datasets for Finnish, Italian and Spanish. They are listed as
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.8.0
3
+ Version: 15.8.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.8.0"
3
+ version = "15.8.1"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -529,12 +529,16 @@ class DatasetConfig:
529
529
  else:
530
530
  sep_word = main_language.or_separator
531
531
 
532
+ local_labels: list[str] = []
533
+ for label in self.labels:
534
+ if label not in self.prompt_label_mapping:
535
+ continue
536
+ local_label = self.prompt_label_mapping[label]
537
+ if local_label not in local_labels:
538
+ local_labels.append(local_label)
539
+
532
540
  # Convert labels to single-quoted labels - and remove duplicates
533
- quoted_labels = [
534
- f"'{self.prompt_label_mapping[label]}'"
535
- for label in set(self.labels)
536
- if label in self.prompt_label_mapping
537
- ]
541
+ quoted_labels = [f"'{label}'" for label in local_labels]
538
542
 
539
543
  if not quoted_labels:
540
544
  return ""
@@ -144,9 +144,27 @@ def extract_labels_from_generation(
144
144
  )
145
145
  if labels is not None:
146
146
  return labels
147
- return get_closest_word_edit_labels(
148
- generated_sequences=model_output.sequences, dataset_config=dataset_config
149
- )
147
+
148
+ candidate_labels = [
149
+ dataset_config.prompt_label_mapping[lbl]
150
+ for lbl in dataset_config.id2label.values()
151
+ ]
152
+ new_predicted_labels: list[str] = list()
153
+ for predicted_label in model_output.sequences:
154
+ # If the prediction includes a boxed answer, use that instead of the full
155
+ # generation
156
+ if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
157
+ predicted_label = m.group(1)
158
+
159
+ # Pick the label with the smallest word edit distance to the predicted label
160
+ edit_distances = [
161
+ Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
162
+ for candidate_label in candidate_labels
163
+ ]
164
+ predicted_label = candidate_labels[np.argmin(edit_distances).item()]
165
+ new_predicted_labels.append(predicted_label)
166
+
167
+ return new_predicted_labels
150
168
 
151
169
 
152
170
  def get_closest_logprobs_labels(
@@ -305,32 +323,3 @@ def get_closest_logprobs_labels(
305
323
 
306
324
  assert len(output_labels) == len(generation_logprobs)
307
325
  return output_labels
308
-
309
-
310
- def get_closest_word_edit_labels(
311
- generated_sequences: list[str], dataset_config: "DatasetConfig"
312
- ) -> list[str]:
313
- """Get the labels with the smallest edit distance to the predicted labels.
314
-
315
- Args:
316
- generated_sequences:
317
- The generated sequences from the model.
318
- dataset_config:
319
- The configuration of the dataset.
320
-
321
- Returns:
322
- The candidate labels with the smallest edit distance to the predicted labels.
323
- """
324
- candidate_labels = [
325
- dataset_config.prompt_label_mapping[lbl]
326
- for lbl in dataset_config.id2label.values()
327
- ]
328
- new_predicted_labels: list[str] = list()
329
- for predicted_label in generated_sequences:
330
- edit_distances = [
331
- Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
332
- for candidate_label in candidate_labels
333
- ]
334
- closest_label = candidate_labels[np.argmin(edit_distances).item()]
335
- new_predicted_labels.append(closest_label)
336
- return new_predicted_labels
@@ -906,7 +906,7 @@ wheels = [
906
906
 
907
907
  [[package]]
908
908
  name = "euroeval"
909
- version = "15.8.0"
909
+ version = "15.8.1"
910
910
  source = { editable = "." }
911
911
  dependencies = [
912
912
  { name = "accelerate" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes