EuroEval 15.7.1__tar.gz → 15.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (240) hide show
  1. {euroeval-15.7.1 → euroeval-15.7.2}/.pre-commit-config.yaml +1 -1
  2. {euroeval-15.7.1 → euroeval-15.7.2}/CHANGELOG.md +16 -0
  3. {euroeval-15.7.1 → euroeval-15.7.2}/PKG-INFO +1 -1
  4. {euroeval-15.7.1 → euroeval-15.7.2}/pyproject.toml +1 -1
  5. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_config_factory.py +1 -1
  6. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_modules/litellm.py +15 -5
  7. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_modules/vllm.py +1 -1
  8. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmarker.py +13 -11
  9. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/__init__.py +1 -0
  10. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/finnish.py +11 -9
  11. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/languages.py +1 -1
  12. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/task_group_utils/sequence_classification.py +46 -11
  13. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/tokenization_utils.py +50 -14
  14. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_hellaswag_fi.py +18 -4
  15. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_scala.py +0 -6
  16. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_scandisent_fi.py +11 -1
  17. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_turku_ner_fi.py +10 -0
  18. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_tydiqa_fi.py +10 -0
  19. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_xlsum_fi.py +11 -1
  20. {euroeval-15.7.1 → euroeval-15.7.2}/uv.lock +2724 -2724
  21. {euroeval-15.7.1 → euroeval-15.7.2}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +0 -0
  22. {euroeval-15.7.1 → euroeval-15.7.2}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  23. {euroeval-15.7.1 → euroeval-15.7.2}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  24. {euroeval-15.7.1 → euroeval-15.7.2}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +0 -0
  25. {euroeval-15.7.1 → euroeval-15.7.2}/.github/workflows/ci.yaml +0 -0
  26. {euroeval-15.7.1 → euroeval-15.7.2}/.gitignore +0 -0
  27. {euroeval-15.7.1 → euroeval-15.7.2}/CITATION.cff +0 -0
  28. {euroeval-15.7.1 → euroeval-15.7.2}/CODE_OF_CONDUCT.md +0 -0
  29. {euroeval-15.7.1 → euroeval-15.7.2}/CONTRIBUTING.md +0 -0
  30. {euroeval-15.7.1 → euroeval-15.7.2}/Dockerfile.cuda +0 -0
  31. {euroeval-15.7.1 → euroeval-15.7.2}/LICENSE +0 -0
  32. {euroeval-15.7.1 → euroeval-15.7.2}/NEW_DATASET_GUIDE.md +0 -0
  33. {euroeval-15.7.1 → euroeval-15.7.2}/README.md +0 -0
  34. {euroeval-15.7.1 → euroeval-15.7.2}/docs/CNAME +0 -0
  35. {euroeval-15.7.1 → euroeval-15.7.2}/docs/README.md +0 -0
  36. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/README.md +0 -0
  37. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/danish.md +0 -0
  38. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/dutch.md +0 -0
  39. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/english.md +0 -0
  40. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/faroese.md +0 -0
  41. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/finnish.md +0 -0
  42. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/french.md +0 -0
  43. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/german.md +0 -0
  44. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/icelandic.md +0 -0
  45. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/italian.md +0 -0
  46. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/norwegian.md +0 -0
  47. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/spanish.md +0 -0
  48. {euroeval-15.7.1 → euroeval-15.7.2}/docs/datasets/swedish.md +0 -0
  49. {euroeval-15.7.1 → euroeval-15.7.2}/docs/extras/radial_plotter.md +0 -0
  50. {euroeval-15.7.1 → euroeval-15.7.2}/docs/faq.md +0 -0
  51. {euroeval-15.7.1 → euroeval-15.7.2}/docs/gfx/favicon.png +0 -0
  52. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/danish.md +0 -0
  53. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/dutch.md +0 -0
  54. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/english.md +0 -0
  55. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/faroese.md +0 -0
  56. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/french.md +0 -0
  57. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/german.md +0 -0
  58. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  59. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/italian.md +0 -0
  60. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  61. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/spanish.md +0 -0
  62. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Monolingual/swedish.md +0 -0
  63. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Multilingual/european.md +0 -0
  64. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Multilingual/germanic.md +0 -0
  65. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  66. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/Multilingual/romance.md +0 -0
  67. {euroeval-15.7.1 → euroeval-15.7.2}/docs/leaderboards/README.md +0 -0
  68. {euroeval-15.7.1 → euroeval-15.7.2}/docs/methodology.md +0 -0
  69. {euroeval-15.7.1 → euroeval-15.7.2}/docs/python-package.md +0 -0
  70. {euroeval-15.7.1 → euroeval-15.7.2}/docs/tasks/README.md +0 -0
  71. {euroeval-15.7.1 → euroeval-15.7.2}/docs/tasks/common-sense-reasoning.md +0 -0
  72. {euroeval-15.7.1 → euroeval-15.7.2}/docs/tasks/knowledge.md +0 -0
  73. {euroeval-15.7.1 → euroeval-15.7.2}/docs/tasks/linguistic-acceptability.md +0 -0
  74. {euroeval-15.7.1 → euroeval-15.7.2}/docs/tasks/named-entity-recognition.md +0 -0
  75. {euroeval-15.7.1 → euroeval-15.7.2}/docs/tasks/reading-comprehension.md +0 -0
  76. {euroeval-15.7.1 → euroeval-15.7.2}/docs/tasks/sentiment-classification.md +0 -0
  77. {euroeval-15.7.1 → euroeval-15.7.2}/docs/tasks/speed.md +0 -0
  78. {euroeval-15.7.1 → euroeval-15.7.2}/docs/tasks/summarization.md +0 -0
  79. {euroeval-15.7.1 → euroeval-15.7.2}/gfx/euroeval.png +0 -0
  80. {euroeval-15.7.1 → euroeval-15.7.2}/gfx/euroeval.xcf +0 -0
  81. {euroeval-15.7.1 → euroeval-15.7.2}/gfx/scandeval.png +0 -0
  82. {euroeval-15.7.1 → euroeval-15.7.2}/makefile +0 -0
  83. {euroeval-15.7.1 → euroeval-15.7.2}/mkdocs.yaml +0 -0
  84. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/__init__.py +0 -0
  85. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_modules/__init__.py +0 -0
  86. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_modules/base.py +0 -0
  87. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_modules/fresh.py +0 -0
  88. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_modules/hf.py +0 -0
  89. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/callbacks.py +0 -0
  90. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/cli.py +0 -0
  91. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/constants.py +0 -0
  92. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/data_loading.py +0 -0
  93. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/data_models.py +0 -0
  94. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/danish.py +0 -0
  95. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/dutch.py +0 -0
  96. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/english.py +0 -0
  97. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/faroese.py +0 -0
  98. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/french.py +0 -0
  99. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/german.py +0 -0
  100. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/icelandic.py +0 -0
  101. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/italian.py +0 -0
  102. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/norwegian.py +0 -0
  103. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/spanish.py +0 -0
  104. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/swedish.py +0 -0
  105. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/enums.py +0 -0
  106. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/exceptions.py +0 -0
  107. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/finetuning.py +0 -0
  108. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/generation.py +0 -0
  109. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/generation_utils.py +0 -0
  110. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/human_evaluation.py +0 -0
  111. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/model_cache.py +0 -0
  112. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/model_config.py +0 -0
  113. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/model_loading.py +0 -0
  114. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/prompt_templates/__init__.py +0 -0
  115. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/prompt_templates/linguistic_acceptability.py +0 -0
  116. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/prompt_templates/multiple_choice.py +0 -0
  117. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/prompt_templates/named_entity_recognition.py +0 -0
  118. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/prompt_templates/reading_comprehension.py +0 -0
  119. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/prompt_templates/sentiment_classification.py +0 -0
  120. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/prompt_templates/summarization.py +0 -0
  121. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/scores.py +0 -0
  122. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/speed_benchmark.py +0 -0
  123. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/task_group_utils/__init__.py +0 -0
  124. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/task_group_utils/multiple_choice_classification.py +0 -0
  125. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/task_group_utils/question_answering.py +0 -0
  126. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/task_group_utils/text_to_text.py +0 -0
  127. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/task_group_utils/token_classification.py +0 -0
  128. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/tasks.py +0 -0
  129. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/types.py +0 -0
  130. {euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/utils.py +0 -0
  131. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/constants.py +0 -0
  132. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_allocine.py +0 -0
  133. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_angry_tweets.py +0 -0
  134. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_arc.py +0 -0
  135. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_arc_is.py +0 -0
  136. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_belebele.py +0 -0
  137. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_cnn_dailymail.py +0 -0
  138. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_conll_en.py +0 -0
  139. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_conll_es.py +0 -0
  140. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_conll_nl.py +0 -0
  141. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_dane.py +0 -0
  142. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_danish_citizen_tests.py +0 -0
  143. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_dansk.py +0 -0
  144. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_danske_talemaader.py +0 -0
  145. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_danske_talemaader_old.py +0 -0
  146. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_dbrd.py +0 -0
  147. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_dutch_cola.py +0 -0
  148. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_eltec.py +0 -0
  149. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_fone.py +0 -0
  150. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_foqa.py +0 -0
  151. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_fosent.py +0 -0
  152. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_fquad.py +0 -0
  153. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_germanquad.py +0 -0
  154. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_germeval.py +0 -0
  155. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_hellaswag.py +0 -0
  156. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  157. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_ice_linguistic.py +0 -0
  158. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_icelandic_error_corpus.py +0 -0
  159. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_icelandic_knowledge.py +0 -0
  160. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_icelandic_qa.py +0 -0
  161. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_icesum.py +0 -0
  162. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_ilpost_sum.py +0 -0
  163. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_jentoft.py +0 -0
  164. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_mim_gold_ner.py +0 -0
  165. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_mlqa_es.py +0 -0
  166. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_mlsum_de.py +0 -0
  167. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_mlsum_es.py +0 -0
  168. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_mmlu.py +0 -0
  169. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_multinerd-it.py +0 -0
  170. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_no_cola.py +0 -0
  171. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_no_sammendrag.py +0 -0
  172. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_nor_common_sense_qa.py +0 -0
  173. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_nordjylland_news.py +0 -0
  174. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_norec.py +0 -0
  175. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_norglm_multiqa.py +0 -0
  176. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_norglm_multisum.py +0 -0
  177. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_norne.py +0 -0
  178. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_norquad.py +0 -0
  179. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_nqii.py +0 -0
  180. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_nrk_quiz_qa.py +0 -0
  181. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_orange_sum.py +0 -0
  182. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_personal_sum.py +0 -0
  183. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_rrn.py +0 -0
  184. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_sb10k.py +0 -0
  185. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_scandiqa.py +0 -0
  186. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_schibsted.py +0 -0
  187. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_sentiment_headlines_es.py +0 -0
  188. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_sentipolc16.py +0 -0
  189. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_squad.py +0 -0
  190. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_squad_it.py +0 -0
  191. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_squad_nl.py +0 -0
  192. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_squad_nl_old.py +0 -0
  193. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_sst5.py +0 -0
  194. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_suc3.py +0 -0
  195. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_swedn.py +0 -0
  196. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_swerec.py +0 -0
  197. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_wiki_lingua_nl.py +0 -0
  198. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_wikiann_fo.py +0 -0
  199. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_wikineural-it.py +0 -0
  200. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_winogrande_is.py +0 -0
  201. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_xquad_es.py +0 -0
  202. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/fix_dot_env_file.py +0 -0
  203. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/load_ud_pos.py +0 -0
  204. {euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/versioning.py +0 -0
  205. {euroeval-15.7.1 → euroeval-15.7.2}/tests/__init__.py +0 -0
  206. {euroeval-15.7.1 → euroeval-15.7.2}/tests/conftest.py +0 -0
  207. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_benchmark_config_factory.py +0 -0
  208. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_benchmark_modules/__init__.py +0 -0
  209. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_benchmark_modules/test_base.py +0 -0
  210. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_benchmark_modules/test_fresh.py +0 -0
  211. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_benchmark_modules/test_hf.py +0 -0
  212. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_benchmark_modules/test_litellm.py +0 -0
  213. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_benchmark_modules/test_vllm.py +0 -0
  214. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_benchmarker.py +0 -0
  215. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_callbacks.py +0 -0
  216. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_cli.py +0 -0
  217. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_constants.py +0 -0
  218. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_data_loading.py +0 -0
  219. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_data_models.py +0 -0
  220. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_dataset_configs.py +0 -0
  221. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_enums.py +0 -0
  222. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_exceptions.py +0 -0
  223. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_finetuning.py +0 -0
  224. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_generation.py +0 -0
  225. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_human_evaluation.py +0 -0
  226. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_languages.py +0 -0
  227. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_model_cache.py +0 -0
  228. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_model_config.py +0 -0
  229. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_model_loading.py +0 -0
  230. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_scores.py +0 -0
  231. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_speed_benchmark.py +0 -0
  232. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_task_utils/__init__.py +0 -0
  233. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_task_utils/test_question_answering.py +0 -0
  234. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_task_utils/test_sequence_classification.py +0 -0
  235. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_task_utils/test_text_to_text.py +0 -0
  236. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_task_utils/test_token_classification.py +0 -0
  237. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_tasks.py +0 -0
  238. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_tokenization_utils.py +0 -0
  239. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_types.py +0 -0
  240. {euroeval-15.7.1 → euroeval-15.7.2}/tests/test_utils.py +0 -0
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.11.7
13
+ rev: v0.11.8
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -10,6 +10,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
10
10
 
11
11
 
12
12
 
13
+ ## [v15.7.2] - 2025-05-02
14
+ ### Fixed
15
+ - Now does not check if a model exists if it has already been evaluated. This is an
16
+ issue when evaluating Ollama models, if the Ollama server is not running.
17
+ - When evaluating instruction-tuned models on text classification tasks, the chat
18
+ template sometimes ends with special symbols, such as a newline, which can change the
19
+ tokenisation of the generated label. When we are evaluating the model using logprobs
20
+ we are thus looking for the wrong label in these cases. We now take this into account,
21
+ and log it to the user if the labels are not found, to avoid confusion.
22
+ - Finnish datasets were not included in the default "all" dataset list, which is the
23
+ default used when no datasets are specified. This has been fixed now.
24
+ - Temporarily disabled HellaSwag-fi, as there is an issue with the labels in the test
25
+ split, causing errors during evaluation. We will re-enable in a future release, when
26
+ this has been fixed.
27
+
28
+
13
29
  ## [v15.7.1] - 2025-04-29
14
30
  ### Changed
15
31
  - Marked the DBRD Dutch sentiment classification as official, as the quality is
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.7.1
3
+ Version: 15.7.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "EuroEval"
3
- version = "15.7.1"
3
+ version = "15.7.2"
4
4
  description = "The robust European language model benchmark."
5
5
  readme = "README.md"
6
6
  authors = [
@@ -238,7 +238,7 @@ def prepare_languages(
238
238
  The default language codes of the languages to include.
239
239
 
240
240
  Returns:
241
- The prepared model or dataset languages.
241
+ The prepared dataset languages.
242
242
  """
243
243
  # Create a dictionary that maps languages to their associated language objects
244
244
  language_mapping = get_all_languages()
@@ -1007,6 +1007,10 @@ def try_download_ollama_model(model_id: str) -> bool:
1007
1007
 
1008
1008
  Returns:
1009
1009
  Whether the model was downloaded successfully.
1010
+
1011
+ Raises:
1012
+ InvalidModel:
1013
+ If Ollama is not running or the model cannot be downloaded.
1010
1014
  """
1011
1015
  if not (model_id.startswith("ollama/") or model_id.startswith("ollama_chat/")):
1012
1016
  return False
@@ -1021,11 +1025,17 @@ def try_download_ollama_model(model_id: str) -> bool:
1021
1025
  level=logging.WARNING,
1022
1026
  )
1023
1027
 
1024
- downloaded_ollama_models: list[str] = [
1025
- model_obj.model
1026
- for model_obj in ollama.list().models
1027
- if model_obj.model is not None
1028
- ]
1028
+ try:
1029
+ downloaded_ollama_models: list[str] = [
1030
+ model_obj.model
1031
+ for model_obj in ollama.list().models
1032
+ if model_obj.model is not None
1033
+ ]
1034
+ except ConnectionError:
1035
+ raise InvalidModel(
1036
+ "Ollama does not seem to be running, so we cannot evaluate the model "
1037
+ f"{model_id!r}. Please make sure that Ollama is running and try again."
1038
+ )
1029
1039
 
1030
1040
  ollama_model_id = "/".join(model_id.split("/")[1:])
1031
1041
  if ollama_model_id not in downloaded_ollama_models:
@@ -797,7 +797,7 @@ def load_model_and_tokenizer(
797
797
  enable_lora=model_config.adapter_base_model_id is not None,
798
798
  max_lora_rank=256,
799
799
  )
800
- except (ValueError, OSError) as e:
800
+ except (RuntimeError, ValueError, OSError) as e:
801
801
  if "awaiting a review from the repo authors" in str(e):
802
802
  raise InvalidModel(
803
803
  f"The model {model_id!r} is awaiting a review from the repository "
@@ -372,15 +372,7 @@ class Benchmarker:
372
372
 
373
373
  current_benchmark_results: list[BenchmarkResult] = list()
374
374
  for model_id in model_ids:
375
- try:
376
- model_config = get_model_config(
377
- model_id=model_id, benchmark_config=benchmark_config
378
- )
379
- except InvalidModel as e:
380
- logger.info(e.message)
381
- num_finished_benchmarks += len(dataset_configs)
382
- continue
383
-
375
+ model_config: ModelConfig | None = None
384
376
  loaded_model: BenchmarkModule | None = None
385
377
  for dataset_config in dataset_configs:
386
378
  # Skip if we have already benchmarked this model on this dataset and
@@ -394,12 +386,22 @@ class Benchmarker:
394
386
  ):
395
387
  logger.debug(
396
388
  f"Skipping benchmarking {model_id} on "
397
- f"{dataset_config.pretty_name}, as it "
398
- "has already been benchmarked."
389
+ f"{dataset_config.pretty_name}, as it has already been "
390
+ "benchmarked."
399
391
  )
400
392
  num_finished_benchmarks += 1
401
393
  continue
402
394
 
395
+ if model_config is None:
396
+ try:
397
+ model_config = get_model_config(
398
+ model_id=model_id, benchmark_config=benchmark_config
399
+ )
400
+ except InvalidModel as e:
401
+ logger.info(e.message)
402
+ num_finished_benchmarks += len(dataset_configs)
403
+ continue
404
+
403
405
  # Skip if the model is an encoder model and the task is generative
404
406
  task_is_generative = (
405
407
  dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
@@ -7,6 +7,7 @@ from .danish import * # noqa: F403
7
7
  from .dutch import * # noqa: F403
8
8
  from .english import * # noqa: F403
9
9
  from .faroese import * # noqa: F403
10
+ from .finnish import * # noqa: F403
10
11
  from .french import * # noqa: F403
11
12
  from .german import * # noqa: F403
12
13
  from .icelandic import * # noqa: F403
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import FI
5
- from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
5
+ from ..tasks import LA, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -40,14 +40,16 @@ XLSUM_FI_CONFIG = DatasetConfig(
40
40
  languages=[FI],
41
41
  )
42
42
 
43
- HELLASWAG_FI_CONFIG = DatasetConfig(
44
- name="hellaswag-fi",
45
- pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
46
- "HellaSwag-fi, translated from the English HellaSwag dataset",
47
- huggingface_id="EuroEval/hellaswag-fi-mini",
48
- task=COMMON_SENSE,
49
- languages=[FI],
50
- )
43
+ # TODO: Include when this issue has been resolved:
44
+ # https://github.com/EuroEval/EuroEval/issues/158#issuecomment-2846664885
45
+ # HELLASWAG_FI_CONFIG = DatasetConfig(
46
+ # name="hellaswag-fi",
47
+ # pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
48
+ # "HellaSwag-fi, translated from the English HellaSwag dataset",
49
+ # huggingface_id="EuroEval/hellaswag-fi-mini",
50
+ # task=COMMON_SENSE,
51
+ # languages=[FI],
52
+ # )
51
53
 
52
54
  SCALA_FI_CONFIG = DatasetConfig(
53
55
  name="scala-fi",
@@ -21,6 +21,7 @@ def get_all_languages() -> dict[str, Language]:
21
21
  DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
22
22
  NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
23
23
  EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
24
+ FI = Language(code="fi", name="Finnish", _and_separator="ja", _or_separator="tai")
24
25
  FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
25
26
  FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
26
27
  DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
@@ -78,7 +79,6 @@ EO = Language(code="eo", name="Esperanto")
78
79
  ET = Language(code="et", name="Estonian")
79
80
  EE = Language(code="ee", name="Ewe")
80
81
  FJ = Language(code="fj", name="Fijian")
81
- FI = Language(code="fi", name="Finnish")
82
82
  FY = Language(code="fy", name="Western Frisian")
83
83
  FF = Language(code="ff", name="Fulah")
84
84
  GD = Language(code="gd", name="Gaelic")
@@ -132,6 +132,11 @@ def extract_labels_from_generation(
132
132
  The predicted labels.
133
133
  """
134
134
  if model_output.scores is not None:
135
+ if first_label_token_mapping is False:
136
+ raise InvalidBenchmark(
137
+ "The model outputted logprobs, but the first label token mapping is "
138
+ "not provided. This means that the model should not output logprobs."
139
+ )
135
140
  labels = get_closest_logprobs_labels(
136
141
  generation_logprobs=model_output.scores,
137
142
  dataset_config=dataset_config,
@@ -147,7 +152,7 @@ def extract_labels_from_generation(
147
152
  def get_closest_logprobs_labels(
148
153
  generation_logprobs: list[list[list[tuple[str, float]]]],
149
154
  dataset_config: "DatasetConfig",
150
- first_label_token_mapping: dict[str, str] | bool,
155
+ first_label_token_mapping: dict[str, str] | t.Literal[True],
151
156
  ) -> list[str] | None:
152
157
  """Get the labels with the highest predicted logprob value.
153
158
 
@@ -164,8 +169,7 @@ def get_closest_logprobs_labels(
164
169
  The configuration of the dataset.
165
170
  first_label_token_mapping:
166
171
  A mapping from labels to the first token in each label, or alternatively a
167
- Boolean value indicating whether the model should output scores (if the
168
- mapping is outputted then the model will always output scores).
172
+ `True` value indicating that the model should output logprobs.
169
173
 
170
174
  Returns:
171
175
  The predicted labels, or None if labels could not be extracted.
@@ -195,7 +199,9 @@ def get_closest_logprobs_labels(
195
199
  # label, as the output label
196
200
  output_label: str | None = None
197
201
  for generated_label in generated_labels:
198
- # Get the candidate labels that starts with the generated label
202
+ # Get the candidate labels. If we have a first label token mapping, we
203
+ # use it to get the candidate labels. Otherwise, we check if any of the
204
+ # labels start with the generated label.
199
205
  if isinstance(first_label_token_mapping, dict):
200
206
  if any(
201
207
  candidate_label not in first_label_token_mapping
@@ -239,14 +245,43 @@ def get_closest_logprobs_labels(
239
245
  )
240
246
  return None
241
247
 
242
- # If no candidate label is found, we ignore the generated label, as it
243
- # basically means that the model is just really bad at generating
244
- # labels.
248
+ # If no candidate label is found, we first check if any of the labels
249
+ # start with the generated label. This could be the case if the labels
250
+ # in the first token mapping is inaccurate or incomplete, for instance
251
+ # if 'pos' is in the first label token mapping, but the model outputted
252
+ # 'posit'. If this is the case then we cannot trust the first label
253
+ # token mapping, and we fall back to using word edit distance.
254
+ # Otherwise, the generated label is just bad, and we skip to the next
255
+ # generated label.
245
256
  elif len(candidate_output_labels) == 0:
246
- logger.debug(
247
- f"No candidate label found for the generated label "
248
- f"{generated_label!r}. The generated label is thus ignored."
249
- )
257
+ candidate_output_labels_starting_with_generated_label = [
258
+ candidate_label
259
+ for candidate_label in candidate_labels
260
+ if candidate_label.startswith(generated_label)
261
+ ]
262
+ if candidate_output_labels_starting_with_generated_label:
263
+ log_once(
264
+ f"No candidate label found for the generated label "
265
+ f"{generated_label!r}. This means that using logprobs to "
266
+ "extract the labels is not reliable, and we will instead "
267
+ "fall back to extracting the labels using word edit "
268
+ "distance.",
269
+ level=logging.DEBUG,
270
+ )
271
+ return None
272
+
273
+ # If we did not find any candidate label for any of the generated labels, we
274
+ # assume that something is wrong with the model output, and we fall back to
275
+ # using word edit distance to extract the labels
276
+ else:
277
+ log_once(
278
+ f"No candidate label found for any of the generated labels "
279
+ f"{generated_labels}. This means that using logprobs to extract "
280
+ "the labels is not reliable, and we will instead fall back to "
281
+ "extracting the labels using word edit distance.",
282
+ level=logging.DEBUG,
283
+ )
284
+ return None
250
285
 
251
286
  if output_label is not None:
252
287
  output_labels.append(output_label)
@@ -311,24 +311,60 @@ def get_first_label_token_mapping(
311
311
  for label in dataset_config.labels
312
312
  ]
313
313
 
314
- # Get the first token of each label, where we add a prefix space if needed
315
- add_prefix_space = (
316
- should_prefix_space_be_added_to_labels(
314
+ # Tokenize some text containing each label, which we will use to extract the
315
+ # first token of each label
316
+ all_tokens: list[list[str]]
317
+ if tokenizer.chat_template is None:
318
+ add_prefix_space = should_prefix_space_be_added_to_labels(
317
319
  labels_to_be_generated=local_labels, tokenizer=tokenizer
318
320
  )
319
- and tokenizer.chat_template is None
320
- )
321
- first_tokens = [
322
- tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
323
- for label in local_labels
324
- ]
325
- first_tokens = [
326
- re.sub(
327
- pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
328
- )
329
- for token in first_tokens
321
+ all_tokens = [
322
+ tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
323
+ for label in local_labels
324
+ ]
325
+ else:
326
+ all_tokens = [
327
+ tokenizer.convert_ids_to_tokens(
328
+ ids=tokenizer.apply_chat_template(
329
+ conversation=[
330
+ dict(role="user", content=""),
331
+ dict(role="assistant", content=label),
332
+ ],
333
+ add_generation_prompt=True,
334
+ tokenize=True,
335
+ )
336
+ )
337
+ for label in local_labels
338
+ ]
339
+
340
+ # Remove any non-alphabetic characters from the tokens
341
+ all_tokens = [
342
+ [
343
+ re.sub(
344
+ pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
345
+ repl="",
346
+ string=token.lower(),
347
+ )
348
+ for token in token_list
349
+ ]
350
+ for token_list in all_tokens
330
351
  ]
331
352
 
353
+ # Extract the first token of each label
354
+ first_tokens: list[str] = list()
355
+ for token_list, label in zip(all_tokens, local_labels):
356
+ matching_tokens = [
357
+ tok for tok in token_list if tok and label.startswith(tok)
358
+ ]
359
+ if not matching_tokens:
360
+ log_once(
361
+ f"No matching token found in token_list for label '{label}', so "
362
+ "we will not output scores.",
363
+ level=logging.DEBUG,
364
+ )
365
+ return False
366
+ first_tokens.append(matching_tokens[0])
367
+
332
368
  # Build a mapping from labels to the first token in each label if the first
333
369
  # tokens are distinct
334
370
  if len(first_tokens) == len(set(first_tokens)):
@@ -1,3 +1,13 @@
1
+ # /// script
2
+ # requires-python = ">=3.10,<4.0"
3
+ # dependencies = [
4
+ # "datasets==3.5.0",
5
+ # "huggingface-hub==0.24.0",
6
+ # "pandas==2.2.0",
7
+ # "requests==2.32.3",
8
+ # ]
9
+ # ///
10
+
1
11
  """Create the Finnish HellaSwag-mini dataset and upload it to the HF Hub."""
2
12
 
3
13
  from collections import Counter
@@ -30,12 +40,16 @@ def main() -> None:
30
40
  repo_id = "Finnish-NLP/hellaswag-fi-google-translate"
31
41
 
32
42
  dataset = load_dataset(path=repo_id, token=True)
33
- dfs = {}
34
- for split in ["train", "validation", "test"]:
35
- df = dataset[split].to_pandas()
43
+ assert isinstance(dataset, DatasetDict)
36
44
 
37
- df["endings"] = df["endings"].apply(process_endings)
45
+ splits = ["train", "validation", "test"]
46
+ assert list(dataset.keys()) == splits
38
47
 
48
+ dfs: dict[str, pd.DataFrame] = dict()
49
+ for split in splits:
50
+ df = dataset[split].to_pandas()
51
+ assert isinstance(df, pd.DataFrame)
52
+ df.endings = df.endings.apply(process_endings)
39
53
  df = process_split(df=df, split=split)
40
54
  dfs[split] = df
41
55
 
@@ -2,7 +2,6 @@
2
2
  # requires-python = ">=3.10,<4.0"
3
3
  # dependencies = [
4
4
  # "datasets==3.5.0",
5
- # "euroeval",
6
5
  # "huggingface-hub==0.24.0",
7
6
  # "pandas==2.2.0",
8
7
  # "requests==2.32.3",
@@ -41,14 +40,9 @@ from pandas.errors import SettingWithCopyWarning
41
40
  from requests.exceptions import HTTPError
42
41
  from tqdm.auto import tqdm
43
42
 
44
- from euroeval.utils import block_terminal_output
45
-
46
43
 
47
44
  def main() -> None:
48
45
  """Create the ScaLA datasets and upload them to the HF Hub."""
49
- # Block terminal output
50
- block_terminal_output()
51
-
52
46
  # Set up the POS dataset loaders
53
47
  pos_datasets = {
54
48
  "da": load_dadt_pos,
@@ -1,3 +1,13 @@
1
+ # /// script
2
+ # requires-python = ">=3.10,<4.0"
3
+ # dependencies = [
4
+ # "datasets==3.5.0",
5
+ # "huggingface-hub==0.24.0",
6
+ # "pandas==2.2.0",
7
+ # "requests==2.32.3",
8
+ # ]
9
+ # ///
10
+
1
11
  """Create the Finnish part of the ScandiSent dataset and upload it to the HF Hub."""
2
12
 
3
13
  import pandas as pd
@@ -14,7 +24,7 @@ def main() -> None:
14
24
 
15
25
  # Download the dataset
16
26
  dataset = load_dataset(path=repo_id, token=True, split="train")
17
- assert isinstance(dataset, DatasetDict)
27
+ assert isinstance(dataset, Dataset)
18
28
 
19
29
  # Convert the dataset to a dataframe
20
30
  df = dataset.to_pandas()
@@ -1,3 +1,13 @@
1
+ # /// script
2
+ # requires-python = ">=3.10,<4.0"
3
+ # dependencies = [
4
+ # "datasets==3.5.0",
5
+ # "huggingface-hub==0.24.0",
6
+ # "pandas==2.2.0",
7
+ # "requests==2.32.3",
8
+ # ]
9
+ # ///
10
+
1
11
  """Create the Finnish Turku NER dataset and upload it to the HF Hub."""
2
12
 
3
13
  import pandas as pd
@@ -1,3 +1,13 @@
1
+ # /// script
2
+ # requires-python = ">=3.10,<4.0"
3
+ # dependencies = [
4
+ # "datasets==3.5.0",
5
+ # "huggingface-hub==0.24.0",
6
+ # "pandas==2.2.0",
7
+ # "requests==2.32.3",
8
+ # ]
9
+ # ///
10
+
1
11
  """Create the TydiQA-mini Finnish dataset and upload it to the HF Hub."""
2
12
 
3
13
  import pandas as pd
@@ -1,3 +1,13 @@
1
+ # /// script
2
+ # requires-python = ">=3.10,<4.0"
3
+ # dependencies = [
4
+ # "datasets==3.5.0",
5
+ # "huggingface-hub==0.24.0",
6
+ # "pandas==2.2.0",
7
+ # "requests==2.32.3",
8
+ # ]
9
+ # ///
10
+
1
11
  """Create the Finnish version of the XLSum summarisation dataset."""
2
12
 
3
13
  import pandas as pd
@@ -11,7 +21,7 @@ def main() -> None:
11
21
  """Create the Finnish XL-Sum dataset and upload to HF Hub."""
12
22
  dataset_id = "TurkuNLP/xlsum-fi"
13
23
 
14
- dataset = load_dataset(dataset_id)
24
+ dataset = load_dataset(dataset_id, trust_remote_code=True, token=True)
15
25
  assert isinstance(dataset, DatasetDict)
16
26
 
17
27
  dataset = dataset.rename_columns(column_mapping=dict(summary="target_text"))