ScandEval 16.6.0__tar.gz → 16.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (348) hide show
  1. {scandeval-16.6.0 → scandeval-16.7.1}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +1 -0
  2. {scandeval-16.6.0 → scandeval-16.7.1}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +1 -1
  3. {scandeval-16.6.0 → scandeval-16.7.1}/.pre-commit-config.yaml +3 -3
  4. {scandeval-16.6.0 → scandeval-16.7.1}/CHANGELOG.md +53 -2
  5. {scandeval-16.6.0 → scandeval-16.7.1}/PKG-INFO +47 -11
  6. {scandeval-16.6.0 → scandeval-16.7.1}/README.md +46 -10
  7. scandeval-16.7.1/docs/datasets/bosnian.md +306 -0
  8. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/estonian.md +2 -0
  9. scandeval-16.6.0/docs/datasets/slovenian.md → scandeval-16.7.1/docs/datasets/slovene.md +6 -6
  10. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/spanish.md +5 -5
  11. scandeval-16.7.1/docs/leaderboards/Monolingual/bulgarian.md +26 -0
  12. scandeval-16.7.1/docs/leaderboards/Monolingual/croatian.md +26 -0
  13. scandeval-16.7.1/docs/leaderboards/Monolingual/greek.md +26 -0
  14. scandeval-16.7.1/docs/leaderboards/Monolingual/serbian.md +26 -0
  15. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Multilingual/slavic.md +1 -1
  16. {scandeval-16.6.0 → scandeval-16.7.1}/docs/methodology.md +0 -2
  17. {scandeval-16.6.0 → scandeval-16.7.1}/pyproject.toml +1 -1
  18. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/benchmark_config_factory.py +9 -14
  19. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/benchmark_modules/base.py +5 -1
  20. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/benchmark_modules/litellm.py +120 -16
  21. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/benchmark_modules/vllm.py +102 -43
  22. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/benchmarker.py +108 -51
  23. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/cli.py +26 -37
  24. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/constants.py +18 -15
  25. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/data_models.py +50 -17
  26. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/__init__.py +2 -1
  27. scandeval-16.7.1/src/scandeval/dataset_configs/bosnian.py +39 -0
  28. scandeval-16.6.0/src/scandeval/dataset_configs/slovenian.py → scandeval-16.7.1/src/scandeval/dataset_configs/slovene.py +8 -8
  29. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/finetuning.py +2 -2
  30. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/generation.py +23 -11
  31. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/generation_utils.py +25 -14
  32. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/languages.py +2 -2
  33. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/logging_utils.py +43 -25
  34. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/metrics/huggingface.py +7 -19
  35. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/metrics/llm_as_a_judge.py +1 -0
  36. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/model_cache.py +11 -2
  37. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/prompt_templates/linguistic_acceptability.py +2 -2
  38. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/prompt_templates/multiple_choice.py +2 -2
  39. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/prompt_templates/named_entity_recognition.py +27 -2
  40. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/prompt_templates/reading_comprehension.py +15 -2
  41. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/prompt_templates/sentiment_classification.py +16 -2
  42. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/prompt_templates/summarization.py +9 -0
  43. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/task_group_utils/question_answering.py +11 -4
  44. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/tokenisation_utils.py +4 -13
  45. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/utils.py +50 -0
  46. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_lr_sum.py +1 -1
  47. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_mms.py +1 -1
  48. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_multi_wiki_qa.py +1 -0
  49. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_sentinews.py +2 -2
  50. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_ssj500k_ner.py +3 -3
  51. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_wikiann.py +1 -1
  52. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/load_ud_pos.py +15 -15
  53. {scandeval-16.6.0 → scandeval-16.7.1}/tests/conftest.py +2 -3
  54. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_benchmark_config_factory.py +3 -5
  55. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_benchmarker.py +9 -9
  56. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_cli.py +5 -6
  57. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_constants.py +1 -1
  58. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_data_models.py +18 -16
  59. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_finetuning.py +3 -1
  60. {scandeval-16.6.0 → scandeval-16.7.1}/uv.lock +3 -2
  61. {scandeval-16.6.0 → scandeval-16.7.1}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  62. {scandeval-16.6.0 → scandeval-16.7.1}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  63. {scandeval-16.6.0 → scandeval-16.7.1}/.github/ISSUE_TEMPLATE/language_request.yaml +0 -0
  64. {scandeval-16.6.0 → scandeval-16.7.1}/.github/workflows/ci.yaml +0 -0
  65. {scandeval-16.6.0 → scandeval-16.7.1}/.gitignore +0 -0
  66. {scandeval-16.6.0 → scandeval-16.7.1}/.markdownlint.jsonc +0 -0
  67. {scandeval-16.6.0 → scandeval-16.7.1}/CITATION.cff +0 -0
  68. {scandeval-16.6.0 → scandeval-16.7.1}/CODE_OF_CONDUCT.md +0 -0
  69. {scandeval-16.6.0 → scandeval-16.7.1}/CONTRIBUTING.md +0 -0
  70. {scandeval-16.6.0 → scandeval-16.7.1}/Dockerfile.cuda +0 -0
  71. {scandeval-16.6.0 → scandeval-16.7.1}/LICENSE +0 -0
  72. {scandeval-16.6.0 → scandeval-16.7.1}/NEW_DATASET_GUIDE.md +0 -0
  73. {scandeval-16.6.0 → scandeval-16.7.1}/docs/CNAME +0 -0
  74. {scandeval-16.6.0 → scandeval-16.7.1}/docs/README.md +0 -0
  75. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/README.md +0 -0
  76. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/bulgarian.md +0 -0
  77. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/croatian.md +0 -0
  78. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/czech.md +0 -0
  79. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/danish.md +0 -0
  80. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/dutch.md +0 -0
  81. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/english.md +0 -0
  82. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/faroese.md +0 -0
  83. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/finnish.md +0 -0
  84. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/french.md +0 -0
  85. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/german.md +0 -0
  86. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/greek.md +0 -0
  87. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/icelandic.md +0 -0
  88. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/italian.md +0 -0
  89. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/latvian.md +0 -0
  90. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/lithuanian.md +0 -0
  91. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/norwegian.md +0 -0
  92. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/polish.md +0 -0
  93. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/portuguese.md +0 -0
  94. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/serbian.md +0 -0
  95. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/slovak.md +0 -0
  96. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/swedish.md +0 -0
  97. {scandeval-16.6.0 → scandeval-16.7.1}/docs/datasets/ukrainian.md +0 -0
  98. {scandeval-16.6.0 → scandeval-16.7.1}/docs/extras/radial_plotter.md +0 -0
  99. {scandeval-16.6.0 → scandeval-16.7.1}/docs/faq.md +0 -0
  100. {scandeval-16.6.0 → scandeval-16.7.1}/docs/gfx/favicon.png +0 -0
  101. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/czech.md +0 -0
  102. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/danish.md +0 -0
  103. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/dutch.md +0 -0
  104. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/english.md +0 -0
  105. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/estonian.md +0 -0
  106. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/faroese.md +0 -0
  107. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/finnish.md +0 -0
  108. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/french.md +0 -0
  109. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/german.md +0 -0
  110. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  111. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/italian.md +0 -0
  112. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/latvian.md +0 -0
  113. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/lithuanian.md +0 -0
  114. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  115. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/polish.md +0 -0
  116. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/portuguese.md +0 -0
  117. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/slovak.md +0 -0
  118. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/spanish.md +0 -0
  119. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/swedish.md +0 -0
  120. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Monolingual/ukrainian.md +0 -0
  121. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Multilingual/baltic.md +0 -0
  122. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Multilingual/european.md +0 -0
  123. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Multilingual/finnic.md +0 -0
  124. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Multilingual/germanic.md +0 -0
  125. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  126. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/Multilingual/romance.md +0 -0
  127. {scandeval-16.6.0 → scandeval-16.7.1}/docs/leaderboards/README.md +0 -0
  128. {scandeval-16.6.0 → scandeval-16.7.1}/docs/python-package.md +0 -0
  129. {scandeval-16.6.0 → scandeval-16.7.1}/docs/tasks/README.md +0 -0
  130. {scandeval-16.6.0 → scandeval-16.7.1}/docs/tasks/common-sense-reasoning.md +0 -0
  131. {scandeval-16.6.0 → scandeval-16.7.1}/docs/tasks/knowledge.md +0 -0
  132. {scandeval-16.6.0 → scandeval-16.7.1}/docs/tasks/linguistic-acceptability.md +0 -0
  133. {scandeval-16.6.0 → scandeval-16.7.1}/docs/tasks/named-entity-recognition.md +0 -0
  134. {scandeval-16.6.0 → scandeval-16.7.1}/docs/tasks/reading-comprehension.md +0 -0
  135. {scandeval-16.6.0 → scandeval-16.7.1}/docs/tasks/sentiment-classification.md +0 -0
  136. {scandeval-16.6.0 → scandeval-16.7.1}/docs/tasks/speed.md +0 -0
  137. {scandeval-16.6.0 → scandeval-16.7.1}/docs/tasks/summarization.md +0 -0
  138. {scandeval-16.6.0 → scandeval-16.7.1}/gfx/euroeval.png +0 -0
  139. {scandeval-16.6.0 → scandeval-16.7.1}/gfx/euroeval.xcf +0 -0
  140. {scandeval-16.6.0 → scandeval-16.7.1}/gfx/scandeval.png +0 -0
  141. {scandeval-16.6.0 → scandeval-16.7.1}/makefile +0 -0
  142. {scandeval-16.6.0 → scandeval-16.7.1}/mkdocs.yaml +0 -0
  143. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/__init__.py +0 -0
  144. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/benchmark_modules/__init__.py +0 -0
  145. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/benchmark_modules/fresh.py +0 -0
  146. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/benchmark_modules/hf.py +0 -0
  147. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/caching_utils.py +0 -0
  148. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/callbacks.py +0 -0
  149. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/data_loading.py +0 -0
  150. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/bulgarian.py +0 -0
  151. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/croatian.py +0 -0
  152. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/czech.py +0 -0
  153. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/danish.py +0 -0
  154. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/dutch.py +0 -0
  155. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/english.py +0 -0
  156. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/estonian.py +0 -0
  157. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/faroese.py +0 -0
  158. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/finnish.py +0 -0
  159. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/french.py +0 -0
  160. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/german.py +0 -0
  161. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/greek.py +0 -0
  162. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/icelandic.py +0 -0
  163. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/italian.py +0 -0
  164. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/latvian.py +0 -0
  165. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/lithuanian.py +0 -0
  166. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/norwegian.py +0 -0
  167. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/polish.py +0 -0
  168. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/portuguese.py +0 -0
  169. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/serbian.py +0 -0
  170. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/slovak.py +0 -0
  171. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/spanish.py +0 -0
  172. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/swedish.py +0 -0
  173. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/dataset_configs/ukrainian.py +0 -0
  174. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/enums.py +0 -0
  175. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/exceptions.py +0 -0
  176. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/metrics/__init__.py +0 -0
  177. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/metrics/base.py +0 -0
  178. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/metrics/pipeline.py +0 -0
  179. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/metrics/speed.py +0 -0
  180. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/model_config.py +0 -0
  181. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/model_loading.py +0 -0
  182. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/prompt_templates/__init__.py +0 -0
  183. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/prompt_templates/classification.py +0 -0
  184. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/prompt_templates/token_classification.py +0 -0
  185. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/scores.py +0 -0
  186. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/speed_benchmark.py +0 -0
  187. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/task_group_utils/__init__.py +0 -0
  188. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/task_group_utils/multiple_choice_classification.py +0 -0
  189. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/task_group_utils/sequence_classification.py +0 -0
  190. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/task_group_utils/text_to_text.py +0 -0
  191. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/task_group_utils/token_classification.py +0 -0
  192. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/tasks.py +0 -0
  193. {scandeval-16.6.0 → scandeval-16.7.1}/src/scandeval/types.py +0 -0
  194. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/__init__.py +0 -0
  195. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/constants.py +0 -0
  196. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_allocine.py +0 -0
  197. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_angry_tweets.py +0 -0
  198. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_arc.py +0 -0
  199. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_arc_is.py +0 -0
  200. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_belebele.py +0 -0
  201. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_bg_ner_bsnlp.py +0 -0
  202. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_boolq_pt.py +0 -0
  203. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_cinexio.py +0 -0
  204. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_cnn_dailymail.py +0 -0
  205. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_conll_en.py +0 -0
  206. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_conll_es.py +0 -0
  207. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_conll_nl.py +0 -0
  208. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_copa_lv.py +0 -0
  209. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_cross_domain_uk_reviews.py +0 -0
  210. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_cs_gec.py +0 -0
  211. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_csfd_sentiment.py +0 -0
  212. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_csfd_sentiment_sk.py +0 -0
  213. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_czech_news.py +0 -0
  214. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_dane.py +0 -0
  215. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_danish_citizen_tests.py +0 -0
  216. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_dansk.py +0 -0
  217. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_danske_talemaader.py +0 -0
  218. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_danske_talemaader_old.py +0 -0
  219. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_dbrd.py +0 -0
  220. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_dutch_cola.py +0 -0
  221. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_elner.py +0 -0
  222. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_eltec.py +0 -0
  223. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_err_news.py +0 -0
  224. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_estner.py +0 -0
  225. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_estonian_valence.py +0 -0
  226. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_european_values.py +0 -0
  227. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_exam_et.py +0 -0
  228. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_exams_bg.py +0 -0
  229. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_fone.py +0 -0
  230. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_foqa.py +0 -0
  231. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_fosent.py +0 -0
  232. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_fquad.py +0 -0
  233. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_fullstack_ner.py +0 -0
  234. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_germanquad.py +0 -0
  235. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_germeval.py +0 -0
  236. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_global_mmlu.py +0 -0
  237. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_goldenswag.py +0 -0
  238. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_grammar_et.py +0 -0
  239. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_greek_sa.py +0 -0
  240. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_greek_wikipedia.py +0 -0
  241. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_harem.py +0 -0
  242. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_hellaswag.py +0 -0
  243. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_hellaswag_cs.py +0 -0
  244. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_hellaswag_fi.py +0 -0
  245. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  246. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_ice_linguistic.py +0 -0
  247. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_icelandic_error_corpus.py +0 -0
  248. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_icelandic_knowledge.py +0 -0
  249. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_icelandic_qa.py +0 -0
  250. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_icesum.py +0 -0
  251. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_idioms_no.py +0 -0
  252. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_ilpost_sum.py +0 -0
  253. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_jentoft.py +0 -0
  254. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_kpwr_ner.py +0 -0
  255. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_latvian_lsm_summary.py +0 -0
  256. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
  257. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_life_in_the_uk.py +0 -0
  258. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_lithuanian_lrytas_summarization.py +0 -0
  259. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_llmzszl.py +0 -0
  260. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_lt_emotions.py +0 -0
  261. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_lt_history.py +0 -0
  262. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_mim_gold_ner.py +0 -0
  263. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_mlqa_es.py +0 -0
  264. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_mlsum_de.py +0 -0
  265. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_mlsum_es.py +0 -0
  266. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_mmlu.py +0 -0
  267. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_mmlu_et.py +0 -0
  268. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_mmlu_hr.py +0 -0
  269. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_mmlu_lv.py +0 -0
  270. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_multinerd-it.py +0 -0
  271. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_ner_uk.py +0 -0
  272. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_no_cola.py +0 -0
  273. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_no_sammendrag.py +0 -0
  274. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_nor_common_sense_qa.py +0 -0
  275. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_nordjylland_news.py +0 -0
  276. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_norec.py +0 -0
  277. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_norglm_multiqa.py +0 -0
  278. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_norglm_multisum.py +0 -0
  279. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_norne.py +0 -0
  280. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_norquad.py +0 -0
  281. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_nqii.py +0 -0
  282. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_nrk_quiz_qa.py +0 -0
  283. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_orange_sum.py +0 -0
  284. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_personal_sum.py +0 -0
  285. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_polemo2.py +0 -0
  286. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_poner.py +0 -0
  287. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_poquad.py +0 -0
  288. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_psc.py +0 -0
  289. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_publico.py +0 -0
  290. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_rrn.py +0 -0
  291. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_sb10k.py +0 -0
  292. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_scala.py +0 -0
  293. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_scandiqa.py +0 -0
  294. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_scandisent_fi.py +0 -0
  295. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_schibsted.py +0 -0
  296. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_sentiment_headlines_es.py +0 -0
  297. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_sentipolc16.py +0 -0
  298. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_skolprov.py +0 -0
  299. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_sqad.py +0 -0
  300. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_squad.py +0 -0
  301. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_squad_it.py +0 -0
  302. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_squad_nl.py +0 -0
  303. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_squad_nl_old.py +0 -0
  304. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_sst2_pt.py +0 -0
  305. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_sst5.py +0 -0
  306. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_suc3.py +0 -0
  307. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_swedn.py +0 -0
  308. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_swerec.py +0 -0
  309. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_trivia_et.py +0 -0
  310. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_turku_ner_fi.py +0 -0
  311. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_tydiqa_fi.py +0 -0
  312. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_umimeto_qa.py +0 -0
  313. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_uner_sk.py +0 -0
  314. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_uner_sr.py +0 -0
  315. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_wiki_lingua_nl.py +0 -0
  316. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_wikineural-it.py +0 -0
  317. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_winogrande.py +0 -0
  318. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_winogrande_et.py +0 -0
  319. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_winogrande_is.py +0 -0
  320. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_xlsum_fi.py +0 -0
  321. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/create_xquad.py +0 -0
  322. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/fix_dot_env_file.py +0 -0
  323. {scandeval-16.6.0 → scandeval-16.7.1}/src/scripts/versioning.py +0 -0
  324. {scandeval-16.6.0 → scandeval-16.7.1}/tests/__init__.py +0 -0
  325. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_benchmark_modules/__init__.py +0 -0
  326. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_benchmark_modules/test_hf.py +0 -0
  327. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_callbacks.py +0 -0
  328. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_data_loading.py +0 -0
  329. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_dataset_configs.py +0 -0
  330. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_enums.py +0 -0
  331. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_exceptions.py +0 -0
  332. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_languages.py +0 -0
  333. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_model_config.py +0 -0
  334. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_model_loading.py +0 -0
  335. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scores.py +0 -0
  336. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scripts/__init__.py +0 -0
  337. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scripts/test_create_scala/__init__.py +0 -0
  338. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scripts/test_create_scala/test_create_scala.py +0 -0
  339. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scripts/test_create_scala/test_data/de_gsd-ud-train.conllu.adp_det +0 -0
  340. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scripts/test_create_scala/test_data/empty.file +0 -0
  341. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scripts/test_create_scala/test_data/en_gum-ud-train.conllu.case +0 -0
  342. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_01 +0 -0
  343. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_02 +0 -0
  344. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_03 +0 -0
  345. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_speed_benchmark.py +0 -0
  346. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_tokenisation_utils.py +0 -0
  347. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_types.py +0 -0
  348. {scandeval-16.6.0 → scandeval-16.7.1}/tests/test_utils.py +0 -0
@@ -25,6 +25,7 @@ body:
25
25
  description: What languages is the dataset in?
26
26
  options:
27
27
  - label: Bulgarian
28
+ - label: Bosnian
28
29
  - label: Croatian
29
30
  - label: Czech
30
31
  - label: Danish
@@ -23,7 +23,7 @@ body:
23
23
  - label: Hellenic languages (Greek)
24
24
  - label: Romance languages (French, Italian, Portuguese, Spanish)
25
25
  - label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
26
- - label: Slavic languages (Bulgarian, Croatian, Czech, Polish, Serbian, Slovak, Slovenian, Ukrainian)
26
+ - label: Slavic languages (Bulgarian, Bosnian, Croatian, Czech, Polish, Serbian, Slovak, Slovenian, Ukrainian)
27
27
  - label: West Germanic languages (Dutch, English, German)
28
28
  validations:
29
29
  required: true
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.14.3
13
+ rev: v0.14.5
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -30,7 +30,7 @@ repos:
30
30
  - pyi
31
31
  - jupyter
32
32
  - repo: https://github.com/kynan/nbstripout
33
- rev: 0.8.1
33
+ rev: 0.8.2
34
34
  hooks:
35
35
  - id: nbstripout
36
36
  - repo: https://github.com/pre-commit/mirrors-mypy
@@ -44,7 +44,7 @@ repos:
44
44
  - --show-error-codes
45
45
  - --check-untyped-defs
46
46
  - repo: https://github.com/DavidAnson/markdownlint-cli2
47
- rev: v0.18.1
47
+ rev: v0.19.0
48
48
  hooks:
49
49
  - id: markdownlint-cli2
50
50
  args:
@@ -7,6 +7,57 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [v16.7.1] - 2025-11-18
11
+
12
+ ### Fixed
13
+
14
+ - The `--no-progress-bar` argument (`progress_bar=False` in the `Benchmarker` API) was
15
+ not hiding all the progress bars for generative models. This has been fixed now.
16
+ - Now respects the revision when loading tokenizers with vLLM models. I.e., if
17
+ evaluating a model `<model_id>@<revision>` then we also load the tokenizer from the
18
+ `<revision>` branch.
19
+
20
+ ## [v16.7.0] - 2025-11-10
21
+
22
+ ### Added
23
+
24
+ - Added support for Bosnian 🇧🇦! This includes the sentiment classification dataset
25
+ MMS-bs, the named entity recognition dataset WikiANN-bs, the reading comprehension
26
+ dataset MultiWikiQA-bs, and the summarisation dataset LR-Sum-bs.
27
+ - Now allows the 'low', 'medium' and 'high' reasoning effort parameters for the GPT-OSS
28
+ models, which can be set by appending `#low`, `#medium` or `#high` to the model ID.
29
+
30
+ ### Changed
31
+
32
+ - Improved the support for evaluating models on custom inference API servers. This
33
+ includes the following:
34
+ - We now dynamically reduce the number of concurrent connections if we run into
35
+ issues with too many requests.
36
+ - When benchmarking models on custom servers, we now automatically add the LiteLLM
37
+ prefix `openai/` to the model ID if no prefix is given, as LiteLLM requires this.
38
+ - We don't require the API key to be given if the server does not require it.
39
+ - We added a more detailed documentation on how to evaluate models on custom
40
+ inference APIs in the readme.
41
+ - Now always truncates prompts to fit within the model's maximum context length when
42
+ evaluating vLLM models. Previously we only did this when catching the associated
43
+ error, but we cannot do this anymore as vLLM only returns generic errors now.
44
+ - Marked OpenAI's GPT-OSS models as reasoning models when benchmarking them on a custom
45
+ inference server.
46
+
47
+ ### Fixed
48
+
49
+ - When evaluating encoder models on reading comprehension datasets, we now also truncate
50
+ the question in case the model's maximum context length is very small.
51
+ - Now correctly detects the reasoning tokens of the GPT-OSS models.
52
+
53
+ ### Deprecated
54
+
55
+ - Deprecated the `--model-language`, `--dataset-language`, and `--batch-size` arguments
56
+ (and the equivalent ones in the `Benchmarker` API). We now only use the `--language`
57
+ argument for languages, and now use `--finetuning-batch-size` for the batch size. We
58
+ chose this renaming of the batch size argument as it is only used during finetuning,
59
+ and this caused confusion when evaluating generative models.
60
+
10
61
  ## [v16.6.0] - 2025-11-04
11
62
 
12
63
  ### Added
@@ -35,7 +86,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
35
86
 
36
87
  ### Added
37
88
 
38
- - Added support for Slovenian 🇸🇮! This includes the sentiment classification dataset
89
+ - Added support for Slovene 🇸🇮! This includes the sentiment classification dataset
39
90
  Sentinews, the linguistic acceptability dataset ScaLA-sl, the named entity recognition
40
91
  dataset ssj500k-NER, the reading comprehension
41
92
  dataset MultiWikiQA-sl, the knowledge dataset MMLU-sl, and the common-sense reasoning
@@ -2822,7 +2873,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2822
2873
  - Removed support for TensorFlow and Jax models, due to them not working
2823
2874
  properly anyway. They might be included at a later point, properly.
2824
2875
 
2825
- ##  [v1.4.0] - 2021-11-25
2876
+ ## [v1.4.0] - 2021-11-25
2826
2877
 
2827
2878
  ### Changed
2828
2879
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ScandEval
3
- Version: 16.6.0
3
+ Version: 16.7.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -92,7 +92,7 @@ ______________________________________________________________________
92
92
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
93
93
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
94
94
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
95
- [![Code Coverage](https://img.shields.io/badge/Coverage-74%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
95
+ [![Code Coverage](https://img.shields.io/badge/Coverage-69%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
96
96
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
97
97
 
98
98
  ## Maintainer
@@ -228,19 +228,55 @@ sentiment-classification`.
228
228
 
229
229
  If the model you want to benchmark is hosted by a custom inference provider, such as a
230
230
  [vLLM server](https://docs.vllm.ai/en/stable/), then this is also supported in EuroEval.
231
+
231
232
  When benchmarking, you simply have to set the `--api-base` argument (`api_base` when
232
233
  using the `Benchmarker` API) to the URL of the inference API, and optionally the
233
234
  `--api-key` argument (`api_key`) to the API key, if authentication is required.
234
235
 
235
- When benchmarking models hosted on a custom inference API, the model ID
236
- (`--model`/`model`) should be the model name as registered on the inference server,
237
- potentially with a required prefix, depending on the type of inference server used. For
238
- instance, if the model is hosted on a vLLM server, the model ID should be prefixed with
239
- `hosted_vllm/`, and if the model is hosted on an Ollama server, the model ID should be
240
- prefixed with `ollama_chat/`. See the full list of possible inference providers as well
241
- as their corresponding prefixes in the [LiteLLM
242
- documentation](https://docs.litellm.ai/docs/providers/), as EuroEval uses LiteLLM to
243
- handle evaluation of inference APIs in general.
236
+ If you're benchmarking an Ollama model, then you're urged to add the prefix
237
+ `ollama_chat/` to the model name, as that will also fetch model metadata as well as pull
238
+ the models from the Ollama model repository before evaluating it, e.g.:
239
+
240
+ ```bash
241
+ euroeval --model ollama_chat/mymodel --api-base http://localhost:11434
242
+ ```
243
+
244
+ For all other OpenAI-compatible inference APIs, you simply provide the model name as
245
+ is, e.g.:
246
+
247
+ ```bash
248
+ euroeval --model my-model --api-base http://localhost:8000
249
+ ```
250
+
251
+ Again, if the inference API requires authentication, you simply add the `--api-key`
252
+ argument:
253
+
254
+ ```bash
255
+ euroeval --model my-model --api-base http://localhost:8000 --api-key my-secret-key
256
+ ```
257
+
258
+ If your model is a reasoning model, then you need to specify this as follows:
259
+
260
+ ```bash
261
+ euroeval --model my-reasoning-model --api-base http://localhost:8000 --generative-type reasoning
262
+ ```
263
+
264
+ Likewise, if it is a pretrained decoder model (aka a completion model), then you specify
265
+ this as follows:
266
+
267
+ ```bash
268
+ euroeval --model my-base-decoder-model --api-base http://localhost:8000 --generative-type base
269
+ ```
270
+
271
+ When using the `Benchmarker` API, the same applies. Here is an example of benchmarking
272
+ an Ollama model hosted locally:
273
+
274
+ ```python
275
+ >>> benchmarker.benchmark(
276
+ ... model="ollama_chat/mymodel",
277
+ ... api_base="http://localhost:11434",
278
+ ... )
279
+ ```
244
280
 
245
281
  ## Benchmarking in an offline environment
246
282
 
@@ -20,7 +20,7 @@ ______________________________________________________________________
20
20
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
21
21
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
22
22
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
23
- [![Code Coverage](https://img.shields.io/badge/Coverage-74%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
23
+ [![Code Coverage](https://img.shields.io/badge/Coverage-69%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
24
24
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
25
25
 
26
26
  ## Maintainer
@@ -156,19 +156,55 @@ sentiment-classification`.
156
156
 
157
157
  If the model you want to benchmark is hosted by a custom inference provider, such as a
158
158
  [vLLM server](https://docs.vllm.ai/en/stable/), then this is also supported in EuroEval.
159
+
159
160
  When benchmarking, you simply have to set the `--api-base` argument (`api_base` when
160
161
  using the `Benchmarker` API) to the URL of the inference API, and optionally the
161
162
  `--api-key` argument (`api_key`) to the API key, if authentication is required.
162
163
 
163
- When benchmarking models hosted on a custom inference API, the model ID
164
- (`--model`/`model`) should be the model name as registered on the inference server,
165
- potentially with a required prefix, depending on the type of inference server used. For
166
- instance, if the model is hosted on a vLLM server, the model ID should be prefixed with
167
- `hosted_vllm/`, and if the model is hosted on an Ollama server, the model ID should be
168
- prefixed with `ollama_chat/`. See the full list of possible inference providers as well
169
- as their corresponding prefixes in the [LiteLLM
170
- documentation](https://docs.litellm.ai/docs/providers/), as EuroEval uses LiteLLM to
171
- handle evaluation of inference APIs in general.
164
+ If you're benchmarking an Ollama model, then you're urged to add the prefix
165
+ `ollama_chat/` to the model name, as that will also fetch model metadata as well as pull
166
+ the models from the Ollama model repository before evaluating it, e.g.:
167
+
168
+ ```bash
169
+ euroeval --model ollama_chat/mymodel --api-base http://localhost:11434
170
+ ```
171
+
172
+ For all other OpenAI-compatible inference APIs, you simply provide the model name as
173
+ is, e.g.:
174
+
175
+ ```bash
176
+ euroeval --model my-model --api-base http://localhost:8000
177
+ ```
178
+
179
+ Again, if the inference API requires authentication, you simply add the `--api-key`
180
+ argument:
181
+
182
+ ```bash
183
+ euroeval --model my-model --api-base http://localhost:8000 --api-key my-secret-key
184
+ ```
185
+
186
+ If your model is a reasoning model, then you need to specify this as follows:
187
+
188
+ ```bash
189
+ euroeval --model my-reasoning-model --api-base http://localhost:8000 --generative-type reasoning
190
+ ```
191
+
192
+ Likewise, if it is a pretrained decoder model (aka a completion model), then you specify
193
+ this as follows:
194
+
195
+ ```bash
196
+ euroeval --model my-base-decoder-model --api-base http://localhost:8000 --generative-type base
197
+ ```
198
+
199
+ When using the `Benchmarker` API, the same applies. Here is an example of benchmarking
200
+ an Ollama model hosted locally:
201
+
202
+ ```python
203
+ >>> benchmarker.benchmark(
204
+ ... model="ollama_chat/mymodel",
205
+ ... api_base="http://localhost:11434",
206
+ ... )
207
+ ```
172
208
 
173
209
  ## Benchmarking in an offline environment
174
210
 
@@ -0,0 +1,306 @@
1
+ # 🇧🇦 Bosnian
2
+
3
+ This is an overview of all the datasets used in the Bosnian part of EuroEval. The
4
+ datasets are grouped by their task – see the [task overview](/tasks) for more
5
+ information about what these constitute.
6
+
7
+ ## Sentiment Classification
8
+
9
+ ### MMS-bs
10
+
11
+ This dataset was published in [this paper](https://doi.org/10.48550/arXiv.2306.07902).
12
+ The corpus consists of 79 manually selected datasets from over 350 datasets reported in the
13
+ scientific literature based on strict quality criteria.
14
+
15
+ The original dataset contains a single split with 36,183 Bosnian samples.
16
+ We use 1,024 / 256 / 2,048 samples for our training, validation, and test splits,
17
+ respectively.
18
+ We have employed stratified sampling based on the label column from the original
19
+ dataset to ensure balanced splits.
20
+
21
+ Here are a few examples from the training split:
22
+
23
+ ```json
24
+ {
25
+ "text": "Jaoo kako cjadko, izasla si s momkom ju ar filing loved, o maj gash! Awwww. POV RA CA CU",
26
+ "label": "negative"
27
+ }
28
+ ```
29
+
30
+ ```json
31
+ {
32
+ "text": "@aneldzoko sta se to desava u Neumu?",
33
+ "label": "neutral"
34
+ }
35
+ ```
36
+
37
+ ```json
38
+ {
39
+ "text": "Zasto se inspirator zove inspirator kad se s njim usisava?",
40
+ "label": "neutral"
41
+ }
42
+ ```
43
+
44
+ When evaluating generative models, we use the following setup (see the
45
+ [methodology](/methodology) for more information on how these are used):
46
+
47
+ - Number of few-shot examples: 12
48
+ - Prefix prompt:
49
+
50
+ ```text
51
+ Slijede dokumenti i njihova osjetila, koja mogu biti pozitivno, neutralno ili negativno.
52
+ ```
53
+
54
+ - Base prompt template:
55
+
56
+ ```text
57
+ Dokument: {text}
58
+ Osjetilo: {label}
59
+ ```
60
+
61
+ - Instruction-tuned prompt template:
62
+
63
+ ```text
64
+ Dokument: {text}
65
+
66
+ Klasificirajte osjećaj u dokumentu. Odgovorite samo s pozitivno, neutralno, ili negativno, i ništa drugo.
67
+ ```
68
+
69
+ - Label mapping:
70
+ - `positive` ➡️ `pozitivno`
71
+ - `neutral` ➡️ `neutralno`
72
+ - `negative` ➡️ `negativno`
73
+
74
+ You can evaluate this dataset directly as follows:
75
+
76
+ ```bash
77
+ euroeval --model <model-id> --dataset mms-bs
78
+ ```
79
+
80
+ ## Named Entity Recognition
81
+
82
+ ### WikiANN-bs
83
+
84
+ This dataset was published in [this paper](https://aclanthology.org/P17-1178/) and is
85
+ part of a cross-lingual named entity recognition framework for 282 languages from
86
+ Wikipedia. It uses silver-standard annotations transferred from English through
87
+ cross-lingual links and performs both name tagging and linking to an english Knowledge
88
+ Base.
89
+
90
+ The original full dataset consists of 10,000 / 10,000 / 10,000 samples for the training,
91
+ validation and test splits, respectively. We use 1,024 / 256 / 2,048 samples for our
92
+ training, validation and test splits, respectively. All the new splits are subsets of
93
+ the original splits.
94
+
95
+ Here are a few examples from the training split:
96
+
97
+ ```json
98
+ {
99
+ "tokens": ["Čehoslovačka", ",", "Francuska", ",", "Mađarska", ",", "Meksiko", ",", "Švicarska", ",", "Urugvaj"],
100
+ "labels": ["B-LOC", "O", "B-LOC", "O", "B-LOC", "O", "B-LOC", "O", "B-LOC", "O", "B-LOC"],
101
+ }
102
+ ```
103
+
104
+ ```json
105
+ {
106
+ "tokens": ["godine", ",", "naselje", "je", "ukinuto", "i", "pripojeno", "naselju", "Bribir", "."],
107
+ "labels": ["O", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "O"],
108
+ }
109
+ ```
110
+
111
+ ```json
112
+ {
113
+ "tokens": ["Administrativno", "središte", "oblasti", "je", "Tjumenj", "."],
114
+ "labels": ["O", "O", "O", "O", "B-LOC", "O"],
115
+ }
116
+ ```
117
+
118
+ When evaluating generative models, we use the following setup (see the
119
+ [methodology](/methodology) for more information on how these are used):
120
+
121
+ - Number of few-shot examples: 8
122
+ - Prefix prompt:
123
+
124
+ ```text
125
+ Slijede rečenice i JSON riječnici s imenovanim entitetima koji se pojavljuju u rečenicama.
126
+ ```
127
+
128
+ - Base prompt template:
129
+
130
+ ```text
131
+ Rečenica: {text}
132
+ Imenovani entiteti: {label}
133
+ ```
134
+
135
+ - Instruction-tuned prompt template:
136
+
137
+ ```text
138
+ Rečenica: {text}
139
+
140
+ Identificirajte imenovane entitete u rečenici. Prikažite ih kao JSON riječnik s ključevima 'osoba', 'mjesto', 'organizacija' i 'razno'. Vrijednosti trebaju biti popisi imenovanih entiteta navedenog tipa, točno kako se pojavljuju u rečenici.
141
+ ```
142
+
143
+ - Label mapping:
144
+ - `B-PER` ➡️ `osoba`
145
+ - `I-PER` ➡️ `osoba`
146
+ - `B-LOC` ➡️ `mjesto`
147
+ - `I-LOC` ➡️ `mjesto`
148
+ - `B-ORG` ➡️ `organizacija`
149
+ - `I-ORG` ➡️ `organizacija`
150
+ - `B-MISC` ➡️ `razno`
151
+ - `I-MISC` ➡️ `razno`
152
+
153
+ You can evaluate this dataset directly as follows:
154
+
155
+ ```bash
156
+ euroeval --model <model-id> --dataset wikiann-bs
157
+ ```
158
+
159
+ ## Reading Comprehension
160
+
161
+ ### MultiWikiQA-bs
162
+
163
+ This dataset was published in [this paper](https://doi.org/10.48550/arXiv.2509.04111)
164
+ and contains Wikipedia articles with LLM-generated questions and answers in 300+
165
+ languages.
166
+
167
+ The original full dataset consists of 5,000 samples in a single split. We use a 1,024 /
168
+ 256 / 2,048 split for training, validation and testing, respectively, sampled randomly.
169
+
170
+ Here are a few examples from the training split:
171
+
172
+ ```json
173
+ {
174
+ "context": "NGC 3803 (također poznat kao PGC 36204) je eliptična galaksija koja je udaljena oko 164 miliona sg od Zemlje i nalazi se u sazviježđu Lav. Najveći prečnik je 0,40 (19 hiljada sg) a najmanji 0,4 uglovnih minuta (19 hiljada sg). Prvo otkriće je napravio R. J. Mitchell 27. marta 1856. godine.\n\nNajbliži NGC/IC objekti \nSljedeći spisak sadrži deset najbližih NGC/IC objekata.\n\nTakođer pogledajte \n Novi opći katalog\n Spisak NGC objekata\n Spisak galaksija\n\nBilješke \n Prividna magnituda od 15,5 – Apsolutna magnituda: M = m - 5 ((log10 DL) - 1), gdje je m=15,5 i DL=50,4 * 106.\n 0,40 uglovnih minuta – S = A * D * 0,000291 * P, gdje je A=0,40, D=50,4 i P = 3,2616.\n Bazirano na euklidsku udaljenost.\n\nReference\n\nLiteratura\n\nVanjski linkovi\n\nNGC 3803 \n\n NGC 3803 na Aladin pregledaču\n\nNGC katalog \n Interaktivni NGC Online Katalog\n Astronomska baza podataka SIMBAD\n NGC katalog na Messier45.com \n NGC/IC projekt\n NGC2000 na NASA sajtu\n NGC na The Night Sky Atlas sajtu\n\nEliptične galaksije\nLav (sazviježđe)\nNGC objekti\nPGC objekti",
175
+ "question": "Koliki je najmanji kutni promjer NGC 3803 izražen u kutnim minutama?",
176
+ "answers": {
177
+ "answer_start": [158],
178
+ "text": ["0,4"]
179
+ }
180
+ }
181
+ ```
182
+
183
+ ```json
184
+ {
185
+ "context": "Po popisu stanovništva, domaćinstava i stanova 2011. u Srbiji, koji je proveden od 1. do 15. oktobra 2011, u općini Crna Trava živjelo je ukupno 1663 stanovnika, što predstavlja 0,02% od ukupnog broja stanovnika Srbije, odnosno 0,77% od od ukupnog broja stanovnika Jablaničkog okruga. Popis stanovništva provoden je na temelju Zakona o popisu stanovništva, domaćinstava i stanova u 2011. Godini ("Službeni glasnik RS", br. 104/09 i 24/11).\n\nRezultati popisa\n\nNacionalna pripadnost\n\nMaternji jezik\n\nVjeroispovijest\n\nStarosna piramida \nOd ukupnog broja stanovnika u općini Crna Trava bilo je 838 (50,39%) muškaraca i 825 (49,61%) žena, što predstavlja omjer muškaraca i žena 1.016:1000. Prosječna starost stanovništva bila je 53,7 godina, muškaraca 51,4 godina, a žena 56,1 godina. Udio osoba starijih od 18 godina je 91,5% (1.521), kod muškaraca 92,0% (771), a kod žena 90,9% (750).\n\nTakođer pogledajte\n\nNapomene\n\nReference\n\nVanjski linkovi \n Republički zavod za statistiku Srbije \n\nCrna Trava\nCrna Trava",
186
+ "question": "Koliko godina u prosjeku imaju stanovnici općine Crna Trava?",
187
+ "answers": {
188
+ "answer_start": [726],
189
+ "text": ["53,7 godina"]
190
+ }
191
+ }
192
+ ```
193
+
194
+ ```json
195
+ {
196
+ "context": "IC 910 (također poznat kao IRAS 13387+2331, MCG 4-32-25 i PGC 48424) je spiralna galaksija koja je udaljena oko 374 miliona sg od Zemlje i nalazi se u sazviježđu Volar. Najveći prečnik je 0,50 (54 hiljade sg) a najmanji 0,4 uglovnih minuta (44 hiljade sg). Prvo otkriće je napravio Stephane Javelle 16. juna 1892. godine.\n\nNajbliži NGC/IC objekti \nSljedeći spisak sadrži deset najbližih NGC/IC objekata.\n\nTakođer pogledajte \n Novi opći katalog\n Spisak IC objekata\n Spisak galaksija\n\nBilješke \n Prividna magnituda od 14,4 – Apsolutna magnituda: M = m - 5 ((log10 DL) - 1), gdje je m=14,4 i DL=114,6 * 106.\n 0,50 uglovnih minuta – S = A * D * 0,000291 * P, gdje je A=0,50, D=114,6 i P = 3,2616.\n Bazirano na euklidsku udaljenost.\n\nReference\n\nLiteratura\n\nVanjski linkovi\n\nIC 910 \n\n IC 910 na Aladin pregledaču\n\nIC katalog \n Interaktivni NGC Online Katalog\n Astronomska baza podataka SIMBAD\n IC katalog na Messier45.com \n NGC/IC projekt\n NGC2000 na NASA sajtu\n IC na The Night Sky Atlas sajtu\n\nIC objekti\nIRAS objekti\nMCG objekti\nPGC objekti\nSpiralne galaksije\nVolar (sazviježđe)",
197
+ "question": "Kolika je distanca između Zemlje i galaksije IC 910?",
198
+ "answers": {
199
+ "answer_start": [108],
200
+ "text": ["oko 374 miliona sg"]
201
+ }
202
+ }
203
+ ```
204
+
205
+ When evaluating generative models, we use the following setup (see the
206
+ [methodology](/methodology) for more information on how these are used):
207
+
208
+ - Number of few-shot examples: 4
209
+ - Prefix prompt:
210
+
211
+ ```text
212
+ Slijede tekstovi s pitanjima i odgovorima.
213
+ ```
214
+
215
+ - Base prompt template:
216
+
217
+ ```text
218
+ Tekst: {text}
219
+ Pitanje: {question}
220
+ Odgovor s najviše 3 riječi:
221
+ ```
222
+
223
+ - Instruction-tuned prompt template:
224
+
225
+ ```text
226
+ Tekst: {text}
227
+
228
+ Odgovorite na sljedeće pitanje o gornjem tekstu s najviše 3 riječi.
229
+
230
+ Pitanje: {question}
231
+ ```
232
+
233
+ You can evaluate this dataset directly as follows:
234
+
235
+ ```bash
236
+ euroeval --model <model-id> --dataset multi-wiki-qa-bs
237
+ ```
238
+
239
+ ## Summarisation
240
+
241
+ ### LR-Sum-bs
242
+
243
+ This dataset was published in [this paper](https://aclanthology.org/2023.findings-acl.427/).
244
+ The source data is public domain newswire collected from Voice of America websites,
245
+ and the summaries are human-written.
246
+
247
+ The original dataset contains 5,784 / 722 / 723 samples for the training, validation, and
248
+ and test splits, respectively. We use 1,024 / 256 / 2,048 samples for our training,
249
+ validation and test splits, respectively. The train and validation splits are subsets
250
+ of the original splits. For the test split, we use all available test samples and
251
+ supplement with additional samples from the training set to reach 2,048 samples in
252
+ total.
253
+
254
+ Here are a few examples from the training split:
255
+
256
+ ```json
257
+ {
258
+ "text": "Komisija 9/11: američki dužnosnici nisu shvaćali razmjere opasnosti od al-Qaide (23/7/04) - 2004-07-23\n\nKomisija koja je istraživala terorističke napade na Sjedinjene Države 2001. godine ocjenjuje da američki dužnosnici nisu shvaćali razmjere opasnosti koju je predstavljala al-Qaidina mreža. Neovisni panel objavio je svoje zaključke na tiskovnoj konferenciji u Washingtonu. Iznoseći osnovne zaključke izvještaja, predsjedatelj komisije Thomas Kean rekao je da američka vlast nije bila dovoljno aktivna u borbi protiv opasnosti koju je predstavljala al-Qaida. Panel je ocijenio da je u svim dijelovima vlasti bilo propusta glede “razumijevanja, određivanja politike, osposobljenosti i rukovođenja”. Vojska je – kako se navodi – ponudila tek ograničene opcije u vezi s napadima na al-Qaidu, a djelovanje obavještajnih službi bilo je otežano krutim budžetom i birokratskim suparništvom. U izvještaju se navodi da nitko ne može znati jesu li postojale neke mjere koje su mogle onemogućiti napade, ali se dodaje da planove al-Qaide nije ni omelo, niti odgodilo ništa što su poduzele vlade predsjednika Clintona i Busha. Komisija je pozvala na formiranje saveznog centra za kontra-terorizam, na čijem bi čelu bio direktor ministarskog ranga s obavezom da nadzire rad svih američkih obavještajnih službi. Predsjedatelj komisije Thomas Kean je ocijenio da su Sjedinjene Države i dalje sučeljene sa – kako je rekao – “jednim od najvećih sigurnosnih izazova u našoj povijesti.” Obrazlažući potrebu stvaranja ministarskog položaja za obavještajni rad, član komisije Lee Hamilton rekao je da su informacije i odgovornost sada razvučeni po brojnim obavještajnim službama. On se također založio za davanje više ovlasti kongresnim tijelima za nadzor obavještajnih službi. Samo nekoliko sati nakon što je komisija objavila svoje nalaze, predsjednik Bush je rekao da je suglasan sa zaključkom da su teroristi 2001. iskoristili duboke institucionalne propuste u obrani zemlje: “Preporuke komisije podudarne su sa strategijom koju moja administracija slijedi u nadilaženju propusta i u borbi do pobjede nad terorizmom.” Predsjednik Bush se još nije službeno obvezao na provedbu bilo koje od komisijinih preporuka, ali je rekao da će one biti pažljivo razmotrene. U izvješću komisije navodi se da nisu pronađeni nikakvi dokazi da je bivši irački predsjednik Saddam Hussein ikada “operativno surađivao” s al-Qaidom. Obavještajni podaci ukazuju na “prijateljske kontakte” Iraka s al-Qaidom prije 11.rujna 2001.godine, ali komisija nije pronašla nikakve dokaze da su Bagdad i al-Qaida surađivali u planiranju i izvršenju napada na Sjedinjene Države. Što se Irana tiče, komisija nije pronašla dokaze da je Teheran bio upoznat s napadima na New York i Washington. No, kako se dodaje, to pitanje treba dalje istraživati. Također se navodi da su iranske vlasti omogućile al-Qaidinim članovima da putuju preko Irana bez da im se u pasoše ubilježi kad su ušli i izašli iz te zemlje.",
259
+ "target_text": "Izvješće komisije navodi se da nisu pronađeni nikakvi dokazi da je Saddam Hussein ikada “operativno surađivao” s al-Qaidom"
260
+ }
261
+ ```
262
+
263
+ ```json
264
+ {
265
+ "text": "Vlada prihvaća odluku Žalbenog vijeća Haškog suda u predmetu Bobetko (29/11/02) - 2002-11-29\n\nVijest da je Žalbeno vijeće Haškog suda odbilo oba podneska hrvatske Vlade u vezi s optužniom protiv generala Janka Bobetka u hrvatskoj pravnoj i političkoj javnosti – ako je suditi po prvim reakcijama – nikoga nije posebno iznenadila. Odvjetnik Goran Mikuličić, pravni savjetnik hrvatske Vlade u odnosima s Haškim sudom, ovako je prokomentirao vijest o odbijanju hrvatskih podnesaka u Haagu. “Naša Vlada prihvaća odluku. Vlada ne polemizira s odlukom i ne komentira odluku jer to je odluka nadležnog suda s kojim nema dalje nikakve pravne rasprave”. Mikuličić je objasnio koji su daljnji koraci Vlade nakon ovakvih vijesti iz Haaga. “Daljnji postupak Vlade će biti objaveštavanje tajništva Haškog tribunala o nalazu liječničkih ekperata koje je angažirao Županijski sud u Zagrebu. Oni su utvrdili da general Bobetko nije sposoban aktivno sudjelovati u postupku, zbog svog lošeg zdravstvenog stanja, i Vlada će posegnuti za odredbama pravila 59., i izvjestiti tajništvo o nemogućnosti udovoljenja zahtjevu zbog objektivnih okolnosti. Osim pravne donosimo i političke reakcije na odluku Žalbenog vijeća u slučaju Bobetko. SDP-ovac Mato Arlović, koji je i predsjednik saborskog Odbora za ustav i poslovnik, kaže da je vladajuća koalicija bila spremna i na povoljnu i na nepovoljnu odluku Žalbenog vijeća Haškog suda. “U tom poledu mislim da je najveća vrijetnost da je haški sud, raspravljajući o prigovorima Republike Hrvatske priznao Hrvatskoj da se može koristiti pravom koje ovi dokumenti daju i da raspravljajući o našim navodima i našim argumentima donio odluku. Drugo je pitanje što mi nismo imali dostatne dokaze da svoja stajališta i potvrdimo i da ih Haški sud prihvati.” Iako je Vlada za pravne korake koje je poduzela oko optužnice protiv generala Bobetka imala potporu ne samo stranaka vladajuće koalicije nego i opozicije, oporbene stranke danas izražavaju negodovanje zbog načina na koji je Vlada branila interese haških optuženika, svojih državljana. Predsjednik Hrvatskog bloka, Ivić Pašalić, smatra da je Račanova Vlada od samog početka svog mandata povela pogrešnu politiku prema Haškom sudu. Problem, po njemu, potječe od saborske deklaracije koju je vladajuća koalicija izglasala još u svibnju 2000., a u kojoj je priznala nadležnost haškog suda za akcije “Bljesak” i “Oluja”. “Prema tome riječ je o promašenoj strategiji sadašnje Vlade koja je jednostavno kulminirala dolaskom nekoliko optužnica u kojima se Vlada ponašala različito. U slučaju generala Gotovine nije napravila ništa nego je dala žalbu Carli del Ponte koja ju je ekspresno vratila natrag, a u slučaju optužnice protiv generala Bobetka, pritisnuta reakcijama u parlamentu i javnosti pokušali su nešto napraviti, ali očito pravno i politički loše”. Pašalić, međutim, ne spominje ustavni zakon o suradnji s Haškim sudom koji obvezuje hrvatske vlasti na suradnju sa sudom, a kojeg je 1996. donijela Hrvatska demokratska zajednica, stranka kojoj je i sam tada pripadao.",
266
+ "target_text": "Odbijanje hrvatskih podnesaka nikoga nije posebno iznenadilo u pravnim i političkim krugovima"
267
+ }
268
+ ```
269
+
270
+ ```json
271
+ {
272
+ "text": "Lječnici udvostručavaju napore na promoviranju vakcinacije kao najbolje zaštite protiv H1N1\n\nZemlje zapadne hemisfere su odpočeledistribuirati H1N1 vakcine u okviru obimnog programa imunizacije protiv virusnepandemije svinjske gripe. Roditelji i neki profesionalci su zabrinuti okosigirnosti vakcine, dok neki doktori dovode u sumnju sposobnosti bolnica da senose sa težim slučajevima. Veliki broj ljudi u Sjedinjenim Državamadolazi u klinike za vakcinaciju. Michelle Lowrey ima troje djece itrudna je sa četvrtim: \"Ja imam sve razloge da budemovdje.\" Trudne žene su izložene većem rizikukomplikacija ukoliko se zaraze virusom H1N1. I do sada je najmanje 86 američkedjece umrlo od novog virusa. Katherine Blake brine za svog sina: \"On je u visoko rizičnoj grupi.Kao dijete je imao otvorenu operaciju srca, i jako me je strah da se nezarazi.\" Američki centar za kontrolu bolestije izvjestio da se novi virus prehlade raširio kroz veći dio zemlje. I poredtoga, neki Amerikanci kažu da neće primiti vakcinu. Mi živimo u Sacramentu. Ima nekihslučajeva svinjske gripe, ali ne mnogo, tako da nas to, zaista, nije pogodilo,kaže jedan čovjek na ulici Washingtona. Neke brine koliko je vakcinasigurna, jer je tako brzo proizvedena, i zato što sadrži konzervanse za kojeneki roditelji tvrde da mogu uzrokovati autizam. Dr. Anne Schuchat iz AmeričkogCentra za kontrolu bolesti kaže da je vakcina sigurna i može se dobiti i bezkonzervansa: \"Mi nismo zanemarili sigurnostu proizvodnji ovih vakcina, ili testiranju i nadgledanju ovih vakcina. I veomaje važno da se ovaj proces obavi pažljivo i sigurno.\" Zdravstveni zvaničnici i lječniciudvostručavaju napore na promoviranju vakcinacije kao najbolje zaštite protivH1N1 virusa. Dr. Peter Holbrooke iz Medicinskogcentra za zaštitu djece u Washingtonu kaže da ljudi griješe kada misle da jeova groznica slična običnoj prehladi: \"Veoma je važno da se dobro razmislio vakcini i bolesti koju ona spriječava. To nije blaga, nego značajnabolest.\" Dr. Holbrooke kaže da čak i umjerenislučajevi izazivaju ozbiljnu bolest i teži slučajevi mogu ubrzano pogoršatistanje. Doktora Arthura Kellermanna saMedicinskog fakulteta Emory brine gdje smjestiti pacijente koji trebajuintenzivnu njegu: \"Mi trebamo pripremiti našekapacitete za intenzivnu njegu i naš zdravstveni sistem za mogućnost donošenjateških odluka - ko može dobiti intenzivnu njegu, a ko ne može.\" Ukoliko H1N1 se virus nastavirazvijati onim tempom kakvim je krenuo nakon što se pojavio u martu, bolestdostiže vrhunac i počinje da opada za otprilike sedam sedmica. Ako je to tako,moglo bi biti da je ona već na vrhuncu u Sjedinjenim Državama, smatra dr.Holbrooke: \"Ali treba shvatiti da veomalako može usljediti drugi val tokom zime.\" Svi se specijalisti slažu u tome daje izbijanje nove groznice nepredvidljivo. I nema dovoljno vakcine H1N1, čak niu Sjedinjenim Državama. Što se tiče zemalja u razvoju, izSvjetske zdravstvene organizacije kažu da bi za njih medjunarodne donacijevakcine trebale početi stizati za nekoliko sedmica.",
273
+ "target_text": "Zemlje zapadne hemisfere su odpočele distribuirati H1N1 vakcine u okviru obimnog programa imunizacije protiv virusne pandemije svinjske"
274
+ }
275
+ ```
276
+
277
+ When evaluating generative models, we use the following setup (see the
278
+ [methodology](/methodology) for more information on how these are used):
279
+
280
+ - Number of few-shot examples: 1
281
+ - Prefix prompt:
282
+
283
+ ```text
284
+ Slijede dokumenti s priloženim sažecima.
285
+ ```
286
+
287
+ - Base prompt template:
288
+
289
+ ```text
290
+ Dokument: {text}
291
+ Sažetak: {target_text}
292
+ ```
293
+
294
+ - Instruction-tuned prompt template:
295
+
296
+ ```text
297
+ Dokument: {text}
298
+
299
+ Napišite sažetak gornjeg dokumenta.
300
+ ```
301
+
302
+ You can evaluate this dataset directly as follows:
303
+
304
+ ```bash
305
+ euroeval --model <model-id> --dataset lr-sum-bs
306
+ ```
@@ -507,6 +507,8 @@ When evaluating generative models, we use the following setup (see the
507
507
  Vastusevariandid:
508
508
  a. {option_a}
509
509
  b. {option_b}
510
+ (...)
511
+ o. {option_o}
510
512
  Vastus: {label}
511
513
  ```
512
514
 
@@ -1,6 +1,6 @@
1
- # 🇸🇮 Slovenian
1
+ # 🇸🇮 Slovene
2
2
 
3
- This is an overview of all the datasets used in the Slovenian part of EuroEval. The
3
+ This is an overview of all the datasets used in the Slovene part of EuroEval. The
4
4
  datasets are grouped by their task - see the [task overview](/tasks) for more
5
5
  information about what these constitute.
6
6
 
@@ -82,7 +82,7 @@ This dataset was published in
82
82
  [this paper](https://nl.ijs.si/jtdh20/pdf/JT-DH_2020_Krek-et-al_The-ssj500k-Training-Corpus-for-Slovene-Language-Processing.pdf),
83
83
  and consists of a collection of text samples from the
84
84
  [FidaPLUS](https://www.sketchengine.eu/fida-plus-corpus/) corpus of written
85
- modern Slovenian.
85
+ modern Slovene.
86
86
 
87
87
  The original dataset consists of 9,489 samples. We use 1,024 / 256 / 2,048
88
88
  samples for our training, validation and test splits, respectively.
@@ -156,8 +156,8 @@ euroeval --model <model-id> --dataset ssj500k-ner
156
156
  ### ScaLA-sl
157
157
 
158
158
  This dataset was published in [this paper](https://aclanthology.org/2023.nodalida-1.20/)
159
- and was automatically created from the [Slovenian Universal Dependencies
160
- treebank](https://github.com/UniversalDependencies/UD_Slovenian-SSJ) by assuming that the
159
+ and was automatically created from the [Slovene Universal Dependencies
160
+ treebank](https://github.com/UniversalDependencies/UD_Slovene-SSJ) by assuming that the
161
161
  documents in the treebank are correct, and corrupting the samples to create
162
162
  grammatically incorrect samples. The corruptions were done by either removing a word
163
163
  from a sentence, or by swapping two neighbouring words in a sentence. To ensure that
@@ -314,7 +314,7 @@ This dataset was published in [this paper](https://doi.org/10.48550/arXiv.2410.0
314
314
  and is a machine translated version of the English [MMLU
315
315
  dataset](https://openreview.net/forum?id=d7KBjmI3GmQ) and features questions within 57
316
316
  different topics, such as elementary mathematics, US history and law. The translation to
317
- Slovenian was done using DeepL.
317
+ Slovene was done using DeepL.
318
318
 
319
319
  The original full dataset consists of 285 / 1,531 / 14,042 samples for training,
320
320
  validation, and testing, respectively. These splits were merged, duplicates removed, and