ScandEval 16.7.0__tar.gz → 16.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (365) hide show
  1. {scandeval-16.7.0 → scandeval-16.8.0}/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +4 -0
  2. {scandeval-16.7.0 → scandeval-16.8.0}/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +3 -3
  3. {scandeval-16.7.0 → scandeval-16.8.0}/.pre-commit-config.yaml +8 -12
  4. scandeval-16.8.0/AGENTS.md +121 -0
  5. {scandeval-16.7.0 → scandeval-16.8.0}/CHANGELOG.md +64 -7
  6. {scandeval-16.7.0 → scandeval-16.8.0}/PKG-INFO +15 -2
  7. {scandeval-16.7.0 → scandeval-16.8.0}/README.md +14 -1
  8. scandeval-16.8.0/docs/datasets/bosnian.md +306 -0
  9. scandeval-16.8.0/docs/datasets/catalan.md +536 -0
  10. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/estonian.md +2 -0
  11. scandeval-16.8.0/docs/datasets/hungarian.md +522 -0
  12. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/lithuanian.md +91 -1
  13. scandeval-16.8.0/docs/datasets/romanian.md +524 -0
  14. scandeval-16.7.0/docs/datasets/slovenian.md → scandeval-16.8.0/docs/datasets/slovene.md +6 -6
  15. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/spanish.md +77 -5
  16. scandeval-16.8.0/docs/leaderboards/Monolingual/bulgarian.md +26 -0
  17. scandeval-16.8.0/docs/leaderboards/Monolingual/croatian.md +26 -0
  18. scandeval-16.8.0/docs/leaderboards/Monolingual/greek.md +26 -0
  19. scandeval-16.8.0/docs/leaderboards/Monolingual/serbian.md +26 -0
  20. scandeval-16.8.0/docs/leaderboards/Monolingual/slovene.md +26 -0
  21. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Multilingual/slavic.md +1 -1
  22. {scandeval-16.7.0 → scandeval-16.8.0}/docs/methodology.md +0 -2
  23. {scandeval-16.7.0 → scandeval-16.8.0}/makefile +1 -1
  24. {scandeval-16.7.0 → scandeval-16.8.0}/pyproject.toml +1 -3
  25. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/__init__.py +3 -0
  26. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/benchmark_config_factory.py +8 -1
  27. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/benchmark_modules/base.py +8 -4
  28. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/benchmark_modules/fresh.py +4 -4
  29. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/benchmark_modules/hf.py +23 -9
  30. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/benchmark_modules/litellm.py +69 -45
  31. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/benchmark_modules/vllm.py +33 -28
  32. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/benchmarker.py +26 -6
  33. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/cli.py +11 -0
  34. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/data_loading.py +4 -4
  35. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/data_models.py +9 -4
  36. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/__init__.py +17 -5
  37. scandeval-16.8.0/src/scandeval/dataset_configs/bosnian.py +39 -0
  38. scandeval-16.8.0/src/scandeval/dataset_configs/catalan.py +64 -0
  39. scandeval-16.8.0/src/scandeval/dataset_configs/hungarian.py +64 -0
  40. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/lithuanian.py +15 -4
  41. scandeval-16.8.0/src/scandeval/dataset_configs/romanian.py +65 -0
  42. scandeval-16.7.0/src/scandeval/dataset_configs/slovenian.py → scandeval-16.8.0/src/scandeval/dataset_configs/slovene.py +8 -8
  43. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/spanish.py +9 -0
  44. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/finetuning.py +5 -4
  45. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/generation.py +6 -1
  46. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/generation_utils.py +30 -1
  47. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/languages.py +2 -2
  48. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/logging_utils.py +5 -3
  49. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/metrics/llm_as_a_judge.py +5 -1
  50. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/metrics/pipeline.py +2 -2
  51. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/model_cache.py +30 -5
  52. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/model_config.py +2 -2
  53. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/model_loading.py +1 -1
  54. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/prompt_templates/linguistic_acceptability.py +40 -27
  55. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/prompt_templates/multiple_choice.py +51 -42
  56. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/prompt_templates/named_entity_recognition.py +96 -24
  57. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/prompt_templates/reading_comprehension.py +73 -68
  58. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/prompt_templates/sentiment_classification.py +64 -35
  59. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/prompt_templates/summarization.py +47 -19
  60. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/task_group_utils/multiple_choice_classification.py +8 -6
  61. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/tokenisation_utils.py +14 -20
  62. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/types.py +3 -0
  63. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/utils.py +24 -15
  64. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/constants.py +3 -0
  65. scandeval-16.8.0/src/scripts/create_atsiliepimai.py +78 -0
  66. scandeval-16.8.0/src/scripts/create_dacsa.py +113 -0
  67. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_global_mmlu.py +1 -1
  68. scandeval-16.8.0/src/scripts/create_guia_cat.py +116 -0
  69. scandeval-16.8.0/src/scripts/create_hun_sum.py +223 -0
  70. scandeval-16.8.0/src/scripts/create_husst.py +119 -0
  71. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_lr_sum.py +1 -1
  72. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_mmlu.py +2 -0
  73. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_mms.py +1 -1
  74. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_multi_wiki_qa.py +4 -0
  75. scandeval-16.8.0/src/scripts/create_ronec.py +160 -0
  76. scandeval-16.8.0/src/scripts/create_rosent.py +215 -0
  77. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_scala.py +6 -0
  78. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_sentinews.py +2 -2
  79. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_ssj500k_ner.py +3 -3
  80. scandeval-16.8.0/src/scripts/create_sumo_ro.py +131 -0
  81. scandeval-16.8.0/src/scripts/create_szeged_ner.py +169 -0
  82. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_wikiann.py +1 -1
  83. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_winogrande.py +3 -0
  84. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/load_ud_pos.py +69 -15
  85. {scandeval-16.7.0 → scandeval-16.8.0}/tests/conftest.py +15 -3
  86. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_benchmark_config_factory.py +25 -5
  87. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_benchmarker.py +25 -3
  88. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_callbacks.py +1 -1
  89. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_cli.py +3 -1
  90. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_constants.py +1 -1
  91. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_data_loading.py +8 -2
  92. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_data_models.py +3 -2
  93. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_dataset_configs.py +9 -3
  94. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_finetuning.py +2 -2
  95. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_model_loading.py +12 -3
  96. {scandeval-16.7.0 → scandeval-16.8.0}/uv.lock +3 -51
  97. {scandeval-16.7.0 → scandeval-16.8.0}/.github/ISSUE_TEMPLATE/bug.yaml +0 -0
  98. {scandeval-16.7.0 → scandeval-16.8.0}/.github/ISSUE_TEMPLATE/feature_request.yaml +0 -0
  99. {scandeval-16.7.0 → scandeval-16.8.0}/.github/ISSUE_TEMPLATE/language_request.yaml +0 -0
  100. {scandeval-16.7.0 → scandeval-16.8.0}/.github/workflows/ci.yaml +0 -0
  101. {scandeval-16.7.0 → scandeval-16.8.0}/.gitignore +0 -0
  102. {scandeval-16.7.0 → scandeval-16.8.0}/.markdownlint.jsonc +0 -0
  103. {scandeval-16.7.0 → scandeval-16.8.0}/CITATION.cff +0 -0
  104. {scandeval-16.7.0 → scandeval-16.8.0}/CODE_OF_CONDUCT.md +0 -0
  105. {scandeval-16.7.0 → scandeval-16.8.0}/CONTRIBUTING.md +0 -0
  106. {scandeval-16.7.0 → scandeval-16.8.0}/Dockerfile.cuda +0 -0
  107. {scandeval-16.7.0 → scandeval-16.8.0}/LICENSE +0 -0
  108. {scandeval-16.7.0 → scandeval-16.8.0}/NEW_DATASET_GUIDE.md +0 -0
  109. {scandeval-16.7.0 → scandeval-16.8.0}/docs/CNAME +0 -0
  110. {scandeval-16.7.0 → scandeval-16.8.0}/docs/README.md +0 -0
  111. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/README.md +0 -0
  112. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/bulgarian.md +0 -0
  113. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/croatian.md +0 -0
  114. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/czech.md +0 -0
  115. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/danish.md +0 -0
  116. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/dutch.md +0 -0
  117. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/english.md +0 -0
  118. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/faroese.md +0 -0
  119. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/finnish.md +0 -0
  120. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/french.md +0 -0
  121. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/german.md +0 -0
  122. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/greek.md +0 -0
  123. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/icelandic.md +0 -0
  124. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/italian.md +0 -0
  125. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/latvian.md +0 -0
  126. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/norwegian.md +0 -0
  127. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/polish.md +0 -0
  128. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/portuguese.md +0 -0
  129. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/serbian.md +0 -0
  130. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/slovak.md +0 -0
  131. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/swedish.md +0 -0
  132. {scandeval-16.7.0 → scandeval-16.8.0}/docs/datasets/ukrainian.md +0 -0
  133. {scandeval-16.7.0 → scandeval-16.8.0}/docs/extras/radial_plotter.md +0 -0
  134. {scandeval-16.7.0 → scandeval-16.8.0}/docs/faq.md +0 -0
  135. {scandeval-16.7.0 → scandeval-16.8.0}/docs/gfx/favicon.png +0 -0
  136. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/czech.md +0 -0
  137. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/danish.md +0 -0
  138. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/dutch.md +0 -0
  139. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/english.md +0 -0
  140. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/estonian.md +0 -0
  141. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/faroese.md +0 -0
  142. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/finnish.md +0 -0
  143. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/french.md +0 -0
  144. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/german.md +0 -0
  145. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/icelandic.md +0 -0
  146. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/italian.md +0 -0
  147. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/latvian.md +0 -0
  148. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/lithuanian.md +0 -0
  149. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/norwegian.md +0 -0
  150. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/polish.md +0 -0
  151. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/portuguese.md +0 -0
  152. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/slovak.md +0 -0
  153. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/spanish.md +0 -0
  154. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/swedish.md +0 -0
  155. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Monolingual/ukrainian.md +0 -0
  156. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Multilingual/baltic.md +0 -0
  157. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Multilingual/european.md +0 -0
  158. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Multilingual/finnic.md +0 -0
  159. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Multilingual/germanic.md +0 -0
  160. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Multilingual/mainland-scandinavian.md +0 -0
  161. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/Multilingual/romance.md +0 -0
  162. {scandeval-16.7.0 → scandeval-16.8.0}/docs/leaderboards/README.md +0 -0
  163. {scandeval-16.7.0 → scandeval-16.8.0}/docs/python-package.md +0 -0
  164. {scandeval-16.7.0 → scandeval-16.8.0}/docs/tasks/README.md +0 -0
  165. {scandeval-16.7.0 → scandeval-16.8.0}/docs/tasks/common-sense-reasoning.md +0 -0
  166. {scandeval-16.7.0 → scandeval-16.8.0}/docs/tasks/knowledge.md +0 -0
  167. {scandeval-16.7.0 → scandeval-16.8.0}/docs/tasks/linguistic-acceptability.md +0 -0
  168. {scandeval-16.7.0 → scandeval-16.8.0}/docs/tasks/named-entity-recognition.md +0 -0
  169. {scandeval-16.7.0 → scandeval-16.8.0}/docs/tasks/reading-comprehension.md +0 -0
  170. {scandeval-16.7.0 → scandeval-16.8.0}/docs/tasks/sentiment-classification.md +0 -0
  171. {scandeval-16.7.0 → scandeval-16.8.0}/docs/tasks/speed.md +0 -0
  172. {scandeval-16.7.0 → scandeval-16.8.0}/docs/tasks/summarization.md +0 -0
  173. {scandeval-16.7.0 → scandeval-16.8.0}/gfx/euroeval.png +0 -0
  174. {scandeval-16.7.0 → scandeval-16.8.0}/gfx/euroeval.xcf +0 -0
  175. {scandeval-16.7.0 → scandeval-16.8.0}/gfx/scandeval.png +0 -0
  176. {scandeval-16.7.0 → scandeval-16.8.0}/mkdocs.yaml +0 -0
  177. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/benchmark_modules/__init__.py +0 -0
  178. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/caching_utils.py +0 -0
  179. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/callbacks.py +0 -0
  180. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/constants.py +0 -0
  181. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/bulgarian.py +0 -0
  182. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/croatian.py +0 -0
  183. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/czech.py +0 -0
  184. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/danish.py +0 -0
  185. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/dutch.py +0 -0
  186. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/english.py +0 -0
  187. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/estonian.py +0 -0
  188. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/faroese.py +0 -0
  189. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/finnish.py +0 -0
  190. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/french.py +0 -0
  191. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/german.py +0 -0
  192. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/greek.py +0 -0
  193. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/icelandic.py +0 -0
  194. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/italian.py +0 -0
  195. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/latvian.py +0 -0
  196. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/norwegian.py +0 -0
  197. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/polish.py +0 -0
  198. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/portuguese.py +0 -0
  199. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/serbian.py +0 -0
  200. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/slovak.py +0 -0
  201. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/swedish.py +0 -0
  202. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/dataset_configs/ukrainian.py +0 -0
  203. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/enums.py +0 -0
  204. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/exceptions.py +0 -0
  205. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/metrics/__init__.py +0 -0
  206. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/metrics/base.py +0 -0
  207. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/metrics/huggingface.py +0 -0
  208. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/metrics/speed.py +0 -0
  209. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/prompt_templates/__init__.py +0 -0
  210. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/prompt_templates/classification.py +0 -0
  211. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/prompt_templates/token_classification.py +0 -0
  212. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/scores.py +0 -0
  213. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/speed_benchmark.py +0 -0
  214. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/task_group_utils/__init__.py +0 -0
  215. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/task_group_utils/question_answering.py +0 -0
  216. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/task_group_utils/sequence_classification.py +0 -0
  217. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/task_group_utils/text_to_text.py +0 -0
  218. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/task_group_utils/token_classification.py +0 -0
  219. {scandeval-16.7.0 → scandeval-16.8.0}/src/scandeval/tasks.py +0 -0
  220. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/__init__.py +0 -0
  221. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_allocine.py +0 -0
  222. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_angry_tweets.py +0 -0
  223. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_arc.py +0 -0
  224. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_arc_is.py +0 -0
  225. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_belebele.py +0 -0
  226. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_bg_ner_bsnlp.py +0 -0
  227. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_boolq_pt.py +0 -0
  228. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_cinexio.py +0 -0
  229. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_cnn_dailymail.py +0 -0
  230. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_conll_en.py +0 -0
  231. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_conll_es.py +0 -0
  232. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_conll_nl.py +0 -0
  233. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_copa_lv.py +0 -0
  234. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_cross_domain_uk_reviews.py +0 -0
  235. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_cs_gec.py +0 -0
  236. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_csfd_sentiment.py +0 -0
  237. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_csfd_sentiment_sk.py +0 -0
  238. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_czech_news.py +0 -0
  239. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_dane.py +0 -0
  240. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_danish_citizen_tests.py +0 -0
  241. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_dansk.py +0 -0
  242. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_danske_talemaader.py +0 -0
  243. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_danske_talemaader_old.py +0 -0
  244. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_dbrd.py +0 -0
  245. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_dutch_cola.py +0 -0
  246. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_elner.py +0 -0
  247. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_eltec.py +0 -0
  248. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_err_news.py +0 -0
  249. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_estner.py +0 -0
  250. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_estonian_valence.py +0 -0
  251. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_european_values.py +0 -0
  252. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_exam_et.py +0 -0
  253. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_exams_bg.py +0 -0
  254. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_fone.py +0 -0
  255. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_foqa.py +0 -0
  256. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_fosent.py +0 -0
  257. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_fquad.py +0 -0
  258. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_fullstack_ner.py +0 -0
  259. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_germanquad.py +0 -0
  260. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_germeval.py +0 -0
  261. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_goldenswag.py +0 -0
  262. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_grammar_et.py +0 -0
  263. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_greek_sa.py +0 -0
  264. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_greek_wikipedia.py +0 -0
  265. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_harem.py +0 -0
  266. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_hellaswag.py +0 -0
  267. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_hellaswag_cs.py +0 -0
  268. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_hellaswag_fi.py +0 -0
  269. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_hotter_and_colder_sentiment.py +0 -0
  270. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_ice_linguistic.py +0 -0
  271. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_icelandic_error_corpus.py +0 -0
  272. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_icelandic_knowledge.py +0 -0
  273. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_icelandic_qa.py +0 -0
  274. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_icesum.py +0 -0
  275. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_idioms_no.py +0 -0
  276. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_ilpost_sum.py +0 -0
  277. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_jentoft.py +0 -0
  278. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_kpwr_ner.py +0 -0
  279. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_latvian_lsm_summary.py +0 -0
  280. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_latvian_twitter_sentiment.py +0 -0
  281. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_life_in_the_uk.py +0 -0
  282. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_lithuanian_lrytas_summarization.py +0 -0
  283. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_llmzszl.py +0 -0
  284. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_lt_emotions.py +0 -0
  285. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_lt_history.py +0 -0
  286. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_mim_gold_ner.py +0 -0
  287. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_mlqa_es.py +0 -0
  288. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_mlsum_de.py +0 -0
  289. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_mlsum_es.py +0 -0
  290. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_mmlu_et.py +0 -0
  291. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_mmlu_hr.py +0 -0
  292. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_mmlu_lv.py +0 -0
  293. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_multinerd-it.py +0 -0
  294. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_ner_uk.py +0 -0
  295. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_no_cola.py +0 -0
  296. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_no_sammendrag.py +0 -0
  297. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_nor_common_sense_qa.py +0 -0
  298. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_nordjylland_news.py +0 -0
  299. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_norec.py +0 -0
  300. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_norglm_multiqa.py +0 -0
  301. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_norglm_multisum.py +0 -0
  302. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_norne.py +0 -0
  303. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_norquad.py +0 -0
  304. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_nqii.py +0 -0
  305. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_nrk_quiz_qa.py +0 -0
  306. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_orange_sum.py +0 -0
  307. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_personal_sum.py +0 -0
  308. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_polemo2.py +0 -0
  309. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_poner.py +0 -0
  310. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_poquad.py +0 -0
  311. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_psc.py +0 -0
  312. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_publico.py +0 -0
  313. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_rrn.py +0 -0
  314. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_sb10k.py +0 -0
  315. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_scandiqa.py +0 -0
  316. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_scandisent_fi.py +0 -0
  317. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_schibsted.py +0 -0
  318. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_sentiment_headlines_es.py +0 -0
  319. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_sentipolc16.py +0 -0
  320. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_skolprov.py +0 -0
  321. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_sqad.py +0 -0
  322. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_squad.py +0 -0
  323. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_squad_it.py +0 -0
  324. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_squad_nl.py +0 -0
  325. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_squad_nl_old.py +0 -0
  326. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_sst2_pt.py +0 -0
  327. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_sst5.py +0 -0
  328. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_suc3.py +0 -0
  329. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_swedn.py +0 -0
  330. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_swerec.py +0 -0
  331. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_trivia_et.py +0 -0
  332. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_turku_ner_fi.py +0 -0
  333. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_tydiqa_fi.py +0 -0
  334. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_umimeto_qa.py +0 -0
  335. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_uner_sk.py +0 -0
  336. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_uner_sr.py +0 -0
  337. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_wiki_lingua_nl.py +0 -0
  338. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_wikineural-it.py +0 -0
  339. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_winogrande_et.py +0 -0
  340. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_winogrande_is.py +0 -0
  341. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_xlsum_fi.py +0 -0
  342. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/create_xquad.py +0 -0
  343. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/fix_dot_env_file.py +0 -0
  344. {scandeval-16.7.0 → scandeval-16.8.0}/src/scripts/versioning.py +0 -0
  345. {scandeval-16.7.0 → scandeval-16.8.0}/tests/__init__.py +0 -0
  346. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_benchmark_modules/__init__.py +0 -0
  347. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_benchmark_modules/test_hf.py +0 -0
  348. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_enums.py +0 -0
  349. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_exceptions.py +0 -0
  350. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_languages.py +0 -0
  351. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_model_config.py +0 -0
  352. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scores.py +0 -0
  353. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scripts/__init__.py +0 -0
  354. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scripts/test_create_scala/__init__.py +0 -0
  355. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scripts/test_create_scala/test_create_scala.py +0 -0
  356. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scripts/test_create_scala/test_data/de_gsd-ud-train.conllu.adp_det +0 -0
  357. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scripts/test_create_scala/test_data/empty.file +0 -0
  358. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scripts/test_create_scala/test_data/en_gum-ud-train.conllu.case +0 -0
  359. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_01 +0 -0
  360. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_02 +0 -0
  361. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_scripts/test_create_scala/test_data/pl_pdb-ud-train.conllu.aux_clitic_03 +0 -0
  362. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_speed_benchmark.py +0 -0
  363. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_tokenisation_utils.py +0 -0
  364. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_types.py +0 -0
  365. {scandeval-16.7.0 → scandeval-16.8.0}/tests/test_utils.py +0 -0
@@ -25,6 +25,8 @@ body:
25
25
  description: What languages is the dataset in?
26
26
  options:
27
27
  - label: Bulgarian
28
+ - label: Bosnian
29
+ - label: Catalan
28
30
  - label: Croatian
29
31
  - label: Czech
30
32
  - label: Danish
@@ -36,6 +38,7 @@ body:
36
38
  - label: French
37
39
  - label: German
38
40
  - label: Greek
41
+ - label: Hungarian
39
42
  - label: Icelandic
40
43
  - label: Italian
41
44
  - label: Latvian
@@ -43,6 +46,7 @@ body:
43
46
  - label: Norwegian (Bokmål or Nynorsk)
44
47
  - label: Polish
45
48
  - label: Portuguese
49
+ - label: Romanian
46
50
  - label: Serbian
47
51
  - label: Slovak
48
52
  - label: Slovenian
@@ -20,10 +20,10 @@ body:
20
20
  options:
21
21
  - label: Baltic languages (Latvian, Lithuanian)
22
22
  - label: Finnic languages (Estonian, Finnish)
23
- - label: Hellenic languages (Greek)
24
- - label: Romance languages (French, Italian, Portuguese, Spanish)
23
+ - label: Greek
24
+ - label: Romance languages (Catalan, French, Italian, Portuguese, Romanian, Spanish)
25
25
  - label: Scandinavian languages (Danish, Faroese, Icelandic, Norwegian, Swedish)
26
- - label: Slavic languages (Bulgarian, Croatian, Czech, Polish, Serbian, Slovak, Slovenian, Ukrainian)
26
+ - label: Slavic languages (Bulgarian, Bosnian, Croatian, Czech, Hungarian, Polish, Serbian, Slovak, Slovenian, Ukrainian)
27
27
  - label: West Germanic languages (Dutch, English, German)
28
28
  validations:
29
29
  required: true
@@ -10,7 +10,7 @@ repos:
10
10
  - id: trailing-whitespace
11
11
  - id: debug-statements
12
12
  - repo: https://github.com/astral-sh/ruff-pre-commit
13
- rev: v0.14.4
13
+ rev: v0.14.6
14
14
  hooks:
15
15
  - id: ruff
16
16
  args:
@@ -30,21 +30,17 @@ repos:
30
30
  - pyi
31
31
  - jupyter
32
32
  - repo: https://github.com/kynan/nbstripout
33
- rev: 0.8.1
33
+ rev: 0.8.2
34
34
  hooks:
35
35
  - id: nbstripout
36
- - repo: https://github.com/pre-commit/mirrors-mypy
37
- rev: v1.18.2
36
+ - repo: https://github.com/facebook/pyrefly-pre-commit
37
+ rev: 0.0.1
38
38
  hooks:
39
- - id: mypy
40
- args:
41
- - --install-types
42
- - --non-interactive
43
- - --ignore-missing-imports
44
- - --show-error-codes
45
- - --check-untyped-defs
39
+ - id: pyrefly-typecheck-system
40
+ name: Pyrefly (type checking)
41
+ pass_filenames: true
46
42
  - repo: https://github.com/DavidAnson/markdownlint-cli2
47
- rev: v0.18.1
43
+ rev: v0.19.1
48
44
  hooks:
49
45
  - id: markdownlint-cli2
50
46
  args:
@@ -0,0 +1,121 @@
1
+ # Python Project Conventions
2
+
3
+ ## Development Workflow
4
+
5
+ ### Tool Execution
6
+
7
+ - Use `uv run` for all script and command execution
8
+ - Always run formatters and linters before committing:
9
+
10
+ ```bash
11
+ uv run ruff format
12
+ uv run ruff check --fix
13
+ ```
14
+
15
+ - Verify all tests pass with `uv run pytest`
16
+ - If tests fail, fix them before proceeding with other changes
17
+
18
+ ## Code Style
19
+
20
+ ### Documentation
21
+
22
+ - Use Google-style docstrings for all public functions, classes, and modules
23
+ - Always include a newline after the name of each argument and exception in the
24
+ docstring.
25
+ - Avoid tutorial-style `#` comments that explain what code does
26
+ - Comments should explain **why**, not **what** (the code itself should be
27
+ self-explanatory)
28
+ - Example:
29
+
30
+ ```python
31
+ def process_items(items: list[Item]) -> list[Result]:
32
+ """Process items and return results.
33
+
34
+ Args:
35
+ items:
36
+ List of items to process.
37
+
38
+ Returns:
39
+ List of processed results.
40
+
41
+ Raises:
42
+ ValueError:
43
+ If items list is empty.
44
+ """
45
+ return await batch_process(items)
46
+ ```
47
+
48
+ ### Type Annotations
49
+
50
+ - Fully type-annotate all functions, methods, and variables
51
+ - Target Python 3.12+ syntax:
52
+ - Use `list[T]`, `dict[K, V]`, `set[T]` (not `List`, `Dict`, `Set` from typing)
53
+ - Use `X | Y` for unions (not `Union[X, Y]`)
54
+ - Use `X | None` for optional types (not `Optional[X]`)
55
+ - Always use `import typing as t` and use the `t.` prefix for types from the typing
56
+ module, such as `t.Any`, `t.Callable`, `t.TypeVar`, etc.
57
+ - Example:
58
+
59
+ ```python
60
+ def fetch_data(url: str, timeout: float = 30.0) -> dict[str, t.Any] | None:
61
+ ...
62
+ ```
63
+
64
+ ### Programming Paradigm
65
+
66
+ - Prefer functional programming patterns over OOP when appropriate
67
+ - Use the best tool for the job (don't force FP or OOP dogmatically)
68
+ - Favor immutability and pure functions where practical
69
+ - Prefer composition over inheritance
70
+ - Use dataclasses or Pydantic models for data structures
71
+
72
+ ### Performance
73
+
74
+ - Write code with performance in mind
75
+ - Profile before optimising
76
+ - Use appropriate data structures (sets for membership, deques for queues, etc.)
77
+ - Leverage list/dict/set comprehensions over explicit loops when clearer
78
+ - Consider generators for memory efficiency with large datasets
79
+
80
+ ## Testing
81
+
82
+ ### Test Execution
83
+
84
+ - Run tests with `uv run pytest`
85
+ - All tests must pass before pushing code
86
+ - Fix broken tests immediately—do not commit failing tests
87
+
88
+ ### Test Style
89
+
90
+ - Follow the same conventions as production code
91
+ - Use descriptive test names that explain the scenario
92
+ - Example:
93
+
94
+ ```python
95
+ def test_fetch_data_returns_valid_json() -> None:
96
+ """Test that fetch_data returns properly formatted JSON."""
97
+ result = await fetch_data("https://api.example.com/data")
98
+ assert isinstance(result, dict)
99
+ assert "id" in result
100
+ ```
101
+
102
+ ## Code Organisation
103
+
104
+ ### Module Structure
105
+
106
+ - Keep modules focused and cohesive
107
+ - Prefer many small modules over few large ones
108
+ - Use clear, descriptive names
109
+ - Organise imports: stdlib, third-party, local (separated by blank lines)
110
+
111
+ ### Function Design
112
+
113
+ - Keep functions small and single-purpose
114
+ - Use descriptive names (prefer `calculate_total_price` over `calc`)
115
+ - Limit arguments (consider using dataclasses for many parameters)
116
+ - Return early to reduce nesting
117
+
118
+ ## Summary
119
+
120
+ **Remember:** Write code that is clear, fast, and well-typed. Let the code speak for
121
+ itself with minimal comments. Run formatters, linters, and tests before committing.
@@ -7,10 +7,66 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [v16.8.0] - 2025-11-25
11
+
12
+ ### Added
13
+
14
+ - Added support for Romanian 🇷🇴! This includes the sentiment classification dataset
15
+ RoSent, the linguistic acceptability dataset ScaLA-ro, the named entity recognition
16
+ dataset RoNEC, the reading comprehension dataset MultiWikiQA-ro, the summarisation
17
+ dataset SumO-Ro, the knowledge dataset Global-MMLU-ro, and the common-sense
18
+ reasoning dataset Winogrande-ro. This was contributed by @oliverkinch ✨
19
+ - Added support for Hungarian 🇭🇺! This includes the sentiment classification dataset
20
+ HuSST, the linguistic acceptability dataset ScaLA-hu, the named entity recognition
21
+ dataset SzegedNER, the reading comprehension dataset MultiWikiQA-hu, the
22
+ summarisation dataset HunSum, the knowledge dataset MMLU-hu, and the common-sense
23
+ reasoning dataset Winogrande-hu. This was contributed by @oliverkinch ✨
24
+ - Added support for Catalan! This includes the sentiment classification dataset
25
+ GuiaCat, the linguistic acceptability dataset ScaLA-ca, the named entity recognition
26
+ dataset WikiANN-ca, the reading comprehension dataset MultiWikiQA-ca, the summarisation
27
+ dataset DACSA-ca, the knowledge dataset MMLU-ca, and the common-sense reasoning dataset
28
+ Winogrande-ca. This was contributed by @oliverkinch ✨
29
+ - Added Spanish summarisation dataset DACSA-es as an unofficial dataset.
30
+ - Added Lithuanian sentiment classification dataset Atsiliepimai to replace the now
31
+ unofficial Lithuanian Emotions dataset. This was contributed by @oliverkinch ✨
32
+ - Added new `--custom-datasets-file` (`custom_datasets_file` in the `Benchmarker` API)
33
+ argument, which can be used to specify a custom Python file containing custom dataset
34
+ definitions. It defaults to `custom_datasets.py` in the current working directory.
35
+
36
+ ### Changed
37
+
38
+ - When evaluating models with the `--debug` flag (`debug=True` in the `Benchmarker`
39
+ API), we now include the full model inputs and outputs in the JSON file stored to the
40
+ current working directory, where we previously only included the model outputs.
41
+
42
+ ### Fixed
43
+
44
+ - When encountering rate limits for API inference models, we ended up waiting 10 seconds
45
+ for each request, which was unnecessarily long. We now only wait 10 seconds for each
46
+ batch of requests.
47
+ - Uses the `FLASH_ATTN` backend with vLLM for Gemma-3-1b and Gemma-3n models now and the
48
+ `TRITON_ATTN` with the other Gemma-3 models, as their architecture is currently not
49
+ supported by the default `FLASHINFER` backend. Note that this can always be changed
50
+ manually with the `VLLM_ATTENTION_BACKEND` environment variable.
51
+
52
+ ## [v16.7.1] - 2025-11-18
53
+
54
+ ### Fixed
55
+
56
+ - The `--no-progress-bar` argument (`progress_bar=False` in the `Benchmarker` API) was
57
+ not hiding all the progress bars for generative models. This has been fixed now.
58
+ - Now respects the revision when loading tokenizers with vLLM models. I.e., if
59
+ evaluating a model `<model_id>@<revision>` then we also load the tokenizer from the
60
+ `<revision>` branch.
61
+
10
62
  ## [v16.7.0] - 2025-11-10
11
63
 
12
64
  ### Added
13
65
 
66
+ - Added support for Bosnian 🇧🇦! This includes the sentiment classification dataset
67
+ MMS-bs, the named entity recognition dataset WikiANN-bs, the reading comprehension
68
+ dataset MultiWikiQA-bs, and the summarisation dataset LR-Sum-bs. This was contributed
69
+ by @oliverkinch ✨
14
70
  - Now allows the 'low', 'medium' and 'high' reasoning effort parameters for the GPT-OSS
15
71
  models, which can be set by appending `#low`, `#medium` or `#high` to the model ID.
16
72
 
@@ -52,7 +108,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
52
108
  - Added support for Croatian 🇭🇷! This includes the sentiment classification dataset
53
109
  MMS-hr, the linguistic acceptability dataset ScaLA-hr, the named entity recognition
54
110
  dataset WikiANN-hr, the reading comprehension dataset MultiWikiQA-hr, the knowledge
55
- dataset MMLU-hr, and the common-sense reasoning dataset Winogrande-hr.
111
+ dataset MMLU-hr, and the common-sense reasoning dataset Winogrande-hr. This was
112
+ contributed by @oliverkinch ✨
56
113
  - Added a system dependency check for `nvcc` in the `VLLMModel.__init__` method to
57
114
  ensure the CUDA Toolkit is installed. Raises an error with installation instructions
58
115
  if NVCC is not available in the system PATH.
@@ -73,11 +130,6 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
73
130
 
74
131
  ### Added
75
132
 
76
- - Added support for Slovenian 🇸🇮! This includes the sentiment classification dataset
77
- Sentinews, the linguistic acceptability dataset ScaLA-sl, the named entity recognition
78
- dataset ssj500k-NER, the reading comprehension
79
- dataset MultiWikiQA-sl, the knowledge dataset MMLU-sl, and the common-sense reasoning
80
- dataset Winogrande-sl.
81
133
  - Added better support for evaluating on custom datasets, by allowing `DatasetConfig`
82
134
  objects directly in the `Benchmarker.benchmark` method. We also support custom
83
135
  datasets with the CLI, by simply defining the desired `DatasetConfig`s in a
@@ -86,6 +138,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
86
138
  with the new `source` argument. This argument can both be the Hugging Face Hub ID of
87
139
  the dataset or a dictionary with 'train', 'val' and 'test', and values the paths to
88
140
  the CSV files.
141
+ - Added support for Slovene 🇸🇮! This includes the sentiment classification dataset
142
+ Sentinews, the linguistic acceptability dataset ScaLA-sl, the named entity recognition
143
+ dataset ssj500k-NER, the reading comprehension
144
+ dataset MultiWikiQA-sl, the knowledge dataset MMLU-sl, and the common-sense reasoning
145
+ dataset Winogrande-sl. This was contributed by @oliverkinch ✨
89
146
  - Added support for Serbian 🇷🇸! This includes the sentiment classification dataset
90
147
  MMS-sr, the linguistic acceptability dataset ScaLA-sr, the named entity recognition
91
148
  dataset UNER-sr, the reading comprehension dataset MultiWikiQA-sr, the summarisation
@@ -2860,7 +2917,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
2860
2917
  - Removed support for TensorFlow and Jax models, due to them not working
2861
2918
  properly anyway. They might be included at a later point, properly.
2862
2919
 
2863
- ##  [v1.4.0] - 2021-11-25
2920
+ ## [v1.4.0] - 2021-11-25
2864
2921
 
2865
2922
  ### Changed
2866
2923
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ScandEval
3
- Version: 16.7.0
3
+ Version: 16.8.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -92,7 +92,7 @@ ______________________________________________________________________
92
92
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
93
93
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
94
94
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
95
- [![Code Coverage](https://img.shields.io/badge/Coverage-74%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
95
+ [![Code Coverage](https://img.shields.io/badge/Coverage-70%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
96
96
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
97
97
 
98
98
  ## Maintainer
@@ -255,6 +255,19 @@ argument:
255
255
  euroeval --model my-model --api-base http://localhost:8000 --api-key my-secret-key
256
256
  ```
257
257
 
258
+ If your model is a reasoning model, then you need to specify this as follows:
259
+
260
+ ```bash
261
+ euroeval --model my-reasoning-model --api-base http://localhost:8000 --generative-type reasoning
262
+ ```
263
+
264
+ Likewise, if it is a pretrained decoder model (aka a completion model), then you specify
265
+ this as follows:
266
+
267
+ ```bash
268
+ euroeval --model my-base-decoder-model --api-base http://localhost:8000 --generative-type base
269
+ ```
270
+
258
271
  When using the `Benchmarker` API, the same applies. Here is an example of benchmarking
259
272
  an Ollama model hosted locally:
260
273
 
@@ -20,7 +20,7 @@ ______________________________________________________________________
20
20
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
21
21
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
22
22
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
23
- [![Code Coverage](https://img.shields.io/badge/Coverage-74%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
23
+ [![Code Coverage](https://img.shields.io/badge/Coverage-70%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
24
24
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
25
25
 
26
26
  ## Maintainer
@@ -183,6 +183,19 @@ argument:
183
183
  euroeval --model my-model --api-base http://localhost:8000 --api-key my-secret-key
184
184
  ```
185
185
 
186
+ If your model is a reasoning model, then you need to specify this as follows:
187
+
188
+ ```bash
189
+ euroeval --model my-reasoning-model --api-base http://localhost:8000 --generative-type reasoning
190
+ ```
191
+
192
+ Likewise, if it is a pretrained decoder model (aka a completion model), then you specify
193
+ this as follows:
194
+
195
+ ```bash
196
+ euroeval --model my-base-decoder-model --api-base http://localhost:8000 --generative-type base
197
+ ```
198
+
186
199
  When using the `Benchmarker` API, the same applies. Here is an example of benchmarking
187
200
  an Ollama model hosted locally:
188
201