EuroEval 15.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (196) hide show
  1. euroeval-15.2.0/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +44 -0
  2. euroeval-15.2.0/.github/ISSUE_TEMPLATE/bug.yaml +63 -0
  3. euroeval-15.2.0/.github/ISSUE_TEMPLATE/feature_request.yaml +27 -0
  4. euroeval-15.2.0/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +53 -0
  5. euroeval-15.2.0/.github/workflows/ci.yaml +90 -0
  6. euroeval-15.2.0/.gitignore +117 -0
  7. euroeval-15.2.0/.pre-commit-config.yaml +41 -0
  8. euroeval-15.2.0/CHANGELOG.md +2250 -0
  9. euroeval-15.2.0/CITATION.cff +33 -0
  10. euroeval-15.2.0/CODE_OF_CONDUCT.md +123 -0
  11. euroeval-15.2.0/CONTRIBUTING.md +90 -0
  12. euroeval-15.2.0/Dockerfile.cuda +25 -0
  13. euroeval-15.2.0/LICENSE +21 -0
  14. euroeval-15.2.0/PKG-INFO +234 -0
  15. euroeval-15.2.0/README.md +162 -0
  16. euroeval-15.2.0/docs/CNAME +1 -0
  17. euroeval-15.2.0/docs/README.md +37 -0
  18. euroeval-15.2.0/docs/datasets/README.md +8 -0
  19. euroeval-15.2.0/docs/datasets/danish.md +753 -0
  20. euroeval-15.2.0/docs/datasets/dutch.md +659 -0
  21. euroeval-15.2.0/docs/datasets/english.md +565 -0
  22. euroeval-15.2.0/docs/datasets/faroese.md +353 -0
  23. euroeval-15.2.0/docs/datasets/french.md +502 -0
  24. euroeval-15.2.0/docs/datasets/german.md +545 -0
  25. euroeval-15.2.0/docs/datasets/icelandic.md +904 -0
  26. euroeval-15.2.0/docs/datasets/norwegian.md +1023 -0
  27. euroeval-15.2.0/docs/datasets/swedish.md +625 -0
  28. euroeval-15.2.0/docs/extras/radial_plotter.md +16 -0
  29. euroeval-15.2.0/docs/faq.md +19 -0
  30. euroeval-15.2.0/docs/gfx/favicon.png +0 -0
  31. euroeval-15.2.0/docs/leaderboards/Monolingual/danish.md +15 -0
  32. euroeval-15.2.0/docs/leaderboards/Monolingual/dutch.md +15 -0
  33. euroeval-15.2.0/docs/leaderboards/Monolingual/english.md +15 -0
  34. euroeval-15.2.0/docs/leaderboards/Monolingual/faroese.md +20 -0
  35. euroeval-15.2.0/docs/leaderboards/Monolingual/french.md +15 -0
  36. euroeval-15.2.0/docs/leaderboards/Monolingual/german.md +15 -0
  37. euroeval-15.2.0/docs/leaderboards/Monolingual/icelandic.md +15 -0
  38. euroeval-15.2.0/docs/leaderboards/Monolingual/norwegian.md +15 -0
  39. euroeval-15.2.0/docs/leaderboards/Monolingual/swedish.md +15 -0
  40. euroeval-15.2.0/docs/leaderboards/Multilingual/european.md +15 -0
  41. euroeval-15.2.0/docs/leaderboards/Multilingual/germanic.md +15 -0
  42. euroeval-15.2.0/docs/leaderboards/Multilingual/mainland-scandinavian.md +15 -0
  43. euroeval-15.2.0/docs/leaderboards/README.md +49 -0
  44. euroeval-15.2.0/docs/methodology.md +163 -0
  45. euroeval-15.2.0/docs/python-package.md +132 -0
  46. euroeval-15.2.0/docs/tasks/README.md +48 -0
  47. euroeval-15.2.0/docs/tasks/common-sense-reasoning.md +34 -0
  48. euroeval-15.2.0/docs/tasks/knowledge.md +33 -0
  49. euroeval-15.2.0/docs/tasks/linguistic-acceptability.md +33 -0
  50. euroeval-15.2.0/docs/tasks/named-entity-recognition.md +36 -0
  51. euroeval-15.2.0/docs/tasks/reading-comprehension.md +33 -0
  52. euroeval-15.2.0/docs/tasks/sentiment-classification.md +32 -0
  53. euroeval-15.2.0/docs/tasks/speed.md +37 -0
  54. euroeval-15.2.0/docs/tasks/summarization.md +40 -0
  55. euroeval-15.2.0/gfx/euroeval-no-bg.png +0 -0
  56. euroeval-15.2.0/gfx/euroeval-orig.png +0 -0
  57. euroeval-15.2.0/gfx/euroeval.png +0 -0
  58. euroeval-15.2.0/gfx/euroeval.xcf +0 -0
  59. euroeval-15.2.0/gfx/scandeval.png +0 -0
  60. euroeval-15.2.0/makefile +150 -0
  61. euroeval-15.2.0/mkdocs.yaml +55 -0
  62. euroeval-15.2.0/pyproject.toml +180 -0
  63. euroeval-15.2.0/src/euroeval/__init__.py +72 -0
  64. euroeval-15.2.0/src/euroeval/benchmark_config_factory.py +358 -0
  65. euroeval-15.2.0/src/euroeval/benchmark_modules/__init__.py +7 -0
  66. euroeval-15.2.0/src/euroeval/benchmark_modules/base.py +354 -0
  67. euroeval-15.2.0/src/euroeval/benchmark_modules/fresh.py +286 -0
  68. euroeval-15.2.0/src/euroeval/benchmark_modules/hf.py +1185 -0
  69. euroeval-15.2.0/src/euroeval/benchmark_modules/litellm.py +905 -0
  70. euroeval-15.2.0/src/euroeval/benchmark_modules/vllm.py +1171 -0
  71. euroeval-15.2.0/src/euroeval/benchmarker.py +1074 -0
  72. euroeval-15.2.0/src/euroeval/callbacks.py +72 -0
  73. euroeval-15.2.0/src/euroeval/cli.py +281 -0
  74. euroeval-15.2.0/src/euroeval/constants.py +50 -0
  75. euroeval-15.2.0/src/euroeval/data_loading.py +96 -0
  76. euroeval-15.2.0/src/euroeval/data_models.py +474 -0
  77. euroeval-15.2.0/src/euroeval/dataset_configs.py +2001 -0
  78. euroeval-15.2.0/src/euroeval/enums.py +144 -0
  79. euroeval-15.2.0/src/euroeval/exceptions.py +191 -0
  80. euroeval-15.2.0/src/euroeval/finetuning.py +324 -0
  81. euroeval-15.2.0/src/euroeval/generation.py +296 -0
  82. euroeval-15.2.0/src/euroeval/human_evaluation.py +737 -0
  83. euroeval-15.2.0/src/euroeval/languages.py +200 -0
  84. euroeval-15.2.0/src/euroeval/model_cache.py +253 -0
  85. euroeval-15.2.0/src/euroeval/model_config.py +77 -0
  86. euroeval-15.2.0/src/euroeval/model_loading.py +78 -0
  87. euroeval-15.2.0/src/euroeval/scores.py +90 -0
  88. euroeval-15.2.0/src/euroeval/speed_benchmark.py +124 -0
  89. euroeval-15.2.0/src/euroeval/task_utils/__init__.py +1 -0
  90. euroeval-15.2.0/src/euroeval/task_utils/multiple_choice_classification.py +176 -0
  91. euroeval-15.2.0/src/euroeval/task_utils/question_answering.py +698 -0
  92. euroeval-15.2.0/src/euroeval/task_utils/sequence_classification.py +237 -0
  93. euroeval-15.2.0/src/euroeval/task_utils/text_to_text.py +150 -0
  94. euroeval-15.2.0/src/euroeval/task_utils/token_classification.py +464 -0
  95. euroeval-15.2.0/src/euroeval/tasks.py +202 -0
  96. euroeval-15.2.0/src/euroeval/types.py +97 -0
  97. euroeval-15.2.0/src/euroeval/utils.py +574 -0
  98. euroeval-15.2.0/src/scripts/constants.py +25 -0
  99. euroeval-15.2.0/src/scripts/create_allocine.py +76 -0
  100. euroeval-15.2.0/src/scripts/create_angry_tweets.py +1 -0
  101. euroeval-15.2.0/src/scripts/create_arc.py +171 -0
  102. euroeval-15.2.0/src/scripts/create_arc_is.py +155 -0
  103. euroeval-15.2.0/src/scripts/create_belebele.py +195 -0
  104. euroeval-15.2.0/src/scripts/create_cnn_dailymail.py +75 -0
  105. euroeval-15.2.0/src/scripts/create_conll_en.py +105 -0
  106. euroeval-15.2.0/src/scripts/create_conll_nl.py +105 -0
  107. euroeval-15.2.0/src/scripts/create_dane.py +111 -0
  108. euroeval-15.2.0/src/scripts/create_danish_citizen_tests.py +95 -0
  109. euroeval-15.2.0/src/scripts/create_dansk.py +181 -0
  110. euroeval-15.2.0/src/scripts/create_danske_talemaader.py +106 -0
  111. euroeval-15.2.0/src/scripts/create_danske_talemaader_old.py +132 -0
  112. euroeval-15.2.0/src/scripts/create_dbrd.py +71 -0
  113. euroeval-15.2.0/src/scripts/create_dutch_cola.py +107 -0
  114. euroeval-15.2.0/src/scripts/create_dutch_social.py +84 -0
  115. euroeval-15.2.0/src/scripts/create_eltec.py +210 -0
  116. euroeval-15.2.0/src/scripts/create_fone.py +143 -0
  117. euroeval-15.2.0/src/scripts/create_foqa.py +50 -0
  118. euroeval-15.2.0/src/scripts/create_fosent.py +212 -0
  119. euroeval-15.2.0/src/scripts/create_fquad.py +157 -0
  120. euroeval-15.2.0/src/scripts/create_germanquad.py +98 -0
  121. euroeval-15.2.0/src/scripts/create_germeval.py +121 -0
  122. euroeval-15.2.0/src/scripts/create_hellaswag.py +161 -0
  123. euroeval-15.2.0/src/scripts/create_hotter_and_colder_sentiment.py +215 -0
  124. euroeval-15.2.0/src/scripts/create_ice_linguistic.py +99 -0
  125. euroeval-15.2.0/src/scripts/create_icelandic_error_corpus.py +140 -0
  126. euroeval-15.2.0/src/scripts/create_icelandic_knowledge.py +189 -0
  127. euroeval-15.2.0/src/scripts/create_icelandic_qa.py +173 -0
  128. euroeval-15.2.0/src/scripts/create_icesum.py +80 -0
  129. euroeval-15.2.0/src/scripts/create_jentoft.py +187 -0
  130. euroeval-15.2.0/src/scripts/create_mim_gold_ner.py +3 -0
  131. euroeval-15.2.0/src/scripts/create_mlsum.py +77 -0
  132. euroeval-15.2.0/src/scripts/create_mmlu.py +166 -0
  133. euroeval-15.2.0/src/scripts/create_no_sammendrag.py +70 -0
  134. euroeval-15.2.0/src/scripts/create_nordjylland_news.py +73 -0
  135. euroeval-15.2.0/src/scripts/create_norec.py +1 -0
  136. euroeval-15.2.0/src/scripts/create_norglm_multiqa.py +214 -0
  137. euroeval-15.2.0/src/scripts/create_norglm_multisum.py +80 -0
  138. euroeval-15.2.0/src/scripts/create_norne.py +145 -0
  139. euroeval-15.2.0/src/scripts/create_norquad.py +125 -0
  140. euroeval-15.2.0/src/scripts/create_nqii.py +99 -0
  141. euroeval-15.2.0/src/scripts/create_orange_sum.py +76 -0
  142. euroeval-15.2.0/src/scripts/create_personal_sum.py +83 -0
  143. euroeval-15.2.0/src/scripts/create_rrn.py +72 -0
  144. euroeval-15.2.0/src/scripts/create_sb10k.py +76 -0
  145. euroeval-15.2.0/src/scripts/create_scala.py +394 -0
  146. euroeval-15.2.0/src/scripts/create_scandiqa.py +101 -0
  147. euroeval-15.2.0/src/scripts/create_schibsted.py +101 -0
  148. euroeval-15.2.0/src/scripts/create_squad.py +108 -0
  149. euroeval-15.2.0/src/scripts/create_squad_nl.py +125 -0
  150. euroeval-15.2.0/src/scripts/create_squad_nl_old.py +127 -0
  151. euroeval-15.2.0/src/scripts/create_sst5.py +83 -0
  152. euroeval-15.2.0/src/scripts/create_suc3.py +130 -0
  153. euroeval-15.2.0/src/scripts/create_swedn.py +76 -0
  154. euroeval-15.2.0/src/scripts/create_swerec.py +153 -0
  155. euroeval-15.2.0/src/scripts/create_wiki_lingua_nl.py +76 -0
  156. euroeval-15.2.0/src/scripts/create_wikiann_fo.py +1 -0
  157. euroeval-15.2.0/src/scripts/create_winogrande_is.py +126 -0
  158. euroeval-15.2.0/src/scripts/fix_dot_env_file.py +46 -0
  159. euroeval-15.2.0/src/scripts/load_ud_pos.py +291 -0
  160. euroeval-15.2.0/src/scripts/versioning.py +136 -0
  161. euroeval-15.2.0/tests/__init__.py +1 -0
  162. euroeval-15.2.0/tests/conftest.py +171 -0
  163. euroeval-15.2.0/tests/test_benchmark_config_factory.py +238 -0
  164. euroeval-15.2.0/tests/test_benchmark_modules/__init__.py +1 -0
  165. euroeval-15.2.0/tests/test_benchmark_modules/test_base.py +1 -0
  166. euroeval-15.2.0/tests/test_benchmark_modules/test_fresh.py +1 -0
  167. euroeval-15.2.0/tests/test_benchmark_modules/test_hf.py +81 -0
  168. euroeval-15.2.0/tests/test_benchmark_modules/test_litellm.py +1 -0
  169. euroeval-15.2.0/tests/test_benchmark_modules/test_vllm.py +1 -0
  170. euroeval-15.2.0/tests/test_benchmarker.py +431 -0
  171. euroeval-15.2.0/tests/test_callbacks.py +76 -0
  172. euroeval-15.2.0/tests/test_cli.py +78 -0
  173. euroeval-15.2.0/tests/test_constants.py +14 -0
  174. euroeval-15.2.0/tests/test_data_loading.py +51 -0
  175. euroeval-15.2.0/tests/test_data_models.py +285 -0
  176. euroeval-15.2.0/tests/test_dataset_configs.py +44 -0
  177. euroeval-15.2.0/tests/test_enums.py +19 -0
  178. euroeval-15.2.0/tests/test_exceptions.py +17 -0
  179. euroeval-15.2.0/tests/test_finetuning.py +190 -0
  180. euroeval-15.2.0/tests/test_generation.py +19 -0
  181. euroeval-15.2.0/tests/test_human_evaluation.py +8 -0
  182. euroeval-15.2.0/tests/test_languages.py +41 -0
  183. euroeval-15.2.0/tests/test_model_cache.py +46 -0
  184. euroeval-15.2.0/tests/test_model_config.py +38 -0
  185. euroeval-15.2.0/tests/test_model_loading.py +58 -0
  186. euroeval-15.2.0/tests/test_scores.py +98 -0
  187. euroeval-15.2.0/tests/test_speed_benchmark.py +61 -0
  188. euroeval-15.2.0/tests/test_task_utils/__init__.py +1 -0
  189. euroeval-15.2.0/tests/test_task_utils/test_question_answering.py +1 -0
  190. euroeval-15.2.0/tests/test_task_utils/test_sequence_classification.py +1 -0
  191. euroeval-15.2.0/tests/test_task_utils/test_text_to_text.py +1 -0
  192. euroeval-15.2.0/tests/test_task_utils/test_token_classification.py +1 -0
  193. euroeval-15.2.0/tests/test_tasks.py +44 -0
  194. euroeval-15.2.0/tests/test_types.py +50 -0
  195. euroeval-15.2.0/tests/test_utils.py +163 -0
  196. euroeval-15.2.0/uv.lock +4777 -0
@@ -0,0 +1,44 @@
1
+ name: 📚 Benchmark Dataset Request
2
+ description: Do you think a particular benchmark dataset is missing in ScandEval?
3
+ title: "[BENCHMARK DATASET REQUEST] <dataset-name>"
4
+ labels: "benchmark dataset request"
5
+
6
+ body:
7
+ - type: input
8
+ attributes:
9
+ label: Dataset name
10
+ description: What is the name of the dataset?
11
+ validations:
12
+ required: true
13
+ - type: input
14
+ attributes:
15
+ label: Dataset link
16
+ description: Please give a link to where the dataset is hosted (doesn't have to be on the Hugging Face Hub)
17
+ validations:
18
+ required: true
19
+ - type: checkboxes
20
+ attributes:
21
+ label: Dataset languages
22
+ description: What languages is the dataset in?
23
+ options:
24
+ - label: Danish
25
+ - label: Dutch
26
+ - label: English
27
+ - label: Faroese
28
+ - label: French
29
+ - label: German
30
+ - label: Icelandic
31
+ - label: Norwegian (Bokmål or Nynorsk)
32
+ - label: Swedish
33
+ validations:
34
+ required: true
35
+ - type: textarea
36
+ attributes:
37
+ label: Describe the dataset
38
+ description: Describe what the dataset is measuring, and why you think it is important to include it as a benchmark dataset in ScandEval.
39
+ validations:
40
+ required: true
41
+ - type: markdown
42
+ attributes:
43
+ value: >
44
+ Thanks for contributing 🎉!
@@ -0,0 +1,63 @@
1
+ name: 🐛 Bug Report
2
+ description: Have you experienced a bug using the `scandeval` package?
3
+ title: "[BUG] <name-of-bug>"
4
+ labels: bug
5
+
6
+ body:
7
+ - type: markdown
8
+ attributes:
9
+ value: >
10
+ #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/Scandeval/ScandEval/issues?q=is%3Aissue).
11
+ - type: textarea
12
+ attributes:
13
+ label: 🐛 Describe the bug
14
+ description: |
15
+ Please provide a clear and concise description of what the bug is. If relevant, add a minimal example so that we can reproduce the error by running the code.
16
+ validations:
17
+ required: true
18
+ - type: dropdown
19
+ attributes:
20
+ label: Operating System
21
+ description: What operating system are you on?
22
+ options:
23
+ - Linux
24
+ - MacOS
25
+ - Windows
26
+ - Other
27
+ validations:
28
+ required: true
29
+ - type: dropdown
30
+ attributes:
31
+ label: Device
32
+ description: What hardware device do you use?
33
+ options:
34
+ - CUDA GPU
35
+ - AMD GPU
36
+ - Mac GPU
37
+ - CPU
38
+ - Other
39
+ validations:
40
+ required: true
41
+ - type: dropdown
42
+ attributes:
43
+ label: Python version
44
+ description: What Python version are you using?
45
+ options:
46
+ - 3.10.x
47
+ - 3.11.x
48
+ - 3.12.x
49
+ - Older than 3.10.x
50
+ - Newer than 3.12.x
51
+ validations:
52
+ required: true
53
+ - type: input
54
+ attributes:
55
+ label: ScandEval version
56
+ description: What version of ScandEval are you using?
57
+ placeholder: Output of `pip list | grep ScandEval`
58
+ validations:
59
+ required: true
60
+ - type: markdown
61
+ attributes:
62
+ value: >
63
+ Thanks for contributing 🎉!
@@ -0,0 +1,27 @@
1
+ name: 🚀 Feature Request
2
+ description: Is the ScandEval benchmark missing a feature?
3
+ title: "[FEATURE REQUEST] <name-of-feature>"
4
+ labels: enhancement
5
+
6
+ body:
7
+ - type: textarea
8
+ attributes:
9
+ label: 🚀 The feature, motivation and pitch
10
+ description: >
11
+ A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*.
12
+ validations:
13
+ required: true
14
+ - type: textarea
15
+ attributes:
16
+ label: Alternatives
17
+ description: >
18
+ A description of any alternative solutions or features you've considered, if any.
19
+ - type: textarea
20
+ attributes:
21
+ label: Additional context
22
+ description: >
23
+ Add any other context or screenshots about the feature request.
24
+ - type: markdown
25
+ attributes:
26
+ value: >
27
+ Thanks for contributing 🎉!
@@ -0,0 +1,53 @@
1
+ name: 📊 Model Evaluation Request
2
+ description: Would you like to have a particular model included in the leaderboards?
3
+ title: "[MODEL EVALUATION REQUEST] <model-name>"
4
+ labels: "model evaluation request"
5
+
6
+ body:
7
+ - type: input
8
+ attributes:
9
+ label: Model ID
10
+ description: What is the Hugging Face model ID?
11
+ validations:
12
+ required: true
13
+ - type: dropdown
14
+ attributes:
15
+ label: Model type
16
+ description: What is the architecture of the model?
17
+ options:
18
+ - Decoder model (e.g., GPT)
19
+ - Encoder model (e.g., BERT)
20
+ - Sequence-to-sequence model (e.g., T5)
21
+ validations:
22
+ required: true
23
+ - type: checkboxes
24
+ attributes:
25
+ label: Evaluation languages
26
+ description: >
27
+ What languages should this model be evaluated on? Tick all that apply. If the
28
+ model is multilingual (e.g., Mistral, Llama), then tick all the languages.
29
+ options:
30
+ - label: Danish
31
+ - label: Dutch
32
+ - label: English
33
+ - label: Faroese
34
+ - label: French
35
+ - label: German
36
+ - label: Icelandic
37
+ - label: Norwegian (Bokmål or Nynorsk)
38
+ - label: Swedish
39
+ validations:
40
+ required: true
41
+ - type: dropdown
42
+ attributes:
43
+ label: Merged model
44
+ description: Is the model a merge of other models, or built on top of a merged model?
45
+ options:
46
+ - Not a merged model
47
+ - Merged model
48
+ validations:
49
+ required: true
50
+ - type: markdown
51
+ attributes:
52
+ value: >
53
+ Thanks for contributing 🎉!
@@ -0,0 +1,90 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request_target:
5
+ types:
6
+ - opened
7
+ - synchronize
8
+ - reopened
9
+ - ready_for_review
10
+ branches:
11
+ - main
12
+
13
+ jobs:
14
+ code-check:
15
+ if: github.event.pull_request.draft == false
16
+ permissions:
17
+ contents: read
18
+ pull-requests: write
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ with:
23
+ persist-credentials: false
24
+ - uses: actions/setup-python@v5
25
+ with:
26
+ python-version: "3.11"
27
+ - uses: pre-commit/action@v3.0.1
28
+
29
+ pytest-linux:
30
+ if: github.event.pull_request.draft == false
31
+ permissions:
32
+ contents: read
33
+ pull-requests: write
34
+ strategy:
35
+ matrix:
36
+ python-version: ["3.10", "3.11", "3.12"]
37
+ runs-on: ubuntu-latest
38
+ steps:
39
+ - uses: actions/checkout@v4
40
+ with:
41
+ persist-credentials: false
42
+
43
+ - name: Install uv and set up Python
44
+ uses: astral-sh/setup-uv@v4
45
+ with:
46
+ enable-cache: true
47
+ python-version: ${{ matrix.python-version }}
48
+
49
+ - name: Install Dependencies
50
+ run: uv sync --no-dev --extra test
51
+
52
+ - name: Test with pytest
53
+ run: uv run pytest
54
+ env:
55
+ HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
56
+ HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
57
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
58
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
59
+
60
+ - name: Delete ScandEval cache
61
+ run: rm -rf .scandeval_cache
62
+
63
+ pytest-macos:
64
+ if: github.event.pull_request.draft == false && contains(github.event.pull_request.labels.*.name, 'macos')
65
+ permissions:
66
+ contents: read
67
+ pull-requests: write
68
+ runs-on: macos-latest
69
+ steps:
70
+ - uses: actions/checkout@v4
71
+
72
+ - name: Install uv and set up Python
73
+ uses: astral-sh/setup-uv@v4
74
+ with:
75
+ enable-cache: true
76
+ python-version: ${{ matrix.python-version }}
77
+
78
+ - name: Install Dependencies
79
+ run: uv sync --no-dev --extra test
80
+
81
+ - name: Test with pytest
82
+ run: uv run pytest
83
+ env:
84
+ HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
85
+ HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
86
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
87
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
88
+
89
+ - name: Delete ScandEval cache
90
+ run: rm -rf .scandeval_cache
@@ -0,0 +1,117 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ **/__pycache__/
3
+ *.py[cod]
4
+
5
+ # C extensions
6
+ *.so
7
+
8
+ # Distribution / packaging
9
+ .Python
10
+ env/
11
+ .venv
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ # Usually these files are written by a python script from a template
29
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .coverage
41
+ .coverage.*
42
+ .cache
43
+ nosetests.xml
44
+ coverage.xml
45
+ *.cover
46
+
47
+ # Translations
48
+ *.mo
49
+ *.pot
50
+
51
+ # Log files
52
+ *.log
53
+
54
+ # PyBuilder
55
+ target/
56
+
57
+ # DotEnv configuration
58
+ .env
59
+
60
+ # Database
61
+ *.db
62
+ *.rdb
63
+
64
+ # Pycharm
65
+ .idea
66
+
67
+ # VS Code
68
+ .vscode/
69
+
70
+ # Spyder
71
+ .spyproject/
72
+
73
+ # Jupyter NB Checkpoints
74
+ .ipynb_checkpoints/
75
+
76
+ # Mac OS-specific storage files
77
+ .DS_Store
78
+
79
+ # vim
80
+ *.swp
81
+ *.swo
82
+
83
+ # Mypy cache
84
+ .mypy_cache/
85
+
86
+ # pytest cache
87
+ .pytest_cache/
88
+
89
+ # Ruff cache
90
+ .ruff_cache/
91
+
92
+ # Checkpoints
93
+ checkpoint-*
94
+
95
+ # Cache
96
+ .scandeval_cache/
97
+ .euroeval_cache/
98
+
99
+ # Result log
100
+ scandeval_benchmark_results.jsonl
101
+ euroeval_benchmark_results.jsonl
102
+
103
+ # Experiments
104
+ gai-experiments.xlsx
105
+ ~$gai-experiments.xlsx
106
+
107
+ # Upload of leaderboard
108
+ src/upload/
109
+
110
+ # Model outputs in debugging mode
111
+ *.json
112
+
113
+ # Mkdocs
114
+ site/
115
+
116
+ # Helper files for docs
117
+ docs/datasets/dataset_example_commands.txt
@@ -0,0 +1,41 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pygrep-hooks
3
+ rev: v1.10.0
4
+ hooks:
5
+ - id: python-use-type-annotations
6
+ - repo: https://github.com/pre-commit/pre-commit-hooks
7
+ rev: v5.0.0
8
+ hooks:
9
+ - id: end-of-file-fixer
10
+ - id: trailing-whitespace
11
+ - id: debug-statements
12
+ - repo: https://github.com/astral-sh/ruff-pre-commit
13
+ rev: v0.9.8
14
+ hooks:
15
+ - id: ruff
16
+ args:
17
+ - --fix
18
+ - --exit-non-zero-on-fix
19
+ types_or:
20
+ - python
21
+ - pyi
22
+ - jupyter
23
+ - id: ruff-format
24
+ types_or:
25
+ - python
26
+ - pyi
27
+ - jupyter
28
+ - repo: https://github.com/kynan/nbstripout
29
+ rev: 0.8.1
30
+ hooks:
31
+ - id: nbstripout
32
+ - repo: https://github.com/pre-commit/mirrors-mypy
33
+ rev: v1.15.0
34
+ hooks:
35
+ - id: mypy
36
+ args:
37
+ - --install-types
38
+ - --non-interactive
39
+ - --ignore-missing-imports
40
+ - --show-error-codes
41
+ - --check-untyped-defs