EuroEval 15.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval-15.2.0/.github/ISSUE_TEMPLATE/benchmark_dataset_request.yaml +44 -0
- euroeval-15.2.0/.github/ISSUE_TEMPLATE/bug.yaml +63 -0
- euroeval-15.2.0/.github/ISSUE_TEMPLATE/feature_request.yaml +27 -0
- euroeval-15.2.0/.github/ISSUE_TEMPLATE/model_evaluation_request.yaml +53 -0
- euroeval-15.2.0/.github/workflows/ci.yaml +90 -0
- euroeval-15.2.0/.gitignore +117 -0
- euroeval-15.2.0/.pre-commit-config.yaml +41 -0
- euroeval-15.2.0/CHANGELOG.md +2250 -0
- euroeval-15.2.0/CITATION.cff +33 -0
- euroeval-15.2.0/CODE_OF_CONDUCT.md +123 -0
- euroeval-15.2.0/CONTRIBUTING.md +90 -0
- euroeval-15.2.0/Dockerfile.cuda +25 -0
- euroeval-15.2.0/LICENSE +21 -0
- euroeval-15.2.0/PKG-INFO +234 -0
- euroeval-15.2.0/README.md +162 -0
- euroeval-15.2.0/docs/CNAME +1 -0
- euroeval-15.2.0/docs/README.md +37 -0
- euroeval-15.2.0/docs/datasets/README.md +8 -0
- euroeval-15.2.0/docs/datasets/danish.md +753 -0
- euroeval-15.2.0/docs/datasets/dutch.md +659 -0
- euroeval-15.2.0/docs/datasets/english.md +565 -0
- euroeval-15.2.0/docs/datasets/faroese.md +353 -0
- euroeval-15.2.0/docs/datasets/french.md +502 -0
- euroeval-15.2.0/docs/datasets/german.md +545 -0
- euroeval-15.2.0/docs/datasets/icelandic.md +904 -0
- euroeval-15.2.0/docs/datasets/norwegian.md +1023 -0
- euroeval-15.2.0/docs/datasets/swedish.md +625 -0
- euroeval-15.2.0/docs/extras/radial_plotter.md +16 -0
- euroeval-15.2.0/docs/faq.md +19 -0
- euroeval-15.2.0/docs/gfx/favicon.png +0 -0
- euroeval-15.2.0/docs/leaderboards/Monolingual/danish.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Monolingual/dutch.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Monolingual/english.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Monolingual/faroese.md +20 -0
- euroeval-15.2.0/docs/leaderboards/Monolingual/french.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Monolingual/german.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Monolingual/icelandic.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Monolingual/norwegian.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Monolingual/swedish.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Multilingual/european.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Multilingual/germanic.md +15 -0
- euroeval-15.2.0/docs/leaderboards/Multilingual/mainland-scandinavian.md +15 -0
- euroeval-15.2.0/docs/leaderboards/README.md +49 -0
- euroeval-15.2.0/docs/methodology.md +163 -0
- euroeval-15.2.0/docs/python-package.md +132 -0
- euroeval-15.2.0/docs/tasks/README.md +48 -0
- euroeval-15.2.0/docs/tasks/common-sense-reasoning.md +34 -0
- euroeval-15.2.0/docs/tasks/knowledge.md +33 -0
- euroeval-15.2.0/docs/tasks/linguistic-acceptability.md +33 -0
- euroeval-15.2.0/docs/tasks/named-entity-recognition.md +36 -0
- euroeval-15.2.0/docs/tasks/reading-comprehension.md +33 -0
- euroeval-15.2.0/docs/tasks/sentiment-classification.md +32 -0
- euroeval-15.2.0/docs/tasks/speed.md +37 -0
- euroeval-15.2.0/docs/tasks/summarization.md +40 -0
- euroeval-15.2.0/gfx/euroeval-no-bg.png +0 -0
- euroeval-15.2.0/gfx/euroeval-orig.png +0 -0
- euroeval-15.2.0/gfx/euroeval.png +0 -0
- euroeval-15.2.0/gfx/euroeval.xcf +0 -0
- euroeval-15.2.0/gfx/scandeval.png +0 -0
- euroeval-15.2.0/makefile +150 -0
- euroeval-15.2.0/mkdocs.yaml +55 -0
- euroeval-15.2.0/pyproject.toml +180 -0
- euroeval-15.2.0/src/euroeval/__init__.py +72 -0
- euroeval-15.2.0/src/euroeval/benchmark_config_factory.py +358 -0
- euroeval-15.2.0/src/euroeval/benchmark_modules/__init__.py +7 -0
- euroeval-15.2.0/src/euroeval/benchmark_modules/base.py +354 -0
- euroeval-15.2.0/src/euroeval/benchmark_modules/fresh.py +286 -0
- euroeval-15.2.0/src/euroeval/benchmark_modules/hf.py +1185 -0
- euroeval-15.2.0/src/euroeval/benchmark_modules/litellm.py +905 -0
- euroeval-15.2.0/src/euroeval/benchmark_modules/vllm.py +1171 -0
- euroeval-15.2.0/src/euroeval/benchmarker.py +1074 -0
- euroeval-15.2.0/src/euroeval/callbacks.py +72 -0
- euroeval-15.2.0/src/euroeval/cli.py +281 -0
- euroeval-15.2.0/src/euroeval/constants.py +50 -0
- euroeval-15.2.0/src/euroeval/data_loading.py +96 -0
- euroeval-15.2.0/src/euroeval/data_models.py +474 -0
- euroeval-15.2.0/src/euroeval/dataset_configs.py +2001 -0
- euroeval-15.2.0/src/euroeval/enums.py +144 -0
- euroeval-15.2.0/src/euroeval/exceptions.py +191 -0
- euroeval-15.2.0/src/euroeval/finetuning.py +324 -0
- euroeval-15.2.0/src/euroeval/generation.py +296 -0
- euroeval-15.2.0/src/euroeval/human_evaluation.py +737 -0
- euroeval-15.2.0/src/euroeval/languages.py +200 -0
- euroeval-15.2.0/src/euroeval/model_cache.py +253 -0
- euroeval-15.2.0/src/euroeval/model_config.py +77 -0
- euroeval-15.2.0/src/euroeval/model_loading.py +78 -0
- euroeval-15.2.0/src/euroeval/scores.py +90 -0
- euroeval-15.2.0/src/euroeval/speed_benchmark.py +124 -0
- euroeval-15.2.0/src/euroeval/task_utils/__init__.py +1 -0
- euroeval-15.2.0/src/euroeval/task_utils/multiple_choice_classification.py +176 -0
- euroeval-15.2.0/src/euroeval/task_utils/question_answering.py +698 -0
- euroeval-15.2.0/src/euroeval/task_utils/sequence_classification.py +237 -0
- euroeval-15.2.0/src/euroeval/task_utils/text_to_text.py +150 -0
- euroeval-15.2.0/src/euroeval/task_utils/token_classification.py +464 -0
- euroeval-15.2.0/src/euroeval/tasks.py +202 -0
- euroeval-15.2.0/src/euroeval/types.py +97 -0
- euroeval-15.2.0/src/euroeval/utils.py +574 -0
- euroeval-15.2.0/src/scripts/constants.py +25 -0
- euroeval-15.2.0/src/scripts/create_allocine.py +76 -0
- euroeval-15.2.0/src/scripts/create_angry_tweets.py +1 -0
- euroeval-15.2.0/src/scripts/create_arc.py +171 -0
- euroeval-15.2.0/src/scripts/create_arc_is.py +155 -0
- euroeval-15.2.0/src/scripts/create_belebele.py +195 -0
- euroeval-15.2.0/src/scripts/create_cnn_dailymail.py +75 -0
- euroeval-15.2.0/src/scripts/create_conll_en.py +105 -0
- euroeval-15.2.0/src/scripts/create_conll_nl.py +105 -0
- euroeval-15.2.0/src/scripts/create_dane.py +111 -0
- euroeval-15.2.0/src/scripts/create_danish_citizen_tests.py +95 -0
- euroeval-15.2.0/src/scripts/create_dansk.py +181 -0
- euroeval-15.2.0/src/scripts/create_danske_talemaader.py +106 -0
- euroeval-15.2.0/src/scripts/create_danske_talemaader_old.py +132 -0
- euroeval-15.2.0/src/scripts/create_dbrd.py +71 -0
- euroeval-15.2.0/src/scripts/create_dutch_cola.py +107 -0
- euroeval-15.2.0/src/scripts/create_dutch_social.py +84 -0
- euroeval-15.2.0/src/scripts/create_eltec.py +210 -0
- euroeval-15.2.0/src/scripts/create_fone.py +143 -0
- euroeval-15.2.0/src/scripts/create_foqa.py +50 -0
- euroeval-15.2.0/src/scripts/create_fosent.py +212 -0
- euroeval-15.2.0/src/scripts/create_fquad.py +157 -0
- euroeval-15.2.0/src/scripts/create_germanquad.py +98 -0
- euroeval-15.2.0/src/scripts/create_germeval.py +121 -0
- euroeval-15.2.0/src/scripts/create_hellaswag.py +161 -0
- euroeval-15.2.0/src/scripts/create_hotter_and_colder_sentiment.py +215 -0
- euroeval-15.2.0/src/scripts/create_ice_linguistic.py +99 -0
- euroeval-15.2.0/src/scripts/create_icelandic_error_corpus.py +140 -0
- euroeval-15.2.0/src/scripts/create_icelandic_knowledge.py +189 -0
- euroeval-15.2.0/src/scripts/create_icelandic_qa.py +173 -0
- euroeval-15.2.0/src/scripts/create_icesum.py +80 -0
- euroeval-15.2.0/src/scripts/create_jentoft.py +187 -0
- euroeval-15.2.0/src/scripts/create_mim_gold_ner.py +3 -0
- euroeval-15.2.0/src/scripts/create_mlsum.py +77 -0
- euroeval-15.2.0/src/scripts/create_mmlu.py +166 -0
- euroeval-15.2.0/src/scripts/create_no_sammendrag.py +70 -0
- euroeval-15.2.0/src/scripts/create_nordjylland_news.py +73 -0
- euroeval-15.2.0/src/scripts/create_norec.py +1 -0
- euroeval-15.2.0/src/scripts/create_norglm_multiqa.py +214 -0
- euroeval-15.2.0/src/scripts/create_norglm_multisum.py +80 -0
- euroeval-15.2.0/src/scripts/create_norne.py +145 -0
- euroeval-15.2.0/src/scripts/create_norquad.py +125 -0
- euroeval-15.2.0/src/scripts/create_nqii.py +99 -0
- euroeval-15.2.0/src/scripts/create_orange_sum.py +76 -0
- euroeval-15.2.0/src/scripts/create_personal_sum.py +83 -0
- euroeval-15.2.0/src/scripts/create_rrn.py +72 -0
- euroeval-15.2.0/src/scripts/create_sb10k.py +76 -0
- euroeval-15.2.0/src/scripts/create_scala.py +394 -0
- euroeval-15.2.0/src/scripts/create_scandiqa.py +101 -0
- euroeval-15.2.0/src/scripts/create_schibsted.py +101 -0
- euroeval-15.2.0/src/scripts/create_squad.py +108 -0
- euroeval-15.2.0/src/scripts/create_squad_nl.py +125 -0
- euroeval-15.2.0/src/scripts/create_squad_nl_old.py +127 -0
- euroeval-15.2.0/src/scripts/create_sst5.py +83 -0
- euroeval-15.2.0/src/scripts/create_suc3.py +130 -0
- euroeval-15.2.0/src/scripts/create_swedn.py +76 -0
- euroeval-15.2.0/src/scripts/create_swerec.py +153 -0
- euroeval-15.2.0/src/scripts/create_wiki_lingua_nl.py +76 -0
- euroeval-15.2.0/src/scripts/create_wikiann_fo.py +1 -0
- euroeval-15.2.0/src/scripts/create_winogrande_is.py +126 -0
- euroeval-15.2.0/src/scripts/fix_dot_env_file.py +46 -0
- euroeval-15.2.0/src/scripts/load_ud_pos.py +291 -0
- euroeval-15.2.0/src/scripts/versioning.py +136 -0
- euroeval-15.2.0/tests/__init__.py +1 -0
- euroeval-15.2.0/tests/conftest.py +171 -0
- euroeval-15.2.0/tests/test_benchmark_config_factory.py +238 -0
- euroeval-15.2.0/tests/test_benchmark_modules/__init__.py +1 -0
- euroeval-15.2.0/tests/test_benchmark_modules/test_base.py +1 -0
- euroeval-15.2.0/tests/test_benchmark_modules/test_fresh.py +1 -0
- euroeval-15.2.0/tests/test_benchmark_modules/test_hf.py +81 -0
- euroeval-15.2.0/tests/test_benchmark_modules/test_litellm.py +1 -0
- euroeval-15.2.0/tests/test_benchmark_modules/test_vllm.py +1 -0
- euroeval-15.2.0/tests/test_benchmarker.py +431 -0
- euroeval-15.2.0/tests/test_callbacks.py +76 -0
- euroeval-15.2.0/tests/test_cli.py +78 -0
- euroeval-15.2.0/tests/test_constants.py +14 -0
- euroeval-15.2.0/tests/test_data_loading.py +51 -0
- euroeval-15.2.0/tests/test_data_models.py +285 -0
- euroeval-15.2.0/tests/test_dataset_configs.py +44 -0
- euroeval-15.2.0/tests/test_enums.py +19 -0
- euroeval-15.2.0/tests/test_exceptions.py +17 -0
- euroeval-15.2.0/tests/test_finetuning.py +190 -0
- euroeval-15.2.0/tests/test_generation.py +19 -0
- euroeval-15.2.0/tests/test_human_evaluation.py +8 -0
- euroeval-15.2.0/tests/test_languages.py +41 -0
- euroeval-15.2.0/tests/test_model_cache.py +46 -0
- euroeval-15.2.0/tests/test_model_config.py +38 -0
- euroeval-15.2.0/tests/test_model_loading.py +58 -0
- euroeval-15.2.0/tests/test_scores.py +98 -0
- euroeval-15.2.0/tests/test_speed_benchmark.py +61 -0
- euroeval-15.2.0/tests/test_task_utils/__init__.py +1 -0
- euroeval-15.2.0/tests/test_task_utils/test_question_answering.py +1 -0
- euroeval-15.2.0/tests/test_task_utils/test_sequence_classification.py +1 -0
- euroeval-15.2.0/tests/test_task_utils/test_text_to_text.py +1 -0
- euroeval-15.2.0/tests/test_task_utils/test_token_classification.py +1 -0
- euroeval-15.2.0/tests/test_tasks.py +44 -0
- euroeval-15.2.0/tests/test_types.py +50 -0
- euroeval-15.2.0/tests/test_utils.py +163 -0
- euroeval-15.2.0/uv.lock +4777 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: 📚 Benchmark Dataset Request
|
|
2
|
+
description: Do you think a particular benchmark dataset is missing in ScandEval?
|
|
3
|
+
title: "[BENCHMARK DATASET REQUEST] <dataset-name>"
|
|
4
|
+
labels: "benchmark dataset request"
|
|
5
|
+
|
|
6
|
+
body:
|
|
7
|
+
- type: input
|
|
8
|
+
attributes:
|
|
9
|
+
label: Dataset name
|
|
10
|
+
description: What is the name of the dataset?
|
|
11
|
+
validations:
|
|
12
|
+
required: true
|
|
13
|
+
- type: input
|
|
14
|
+
attributes:
|
|
15
|
+
label: Dataset link
|
|
16
|
+
description: Please give a link to where the dataset is hosted (doesn't have to be on the Hugging Face Hub)
|
|
17
|
+
validations:
|
|
18
|
+
required: true
|
|
19
|
+
- type: checkboxes
|
|
20
|
+
attributes:
|
|
21
|
+
label: Dataset languages
|
|
22
|
+
description: What languages is the dataset in?
|
|
23
|
+
options:
|
|
24
|
+
- label: Danish
|
|
25
|
+
- label: Dutch
|
|
26
|
+
- label: English
|
|
27
|
+
- label: Faroese
|
|
28
|
+
- label: French
|
|
29
|
+
- label: German
|
|
30
|
+
- label: Icelandic
|
|
31
|
+
- label: Norwegian (Bokmål or Nynorsk)
|
|
32
|
+
- label: Swedish
|
|
33
|
+
validations:
|
|
34
|
+
required: true
|
|
35
|
+
- type: textarea
|
|
36
|
+
attributes:
|
|
37
|
+
label: Describe the dataset
|
|
38
|
+
description: Describe what the dataset is measuring, and why you think it is important to include it as a benchmark dataset in ScandEval.
|
|
39
|
+
validations:
|
|
40
|
+
required: true
|
|
41
|
+
- type: markdown
|
|
42
|
+
attributes:
|
|
43
|
+
value: >
|
|
44
|
+
Thanks for contributing 🎉!
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
name: 🐛 Bug Report
|
|
2
|
+
description: Have you experienced a bug using the `scandeval` package?
|
|
3
|
+
title: "[BUG] <name-of-bug>"
|
|
4
|
+
labels: bug
|
|
5
|
+
|
|
6
|
+
body:
|
|
7
|
+
- type: markdown
|
|
8
|
+
attributes:
|
|
9
|
+
value: >
|
|
10
|
+
#### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/Scandeval/ScandEval/issues?q=is%3Aissue).
|
|
11
|
+
- type: textarea
|
|
12
|
+
attributes:
|
|
13
|
+
label: 🐛 Describe the bug
|
|
14
|
+
description: |
|
|
15
|
+
Please provide a clear and concise description of what the bug is. If relevant, add a minimal example so that we can reproduce the error by running the code.
|
|
16
|
+
validations:
|
|
17
|
+
required: true
|
|
18
|
+
- type: dropdown
|
|
19
|
+
attributes:
|
|
20
|
+
label: Operating System
|
|
21
|
+
description: What operating system are you on?
|
|
22
|
+
options:
|
|
23
|
+
- Linux
|
|
24
|
+
- MacOS
|
|
25
|
+
- Windows
|
|
26
|
+
- Other
|
|
27
|
+
validations:
|
|
28
|
+
required: true
|
|
29
|
+
- type: dropdown
|
|
30
|
+
attributes:
|
|
31
|
+
label: Device
|
|
32
|
+
description: What hardware device do you use?
|
|
33
|
+
options:
|
|
34
|
+
- CUDA GPU
|
|
35
|
+
- AMD GPU
|
|
36
|
+
- Mac GPU
|
|
37
|
+
- CPU
|
|
38
|
+
- Other
|
|
39
|
+
validations:
|
|
40
|
+
required: true
|
|
41
|
+
- type: dropdown
|
|
42
|
+
attributes:
|
|
43
|
+
label: Python version
|
|
44
|
+
description: What Python version are you using?
|
|
45
|
+
options:
|
|
46
|
+
- 3.10.x
|
|
47
|
+
- 3.11.x
|
|
48
|
+
- 3.12.x
|
|
49
|
+
- Older than 3.10.x
|
|
50
|
+
- Newer than 3.12.x
|
|
51
|
+
validations:
|
|
52
|
+
required: true
|
|
53
|
+
- type: input
|
|
54
|
+
attributes:
|
|
55
|
+
label: ScandEval version
|
|
56
|
+
description: What version of ScandEval are you using?
|
|
57
|
+
placeholder: Output of `pip list | grep ScandEval`
|
|
58
|
+
validations:
|
|
59
|
+
required: true
|
|
60
|
+
- type: markdown
|
|
61
|
+
attributes:
|
|
62
|
+
value: >
|
|
63
|
+
Thanks for contributing 🎉!
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
name: 🚀 Feature Request
|
|
2
|
+
description: Is the ScandEval benchmark missing a feature?
|
|
3
|
+
title: "[FEATURE REQUEST] <name-of-feature>"
|
|
4
|
+
labels: enhancement
|
|
5
|
+
|
|
6
|
+
body:
|
|
7
|
+
- type: textarea
|
|
8
|
+
attributes:
|
|
9
|
+
label: 🚀 The feature, motivation and pitch
|
|
10
|
+
description: >
|
|
11
|
+
A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*.
|
|
12
|
+
validations:
|
|
13
|
+
required: true
|
|
14
|
+
- type: textarea
|
|
15
|
+
attributes:
|
|
16
|
+
label: Alternatives
|
|
17
|
+
description: >
|
|
18
|
+
A description of any alternative solutions or features you've considered, if any.
|
|
19
|
+
- type: textarea
|
|
20
|
+
attributes:
|
|
21
|
+
label: Additional context
|
|
22
|
+
description: >
|
|
23
|
+
Add any other context or screenshots about the feature request.
|
|
24
|
+
- type: markdown
|
|
25
|
+
attributes:
|
|
26
|
+
value: >
|
|
27
|
+
Thanks for contributing 🎉!
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
name: 📊 Model Evaluation Request
|
|
2
|
+
description: Would you like to have a particular model included in the leaderboards?
|
|
3
|
+
title: "[MODEL EVALUATION REQUEST] <model-name>"
|
|
4
|
+
labels: "model evaluation request"
|
|
5
|
+
|
|
6
|
+
body:
|
|
7
|
+
- type: input
|
|
8
|
+
attributes:
|
|
9
|
+
label: Model ID
|
|
10
|
+
description: What is the Hugging Face model ID?
|
|
11
|
+
validations:
|
|
12
|
+
required: true
|
|
13
|
+
- type: dropdown
|
|
14
|
+
attributes:
|
|
15
|
+
label: Model type
|
|
16
|
+
description: What is the architecture of the model?
|
|
17
|
+
options:
|
|
18
|
+
- Decoder model (e.g., GPT)
|
|
19
|
+
- Encoder model (e.g., BERT)
|
|
20
|
+
- Sequence-to-sequence model (e.g., T5)
|
|
21
|
+
validations:
|
|
22
|
+
required: true
|
|
23
|
+
- type: checkboxes
|
|
24
|
+
attributes:
|
|
25
|
+
label: Evaluation languages
|
|
26
|
+
description: >
|
|
27
|
+
What languages should this model be evaluated on? Tick all that apply. If the
|
|
28
|
+
model is multilingual (e.g., Mistral, Llama), then tick all the languages.
|
|
29
|
+
options:
|
|
30
|
+
- label: Danish
|
|
31
|
+
- label: Dutch
|
|
32
|
+
- label: English
|
|
33
|
+
- label: Faroese
|
|
34
|
+
- label: French
|
|
35
|
+
- label: German
|
|
36
|
+
- label: Icelandic
|
|
37
|
+
- label: Norwegian (Bokmål or Nynorsk)
|
|
38
|
+
- label: Swedish
|
|
39
|
+
validations:
|
|
40
|
+
required: true
|
|
41
|
+
- type: dropdown
|
|
42
|
+
attributes:
|
|
43
|
+
label: Merged model
|
|
44
|
+
description: Is the model a merge of other models, or built on top of a merged model?
|
|
45
|
+
options:
|
|
46
|
+
- Not a merged model
|
|
47
|
+
- Merged model
|
|
48
|
+
validations:
|
|
49
|
+
required: true
|
|
50
|
+
- type: markdown
|
|
51
|
+
attributes:
|
|
52
|
+
value: >
|
|
53
|
+
Thanks for contributing 🎉!
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request_target:
|
|
5
|
+
types:
|
|
6
|
+
- opened
|
|
7
|
+
- synchronize
|
|
8
|
+
- reopened
|
|
9
|
+
- ready_for_review
|
|
10
|
+
branches:
|
|
11
|
+
- main
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
code-check:
|
|
15
|
+
if: github.event.pull_request.draft == false
|
|
16
|
+
permissions:
|
|
17
|
+
contents: read
|
|
18
|
+
pull-requests: write
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
with:
|
|
23
|
+
persist-credentials: false
|
|
24
|
+
- uses: actions/setup-python@v5
|
|
25
|
+
with:
|
|
26
|
+
python-version: "3.11"
|
|
27
|
+
- uses: pre-commit/action@v3.0.1
|
|
28
|
+
|
|
29
|
+
pytest-linux:
|
|
30
|
+
if: github.event.pull_request.draft == false
|
|
31
|
+
permissions:
|
|
32
|
+
contents: read
|
|
33
|
+
pull-requests: write
|
|
34
|
+
strategy:
|
|
35
|
+
matrix:
|
|
36
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v4
|
|
40
|
+
with:
|
|
41
|
+
persist-credentials: false
|
|
42
|
+
|
|
43
|
+
- name: Install uv and set up Python
|
|
44
|
+
uses: astral-sh/setup-uv@v4
|
|
45
|
+
with:
|
|
46
|
+
enable-cache: true
|
|
47
|
+
python-version: ${{ matrix.python-version }}
|
|
48
|
+
|
|
49
|
+
- name: Install Dependencies
|
|
50
|
+
run: uv sync --no-dev --extra test
|
|
51
|
+
|
|
52
|
+
- name: Test with pytest
|
|
53
|
+
run: uv run pytest
|
|
54
|
+
env:
|
|
55
|
+
HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
56
|
+
HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
57
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
58
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
59
|
+
|
|
60
|
+
- name: Delete ScandEval cache
|
|
61
|
+
run: rm -rf .scandeval_cache
|
|
62
|
+
|
|
63
|
+
pytest-macos:
|
|
64
|
+
if: github.event.pull_request.draft == false && contains(github.event.pull_request.labels.*.name, 'macos')
|
|
65
|
+
permissions:
|
|
66
|
+
contents: read
|
|
67
|
+
pull-requests: write
|
|
68
|
+
runs-on: macos-latest
|
|
69
|
+
steps:
|
|
70
|
+
- uses: actions/checkout@v4
|
|
71
|
+
|
|
72
|
+
- name: Install uv and set up Python
|
|
73
|
+
uses: astral-sh/setup-uv@v4
|
|
74
|
+
with:
|
|
75
|
+
enable-cache: true
|
|
76
|
+
python-version: ${{ matrix.python-version }}
|
|
77
|
+
|
|
78
|
+
- name: Install Dependencies
|
|
79
|
+
run: uv sync --no-dev --extra test
|
|
80
|
+
|
|
81
|
+
- name: Test with pytest
|
|
82
|
+
run: uv run pytest
|
|
83
|
+
env:
|
|
84
|
+
HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
85
|
+
HF_TOKEN: ${{ secrets.HUGGINGFACE_API_KEY }}
|
|
86
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
87
|
+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
|
88
|
+
|
|
89
|
+
- name: Delete ScandEval cache
|
|
90
|
+
run: rm -rf .scandeval_cache
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
**/__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
|
|
5
|
+
# C extensions
|
|
6
|
+
*.so
|
|
7
|
+
|
|
8
|
+
# Distribution / packaging
|
|
9
|
+
.Python
|
|
10
|
+
env/
|
|
11
|
+
.venv
|
|
12
|
+
build/
|
|
13
|
+
develop-eggs/
|
|
14
|
+
dist/
|
|
15
|
+
downloads/
|
|
16
|
+
eggs/
|
|
17
|
+
.eggs/
|
|
18
|
+
lib/
|
|
19
|
+
lib64/
|
|
20
|
+
parts/
|
|
21
|
+
sdist/
|
|
22
|
+
var/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
|
|
27
|
+
# PyInstaller
|
|
28
|
+
# Usually these files are written by a python script from a template
|
|
29
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
30
|
+
*.manifest
|
|
31
|
+
*.spec
|
|
32
|
+
|
|
33
|
+
# Installer logs
|
|
34
|
+
pip-log.txt
|
|
35
|
+
pip-delete-this-directory.txt
|
|
36
|
+
|
|
37
|
+
# Unit test / coverage reports
|
|
38
|
+
htmlcov/
|
|
39
|
+
.tox/
|
|
40
|
+
.coverage
|
|
41
|
+
.coverage.*
|
|
42
|
+
.cache
|
|
43
|
+
nosetests.xml
|
|
44
|
+
coverage.xml
|
|
45
|
+
*.cover
|
|
46
|
+
|
|
47
|
+
# Translations
|
|
48
|
+
*.mo
|
|
49
|
+
*.pot
|
|
50
|
+
|
|
51
|
+
# Log files
|
|
52
|
+
*.log
|
|
53
|
+
|
|
54
|
+
# PyBuilder
|
|
55
|
+
target/
|
|
56
|
+
|
|
57
|
+
# DotEnv configuration
|
|
58
|
+
.env
|
|
59
|
+
|
|
60
|
+
# Database
|
|
61
|
+
*.db
|
|
62
|
+
*.rdb
|
|
63
|
+
|
|
64
|
+
# Pycharm
|
|
65
|
+
.idea
|
|
66
|
+
|
|
67
|
+
# VS Code
|
|
68
|
+
.vscode/
|
|
69
|
+
|
|
70
|
+
# Spyder
|
|
71
|
+
.spyproject/
|
|
72
|
+
|
|
73
|
+
# Jupyter NB Checkpoints
|
|
74
|
+
.ipynb_checkpoints/
|
|
75
|
+
|
|
76
|
+
# Mac OS-specific storage files
|
|
77
|
+
.DS_Store
|
|
78
|
+
|
|
79
|
+
# vim
|
|
80
|
+
*.swp
|
|
81
|
+
*.swo
|
|
82
|
+
|
|
83
|
+
# Mypy cache
|
|
84
|
+
.mypy_cache/
|
|
85
|
+
|
|
86
|
+
# pytest cache
|
|
87
|
+
.pytest_cache/
|
|
88
|
+
|
|
89
|
+
# Ruff cache
|
|
90
|
+
.ruff_cache/
|
|
91
|
+
|
|
92
|
+
# Checkpoints
|
|
93
|
+
checkpoint-*
|
|
94
|
+
|
|
95
|
+
# Cache
|
|
96
|
+
.scandeval_cache/
|
|
97
|
+
.euroeval_cache/
|
|
98
|
+
|
|
99
|
+
# Result log
|
|
100
|
+
scandeval_benchmark_results.jsonl
|
|
101
|
+
euroeval_benchmark_results.jsonl
|
|
102
|
+
|
|
103
|
+
# Experiments
|
|
104
|
+
gai-experiments.xlsx
|
|
105
|
+
~$gai-experiments.xlsx
|
|
106
|
+
|
|
107
|
+
# Upload of leaderboard
|
|
108
|
+
src/upload/
|
|
109
|
+
|
|
110
|
+
# Model outputs in debugging mode
|
|
111
|
+
*.json
|
|
112
|
+
|
|
113
|
+
# Mkdocs
|
|
114
|
+
site/
|
|
115
|
+
|
|
116
|
+
# Helper files for docs
|
|
117
|
+
docs/datasets/dataset_example_commands.txt
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pygrep-hooks
|
|
3
|
+
rev: v1.10.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: python-use-type-annotations
|
|
6
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
7
|
+
rev: v5.0.0
|
|
8
|
+
hooks:
|
|
9
|
+
- id: end-of-file-fixer
|
|
10
|
+
- id: trailing-whitespace
|
|
11
|
+
- id: debug-statements
|
|
12
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
+
rev: v0.9.8
|
|
14
|
+
hooks:
|
|
15
|
+
- id: ruff
|
|
16
|
+
args:
|
|
17
|
+
- --fix
|
|
18
|
+
- --exit-non-zero-on-fix
|
|
19
|
+
types_or:
|
|
20
|
+
- python
|
|
21
|
+
- pyi
|
|
22
|
+
- jupyter
|
|
23
|
+
- id: ruff-format
|
|
24
|
+
types_or:
|
|
25
|
+
- python
|
|
26
|
+
- pyi
|
|
27
|
+
- jupyter
|
|
28
|
+
- repo: https://github.com/kynan/nbstripout
|
|
29
|
+
rev: 0.8.1
|
|
30
|
+
hooks:
|
|
31
|
+
- id: nbstripout
|
|
32
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
33
|
+
rev: v1.15.0
|
|
34
|
+
hooks:
|
|
35
|
+
- id: mypy
|
|
36
|
+
args:
|
|
37
|
+
- --install-types
|
|
38
|
+
- --non-interactive
|
|
39
|
+
- --ignore-missing-imports
|
|
40
|
+
- --show-error-codes
|
|
41
|
+
- --check-untyped-defs
|