EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,234 @@
1
+ Metadata-Version: 2.4
2
+ Name: EuroEval
3
+ Version: 15.2.0
4
+ Summary: The robust European language model benchmark.
5
+ Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
+ Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
7
+ Author-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
8
+ Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>
9
+ License: MIT License
10
+
11
+ Copyright (c) 2022-2024 Dan Saattrup Nielsen
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Requires-Python: <4.0,>=3.10
32
+ Requires-Dist: accelerate>=0.34.2
33
+ Requires-Dist: bert-score>=0.3.13
34
+ Requires-Dist: click>=8.1.3
35
+ Requires-Dist: datasets>=2.15.0
36
+ Requires-Dist: evaluate>=0.4.1
37
+ Requires-Dist: huggingface-hub>=0.24.0
38
+ Requires-Dist: levenshtein>=0.24.0
39
+ Requires-Dist: litellm>=1.61.13
40
+ Requires-Dist: more-itertools>=10.5.0
41
+ Requires-Dist: numpy<2.0.0,>=1.23.0
42
+ Requires-Dist: pandas>=2.2.0
43
+ Requires-Dist: protobuf~=3.20.0
44
+ Requires-Dist: pydantic>=2.6.0
45
+ Requires-Dist: pyinfer>=0.0.3
46
+ Requires-Dist: python-dotenv>=1.0.1
47
+ Requires-Dist: rouge-score>=0.1.2
48
+ Requires-Dist: sacremoses>=0.1.1
49
+ Requires-Dist: scikit-learn<1.6.0
50
+ Requires-Dist: sentencepiece>=0.1.96
51
+ Requires-Dist: seqeval>=1.2.2
52
+ Requires-Dist: tenacity>=9.0.0
53
+ Requires-Dist: termcolor>=2.0.0
54
+ Requires-Dist: torch>=2.3.0
55
+ Requires-Dist: transformers>=4.47.0
56
+ Provides-Extra: all
57
+ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
58
+ Requires-Dist: demjson3>=3.0.6; extra == 'all'
59
+ Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
60
+ Requires-Dist: gradio>=4.26.0; extra == 'all'
61
+ Requires-Dist: vllm<0.6.5,>=0.6.3; (platform_system == 'Linux') and extra == 'all'
62
+ Provides-Extra: generative
63
+ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
64
+ Requires-Dist: demjson3>=3.0.6; extra == 'generative'
65
+ Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
66
+ Requires-Dist: vllm<0.6.5,>=0.6.3; (platform_system == 'Linux') and extra == 'generative'
67
+ Provides-Extra: human-evaluation
68
+ Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
69
+ Provides-Extra: test
70
+ Requires-Dist: gradio>=4.26.0; extra == 'test'
71
+ Description-Content-Type: text/markdown
72
+
73
+ <div align='center'>
74
+ <img src="https://raw.githubusercontent.com/EuroEval/EuroEval/main/gfx/euroeval.png" height="500" width="372">
75
+ </div>
76
+
77
+ ### The robust European language model benchmark.
78
+
79
+ ______________________________________________________________________
80
+ [![Documentation](https://img.shields.io/badge/docs-passing-green)](https://euroeval.com)
81
+ [![PyPI Status](https://badge.fury.io/py/euroeval.svg)](https://pypi.org/project/euroeval/)
82
+ [![First paper](https://img.shields.io/badge/arXiv-2304.00906-b31b1b.svg)](https://arxiv.org/abs/2304.00906)
83
+ [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
84
+ [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
85
+ [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
86
+ [![Code Coverage](https://img.shields.io/badge/Coverage-65%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
87
+ [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
88
+
89
+
90
+ ## Maintainers
91
+
92
+ - Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
93
+ dan.nielsen@alexandra.dk)
94
+ - Kenneth Enevoldsen ([@KennethEnevoldsen](https://github.com/KennethEnevoldsen),
95
+ kenneth.enevoldsen@cas.au.dk)
96
+
97
+
98
+ ## Installation
99
+ To install the package simply write the following command in your favorite terminal:
100
+ ```
101
+ $ pip install euroeval[all]
102
+ ```
103
+
104
+ This will install the EuroEval package with all extras. You can also install the
105
+ minimal version by leaving out the `[all]`, in which case the package will let you know
106
+ when an evaluation requires a certain extra dependency, and how you install it.
107
+
108
+ ## Quickstart
109
+ ### Benchmarking from the Command Line
110
+ The easiest way to benchmark pretrained models is via the command line interface. After
111
+ having installed the package, you can benchmark your favorite model like so:
112
+ ```
113
+ $ euroeval --model <model-id>
114
+ ```
115
+
116
+ Here `model` is the HuggingFace model ID, which can be found on the [HuggingFace
117
+ Hub](https://huggingface.co/models). By default this will benchmark the model on all
118
+ the tasks available. If you want to benchmark on a particular task, then use the
119
+ `--task` argument:
120
+ ```
121
+ $ euroeval --model <model-id> --task sentiment-classification
122
+ ```
123
+
124
+ We can also narrow down which languages we would like to benchmark on. This can be done
125
+ by setting the `--language` argument. Here we thus benchmark the model on the Danish
126
+ sentiment classification task:
127
+ ```
128
+ $ euroeval --model <model-id> --task sentiment-classification --language da
129
+ ```
130
+
131
+ Multiple models, datasets and/or languages can be specified by just attaching multiple
132
+ arguments. Here is an example with two models:
133
+ ```
134
+ $ euroeval --model <model-id1> --model <model-id2>
135
+ ```
136
+
137
+ The specific model version/revision to use can also be added after the suffix '@':
138
+ ```
139
+ $ euroeval --model <model-id>@<commit>
140
+ ```
141
+
142
+ This can be a branch name, a tag name, or a commit id. It defaults to 'main' for latest.
143
+
144
+ See all the arguments and options available for the `euroeval` command by typing
145
+ ```
146
+ $ euroeval --help
147
+ ```
148
+
149
+ ### Benchmarking from a Script
150
+ In a script, the syntax is similar to the command line interface. You simply initialise
151
+ an object of the `Benchmarker` class, and call this benchmark object with your favorite
152
+ model:
153
+ ```
154
+ >>> from euroeval import Benchmarker
155
+ >>> benchmark = Benchmarker()
156
+ >>> benchmark(model="<model>")
157
+ ```
158
+
159
+ To benchmark on a specific task and/or language, you simply specify the `task` or
160
+ `language` arguments, shown here with same example as above:
161
+ ```
162
+ >>> benchmark(model="<model>", task="sentiment-classification", language="da")
163
+ ```
164
+
165
+ If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
166
+ simply leave out the `model` argument. In this example, we're benchmarking all Danish
167
+ models on the Danish sentiment classification task:
168
+ ```
169
+ >>> benchmark(task="sentiment-classification", language="da")
170
+ ```
171
+
172
+ ### Benchmarking from Docker
173
+ A Dockerfile is provided in the repo, which can be downloaded and run, without needing
174
+ to clone the repo and installing from source. This can be fetched programmatically by
175
+ running the following:
176
+ ```
177
+ $ wget https://raw.githubusercontent.com/EuroEval/EuroEval/main/Dockerfile.cuda
178
+ ```
179
+
180
+ Next, to be able to build the Docker image, first ensure that the NVIDIA Container
181
+ Toolkit is
182
+ [installed](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation)
183
+ and
184
+ [configured](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker).
185
+ Ensure that the the CUDA version stated at the top of the Dockerfile matches the CUDA
186
+ version installed (which you can check using `nvidia-smi`). After that, we build the
187
+ image as follows:
188
+ ```
189
+ $ docker build --pull -t euroeval -f Dockerfile.cuda .
190
+ ```
191
+
192
+ With the Docker image built, we can now evaluate any model as follows:
193
+ ```
194
+ $ docker run -e args="<euroeval-arguments>" --gpus 1 --name euroeval --rm euroeval
195
+ ```
196
+
197
+ Here `<euroeval-arguments>` consists of the arguments added to the `euroeval` CLI
198
+ argument. This could for instance be `--model <model-id> --task
199
+ sentiment-classification`.
200
+
201
+
202
+ ## Special Thanks :pray:
203
+ - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
204
+ models on the leaderboards.
205
+ - Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
206
+ [Researcher Access Program](https://openai.com/form/researcher-access-program/).
207
+ - Thanks to [UWV](https://www.uwv.nl/) and [KU
208
+ Leuven](https://www.arts.kuleuven.be/ling/ccl) for sponsoring the Azure OpenAI
209
+ credits used to evaluate GPT-4-turbo in Dutch.
210
+ - Thanks to [Miðeind](https://mideind.is/english.html) for sponsoring the OpenAI
211
+ credits used to evaluate GPT-4-turbo in Icelandic and Faroese.
212
+ - Thanks to [CHC](https://chc.au.dk/) for sponsoring the OpenAI credits used to
213
+ evaluate GPT-4-turbo in German.
214
+
215
+
216
+ ## Citing EuroEval
217
+ If you want to cite the framework then feel free to use this:
218
+
219
+ ```
220
+ @article{nielsen2024encoder,
221
+ title={Encoder vs Decoder: Comparative Analysis of Encoder and Decoder Language Models on Multilingual NLU Tasks},
222
+ author={Nielsen, Dan Saattrup and Enevoldsen, Kenneth and Schneider-Kamp, Peter},
223
+ journal={arXiv preprint arXiv:2406.13469},
224
+ year={2024}
225
+ }
226
+ @inproceedings{nielsen2023scandeval,
227
+ author = {Nielsen, Dan Saattrup},
228
+ booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)},
229
+ month = may,
230
+ pages = {185--201},
231
+ title = {{ScandEval: A Benchmark for Scandinavian Natural Language Processing}},
232
+ year = {2023}
233
+ }
234
+ ```
@@ -0,0 +1,40 @@
1
+ euroeval/__init__.py,sha256=3od9_ucHlILSbe4WCR8k5PbeorvmUr-VjOKXJ01I0fA,2165
2
+ euroeval/benchmark_config_factory.py,sha256=pi4Lu--ySKZRd9ItG6VKS6BPLis64vL-7UE99VSXq5Y,12534
3
+ euroeval/benchmarker.py,sha256=EjORG5haUio9LgfGH7ruWEFutvJN0QGasoknFH_yGHs,46705
4
+ euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
5
+ euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
+ euroeval/constants.py,sha256=qFrm3cRT6UlnTXfHUmxqZsr0SBsGskjV1qrUlnAW-aw,1473
7
+ euroeval/data_loading.py,sha256=IHd1H4OCAtOyiro7YnJsGbbT7PTwiMUB02gh1g6Nlhg,3116
8
+ euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
9
+ euroeval/dataset_configs.py,sha256=2t0S6MqLjVLH1T7qQCpkPkAAev2KBZVAlqWVJ-K53ls,75351
10
+ euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
11
+ euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
12
+ euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
13
+ euroeval/generation.py,sha256=UZ9nmKl4rbNBhW41iwpgw_tqfsEfe1UhOnjGudz9GWs,10382
14
+ euroeval/human_evaluation.py,sha256=5uOm8cZf5uy2jBPs-ih7g8ni-a3hUz8UiXVPh6PzUWw,27675
15
+ euroeval/languages.py,sha256=d1SyG0KVtCAA_PYpFGZCgZcyVLIr7Q8uYKPxNw6WEBc,7909
16
+ euroeval/model_cache.py,sha256=BhkyWrOhjskESbndy218LUv1ZiWRc48ScdH_42dKHtE,8275
17
+ euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
18
+ euroeval/model_loading.py,sha256=ta07tMoSfK1kqjOynVXQA0vVrns6RzsCEE3g1_RGVVs,2719
19
+ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
20
+ euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
21
+ euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
22
+ euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
23
+ euroeval/utils.py,sha256=lbiLcVPVPkvp7lLHUJqhAb6X0y8S_sqSrzXAqmfzFe0,18707
24
+ euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
+ euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
26
+ euroeval/benchmark_modules/fresh.py,sha256=3R2k3Vp7J4YY8Nw5osbDIyayPtLLa2mItJGJFyyYNkY,9599
27
+ euroeval/benchmark_modules/hf.py,sha256=n3VIUA7XOOTgbSMkmYp5S06iJV0kp7aMq8YzRb0EDLw,41741
28
+ euroeval/benchmark_modules/litellm.py,sha256=uMPzUjTU54UHDmBImzWUFCGUupKvZNQN-2u0c8UaM3s,34488
29
+ euroeval/benchmark_modules/vllm.py,sha256=enLKALixXvz2qvfblGEfRwU7wb-X-7HkOdjcYpdA3xM,43341
30
+ euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
31
+ euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
32
+ euroeval/task_utils/question_answering.py,sha256=NYl3g7r84e9uaEObj_-fTFKof-WkkCQ_H_VSJ3UDS1M,27112
33
+ euroeval/task_utils/sequence_classification.py,sha256=JyGLIfMvF98emmnsfckomdzJWluVj1EeAzSLZmJFpOk,8203
34
+ euroeval/task_utils/text_to_text.py,sha256=-9iz5nR9Ib-9xOolDQM0-QJ7k4iSjDP3togE1wgxsDw,5374
35
+ euroeval/task_utils/token_classification.py,sha256=7BSBTBL7GBYOJQlK4se3h6C6HdjMec1gGgquJNXYlaI,17738
36
+ euroeval-15.2.0.dist-info/METADATA,sha256=C3bNw5fBxAFG_aOLRg6tqXsL-cb4uRoq0qsTBmRmf50,10196
37
+ euroeval-15.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ euroeval-15.2.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
+ euroeval-15.2.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
+ euroeval-15.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,4 @@
1
+ [console_scripts]
2
+ euroeval = euroeval.cli:benchmark
3
+ human_evaluate = euroeval.human_evaluation:main
4
+ scandeval = euroeval.cli:benchmark
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022-2024 Dan Saattrup Nielsen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.