PyPI - EuroEval - Versions diffs - 15.2.0__py3-none-any.whl - Mend

EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show

euroeval/__init__.py +72 -0
euroeval/benchmark_config_factory.py +358 -0
euroeval/benchmark_modules/__init__.py +7 -0
euroeval/benchmark_modules/base.py +354 -0
euroeval/benchmark_modules/fresh.py +286 -0
euroeval/benchmark_modules/hf.py +1185 -0
euroeval/benchmark_modules/litellm.py +905 -0
euroeval/benchmark_modules/vllm.py +1171 -0
euroeval/benchmarker.py +1074 -0
euroeval/callbacks.py +72 -0
euroeval/cli.py +281 -0
euroeval/constants.py +50 -0
euroeval/data_loading.py +96 -0
euroeval/data_models.py +474 -0
euroeval/dataset_configs.py +2001 -0
euroeval/enums.py +144 -0
euroeval/exceptions.py +191 -0
euroeval/finetuning.py +324 -0
euroeval/generation.py +296 -0
euroeval/human_evaluation.py +737 -0
euroeval/languages.py +200 -0
euroeval/model_cache.py +253 -0
euroeval/model_config.py +77 -0
euroeval/model_loading.py +78 -0
euroeval/scores.py +90 -0
euroeval/speed_benchmark.py +124 -0
euroeval/task_utils/__init__.py +1 -0
euroeval/task_utils/multiple_choice_classification.py +176 -0
euroeval/task_utils/question_answering.py +698 -0
euroeval/task_utils/sequence_classification.py +237 -0
euroeval/task_utils/text_to_text.py +150 -0
euroeval/task_utils/token_classification.py +464 -0
euroeval/tasks.py +202 -0
euroeval/types.py +97 -0
euroeval/utils.py +574 -0
euroeval-15.2.0.dist-info/METADATA +234 -0
euroeval-15.2.0.dist-info/RECORD +40 -0
euroeval-15.2.0.dist-info/WHEEL +4 -0
euroeval-15.2.0.dist-info/entry_points.txt +4 -0
euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0

euroeval-15.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,234 @@
+Metadata-Version: 2.4
+Name: EuroEval
+Version: 15.2.0
+Summary: The robust European language model benchmark.
+Project-URL: Repository, https://github.com/EuroEval/EuroEval
+Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
+Author-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
+Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>
+License: MIT License
+        Copyright (c) 2022-2024 Dan Saattrup Nielsen
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+License-File: LICENSE
+Requires-Python: <4.0,>=3.10
+Requires-Dist: accelerate>=0.34.2
+Requires-Dist: bert-score>=0.3.13
+Requires-Dist: click>=8.1.3
+Requires-Dist: datasets>=2.15.0
+Requires-Dist: evaluate>=0.4.1
+Requires-Dist: huggingface-hub>=0.24.0
+Requires-Dist: levenshtein>=0.24.0
+Requires-Dist: litellm>=1.61.13
+Requires-Dist: more-itertools>=10.5.0
+Requires-Dist: numpy<2.0.0,>=1.23.0
+Requires-Dist: pandas>=2.2.0
+Requires-Dist: protobuf~=3.20.0
+Requires-Dist: pydantic>=2.6.0
+Requires-Dist: pyinfer>=0.0.3
+Requires-Dist: python-dotenv>=1.0.1
+Requires-Dist: rouge-score>=0.1.2
+Requires-Dist: sacremoses>=0.1.1
+Requires-Dist: scikit-learn<1.6.0
+Requires-Dist: sentencepiece>=0.1.96
+Requires-Dist: seqeval>=1.2.2
+Requires-Dist: tenacity>=9.0.0
+Requires-Dist: termcolor>=2.0.0
+Requires-Dist: torch>=2.3.0
+Requires-Dist: transformers>=4.47.0
+Provides-Extra: all
+Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: demjson3>=3.0.6; extra == 'all'
+Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: gradio>=4.26.0; extra == 'all'
+Requires-Dist: vllm<0.6.5,>=0.6.3; (platform_system == 'Linux') and extra == 'all'
+Provides-Extra: generative
+Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: demjson3>=3.0.6; extra == 'generative'
+Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: vllm<0.6.5,>=0.6.3; (platform_system == 'Linux') and extra == 'generative'
+Provides-Extra: human-evaluation
+Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
+Provides-Extra: test
+Requires-Dist: gradio>=4.26.0; extra == 'test'
+Description-Content-Type: text/markdown
+<div align='center'>
+<img src="https://raw.githubusercontent.com/EuroEval/EuroEval/main/gfx/euroeval.png" height="500" width="372">
+</div>
+### The robust European language model benchmark.
+______________________________________________________________________
+[![Documentation](https://img.shields.io/badge/docs-passing-green)](https://euroeval.com)
+[![PyPI Status](https://badge.fury.io/py/euroeval.svg)](https://pypi.org/project/euroeval/)
+[![First paper](https://img.shields.io/badge/arXiv-2304.00906-b31b1b.svg)](https://arxiv.org/abs/2304.00906)
+[![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
+[![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
+[![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
+[![Code Coverage](https://img.shields.io/badge/Coverage-65%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
+[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
+## Maintainers
+- Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
+  dan.nielsen@alexandra.dk)
+- Kenneth Enevoldsen ([@KennethEnevoldsen](https://github.com/KennethEnevoldsen),
+  kenneth.enevoldsen@cas.au.dk)
+## Installation
+To install the package simply write the following command in your favorite terminal:
+```
+$ pip install euroeval[all]
+```
+This will install the EuroEval package with all extras. You can also install the
+minimal version by leaving out the `[all]`, in which case the package will let you know
+when an evaluation requires a certain extra dependency, and how you install it.
+## Quickstart
+### Benchmarking from the Command Line
+The easiest way to benchmark pretrained models is via the command line interface. After
+having installed the package, you can benchmark your favorite model like so:
+```
+$ euroeval --model <model-id>
+```
+Here `model` is the HuggingFace model ID, which can be found on the [HuggingFace
+Hub](https://huggingface.co/models). By default this will benchmark the model on all
+the tasks available. If you want to benchmark on a particular task, then use the
+`--task` argument:
+```
+$ euroeval --model <model-id> --task sentiment-classification
+```
+We can also narrow down which languages we would like to benchmark on. This can be done
+by setting the `--language` argument. Here we thus benchmark the model on the Danish
+sentiment classification task:
+```
+$ euroeval --model <model-id> --task sentiment-classification --language da
+```
+Multiple models, datasets and/or languages can be specified by just attaching multiple
+arguments. Here is an example with two models:
+```
+$ euroeval --model <model-id1> --model <model-id2>
+```
+The specific model version/revision to use can also be added after the suffix '@':
+```
+$ euroeval --model <model-id>@<commit>
+```
+This can be a branch name, a tag name, or a commit id. It defaults to 'main' for latest.
+See all the arguments and options available for the `euroeval` command by typing
+```
+$ euroeval --help
+```
+### Benchmarking from a Script
+In a script, the syntax is similar to the command line interface. You simply initialise
+an object of the `Benchmarker` class, and call this benchmark object with your favorite
+model:
+```
+>>> from euroeval import Benchmarker
+>>> benchmark = Benchmarker()
+>>> benchmark(model="<model>")
+```
+To benchmark on a specific task and/or language, you simply specify the `task` or
+`language` arguments, shown here with same example as above:
+```
+>>> benchmark(model="<model>", task="sentiment-classification", language="da")
+```
+If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
+simply leave out the `model` argument. In this example, we're benchmarking all Danish
+models on the Danish sentiment classification task:
+```
+>>> benchmark(task="sentiment-classification", language="da")
+```
+### Benchmarking from Docker
+A Dockerfile is provided in the repo, which can be downloaded and run, without needing
+to clone the repo and installing from source. This can be fetched programmatically by
+running the following:
+```
+$ wget https://raw.githubusercontent.com/EuroEval/EuroEval/main/Dockerfile.cuda
+```
+Next, to be able to build the Docker image, first ensure that the NVIDIA Container
+Toolkit is
+[installed](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installation)
+and
+[configured](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#configuring-docker).
+Ensure that the the CUDA version stated at the top of the Dockerfile matches the CUDA
+version installed (which you can check using `nvidia-smi`). After that, we build the
+image as follows:
+```
+$ docker build --pull -t euroeval -f Dockerfile.cuda .
+```
+With the Docker image built, we can now evaluate any model as follows:
+```
+$ docker run -e args="<euroeval-arguments>" --gpus 1 --name euroeval --rm euroeval
+```
+Here `<euroeval-arguments>` consists of the arguments added to the `euroeval` CLI
+argument. This could for instance be `--model <model-id> --task
+sentiment-classification`.
+## Special Thanks :pray:
+- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
+  models on the leaderboards.
+- Thanks to [OpenAI](https://openai.com/) for sponsoring OpenAI credits as part of their
+  [Researcher Access Program](https://openai.com/form/researcher-access-program/).
+- Thanks to [UWV](https://www.uwv.nl/) and [KU
+  Leuven](https://www.arts.kuleuven.be/ling/ccl) for sponsoring the Azure OpenAI
+  credits used to evaluate GPT-4-turbo in Dutch.
+- Thanks to [Miðeind](https://mideind.is/english.html) for sponsoring the OpenAI
+  credits used to evaluate GPT-4-turbo in Icelandic and Faroese.
+- Thanks to [CHC](https://chc.au.dk/) for sponsoring the OpenAI credits used to
+  evaluate GPT-4-turbo in German.
+## Citing EuroEval
+If you want to cite the framework then feel free to use this:
+```
+@article{nielsen2024encoder,
+  title={Encoder vs Decoder: Comparative Analysis of Encoder and Decoder Language Models on Multilingual NLU Tasks},
+  author={Nielsen, Dan Saattrup and Enevoldsen, Kenneth and Schneider-Kamp, Peter},
+  journal={arXiv preprint arXiv:2406.13469},
+  year={2024}
+}
+@inproceedings{nielsen2023scandeval,
+  author = {Nielsen, Dan Saattrup},
+  booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)},
+  month = may,
+  pages = {185--201},
+  title = {{ScandEval: A Benchmark for Scandinavian Natural Language Processing}},
+  year = {2023}
+}
+```

euroeval-15.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,40 @@
+euroeval/__init__.py,sha256=3od9_ucHlILSbe4WCR8k5PbeorvmUr-VjOKXJ01I0fA,2165
+euroeval/benchmark_config_factory.py,sha256=pi4Lu--ySKZRd9ItG6VKS6BPLis64vL-7UE99VSXq5Y,12534
+euroeval/benchmarker.py,sha256=EjORG5haUio9LgfGH7ruWEFutvJN0QGasoknFH_yGHs,46705
+euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
+euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
+euroeval/constants.py,sha256=qFrm3cRT6UlnTXfHUmxqZsr0SBsGskjV1qrUlnAW-aw,1473
+euroeval/data_loading.py,sha256=IHd1H4OCAtOyiro7YnJsGbbT7PTwiMUB02gh1g6Nlhg,3116
+euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
+euroeval/dataset_configs.py,sha256=2t0S6MqLjVLH1T7qQCpkPkAAev2KBZVAlqWVJ-K53ls,75351
+euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
+euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
+euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
+euroeval/generation.py,sha256=UZ9nmKl4rbNBhW41iwpgw_tqfsEfe1UhOnjGudz9GWs,10382
+euroeval/human_evaluation.py,sha256=5uOm8cZf5uy2jBPs-ih7g8ni-a3hUz8UiXVPh6PzUWw,27675
+euroeval/languages.py,sha256=d1SyG0KVtCAA_PYpFGZCgZcyVLIr7Q8uYKPxNw6WEBc,7909
+euroeval/model_cache.py,sha256=BhkyWrOhjskESbndy218LUv1ZiWRc48ScdH_42dKHtE,8275
+euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
+euroeval/model_loading.py,sha256=ta07tMoSfK1kqjOynVXQA0vVrns6RzsCEE3g1_RGVVs,2719
+euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
+euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
+euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
+euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
+euroeval/utils.py,sha256=lbiLcVPVPkvp7lLHUJqhAb6X0y8S_sqSrzXAqmfzFe0,18707
+euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
+euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
+euroeval/benchmark_modules/fresh.py,sha256=3R2k3Vp7J4YY8Nw5osbDIyayPtLLa2mItJGJFyyYNkY,9599
+euroeval/benchmark_modules/hf.py,sha256=n3VIUA7XOOTgbSMkmYp5S06iJV0kp7aMq8YzRb0EDLw,41741
+euroeval/benchmark_modules/litellm.py,sha256=uMPzUjTU54UHDmBImzWUFCGUupKvZNQN-2u0c8UaM3s,34488
+euroeval/benchmark_modules/vllm.py,sha256=enLKALixXvz2qvfblGEfRwU7wb-X-7HkOdjcYpdA3xM,43341
+euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
+euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
+euroeval/task_utils/question_answering.py,sha256=NYl3g7r84e9uaEObj_-fTFKof-WkkCQ_H_VSJ3UDS1M,27112
+euroeval/task_utils/sequence_classification.py,sha256=JyGLIfMvF98emmnsfckomdzJWluVj1EeAzSLZmJFpOk,8203
+euroeval/task_utils/text_to_text.py,sha256=-9iz5nR9Ib-9xOolDQM0-QJ7k4iSjDP3togE1wgxsDw,5374
+euroeval/task_utils/token_classification.py,sha256=7BSBTBL7GBYOJQlK4se3h6C6HdjMec1gGgquJNXYlaI,17738
+euroeval-15.2.0.dist-info/METADATA,sha256=C3bNw5fBxAFG_aOLRg6tqXsL-cb4uRoq0qsTBmRmf50,10196
+euroeval-15.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.2.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.2.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.2.0.dist-info/RECORD,,

euroeval-15.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any

euroeval-15.2.0.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,4 @@
+[console_scripts]
+euroeval = euroeval.cli:benchmark
+human_evaluate = euroeval.human_evaluation:main
+scandeval = euroeval.cli:benchmark

euroeval-15.2.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2022-2024 Dan Saattrup Nielsen
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.