PyPI - ScandEval - Versions diffs - 16.10.0__py3-none-any.whl → 16.11.0__py3-none-any.whl - Mend

ScandEval 16.10.0py3-none-any.whl → 16.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

scandeval/benchmark_modules/hf.py +14 -1
scandeval/benchmark_modules/litellm.py +111 -22
scandeval/benchmark_modules/vllm.py +116 -60
scandeval/benchmarker.py +13 -6
scandeval/data_models.py +2 -2
scandeval/dataset_configs/dutch.py +8 -9
scandeval/dataset_configs/norwegian.py +3 -3
scandeval/logging_utils.py +1 -1
scandeval/metrics/huggingface.py +3 -2
scandeval/metrics/llm_as_a_judge.py +79 -15
scandeval/model_loading.py +2 -1
scandeval/task_group_utils/sequence_classification.py +12 -3
scandeval/types.py +39 -0
scandeval/utils.py +29 -4
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/METADATA +27 -19
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/RECORD +19 -19
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/licenses/LICENSE +1 -1
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/WHEEL +0 -0
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/entry_points.txt +0 -0

scandeval/dataset_configs/dutch.py CHANGED Viewed

@@ -74,6 +74,14 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
     languages=[DUTCH],
 )
+DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
+    name="duidelijke-taal",
+    pretty_name="Duidelijke Taal",
+    source="EuroEval/duidelijke-taal",
+    task=SIMPL,
+    languages=[DUTCH],
+)
 VALEU_NL_CONFIG = DatasetConfig(
     name="valeu-nl",
     pretty_name="VaLEU-nl",
@@ -161,12 +169,3 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
     _labels=["a", "b"],
     unofficial=True,
 )
-DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
-    name="duidelijke-taal",
-    pretty_name="Duidelijke Taal",
-    source="EuroEval/duidelijke-taal",
-    task=SIMPL,
-    languages=[DUTCH],
-    unofficial=True,
-)

scandeval/dataset_configs/norwegian.py CHANGED Viewed

@@ -27,7 +27,7 @@ SCALA_NN_CONFIG = DatasetConfig(
     pretty_name="ScaLA-nn",
     source="EuroEval/scala-nn",
     task=LA,
-    languages=[NORWEGIAN_NYNORSK],
+    languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
 )
 NORNE_NB_CONFIG = DatasetConfig(
@@ -43,7 +43,7 @@ NORNE_NN_CONFIG = DatasetConfig(
     pretty_name="NorNE-nn",
     source="EuroEval/norne-nn-mini",
     task=NER,
-    languages=[NORWEGIAN_NYNORSK],
+    languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
 )
 NORQUAD_CONFIG = DatasetConfig(
@@ -197,7 +197,7 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
     pretty_name="MultiWikiQA-nn",
     source="EuroEval/multi-wiki-qa-nn-mini",
     task=RC,
-    languages=[NORWEGIAN_NYNORSK],
+    languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
     unofficial=True,
 )

scandeval/logging_utils.py CHANGED Viewed

@@ -87,7 +87,7 @@ def log(message: str, level: int, colour: str | None = None) -> None:
 @cache_arguments("message")
-def log_once(message: str, level: int = logging.INFO, prefix: str = "") -> None:
+def log_once(message: str, level: int, prefix: str = "") -> None:
     """Log a message once.
     This is ensured by caching the "message" argument and only logging it the first time

scandeval/metrics/huggingface.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All the Hugging Face metrics used in EuroEval."""
 import collections.abc as c
+import os
 import typing as t
 from pathlib import Path
@@ -130,7 +131,7 @@ class HuggingFaceMetric(Metric):
             "__call__ method."
         )
-        with no_terminal_output(disable=benchmark_config.verbose):
+        with no_terminal_output(disable=os.getenv("FULL_LOG", "0") == "1"):
             results = self.metric.compute(
                 predictions=predictions, references=references, **self.compute_kwargs
             )
@@ -196,7 +197,7 @@ class SourceBasedMetric(HuggingFaceMetric):
                 f"instead."
             )
-        with no_terminal_output(disable=benchmark_config.verbose):
+        with no_terminal_output(disable=os.getenv("FULL_LOG", "0") == "1"):
             results = self.metric.compute(
                 sources=sources,
                 predictions=predictions,

scandeval/metrics/llm_as_a_judge.py CHANGED Viewed

@@ -5,7 +5,7 @@ import logging
 import typing as t
 from pathlib import Path
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ValidationError
 from ..exceptions import InvalidBenchmark
 from ..logging_utils import log
@@ -17,6 +17,8 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig
+from ..types import BatchScoringFunction, ScoringFunction
 class LLMAsAJudgeMetric(Metric):
     """Use an LLM to judge the quality of the predictions."""
@@ -29,7 +31,8 @@ class LLMAsAJudgeMetric(Metric):
         judge_kwargs: dict[str, t.Any],
         user_prompt: str,
         response_format: t.Type[BaseModel],
-        scoring_fn: t.Callable[[BaseModel | None], float],
+        scoring_fn: ScoringFunction | None = None,
+        batch_scoring_fn: BatchScoringFunction | None = None,
         condition_formatting_fn: t.Callable[[str], str] = lambda x: x,
         system_prompt: str | None = None,
     ) -> None:
@@ -57,6 +60,8 @@ class LLMAsAJudgeMetric(Metric):
                 response.
             scoring_fn:
                 A function that takes the judge's response and returns a score.
+            batch_scoring_fn:
+                A function that takes all judge responses and returns a score.
             condition_formatting_fn (optional):
                 A function to format the condition string before it is included in the
                 user prompt. Defaults to a no-op function that returns the input
@@ -70,7 +75,9 @@ class LLMAsAJudgeMetric(Metric):
         self.judge_kwargs = judge_kwargs
         self.user_prompt = user_prompt
         self.response_format = response_format
-        self.scoring_fn = scoring_fn
+        self.batch_scoring_fn = self._get_batch_scoring_fn(
+            scoring_fn=scoring_fn, batch_scoring_fn=batch_scoring_fn
+        )
         self.condition_formatting_fn = condition_formatting_fn
         self.system_prompt = system_prompt
@@ -181,22 +188,36 @@ class LLMAsAJudgeMetric(Metric):
         json_dicts = [
             extract_json_dict_from_string(s=output.sequence) for output in raw_outputs
         ]
-        outputs = [
-            self.response_format.model_validate(obj=json_dict)
-            if json_dict is not None
-            else None
-            for json_dict in json_dicts
-        ]
+        outputs_raw: list[BaseModel | None] = []
+        for json_dict in json_dicts:
+            if json_dict is None:
+                outputs_raw.append(None)
+                continue
+            try:
+                outputs_raw.append(self.response_format.model_validate(obj=json_dict))
+            except ValidationError:
+                outputs_raw.append(None)
+        num_none: int = sum(output is None for output in outputs_raw)
+        if num_none:
+            log(
+                f"Could not parse/validate {num_none:,} of {len(outputs_raw):,} judge "
+                f"outputs for metric {self.pretty_name!r}. These will be ignored.",
+                level=logging.DEBUG,
+            )
-        # Calculate the scores using the scoring function
-        scores = [self.scoring_fn(output) for output in outputs]
-        if not scores:
+        outputs: list[BaseModel] = [
+            output for output in outputs_raw if output is not None
+        ]
+        if not outputs:
             log(
-                f"No scores were calculated for {self.pretty_name}.",
+                f"No valid judge outputs were produced for metric "
+                f"{self.pretty_name!r}.",
                 level=logging.WARNING,
             )
             return None
-        return sum(scores) / len(scores)
+        return self.batch_scoring_fn(outputs=outputs, dataset=dataset)
     def _apply_user_prompt(self, prediction: str, condition: str | None = None) -> str:
         """Apply the user prompt to the prediction and condition.
@@ -227,6 +248,49 @@ class LLMAsAJudgeMetric(Metric):
             )
         return self.user_prompt.format(prediction=prediction)
+    def _get_batch_scoring_fn(
+        self,
+        scoring_fn: ScoringFunction | None,
+        batch_scoring_fn: BatchScoringFunction | None,
+    ) -> BatchScoringFunction:
+        """Get the batch scoring function.
+        Args:
+            scoring_fn:
+                The scoring function to use.
+            batch_scoring_fn:
+                The batch scoring function to use.
+        Returns:
+            The batch scoring function.
+        Raises:
+            InvalidBenchmark:
+                If both or neither of the scoring functions are provided.
+        """
+        if scoring_fn is not None and batch_scoring_fn is not None:
+            raise InvalidBenchmark(
+                "Both `scoring_fn` and `batch_scoring_fn` are provided. Please "
+                "provide only one of them."
+            )
+        if scoring_fn is not None:
+            scoring_fn_nonnull = scoring_fn
+            def batch_fn(
+                outputs: list[BaseModel], dataset: "Dataset | None" = None
+            ) -> float:
+                return sum(scoring_fn_nonnull(output) for output in outputs) / len(
+                    outputs
+                )
+            return batch_fn
+        if batch_scoring_fn is not None:
+            return batch_scoring_fn
+        raise InvalidBenchmark(
+            "Neither `scoring_fn` nor `batch_scoring_fn` are provided. Please "
+            "provide one of them."
+        )
 ### Fluency metric ###
@@ -257,5 +321,5 @@ fluency_metric = LLMAsAJudgeMetric(
     "Text: {prediction!r}\n\n"
     "Output your rating as a JSON object with a single key 'fluency'.",
     response_format=Fluency,
-    scoring_fn=lambda output: (output.fluency - 1) / 4.0 if output is not None else 0.0,
+    scoring_fn=lambda output: (output.fluency - 1) / 4.0,
 )

scandeval/model_loading.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Functions related to the loading of models."""
+import logging
 import typing as t
 from .benchmark_modules import (
@@ -35,7 +36,7 @@ def load_model(
     Returns:
         The model.
     """
-    log_once(f"\nLoading the model {model_config.model_id}...")
+    log_once(f"\nLoading the model {model_config.model_id}...", level=logging.INFO)
     # The order matters; the first model type that matches will be used. For this
     # reason, they have been ordered in terms of the most common model types.

scandeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -180,6 +180,17 @@ def extract_labels_from_generation(
         if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
             predicted_label = m.group(1)
+        # If the prediction starts with one of the candidate labels (case-insensitive)
+        # then use that one
+        prefix_candidate_labels = [
+            candidate_label
+            for candidate_label in sample_candidate_labels[idx]
+            if predicted_label.lower().startswith(candidate_label.lower())
+        ]
+        if prefix_candidate_labels:
+            new_predicted_labels.append(prefix_candidate_labels[0])
+            continue
         # We set the word edit distance weights such that we heavily penalise insertions
         # and substitutions, so that we don't just insert the correct label, but that we
         # want the model to have included the correct label in its output.
@@ -235,9 +246,7 @@ def extract_labels_from_generation(
                 f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
                 "of the samples. This likely means that the model were completely "
                 "off in these cases. Since this task does not allow invalid model "
-                "outputs, we have to abort the evaluation. Please re-run the "
-                "evaluation with the `--debug` flag (or `debug=True` if you're using "
-                "the `Benchmarker` API) to see the precise model outputs."
+                "outputs, we have to abort the evaluation."
             )
     return new_predicted_labels

scandeval/types.py CHANGED Viewed

@@ -13,9 +13,11 @@ except ImportError:
         MistralCommonBackend as MistralCommonTokenizer,
     )
 if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
     from numpy.typing import NDArray
+    from pydantic import BaseModel
     from .data_models import BenchmarkConfig, GenerativeModelOutput
@@ -73,6 +75,43 @@ class ExtractLabelsFunction(t.Protocol):
         ...
+class ScoringFunction(t.Protocol):
+    """A function used to compute a score from a single model output."""
+    def __call__(self, output: "BaseModel") -> float:
+        """Compute a score from a model output.
+        Args:
+            output:
+                A model output (Pydantic model) from the judge.
+        Returns:
+            A float score computed from the output.
+        """
+        ...
+class BatchScoringFunction(t.Protocol):
+    """A function used to compute batch scores from model outputs."""
+    def __call__(
+        self, outputs: list["BaseModel"], dataset: "Dataset | None" = None
+    ) -> float:
+        """Compute a batch score from model outputs.
+        Args:
+            outputs:
+                List of model outputs (Pydantic models) from the judge.
+            dataset:
+                Optional dataset used for evaluation. Can be used for additional
+                context when computing the score.
+        Returns:
+            A float score computed from the batch of outputs.
+        """
+        ...
 def is_list_of_int(x: object) -> t.TypeGuard[c.Sequence[int]]:
     """Check if an object is a list of integers.

scandeval/utils.py CHANGED Viewed

@@ -21,6 +21,7 @@ import huggingface_hub as hf_hub
 import numpy as np
 import torch
 from huggingface_hub.errors import LocalTokenNotFoundError
+from requests.exceptions import RequestException
 from .caching_utils import cache_arguments
 from .constants import T
@@ -44,10 +45,25 @@ def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
     Returns:
         The path to the cache directory.
     """
-    # to avoid nesting due to models name containing '/'
-    _model_id = model_id.replace("/", "--")
-    cache_dir_path = Path(cache_dir) / "model_cache" / _model_id
-    return str(cache_dir_path)
+    # If the model ID is a path, we just use that as the cache dir
+    if Path(model_id).is_dir():
+        log_once(
+            f"Since the model {model_id!r} is a local model, we will use the model "
+            "directory directly as the model cache directory.",
+            level=logging.DEBUG,
+        )
+        return model_id
+    # Otherwise, we create a cache dir based on the model ID
+    model_cache_dir = Path(
+        cache_dir, "model_cache", model_id.replace("/", "--")
+    ).as_posix()
+    log_once(
+        f"Using the model cache directory {model_cache_dir!r} for the model "
+        f"{model_id!r}.",
+        level=logging.DEBUG,
+    )
+    return model_cache_dir
 def resolve_model_path(download_dir: str) -> str:
@@ -65,8 +81,10 @@ def resolve_model_path(download_dir: str) -> str:
             If the model path is not valid, or if required files are missing.
     """
     model_path = Path(download_dir)
     # Get the 'path safe' version of the model id, which is the last dir in the path
     model_id_path = model_path.name
     # Hf hub `cache_dir` puts the files in models--`model_id_path`/snapshots
     model_path = model_path / f"models--{model_id_path}" / "snapshots"
     if not model_path.exists():
@@ -423,6 +441,13 @@ def get_hf_token(api_key: str | None) -> str | bool:
             level=logging.DEBUG,
         )
         return False
+    except RequestException:
+        log_once(
+            "No Hugging Face API key was set and the connection to Hugging Face "
+            "failed, so no token will be used.",
+            level=logging.DEBUG,
+        )
+        return False
 def extract_multiple_choice_labels(

{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ScandEval
-Version: 16.10.0
+Version: 16.11.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -8,7 +8,7 @@ Author-email: Dan Saattrup Smart <dan.smart@alexandra.dk>
 Maintainer-email: Dan Saattrup Smart <dan.smart@alexandra.dk>
 License: MIT License
-        Copyright (c) 2022-2025 Dan Saattrup Smart
+        Copyright (c) 2022-2026 Dan Saattrup Smart
         Permission is hereby granted, free of charge, to any person obtaining a copy
         of this software and associated documentation files (the "Software"), to deal
@@ -123,16 +123,17 @@ The easiest way to benchmark pretrained models is via the command line interface
 having installed the package, you can benchmark your favorite model like so:
 ```bash
-euroeval --model <model-id>
+euroeval --model <model-id-or-path>
 ```
-Here `model` is the HuggingFace model ID, which can be found on the [HuggingFace
-Hub](https://huggingface.co/models). By default this will benchmark the model on all
-the tasks available. If you want to benchmark on a particular task, then use the
-`--task` argument:
+Here `model` is either the HuggingFace model ID, which can be found on the [HuggingFace
+Hub](https://huggingface.co/models), or a local path to a model directory (containing
+the model files as well as the `config.json` file). By default this will benchmark the
+model on all the tasks available. If you want to benchmark on a particular task, then
+use the `--task` argument:
 ```bash
-euroeval --model <model-id> --task sentiment-classification
+euroeval --model <model-id-or-path> --task sentiment-classification
 ```
 We can also narrow down which languages we would like to benchmark on. This can be done
@@ -140,20 +141,20 @@ by setting the `--language` argument. Here we thus benchmark the model on the Da
 sentiment classification task:
 ```bash
-euroeval --model <model-id> --task sentiment-classification --language da
+euroeval --model <model-id-or-path> --task sentiment-classification --language da
 ```
 Multiple models, datasets and/or languages can be specified by just attaching multiple
 arguments. Here is an example with two models:
 ```bash
-euroeval --model <model-id1> --model <model-id2>
+euroeval --model <model-id-or-path-1> --model <model-id-or-path-2>
 ```
 The specific model version/revision to use can also be added after the suffix '@':
 ```bash
-euroeval --model <model-id>@<commit>
+euroeval --model <model-id-or-path>@<commit>
 ```
 This can be a branch name, a tag name, or a commit id. It defaults to 'main' for latest.
@@ -173,7 +174,7 @@ model:
 ```python
 >>> from euroeval import Benchmarker
 >>> benchmarker = Benchmarker()
->>> benchmarker.benchmark(model="<model-id>")
+>>> benchmarker.benchmark(model="<model-id-or-path>")
 ```
 To benchmark on a specific task and/or language, you simply specify the `task` or
@@ -181,7 +182,7 @@ To benchmark on a specific task and/or language, you simply specify the `task` o
 ```python
 >>> benchmarker.benchmark(
-...     model="<model-id>",
+...     model="<model-id-or-path>",
 ...     task="sentiment-classification",
 ...     language="da",
 ... )
@@ -225,7 +226,7 @@ docker run -e args="<euroeval-arguments>" --gpus 1 --name euroeval --rm euroeval
 ```
 Here `<euroeval-arguments>` consists of the arguments added to the `euroeval` CLI
-argument. This could for instance be `--model <model-id> --task
+argument. This could for instance be `--model <model-id-or-path> --task
 sentiment-classification`.
 ## Benchmarking custom inference APIs
@@ -291,14 +292,14 @@ script. For example to download the model you want and all of the Danish sentime
 classification datasets:
 ```bash
-euroeval --model <model-id> --task sentiment-classification --language da --download-only
+euroeval --model <model-id-or-path> --task sentiment-classification --language da --download-only
 ```
 Or from a script:
 ```python
 >>> benchmarker.benchmark(
-... model="<model-id>",
+... model="<model-id-or-path>",
 ... task="sentiment-classification",
 ... language="da",
 ... download_only=True,
@@ -346,7 +347,7 @@ MY_CONFIG = DatasetConfig(
 You can then benchmark your custom dataset by simply running
 ```bash
-euroeval --dataset my-dataset --model <model-id>
+euroeval --dataset my-dataset --model <model-id-or-path>
 ```
 You can also run the benchmark from a Python script, by simply providing your custom
@@ -356,7 +357,7 @@ dataset configuration directly into the `benchmark` method:
 from euroeval import Benchmarker
 benchmarker = Benchmarker()
-benchmarker.benchmark(model="<model-id>", dataset=MY_CONFIG)
+benchmarker.benchmark(model="<model-id-or-path>", dataset=MY_CONFIG)
 ```
 We have included three convenience tasks to make it easier to set up custom datasets:
@@ -436,7 +437,7 @@ MY_SQL_DATASET = DatasetConfig(
 Again, with this you can benchmark your custom dataset by simply running
 ```bash
-euroeval --dataset my-sql-dataset --model <model-id>
+euroeval --dataset my-sql-dataset --model <model-id-or-path>
 ```
 ## Reproducing the evaluation datasets
@@ -592,6 +593,13 @@ A huge thank you to all the contributors who have helped make this project a suc
         alt="Contributor avatar for tvosch"
     />
 </a>
+<a href="https://github.com/Touzen">
+    <img
+        src="https://avatars.githubusercontent.com/u/1416265"
+        width=50
+        alt="Contributor avatar for Touzen"
+    />
+</a>
 ### Contribute to EuroEval

{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/RECORD RENAMED Viewed

@@ -1,34 +1,34 @@
 scandeval/__init__.py,sha256=w4oYw-lbj5ZZ4pv-bHrgZNJ6dlu-WcAWg2e--_UMmeE,4244
 scandeval/benchmark_config_factory.py,sha256=2stmcqKwx0G9pAiA0atunqDchJ9eoezp1Wh3vB41zV4,8745
-scandeval/benchmarker.py,sha256=ARH1ATYAunKNRgIQTDvGqMN_M-ygG0SIQw-hfTOuC6U,53556
+scandeval/benchmarker.py,sha256=Enf3IGYPl2q8j4ViXi5M8_ZaftpCAemTi0Z9HGMv7wc,53841
 scandeval/caching_utils.py,sha256=lLUbkpDdJZy4xodIpwIz5d-WNKGuszbr_d9dyiJ5kZc,2591
 scandeval/callbacks.py,sha256=l8f6Zr8EoHfVFsI1ZnMUK0Y8uZB00Nvaz_I6XDn6avE,2515
 scandeval/cli.py,sha256=zvPGomSdrcjxc4uhmh8SkB4s2d7U9JYhxBJ34vznqUI,9411
 scandeval/constants.py,sha256=wF7fQwaX8yZIypq_eh5RcaQFEhABR7dJxQaAX82b4P8,3766
 scandeval/data_loading.py,sha256=8ryYEmj6di1f9QefGfNajxObQ9iapIGuAsL8m9KzDyI,7050
-scandeval/data_models.py,sha256=vRGKrYr1YFBcH4ngOHrESicbTaIcz-joKz58JN5YMFE,30548
+scandeval/data_models.py,sha256=btAafgRktlRhcOXDIFNp4y0RiR2n5-C_rRmgZCyxmCE,30562
 scandeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
 scandeval/exceptions.py,sha256=4-N2OIo5PJ2aciLjagNAVhdHPxpq2QxywbBqJ8lkKj0,5780
 scandeval/finetuning.py,sha256=dTjchPHLFRD65ZrEmtj5TfMTPZ6PODn77t372fgTNwE,11983
 scandeval/generation.py,sha256=ccE-S0jxkM99XziIdeaBbk8yRGv4YBkzZkoabhFCSKA,13382
 scandeval/generation_utils.py,sha256=A6YCiiMrMEUHq5BcVEjsouIKMPGt0sCfPzsJY1GVyk0,20092
 scandeval/languages.py,sha256=gUSosFbvf1eEQHjVsKhXdJ4jiGXC-9lMkOL8AsBG33Q,37295
-scandeval/logging_utils.py,sha256=Pd6DyHTPHCUsjtriomJboiTB35UdXvzxwnNpGTuec-g,9522
+scandeval/logging_utils.py,sha256=Qnni11ngHrjCf_fgkk6lp6gs-tGSgUS3d5zRR83y6ec,9507
 scandeval/model_cache.py,sha256=sjMYW0klnHt2yAFLavDTsp_InxPeSOuVEFo-Rh_31UM,10219
 scandeval/model_config.py,sha256=fxHfgpw-9vj3hwke28DguVGvG9TU06nkTXT0V6KAMpQ,2761
-scandeval/model_loading.py,sha256=bE51L4-AaVgo9h10UsKH_47CB4tOJGU988HxotQ5sYE,2342
+scandeval/model_loading.py,sha256=DsX7et18Epcv8kHATZgwPJnwH17GHmh3JCzrSoI3GAE,2377
 scandeval/scores.py,sha256=9a1XtppFbp8GJFc9JdThGxqBY0YUE7-92oyrlxScjNk,3281
 scandeval/speed_benchmark.py,sha256=VUOvauc9tuAegThNT2g1a-Z1l7DEmKq57dHI4t16o5A,4068
 scandeval/tasks.py,sha256=mgE6Vx_1WD9-aY-yeBxc_09Uyz-tqk69xISMWVYcrsY,5980
 scandeval/tokenisation_utils.py,sha256=Sa8V91J4NDFBF-qbConPsQvUkW_02cJp0gySz_Q3NDo,21191
-scandeval/types.py,sha256=-VNeeDEvlNwfemszpvuGb3Dr9Gu3Eqc6XRmR11HLRi4,3293
-scandeval/utils.py,sha256=BIAP9TWmY_xv6tuCUgmnYifoeodxlz8N2Q0We3frgLU,18389
+scandeval/types.py,sha256=CHQjLzqKYDXPCyZas7rKg6wD1pNiYuaOFMWimrj5H64,4374
+scandeval/utils.py,sha256=E3HQ-8cecJh6NMHF7Ji2YBx6x4tiVKeESglkBeQ0CKg,19167
 scandeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 scandeval/benchmark_modules/base.py,sha256=5YAsCMILKTRXFx_ylGQ7iS5AFKN25iFdkBjj8KzzElw,11445
 scandeval/benchmark_modules/fresh.py,sha256=sG5ae4p1J-GGmVNcVBIxY1xZIAlUwq_pu-9c4uAYU3Y,10734
-scandeval/benchmark_modules/hf.py,sha256=f89E7XoMqsBHhYnMYBgy7ZuXDsAQ7VaIqMfFrHyjg8g,47363
-scandeval/benchmark_modules/litellm.py,sha256=TH35CQhoVinlmfHnAW-XJE21o96YfiIv993m0ASS80E,71590
-scandeval/benchmark_modules/vllm.py,sha256=dloZsXU6_JE9pNbAAnqezKpoVatF7E_c6ivYlZ1emnY,57223
+scandeval/benchmark_modules/hf.py,sha256=bfaPCCBWtRB36TAfJU82WhK_KtdWSuFbSVE81JU1uEY,47900
+scandeval/benchmark_modules/litellm.py,sha256=LPYwCkqpMOMiJzBHQ6mepa94tQZ2POWIpgciVszbOyE,75061
+scandeval/benchmark_modules/vllm.py,sha256=DbGM-_ExTKAhETibb5GOlvG0MguG0JZZHD3cXYP65LM,59754
 scandeval/dataset_configs/__init__.py,sha256=GFI_W9GKd3OSDdhhJzHc8mwoP9b32IHIIyvPBI-hK6k,3223
 scandeval/dataset_configs/albanian.py,sha256=D__dli7JO3yeHzzdJ3FFyUGw-z20f1yI6QLnws-WB8I,1473
 scandeval/dataset_configs/bosnian.py,sha256=golIWqwW1pFwSkuBM1v0yhHDblB2FoJgK24aO7kKm7M,877
@@ -37,7 +37,7 @@ scandeval/dataset_configs/catalan.py,sha256=SXwRJjIcMMN7rVuhFRZSnCGDoMfabW5HFoZO
 scandeval/dataset_configs/croatian.py,sha256=U5oBTjttpWTWonTEzZAf-G3nvQICRQmw6Kla-HWn_5k,1260
 scandeval/dataset_configs/czech.py,sha256=ghv2yNw839G-utll8PQRSjyKYbM5gfoQhFKy664GTCI,1562
 scandeval/dataset_configs/danish.py,sha256=LEKs04vK2KnV0CYheT7FeS-g3iHBvf2bQxyl0D_LbTg,3293
-scandeval/dataset_configs/dutch.py,sha256=j9D6WW5o19cuEVeyx_oosC6dF215L7ZJunIJ6tIah0g,3571
+scandeval/dataset_configs/dutch.py,sha256=OZJmaqGguXY5D9hz0zFNrwGQPRXgxZonctSc8Gsy9sY,3550
 scandeval/dataset_configs/english.py,sha256=nc9nGwxf1tHVMUhQeND61yJbpTO4rJaAusPZlstqtq0,2817
 scandeval/dataset_configs/estonian.py,sha256=bWiKA_dJ7WUE8Z_1YZnSewhi4ZdCQBGJZ7pQxkCwMcU,2757
 scandeval/dataset_configs/faroese.py,sha256=13qYwXonDPWG9Av5MY_NBNTRDglPVKz5_mbz7ZCJ_mo,1247
@@ -50,7 +50,7 @@ scandeval/dataset_configs/icelandic.py,sha256=G2Ibe6oF1NknkQmHqLpoHlysW_8f-0G53D
 scandeval/dataset_configs/italian.py,sha256=qhjAQChnQanzs7EyN1DSAJ4OOU41HAlWqWntQOtbWCw,2761
 scandeval/dataset_configs/latvian.py,sha256=wbwIDieq5Lplng5Jzx9LEqq4d8b5LnNOyCUmT64b4bA,1928
 scandeval/dataset_configs/lithuanian.py,sha256=RPqKwsysO1TYeQuEEsbhzGcSFHDX94lk1hgl1CfQaMU,1724
-scandeval/dataset_configs/norwegian.py,sha256=skKKs4V4-zbd-1lpVUaxKXAjTMpBM6SAU5HZ8kcQ2mI,5454
+scandeval/dataset_configs/norwegian.py,sha256=k70T78rTY3pmmVRxG3i_J1j7td_boFHJetkyITskIL0,5487
 scandeval/dataset_configs/polish.py,sha256=nN_NT8cUK2iv1L_zO_aCYOk2R7ACSDZgvI7e0hIaFAM,2074
 scandeval/dataset_configs/portuguese.py,sha256=m9lEeVtI_yNvIdTIEOn3HFK_ilY2tn3-acC981hjZFM,2401
 scandeval/dataset_configs/romanian.py,sha256=AcDp0mqOHmmv3EodovGEcBmarxjLYsXOPr_X4IQoNTw,1472
@@ -62,8 +62,8 @@ scandeval/dataset_configs/swedish.py,sha256=kpEK29swY7iyUSzUvD9hNf2qwb3d7bHrFwbo
 scandeval/dataset_configs/ukrainian.py,sha256=spbCmCOU27jOfz6FZxqCIfVmDN5l8H-7VCl-k-8eAIo,1527
 scandeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
 scandeval/metrics/base.py,sha256=dUBby-ZzettMjdcjek6rw0JTZMuScX4cQ2Rd6untKHY,2525
-scandeval/metrics/huggingface.py,sha256=W1hPuIGBALOogGN2yTGTJUsylsMII3A66fEe9nB8N2k,9493
-scandeval/metrics/llm_as_a_judge.py,sha256=cZ7ZCuB3633T87MjBtAekrBQ_vYaNv1uTcqnI32gNpQ,9837
+scandeval/metrics/huggingface.py,sha256=W4ktwFSYq0Dy6thSmCRpxztvXDDYZtCWC0xKD6_Tcik,9521
+scandeval/metrics/llm_as_a_judge.py,sha256=UUFk3aL2BZqJ-u9-dzexsoArTxPJTMmHRqb1eWxexaI,12133
 scandeval/metrics/pipeline.py,sha256=GTIqaFkn-nTLU4xBi8-zP1J4Ytv3qeFVuRB4OcuwkOw,10876
 scandeval/metrics/speed.py,sha256=G5hEQcrtqxF070ZZwLDh61iZnq2CSW2o6ZM7zR4lOTY,1298
 scandeval/prompt_templates/__init__.py,sha256=p3CUcSaJiiUm6EQyhceDUjotH7GdyHolMznAn2f44as,519
@@ -79,11 +79,11 @@ scandeval/prompt_templates/token_classification.py,sha256=8Uw34mN2xQ_5es-nz7vCK-
 scandeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 scandeval/task_group_utils/multiple_choice_classification.py,sha256=PWUXeGn-9RsXxdVRYHJASyBVQ8L5Jla981eot0GLooY,7316
 scandeval/task_group_utils/question_answering.py,sha256=tuMwr-RnvJap5jkTrluxC1tfQVS6rKN8_ifNwis-auw,29064
-scandeval/task_group_utils/sequence_classification.py,sha256=VhiggNrB7Gi2x-99MPL0RR2VZRv-wpJerXulgQH6wcU,16556
+scandeval/task_group_utils/sequence_classification.py,sha256=1YAaKn5bY8j9ONPfJZODjaGKVMkA9fQcl51fvBcjeF8,16829
 scandeval/task_group_utils/text_to_text.py,sha256=p6zzjob70qQUpfUOs0LToSzavE1ERqRAHu_727Jb2mM,5476
 scandeval/task_group_utils/token_classification.py,sha256=8dF32KQAYAFnnn7DPHX-yvJmRrMBmT2CyFREacyTwvQ,17321
-scandeval-16.10.0.dist-info/METADATA,sha256=xgQgjZK9T2wSc31Imb1lYvOQjSlEooRLA9oh-suuBr0,23435
-scandeval-16.10.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-scandeval-16.10.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
-scandeval-16.10.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
-scandeval-16.10.0.dist-info/RECORD,,
+scandeval-16.11.0.dist-info/METADATA,sha256=Tf9a-KP53zFhJMuSHkskNm66jNyVzFFb-STy69ur3FQ,23838
+scandeval-16.11.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+scandeval-16.11.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
+scandeval-16.11.0.dist-info/licenses/LICENSE,sha256=vb2c84xITVnhnVFsBS8AWXl-4S-KpxN6VMxTqqYlV3s,1080
+scandeval-16.11.0.dist-info/RECORD,,

{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/licenses/LICENSE RENAMED Viewed

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2022-2025 Dan Saattrup Smart
+Copyright (c) 2022-2026 Dan Saattrup Smart
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

ScandEval 16.10.0__py3-none-any.whl → 16.11.0__py3-none-any.whl

ScandEval 16.10.0py3-none-any.whl → 16.11.0py3-none-any.whl