PyPI - eval-framework - Versions diffs - 0.2.10__tar.gz → 0.2.12__tar.gz - Mend

eval-framework 0.2.10tar.gz → 0.2.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

{eval_framework-0.2.10 → eval_framework-0.2.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: eval-framework
-Version: 0.2.10
+Version: 0.2.12
 Summary: Evalulation Framework
 Author: Aleph Alpha Research
 License:                                  Apache License
@@ -214,7 +214,7 @@ Classifier: Typing :: Typed
 Requires-Dist: pyyaml>=6.0.1,<7
 Requires-Dist: xmltodict>=0.13.0,<0.16
 Requires-Dist: pydantic>=2.7,<3
-Requires-Dist: datasets>=2.19.1,<4
+Requires-Dist: datasets>=4.0.0,<5
 Requires-Dist: sacrebleu>=2.4.3,<3
 Requires-Dist: pycountry>=24.6.1,<25
 Requires-Dist: nltk>=3.9.1,<4
@@ -238,15 +238,15 @@ Requires-Dist: numpy>=1.26.4
 Requires-Dist: antlr4-python3-runtime==4.11.0
 Requires-Dist: accelerate ; extra == 'accelerate'
 Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
-Requires-Dist: aleph-alpha-client>=10,<11 ; extra == 'api'
+Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
 Requires-Dist: unbabel-comet>=2.2.6,<3 ; extra == 'comet'
 Requires-Dist: determined>=0.38,<0.39 ; extra == 'determined'
 Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
 Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
 Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
 Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
-Requires-Dist: openai>=1.62,<2.8 ; extra == 'openai'
-Requires-Dist: tiktoken>=0.9,<0.10 ; extra == 'openai'
+Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
+Requires-Dist: tiktoken>=0.9,<1 ; extra == 'openai'
 Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
 Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
 Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'

{eval_framework-0.2.10 → eval_framework-0.2.12}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "eval-framework"
-version = "0.2.10"
+version = "0.2.12"
 description = "Evalulation Framework"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -21,7 +21,7 @@ dependencies = [
   "pyyaml>=6.0.1,<7",
   "xmltodict>=0.13.0,<0.16",
   "pydantic>=2.7,<3",
-  "datasets>=2.19.1,<4", # dataset v4 has breaking changes we'd need to adapt to
+  "datasets>=4.0.0,<5",
   "sacrebleu>=2.4.3,<3",
   "pycountry>=24.6.1,<25",
   "nltk>=3.9.1,<4",
@@ -53,10 +53,10 @@ determined = [
   "determined>=0.38,<0.39",
   "tensorboard==2.19.0"
 ]
-api = ["aleph-alpha-client>=10,<11"]
+api = ["aleph-alpha-client>=11.5.1"]
 openai = [
-  "openai>=1.62,<2.8",
-  "tiktoken>=0.9,<0.10",
+  "openai>=1.62,<3",
+  "tiktoken>=0.9,<1",
   "transformers>=4.45.2,<5",
 ]
 transformers = [

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/aleph_alpha.py RENAMED Viewed

@@ -3,16 +3,12 @@ import json
 import logging
 import math
 import os
-import random
 import re
-import time
 import traceback
 from collections.abc import Callable, Sequence
-import aiohttp
 from aleph_alpha_client import (
     AsyncClient,
-    BusyError,
     Client,
     CompletionRequest,
     CompletionResponse,
@@ -49,11 +45,11 @@ class AlephAlphaAPIModel(BaseLLM):
         formatter: BaseFormatter | None = None,
         checkpoint_name: str | None = None,
         temperature: float | None = None,
+        top_p: float | None = None,
         # Please see README.md for tips if adapting the following parameters.
         max_retries: int = 100,
         max_async_concurrent_requests: int = 32,
         request_timeout_seconds: int = 30 * 60 + 5,
-        queue_full_timeout_seconds: int = 30 * 60 + 5,
         bytes_per_token: float | None = None,
         token: str = os.getenv("AA_TOKEN", "dummy"),
         base_url: str = os.getenv("AA_INFERENCE_ENDPOINT", "dummy_endpoint"),
@@ -67,10 +63,10 @@ class AlephAlphaAPIModel(BaseLLM):
             self._formatter = formatter
         self._llm_name = checkpoint_name or self.LLM_NAME
         self._temperature = temperature if temperature is not None else 0.0
+        self._top_p = top_p if top_p is not None else 0.0
         self.max_async_concurrent_requests = max_async_concurrent_requests
         self.max_retries = max_retries
         self.request_timeout_seconds = request_timeout_seconds
-        self.queue_full_timeout_seconds = queue_full_timeout_seconds
         self.token = token
         self.base_url = base_url
         self._validate_model_availability(base_url, token)
@@ -101,56 +97,6 @@ class AlephAlphaAPIModel(BaseLLM):
         except Exception as e:
             raise RuntimeError(f"Model '{self._llm_name}' is not available: {e}")
-    async def _request_with_backoff(
-        self, client: AsyncClient, request: CompletionRequest, id: int
-    ) -> CompletionResponse:
-        """
-        Query Aleph-Alpha API with complete. Retry with back-off until it responds.
-        """
-        num_attempts = 0
-        start_time: float | None = None
-        while True:
-            try:
-                return await client.complete(request, model=self._llm_name)
-            except (TimeoutError, BusyError, RuntimeError, aiohttp.ClientError) as e:
-                status_code: str = safe_json_loads(e.args[1]).get("code", "") if len(e.args) >= 2 else ""
-                str_e = str(e)
-                if status_code == "QUEUE_FULL":
-                    # Worker not available or missed a heartbeat (inference longer than scheduler's
-                    # API_MODEL_AVAILABLE_TIMEOUT_DURATION_MILLIS) or the scheduler is overloaded.
-                    if start_time is None:
-                        start_time = time.time()
-                    elapsed = time.time() - start_time
-                    if elapsed <= self.queue_full_timeout_seconds:
-                        logger.info(
-                            f"Request {id}: {status_code or str_e[:256]} - retrying: attempt"
-                            f" {num_attempts}/{self.max_retries}, elapsed {elapsed:.1f} sec"
-                        )
-                        # don't count as retry (request returns immediately, so just wait a bit not to DoS the server)
-                        await asyncio.sleep(random.randint(5, 30))
-                        continue
-                elif (
-                    status_code == "TIMEOUT_TASK"
-                    or isinstance(e, TimeoutError)
-                    or "502 Bad Gateway" in str_e
-                    or "504 Gateway Time-out" in str_e
-                    or isinstance(e, aiohttp.ClientError)
-                ):
-                    # client timeout, either because task too long in a queue or inference too long
-                    # (scheduler's API_CLIENT_TIMEOUT_DURATION_MILLIS). Retrying for the "inference too long"
-                    # case makes no sense but we unfortunately don't know which case has happened.
-                    num_attempts += 1
-                    start_time = None
-                    if num_attempts < self.max_retries:
-                        logger.info(f"Request {id}: TIMEOUT_TASK - retrying: attempt {num_attempts}/{self.max_retries}")
-                        await asyncio.sleep(random.randint(5, 30))
-                        continue
-                raise e
     def _error_from_exception(self, e: Exception) -> Error:
         """Convert an exception to an Error object."""
         if len(e.args) >= 2:
@@ -171,39 +117,36 @@ class AlephAlphaAPIModel(BaseLLM):
     async def _process_request_with_client(
         self,
         client: AsyncClient,
-        semaphore: asyncio.Semaphore,
         request: CompletionRequest,
         id: int,
     ) -> tuple[CompletionRequest, CompletionResponse | Error]:
         """Process a single request, returning the request and either a response or error."""
-        async with semaphore:
-            try:
-                response = await self._request_with_backoff(client=client, request=request, id=id)
-                logger.info(f"Request {id}: Success")
-                return (request, response)
-            except Exception as e:
-                if raise_errors():
-                    raise e
-                logger.info(f"Request {id}: Failure: {str(e)[:256]}")
-                return (request, self._error_from_exception(e))
+        try:
+            response = await client.complete(request, model=self._llm_name)
+            logger.info(f"Request {id}: Success")
+            return (request, response)
+        except Exception as e:
+            if raise_errors():
+                raise e
+            logger.info(f"Request {id}: Failure: {str(e)[:256]}")
+            return (request, self._error_from_exception(e))
     async def _process_requests(
         self,
         requests: list[CompletionRequest],
     ) -> list[tuple[CompletionRequest, CompletionResponse | Error]]:
         """Process multiple requests concurrently, returning request/response pairs."""
-        semaphore = asyncio.Semaphore(self.max_async_concurrent_requests)
         async with AsyncClient(
             host=self.base_url,
             nice=True,
             request_timeout_seconds=self.request_timeout_seconds,
             token=self.token,
-            total_retries=0,  # we have a custom retry policy in _request_with_backoff()
+            total_retries=self.max_retries,
+            limit=self.max_async_concurrent_requests,
         ) as client:
             tasks = (
                 self._process_request_with_client(
                     client,
-                    semaphore,
                     request,
                     i,
                 )
@@ -272,6 +215,7 @@ class AlephAlphaAPIModel(BaseLLM):
                     maximum_tokens=scaled_max_tokens,
                     stop_sequences=stop_sequences,
                     temperature=effective_temperature,
+                    top_p=self._top_p,
                 )
             )

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/base.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
+from collections.abc import Generator, Sequence
+from contextlib import contextmanager
 from pathlib import Path
 from typing import Any
@@ -112,21 +113,22 @@ class BaseLLM(ABC):
         """
         pass
+    @contextmanager
     def _get_final_checkpoint(
         self, checkpoint_path: str | Path | None = None, model_name: str | None = None, artifact_name: str | None = None
-    ) -> tuple[str | Path | None, str | None]:
+    ) -> Generator[tuple[str | Path | None, str | None], None, None]:
         if (num_provided := sum(x is not None for x in [checkpoint_path, model_name, artifact_name])) == 0:
             if not getattr(self, "LLM_NAME", ""):
                 raise ValueError("Either LLM_NAME, checkpoint_path, model_name, or artifact_name must be provided.")
-            return None, None  # no argument given, so will use the LLM_NAME of the class
+            yield None, None  # no argument given, so will use the LLM_NAME of the class
         elif num_provided > 1:
             raise ValueError("At most one of `checkpoint_path`, `model_name`, or `artifact_name` must be provided.")
         elif checkpoint_path is not None:
-            return checkpoint_path, str(checkpoint_path)
+            yield checkpoint_path, str(checkpoint_path)
         elif model_name is not None:
-            return model_name, model_name
+            yield model_name, model_name
         else:
             from eval_framework.utils.file_ops import WandbFs
@@ -139,7 +141,7 @@ class BaseLLM(ABC):
                 file_root = wandb_fs.find_hf_checkpoint_root_from_path_list()
                 if file_root is None:
                     raise ValueError(f"Could not find HuggingFace checkpoint in artifact {artifact_base}:{version}")
-                return file_root, artifact_name
+                yield file_root, artifact_name
     def _get_final_formatter(
         self,

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/huggingface.py RENAMED Viewed

@@ -322,22 +322,21 @@ class HFLLM(BaseHFLLM):
         bytes_per_token: float | None = None,
         **kwargs: Any,
     ) -> None:
-        final_path, possible_name = self._get_final_checkpoint(checkpoint_path, model_name, artifact_name)
+        with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
+            self.checkpoint_name = checkpoint_name
+            if self.checkpoint_name is None and possible_name is not None:
+                self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_")  # sanitize pathname
-        self.checkpoint_name = checkpoint_name
-        if self.checkpoint_name is None and possible_name is not None:
-            self.checkpoint_name = possible_name.replace("/", "_").replace(":", "_").strip("_")  # sanitize pathname
+            if final_path:
+                self.LLM_NAME = str(final_path)
-        if final_path:
-            self.LLM_NAME = str(final_path)
+            final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
-        final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
-        super().__init__(
-            formatter=final_formatter,
-            bytes_per_token=bytes_per_token,
-            **kwargs,
-        )
+            super().__init__(
+                formatter=final_formatter,
+                bytes_per_token=bytes_per_token,
+                **kwargs,
+            )
     @property
     def name(self) -> str:

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/llm/vllm.py RENAMED Viewed

@@ -137,10 +137,12 @@ class BaseVLLMModel(BaseLLM):
         device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
         self.batch_size = batch_size
-        self._tokenizer: None | VLLMTokenizerAPI = None
         self.model = LLM(**model_args, device=device)
+        self._tokenizer: None | VLLMTokenizerAPI = None
+        _ = self.tokenizer  # make sure tokenizer is initialized
         self.sampling_params: SamplingParams = self._process_sampling_params(sampling_params)
         logger.info(
@@ -481,28 +483,27 @@ class VLLMModel(BaseVLLMModel):
         sampling_params: SamplingParams | dict[str, Any] | None = None,
         **kwargs: Any,
     ) -> None:
-        final_path, possible_name = self._get_final_checkpoint(checkpoint_path, model_name, artifact_name)
-        if final_path:
-            self.LLM_NAME = str(final_path)
-        final_name = checkpoint_name
-        if final_name is None and possible_name is not None:
-            final_name = possible_name.replace("/", "_").replace(":", "_").strip("_")  # sanitize pathname
-        final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
-        super().__init__(
-            formatter=final_formatter,
-            checkpoint_path=final_path,
-            checkpoint_name=final_name,
-            max_model_len=max_model_len,
-            tensor_parallel_size=tensor_parallel_size,
-            gpu_memory_utilization=gpu_memory_utilization,
-            batch_size=batch_size,
-            sampling_params=sampling_params,
-            **kwargs,
-        )
+        with self._get_final_checkpoint(checkpoint_path, model_name, artifact_name) as (final_path, possible_name):
+            if final_path:
+                self.LLM_NAME = str(final_path)
+            final_name = checkpoint_name
+            if final_name is None and possible_name is not None:
+                final_name = possible_name.replace("/", "_").replace(":", "_").strip("_")  # sanitize pathname
+            final_formatter = self._get_final_formatter(formatter, formatter_name, formatter_kwargs)
+            super().__init__(
+                formatter=final_formatter,
+                checkpoint_path=final_path,
+                checkpoint_name=final_name,
+                max_model_len=max_model_len,
+                tensor_parallel_size=tensor_parallel_size,
+                gpu_memory_utilization=gpu_memory_utilization,
+                batch_size=batch_size,
+                sampling_params=sampling_params,
+                **kwargs,
+            )
 class VLLMRegistryModel(VLLMModel):  # deprecated

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/base.py RENAMED Viewed

@@ -171,7 +171,6 @@ class BaseTask[SubjectType](ABC):
             return load_dataset(
                 **kwargs,
                 revision=self.HF_REVISION,
-                trust_remote_code=True,
                 cache_dir=cache_dir,
                 download_config=download_config,
             )
@@ -179,7 +178,6 @@ class BaseTask[SubjectType](ABC):
             return load_dataset(
                 **kwargs,
                 revision=self.HF_REVISION,
-                trust_remote_code=True,
                 cache_dir=f"{Path.home()}/.cache/eval-framework",
             )

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/duc.py RENAMED Viewed

@@ -12,11 +12,12 @@ class DUC(BaseTask[str], ABC):
     """https://huggingface.co/datasets/midas/duc2001"""
     DATASET_PATH: str = "midas/duc2001"
-    SAMPLE_SPLIT: str = "test"
-    FEWSHOT_SPLIT: str = "test"
+    HF_REVISION: str = "77d6dedcbce421695a12f24c8802e8847a129d92"
+    SAMPLE_SPLIT: str = "train"
+    FEWSHOT_SPLIT: str = "train"
     RESPONSE_TYPE: ResponseType = ResponseType.COMPLETION
     METRICS: list[type[BaseMetric]] = [AccuracyCompletion]
-    SUBJECTS: list[str] = ["raw"]
+    SUBJECTS: list[str] = ["default"]
     PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"]
     LANGUAGE = Language.ENG
@@ -33,6 +34,10 @@ class DUC(BaseTask[str], ABC):
         completion_text = completion_text.strip()
         return completion_text
+    def _load_dataset(self, subject: str) -> None:
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
+        self.dataset = self._shuffle_splits(hf_dataset=hf_dataset)
     def _get_instruction_text(self, item: dict[str, Any]) -> str:
         instruction_text = " ".join(item["document"])
         instruction_text = re.sub(r"\s+([.,!?;:])", r"\1", instruction_text)
@@ -47,7 +52,7 @@ class DUC(BaseTask[str], ABC):
 class DUC_EXTRACTIVE(DUC):
     NAME = "DUC Extractive"
-    SUBJECTS: list[str] = ["raw"]
+    SUBJECTS: list[str] = ["default"]
     def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
         return item["extractive_keyphrases"]
@@ -61,14 +66,13 @@ class DUC_EXTRACTIVE(DUC):
 class DUC_ABSTRACTIVE(DUC):
     NAME = "DUC Abstractive"
-    SUBJECTS: list[str] = ["raw"]
+    SUBJECTS: list[str] = ["default"]
     def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
         return item["abstractive_keyphrases"]
     def _load_dataset(self, subject: str) -> None:
-        # not all samples have abstractive keyphrases
-        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject)
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject, data_files="raw/test/0000.parquet")
         self.dataset = {}
         for split, data in hf_dataset.items():

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/flores200.py RENAMED Viewed

@@ -25,6 +25,7 @@ class Flores200(BaseTask[str]):
     NAME = "FLoRes-200"
     DATASET_PATH = "facebook/flores"
+    HF_REVISION = "fd7d8f42fccb9dbc35830053a8c705a2627124ce"
     SAMPLE_SPLIT = "devtest"
     FEWSHOT_SPLIT = "dev"
     RESPONSE_TYPE = ResponseType.COMPLETION
@@ -66,7 +67,6 @@ class Flores200(BaseTask[str]):
                 split=kwargs.get("split"),
                 data_files=None,  # Let it auto-discover parquet files
                 revision=self.HF_REVISION,
-                trust_remote_code=False,  # Disable the loading script!
                 cache_dir=cache_dir,
                 download_config=download_config,
             )
@@ -79,7 +79,6 @@ class Flores200(BaseTask[str]):
             dataset = load_dataset(
                 **kwargs,
                 revision=self.HF_REVISION,
-                trust_remote_code=True,
                 cache_dir=cache_dir,
                 download_config=download_config,
             )

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/gsm8k.py RENAMED Viewed

@@ -92,6 +92,7 @@ class GSM8KEvalHarness(BaseTask[str]):
     NAME = "GSM8KEvalHarness"
     DATASET_PATH = "openai/gsm8k"
+    HF_REVISION = "main"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "train"
     RESPONSE_TYPE = ResponseType.COMPLETION

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/infinitebench.py RENAMED Viewed

@@ -39,13 +39,10 @@ class InfiniteBench(BaseTask[str], ABC):
             }
         )
         try:
-            return load_dataset(
-                **kwargs, trust_remote_code=True, cache_dir=cache_dir, download_config=download_config, features=ft
-            )
+            return load_dataset(**kwargs, cache_dir=cache_dir, download_config=download_config, features=ft)
         except Exception:
             return load_dataset(
                 **kwargs,
-                trust_remote_code=True,
                 cache_dir=f"{Path.home()}/.cache/eval-framework",
                 features=ft,
             )

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/mmlu_de.py RENAMED Viewed

@@ -73,6 +73,7 @@ class MMLU_DE(BaseTask[str]):
     NAME = "MMLU_DE"
     DATASET_PATH = "LeoLM/MMLU_de"
+    HF_REVISION = "11433b408001dd26444c7e666cc536e0b8907ca5"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "validation"
     RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/opengptx_eu20.py RENAMED Viewed

@@ -27,6 +27,7 @@ class ARC_EU20_DE(ARC):
     NAME = "ARC_EU20_DE"
     DATASET_PATH = "openGPT-X/arcx"
+    HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "train"
     SUBJECTS = ["challenge_DE", "easy_DE"]
@@ -36,6 +37,7 @@ class ARC_EU20_DE(ARC):
 class ARC_EU20_FR(ARC):
     NAME = "ARC_EU20_FR"
     DATASET_PATH = "openGPT-X/arcx"
+    HF_REVISION = "e4c31fa077b82832cc21e614832701603a8ad319"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "train"
     SUBJECTS = ["challenge_FR", "easy_FR"]
@@ -51,6 +53,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
     """  # noqa: E501
     NAME = "GSM8K_EU20_DE"
+    HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
     DATASET_PATH = "openGPT-X/gsm8kx"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "train"
@@ -60,6 +63,7 @@ class GSM8K_EU20_DE(GSM8KEvalHarness):
 class GSM8K_EU20_FR(GSM8KEvalHarness):
     NAME = "GSM8K_EU20_FR"
+    HF_REVISION = "3ed0f81d31a9013e05d16644aabcc36db50078a9"
     DATASET_PATH = "openGPT-X/gsm8kx"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "train"
@@ -77,6 +81,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
     NAME = "HellaSwag_EU20_DE"
     DATASET_PATH = "openGPT-X/hellaswagx"
+    HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
     SAMPLE_SPLIT = "train"
     FEWSHOT_SPLIT = "validation"
     SUBJECTS = ["DE"]
@@ -86,6 +91,7 @@ class HELLASWAG_EU20_DE(HELLASWAG):
 class HELLASWAG_EU20_FR(HELLASWAG):
     NAME = "HellaSwag_EU20_FR"
     DATASET_PATH = "openGPT-X/hellaswagx"
+    HF_REVISION = "7c30407f4f11fa4fada74bd4384ed0fe572ae8f2"
     SAMPLE_SPLIT = "train"
     FEWSHOT_SPLIT = "validation"
     SUBJECTS = ["FR"]
@@ -128,6 +134,7 @@ class TRUTHFULQA_EU20_DE(TRUTHFULQA):
     NAME = "TruthfulQA_EU20_DE"
     DATASET_PATH = "openGPT-X/truthfulqax"
+    HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
     LANGUAGE = Language.DEU
     def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
@@ -182,6 +189,7 @@ TRUTHFULQA_EU20_FR_FEWSHOT_ITEMS = [
 class TRUTHFULQA_EU20_FR(TRUTHFULQA):
     NAME = "TruthfulQA_EU20_FR"
     DATASET_PATH = "openGPT-X/truthfulqax"
+    HF_REVISION = "cff042da87dfb8885c357cb1c83194fa6aaf1d49"
     LANGUAGE = Language.FRA
     def _load_dataset(self, subject: SubjectType) -> None:
@@ -214,6 +222,7 @@ class MMLU_EU20_DE(MMLU):
     NAME = "MMLU_EU20_DE"
     DATASET_PATH = "openGPT-X/mmlux"
+    HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "dev"  # one could merge dev and validation to have a larger pool of fewshot examples
     SUBJECTS = [i + "_DE" for i in MMLU_SUBJECTS]
@@ -321,6 +330,7 @@ MMLU_SUBJECTS_TRANSLATION_FR = {
 class MMLU_EU20_FR(MMLU):
     NAME = "MMLU_EU20_FR"
     DATASET_PATH = "openGPT-X/mmlux"
+    HF_REVISION = "6412d5d5d03a7b31d02f4ba34b787c2e7939a800"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "dev"
     SUBJECTS = [i + "_FR" for i in MMLU_SUBJECTS]

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/piqa.py RENAMED Viewed

@@ -15,6 +15,7 @@ class PIQA(BaseTask[str]):
     NAME = "PIQA"
     DATASET_PATH = "ybisk/piqa"
+    HF_REVISION = "6b3aceb3276e5ab7e51895d73151a718690af38c"
     SAMPLE_SPLIT = "validation"  # 1838 examples (same split as lm-eval)
     FEWSHOT_SPLIT = "test"  # 3084 examples
     RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/squad.py RENAMED Viewed

@@ -75,7 +75,6 @@ class SQUAD2(BaseTask[str]):
         return load_dataset(
             **kwargs,
             revision=self.HF_REVISION,
-            trust_remote_code=True,
             cache_dir=cache_dir,
             download_config=download_config,
         )

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/winox.py RENAMED Viewed

@@ -1,5 +1,11 @@
+import os
+from pathlib import Path
 from typing import Any
+from datasets import DownloadConfig, load_dataset
+from huggingface_hub import HfApi
+from huggingface_hub.errors import RevisionNotFoundError
 from eval_framework.tasks.base import Language
 from eval_framework.tasks.benchmarks.winogrande import WINOGRANDE
@@ -19,6 +25,7 @@ class WINOX(WINOGRANDE):
     """
     DATASET_PATH = "demelin/wino_x"
+    HF_REVISION = "7d82697fd52ac8b03e62aadfddc61077320f21e7"
     SAMPLE_SPLIT = "test"
     FEWSHOT_SPLIT = "test"
     LANGUAGE_SHORT_CODE = ""
@@ -42,6 +49,31 @@ class WINOX(WINOGRANDE):
         ]
         return choices
+    def _load_hf_dataset(self, **kwargs: Any) -> Any:
+        """Override to handle FLORES-200 encoding issues by using parquet files."""
+        # Check if the HF_REVISION is valid before loading the dataset
+        if self.HF_REVISION:
+            try:
+                _ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0)
+            except Exception as e:
+                if isinstance(e, RevisionNotFoundError):
+                    raise e
+        cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
+        download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
+        dataset = load_dataset(
+            kwargs.get("path", self.DATASET_PATH),
+            name=kwargs.get("name"),
+            split=kwargs.get("split"),
+            data_files=None,  # Let it auto-discover parquet files
+            revision=self.HF_REVISION,
+            cache_dir=cache_dir,
+            download_config=download_config,
+        )
+        return dataset
 class WINOX_DE(WINOX):
     NAME = "WINOX_DE"

{eval_framework-0.2.10 → eval_framework-0.2.12}/src/eval_framework/tasks/benchmarks/zero_scrolls.py RENAMED Viewed

@@ -16,6 +16,7 @@ class ZERO_SCROLLS_QUALITY(BaseTask[str]):
     NAME = "ZeroSCROLLS QuALITY"
     DATASET_PATH = "tau/zero_scrolls"
+    HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
     SAMPLE_SPLIT = "validation"
     FEWSHOT_SPLIT = "validation"
     RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
@@ -48,6 +49,7 @@ class ZERO_SCROLLS_COMPLETION(BaseTask[str]):
     """ZeroSCROLLS dataset: https://huggingface.co/datasets/tau/zero_scrolls"""
     DATASET_PATH = "tau/zero_scrolls"
+    HF_REVISION = "3ee203cfad81b1a4fab8f2351c12679fbe95b179"
     SAMPLE_SPLIT = "validation"
     FEWSHOT_SPLIT = "validation"
     RESPONSE_TYPE = ResponseType.COMPLETION

eval-framework 0.2.10__tar.gz → 0.2.12__tar.gz

eval-framework 0.2.10tar.gz → 0.2.12tar.gz