PyPI - eval-framework - Versions diffs - 0.3.7__tar.gz → 0.3.8__tar.gz - Mend

eval-framework 0.3.7tar.gz → 0.3.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

{eval_framework-0.3.7 → eval_framework-0.3.8}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.3
 Name: eval-framework
-Version: 0.3.7
-Summary: Evalulation Framework
+Version: 0.3.8
+Summary: Evaluation Framework
 Author: Aleph Alpha Research
 License:                                  Apache License
                                     Version 2.0, January 2004
@@ -211,49 +211,49 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3 :: Only
 Classifier: Topic :: Software Development :: Libraries
 Classifier: Typing :: Typed
-Requires-Dist: pyyaml>=6.0.1,<7
-Requires-Dist: xmltodict>=0.13.0,<0.16
-Requires-Dist: pydantic>=2.7,<3
-Requires-Dist: datasets>=4.0.0,<5
-Requires-Dist: sacrebleu>=2.4.3,<3
+Requires-Dist: pyyaml>=6.0.3,<7
+Requires-Dist: xmltodict>=0.15.1,<0.16
+Requires-Dist: pydantic>=2.13.4,<3
+Requires-Dist: datasets>=4.8.5,<5
+Requires-Dist: sacrebleu>=2.6.0,<3
 Requires-Dist: pycountry>=24.6.1,<25
-Requires-Dist: nltk>=3.9.1,<4
-Requires-Dist: python-dotenv>=1.0.1,<2
-Requires-Dist: lingua-language-detector>=2.0.2,<3
-Requires-Dist: google-crc32c>=1.5.0,<2
+Requires-Dist: nltk>=3.9.4,<4
+Requires-Dist: python-dotenv>=1.2.2,<2
+Requires-Dist: lingua-language-detector>=2.2.0,<3
+Requires-Dist: google-crc32c>=1.8.0,<2
 Requires-Dist: kubernetes>=31.0.0,<32
 Requires-Dist: langdetect>=1.0.9,<2
-Requires-Dist: spacy>=3.8.3,<4
-Requires-Dist: jsonschema>=4.23.0,<5
-Requires-Dist: mysql-connector-python>=9.0.0,<10
-Requires-Dist: psycopg2-binary>=2.9.9,<3
+Requires-Dist: spacy>=3.8.14,<4
+Requires-Dist: jsonschema>=4.26.0,<5
+Requires-Dist: mysql-connector-python>=9.7.0,<10
+Requires-Dist: psycopg2-binary>=2.9.12,<3
 Requires-Dist: sympy>=1.13.1,<2
-Requires-Dist: llm-sandbox[docker]==0.3.37
+Requires-Dist: llm-sandbox[docker]==0.3.39
 Requires-Dist: jsonlines>=4,<5
-Requires-Dist: lxml>=6,<7
-Requires-Dist: python-iso639>=2025.2.18
-Requires-Dist: wandb>=0.23.0,<1
-Requires-Dist: boto3>=1.40.54,<2
+Requires-Dist: lxml>=6.1.1,<7
+Requires-Dist: python-iso639>=2026.4.20
+Requires-Dist: wandb>=0.27.0,<1
+Requires-Dist: boto3>=1.43.18,<2
 Requires-Dist: numpy>=1.26.4
 Requires-Dist: antlr4-python3-runtime==4.11.0
-Requires-Dist: scipy>=1.14.0,<2
+Requires-Dist: scipy>=1.17.1,<2
 Requires-Dist: accelerate ; extra == 'accelerate'
 Requires-Dist: eval-framework[determined,api,openai,transformers,accelerate,vllm,comet,optional,mistral] ; extra == 'all'
 Requires-Dist: aleph-alpha-client>=11.5.1 ; extra == 'api'
-Requires-Dist: unbabel-comet>=2.2.6,<3 ; extra == 'comet'
-Requires-Dist: determined>=0.38,<0.39 ; extra == 'determined'
-Requires-Dist: tensorboard==2.19.0 ; extra == 'determined'
-Requires-Dist: mistral-common>=1.7,<2 ; extra == 'mistral'
-Requires-Dist: huggingface-hub>=0.33.2,<0.34 ; extra == 'mistral'
+Requires-Dist: unbabel-comet>=2.2.7,<3 ; extra == 'comet'
+Requires-Dist: determined>=0.38.1,<0.39 ; extra == 'determined'
+Requires-Dist: tensorboard==2.20.0 ; extra == 'determined'
+Requires-Dist: mistral-common>=1.11.2,<2 ; extra == 'mistral'
+Requires-Dist: huggingface-hub>=0.36.2,<0.37 ; extra == 'mistral'
 Requires-Dist: eval-framework[vllm] ; extra == 'mistral'
 Requires-Dist: openai>=1.62,<3 ; extra == 'openai'
-Requires-Dist: tiktoken>=0.9,<1 ; extra == 'openai'
+Requires-Dist: tiktoken>=0.13.0,<1 ; extra == 'openai'
 Requires-Dist: transformers>=4.45.2,<5 ; extra == 'openai'
 Requires-Dist: transformers>=4.45.2,<5 ; extra == 'optional'
 Requires-Dist: jinja2>=3.1.6,<4 ; extra == 'optional'
 Requires-Dist: transformers>=4.45.2,<5 ; extra == 'transformers'
 Requires-Dist: torch>=2.5,<3 ; extra == 'transformers'
-Requires-Dist: accelerate>=0.30.0,<1 ; extra == 'transformers'
+Requires-Dist: accelerate>=0.34.2,<1 ; extra == 'transformers'
 Requires-Dist: vllm>=0.8.5,<0.9 ; extra == 'vllm'
 Requires-Dist: torch>=2.5,<3 ; extra == 'vllm'
 Requires-Python: >=3.12, <3.13

{eval_framework-0.3.7 → eval_framework-0.3.8}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "eval-framework"
-version = "0.3.7"
-description = "Evalulation Framework"
+version = "0.3.8"
+description = "Evaluation Framework"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.12,<3.13"
@@ -18,53 +18,53 @@ classifiers = [
   "Typing :: Typed",
 ]
 dependencies = [
-  "pyyaml>=6.0.1,<7",
-  "xmltodict>=0.13.0,<0.16",
-  "pydantic>=2.7,<3",
-  "datasets>=4.0.0,<5",
-  "sacrebleu>=2.4.3,<3",
+  "pyyaml>=6.0.3,<7",
+  "xmltodict>=0.15.1,<0.16",
+  "pydantic>=2.13.4,<3",
+  "datasets>=4.8.5,<5",
+  "sacrebleu>=2.6.0,<3",
   "pycountry>=24.6.1,<25",
-  "nltk>=3.9.1,<4",
-  "python-dotenv>=1.0.1,<2",
-  "lingua-language-detector>=2.0.2,<3",
-  "google-crc32c>=1.5.0,<2",
+  "nltk>=3.9.4,<4",
+  "python-dotenv>=1.2.2,<2",
+  "lingua-language-detector>=2.2.0,<3",
+  "google-crc32c>=1.8.0,<2",
   "kubernetes>=31.0.0,<32", # required by llm-sandbox though actually not needed
   "langdetect>=1.0.9,<2", # required by the original ifeval implementation
-  "spacy>=3.8.3,<4",
-  "jsonschema>=4.23.0,<5",
-  "mysql-connector-python>=9.0.0,<10", # required for sql-related tasks
-  "psycopg2-binary>=2.9.9,<3", # required for sql-related tasks
+  "spacy>=3.8.14,<4",
+  "jsonschema>=4.26.0,<5",
+  "mysql-connector-python>=9.7.0,<10", # required for sql-related tasks
+  "psycopg2-binary>=2.9.12,<3", # required for sql-related tasks
   "sympy>=1.13.1,<2",
-  "llm-sandbox[docker]==0.3.37",
+  "llm-sandbox[docker]==0.3.39",
   "jsonlines>=4,<5",
-  "lxml>=6,<7",
-  "python-iso639>=2025.2.18",
-  "wandb>=0.23.0,<1",
-  "boto3>=1.40.54,<2",
+  "lxml>=6.1.1,<7",
+  "python-iso639>=2026.4.20",
+  "wandb>=0.27.0,<1",
+  "boto3>=1.43.18,<2",
   "numpy>=1.26.4",
   # is a dependency of sympy, but not explicitly listed in the requirements.txt
   # https://github.com/sympy/sympy/blob/0204fa34e8f6f6f8ccb4de01209be9a2345c9d6e/doc/src/contributing/dependencies.md?plain=1#L125
   "antlr4-python3-runtime==4.11.0",
-  "scipy>=1.14.0,<2",  # required for the aggregation of pass@k metrics
+  "scipy>=1.17.1,<2",  # required for the aggregation of pass@k metrics
 ]
 [project.optional-dependencies]
 # Model-specific extras
 determined = [
-  "determined>=0.38,<0.39",
-  "tensorboard==2.19.0"
+  "determined>=0.38.1,<0.39",
+  "tensorboard==2.20.0"
 ]
 api = ["aleph-alpha-client>=11.5.1"]
 openai = [
   "openai>=1.62,<3",
-  "tiktoken>=0.9,<1",
+  "tiktoken>=0.13.0,<1",
   "transformers>=4.45.2,<5",
 ]
 transformers = [
   "transformers>=4.45.2,<5",
   "torch>=2.5,<3",
-  "accelerate>=0.30.0,<1",
+  "accelerate>=0.34.2,<1",
 ]
 accelerate = ["accelerate"]
 vllm = [
@@ -72,13 +72,13 @@ vllm = [
   "torch>=2.5,<3"
 ]
 mistral = [
-  "mistral-common>=1.7,<2",
-  "huggingface-hub>=0.33.2,<0.34",
+  "mistral-common>=1.11.2,<2",
+  "huggingface-hub>=0.36.2,<0.37",
   "eval_framework[vllm]",
 ]
 # Benchmark/metric specific extras
 comet = [
-  "unbabel-comet>=2.2.6,<3",
+  "unbabel-comet>=2.2.7,<3",
 ]
 # from template-formatting
 optional = [
@@ -97,24 +97,25 @@ eval_framework = "eval_framework.run:run"
 [dependency-groups]
 dev = [
-  "mypy>=1.10,<2",
-  "pytest>=8.3.3,<9",
-  "pytest-mock>=3.14.1",
-  "pytest-xdist>=3.6.1,<4",
+  "mypy>=2.1.0,<3",
+  "pytest>=9.0.3,<10",
+  "pytest-mock>=3.15.1",
+  "pytest-xdist>=3.8.0,<4",
   "pytest-sugar>1.1,<2",
-  "types-pyyaml>=6.0.12.20240917,<7",
-  "types-python-dateutil>=2.9.0.20241206,<3",
-  "types-requests>=2.32.0.20250328,<3",
+  "types-pyyaml>=6.0.12.20260518,<7",
+  "types-python-dateutil>=2.9.0.20260518,<3",
+  "types-requests>=2.33.0.20260518,<3",
   "plotly>=5.24.1,<6",
-  "ruff>=0.12.8",
+  "ruff>=0.15.15",
+  "pip-licenses>=5.5.5",
 ]
 flash-attn = [
-  "flash-attn>=2.7.2.post1,<2.8",
+  "flash-attn>=2.8.3,<2.9",
   "torch"
 ]
 [build-system]
-requires = ["uv_build>=0.9.0,<0.10.0"]
+requires = ["uv_build>=0.11.17,<0.11.18"]
 build-backend = "uv_build"
 [tool.uv.build-backend]

{eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/evaluation_generator.py RENAMED Viewed

@@ -80,6 +80,7 @@ class EvaluationGenerator:
                 )
             else:
                 metric = metric_class()
+            metric.fail_on_error = self.config.fail_on_error
             logger.info(f"Starting calculation of {metric.NAME}")
             safe_tqdm_write(f"INFO: Calculating {metric.NAME}")

{eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/llm/openai.py RENAMED Viewed

@@ -91,8 +91,7 @@ class OpenAIModel(BaseLLM):
             base_url=base_url,
         )
-        # Initialize tokenizer for the model
-        self._encoder = self._get_encoder()
+        self._encoder: tiktoken.Encoding | Tokenizer | None = self._get_encoder_or_none()
         # set bytes_per_token_scalar for non-standard models
         if bytes_per_token is not None and bytes_per_token <= 0:
@@ -101,9 +100,23 @@ class OpenAIModel(BaseLLM):
             4.0 / bytes_per_token if bytes_per_token is not None else 4.0 / self.BYTES_PER_TOKEN
         )
-    def _get_encoder(self) -> tiktoken.Encoding:
+    def _get_encoder_or_none(self) -> tiktoken.Encoding | None:
         assert self._model_name is not None
-        return tiktoken.encoding_for_model(self._model_name)
+        try:
+            return tiktoken.encoding_for_model(self._model_name)
+        except KeyError:
+            logger.info(
+                "tiktoken could not map model_name=%r. Disabling token counting for this model.",
+                self._model_name,
+            )
+            return None
+        except Exception as e:
+            logger.warning(
+                "Failed to initialize tiktoken encoder for model_name=%r (%s). Disabling token counting.",
+                self._model_name,
+                e.__class__.__name__,
+            )
+            return None
     def _count_tokens(self, text: str) -> int:
         """
@@ -115,6 +128,8 @@ class OpenAIModel(BaseLLM):
         Returns:
             Number of tokens.
         """
+        if self._encoder is None:
+            raise RuntimeError("Token counting is not available (no encoder configured).")
         return len(self._encoder.encode(text))
     def generate_from_messages(
@@ -166,14 +181,31 @@ class OpenAIModel(BaseLLM):
                     stop=stop_sequences,
                 )
                 completion = response.choices[0].text
+                usage = getattr(response, "usage", None)
+                prompt_tokens = getattr(usage, "prompt_tokens", None) if usage is not None else None
+                completion_tokens = getattr(usage, "completion_tokens", None) if usage is not None else None
                 return RawCompletion(
                     prompt=prompt,
-                    prompt_sequence_positions=self._count_tokens(prompt),
-                    concat_compression=ConcatCompression.calculate(
-                        single_messages, count_tokens=self._count_tokens, completion=completion
+                    prompt_sequence_positions=(
+                        prompt_tokens
+                        if prompt_tokens is not None
+                        else (self._count_tokens(prompt) if self._encoder is not None else None)
+                    ),
+                    concat_compression=(
+                        ConcatCompression.calculate(
+                            single_messages,
+                            count_tokens=self._count_tokens,
+                            completion=completion,
+                        )
+                        if self._encoder is not None
+                        else None
                     ),
                     completion=completion,
-                    completion_sequence_positions=self._count_tokens(completion),
+                    completion_sequence_positions=(
+                        completion_tokens
+                        if completion_tokens is not None
+                        else (self._count_tokens(completion) if self._encoder is not None else None)
+                    ),
                 )
             else:
@@ -190,15 +222,26 @@ class OpenAIModel(BaseLLM):
                 )
                 prompt = "\n".join([f"{m.get('role', '')}: {m.get('content', '')}" for m in chat_messages])
                 prompt_tokens = getattr(chat_response.usage, "prompt_tokens", None)
+                completion_tokens = getattr(chat_response.usage, "completion_tokens", None)
                 completion = chat_response.choices[0].message.content or ""
                 return RawCompletion(
                     prompt=prompt,
                     prompt_sequence_positions=prompt_tokens,
-                    concat_compression=ConcatCompression.calculate(
-                        single_messages, count_tokens=self._count_tokens, completion=completion
+                    concat_compression=(
+                        ConcatCompression.calculate(
+                            single_messages,
+                            count_tokens=self._count_tokens,
+                            completion=completion,
+                        )
+                        if self._encoder is not None
+                        else None
                     ),
                     completion=completion,
-                    completion_sequence_positions=self._count_tokens(completion),
+                    completion_sequence_positions=(
+                        completion_tokens
+                        if completion_tokens is not None
+                        else (self._count_tokens(completion) if self._encoder is not None else None)
+                    ),
                 )
         with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -218,6 +261,10 @@ class OpenAIModel(BaseLLM):
         Note:
             Uses the OpenAI completions API with echo=True; chat logprobs are not supported.
         """
+        if self._encoder is None:
+            raise NotImplementedError(
+                "OpenAIModel.logprobs() requires a local tokenizer/encoder, but none is available."
+            )
         assert self._model_name in ["babbage-002", "davinci-002"], (
             "Log-probs for prompt tokens are only supported for a limited set of models."
         )
@@ -383,12 +430,16 @@ class DeepseekModel(OpenAIModel):
             base_url="https://api.deepseek.com/beta",
         )
         self._tokenizer_name = tokenizer_name if tokenizer_name is not None else "deepseek-ai/DeepSeek-V3.2-Exp"
+        # DeepSeek uses HF tokenization; override the base encoder (which may be None).
+        self._encoder = self._get_encoder()
     def _get_encoder(self) -> Tokenizer:
         return AutoTokenizer.from_pretrained(self._tokenizer_name)
     def _count_tokens(self, text: str) -> int:
-        return len(self._encoder.encode(text))
+        encoder = self._encoder
+        assert encoder is not None
+        return len(encoder.encode(text))  # type: ignore[union-attr]
 ### Model Aliases ###

eval_framework-0.3.8/src/eval_framework/llm/vllm_local_server.py ADDED Viewed

@@ -0,0 +1,217 @@
+from __future__ import annotations
+import atexit
+import logging
+import os
+import signal
+import socket
+import subprocess
+import time
+import urllib.error
+import urllib.request
+from collections.abc import Sequence
+from eval_framework.llm.base import BaseLLM
+from eval_framework.llm.openai import OpenAIModel
+from eval_framework.shared.types import RawCompletion, RawLoglikelihood
+from eval_framework.tasks.base import Sample
+from template_formatting.formatter import BaseFormatter, Message
+logger = logging.getLogger(__name__)
+def _pick_free_port(host: str) -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind((host, 0))
+        return int(s.getsockname()[1])
+def _wait_for_http_ready(url: str, *, timeout_s: float) -> None:
+    deadline = time.time() + timeout_s
+    last_err: Exception | None = None
+    while time.time() < deadline:
+        try:
+            with urllib.request.urlopen(url, timeout=2) as resp:
+                if 200 <= resp.status < 500:
+                    return
+        except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
+            last_err = e
+            time.sleep(0.25)
+    raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
+def _wait_for_http_ready_or_proc_exit(url: str, *, timeout_s: float, proc: subprocess.Popen[str]) -> None:
+    """
+    Like `_wait_for_http_ready`, but fail fast if the server process exits.
+    This avoids long timeouts that hide the real root cause (e.g. invalid CLI flags,
+    missing dependencies, CUDA issues).
+    """
+    deadline = time.time() + timeout_s
+    last_err: Exception | None = None
+    while time.time() < deadline:
+        if proc.poll() is not None:
+            out = ""
+            try:
+                if proc.stdout is not None:
+                    out = proc.stdout.read() or ""
+            except Exception:
+                out = ""
+            tail = out.strip()
+            if len(tail) > 8000:
+                tail = tail[-8000:]
+            raise RuntimeError(
+                f"vLLM server process exited before becoming ready. exit_code={proc.returncode}. Output (tail):\n{tail}"
+            )
+        try:
+            with urllib.request.urlopen(url, timeout=2) as resp:
+                if 200 <= resp.status < 500:
+                    return
+        except (urllib.error.URLError, TimeoutError, ConnectionError, OSError) as e:
+            last_err = e
+            time.sleep(0.25)
+    raise RuntimeError(f"Timed out waiting for server readiness at {url}. Last error: {last_err}")
+class VLLMLocalServerModel(BaseLLM):
+    """
+    Provider-style model: start a local vLLM OpenAI-compatible server, then talk to it via `OpenAIModel(base_url=...)`.
+    This gives you a stable HTTP boundary (good for VCR cassettes) while keeping "local vLLM" as a selectable backend.
+    Notes:
+    - The server is started in a subprocess using `vllm serve`.
+    - Cleanup is best-effort (SIGTERM then SIGKILL).
+    - Not all OpenAI API features are guaranteed to be supported by the local server (e.g. logprobs).
+    """
+    def __init__(
+        self,
+        *,
+        model_name: str,
+        host: str = "127.0.0.1",
+        port: int | None = None,
+        startup_timeout_s: float = 120.0,
+        # `OpenAIModel` parameters:
+        formatter: BaseFormatter | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        api_key: str | None = None,
+        bytes_per_token: float | None = None,
+        # vLLM "serve" parameters (subset, passed through):
+        tensor_parallel_size: int | None = None,
+        dtype: str | None = None,
+        max_model_len: int | None = None,
+        gpu_memory_utilization: float | None = None,
+        enforce_eager: bool | None = None,
+        # Escape hatch:
+        vllm_command: str | None = None,
+        vllm_extra_args: list[str] | None = None,
+        env: dict[str, str] | None = None,
+    ) -> None:
+        self._model_name = model_name
+        self._host = host
+        self._port = port if port is not None else _pick_free_port(host)
+        self._startup_timeout_s = float(startup_timeout_s)
+        self._proc: subprocess.Popen[str] | None = None
+        self._server_url = f"http://{self._host}:{self._port}/v1"
+        cmd = [vllm_command or "vllm", "serve", self._model_name, "--host", self._host, "--port", str(self._port)]
+        # A small, intentionally conservative subset of flags.
+        if tensor_parallel_size is not None:
+            cmd += ["--tensor-parallel-size", str(tensor_parallel_size)]
+        if dtype is not None:
+            cmd += ["--dtype", str(dtype)]
+        if max_model_len is not None:
+            cmd += ["--max-model-len", str(max_model_len)]
+        if gpu_memory_utilization is not None:
+            cmd += ["--gpu-memory-utilization", str(gpu_memory_utilization)]
+        if enforce_eager is not None:
+            # vLLM exposes this as a boolean flag; passing a value breaks CLI parsing.
+            if enforce_eager:
+                cmd += ["--enforce-eager"]
+        if vllm_extra_args:
+            cmd += list(vllm_extra_args)
+        merged_env = os.environ.copy()
+        if env:
+            merged_env.update(env)
+        logger.info("Starting local vLLM server: %s", " ".join(cmd))
+        self._proc = subprocess.Popen(
+            cmd,
+            env=merged_env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+        # Ensure we don't leave it around if the process exits abruptly.
+        atexit.register(self._cleanup)
+        # Wait until the OpenAI-compatible endpoints respond.
+        if self._proc is None:
+            raise RuntimeError("Failed to start vLLM server process.")
+        _wait_for_http_ready_or_proc_exit(
+            f"{self._server_url}/models",
+            timeout_s=self._startup_timeout_s,
+            proc=self._proc,
+        )
+        # Configure client to talk to the local server.
+        # For local servers, any non-empty API key typically works; allow explicit override.
+        effective_api_key = api_key if api_key is not None else os.getenv("OPENAI_API_KEY") or "local-vllm"
+        self._client = OpenAIModel(
+            model_name=self._model_name,
+            formatter=formatter,
+            temperature=temperature,
+            top_p=top_p,
+            api_key=effective_api_key,
+            base_url=self._server_url,
+            bytes_per_token=bytes_per_token,
+        )
+    @property
+    def name(self) -> str:
+        return f"vllm_local::{self._model_name}"
+    def generate_from_messages(
+        self,
+        messages: list[Sequence[Message]],
+        stop_sequences: list[str] | None = None,
+        max_tokens: int | None = None,
+        temperature: float | None = None,
+        top_p: float | None = None,
+    ) -> list[RawCompletion]:
+        return self._client.generate_from_messages(messages, stop_sequences, max_tokens, temperature, top_p)
+    def logprobs(self, samples: list[Sample]) -> list[RawLoglikelihood]:
+        return self._client.logprobs(samples)
+    def _cleanup(self) -> None:
+        proc = self._proc
+        self._proc = None
+        if proc is None:
+            return
+        if proc.poll() is not None:
+            return
+        try:
+            proc.terminate()
+            proc.wait(timeout=10)
+        except Exception:
+            try:
+                proc.send_signal(signal.SIGKILL)
+            except Exception:
+                pass
+    def __del__(self) -> None:
+        self._cleanup()

{eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/base.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import traceback
 from abc import ABC, abstractmethod
 from typing import Any
@@ -5,6 +6,7 @@ from pydantic import BaseModel, ConfigDict
 from eval_framework.metrics.aggregators.aggregators import Aggregator
 from eval_framework.shared.types import Error
+from eval_framework.tasks.utils import raise_errors
 class MetricResult(BaseModel):
@@ -33,6 +35,8 @@ class BaseMetric[Response](ABC):
     # sample over multiple runs (LLM calls). We default to averaging and thus making
     # macro averaging the overall computation default.
     AGGREGATORS: list[Aggregator] = []
+    # Set by the evaluation generator before calculate(); controls how infra failures are handled.
+    fail_on_error: bool = False
     @classproperty
     def NAMES(cls) -> list[str]:
@@ -43,3 +47,17 @@ class BaseMetric[Response](ABC):
     @abstractmethod
     def calculate(self, response: Response) -> list[MetricResult]:
         raise NotImplementedError
+    def _record_or_raise(self, exc: Exception) -> list[MetricResult]:
+        """Infra failure (e.g. a Docker image-pull rate limit): abort when fail_on_error is set,
+        otherwise record a per-sample error so the run continues."""
+        if raise_errors() or self.fail_on_error:
+            raise exc
+        return [
+            MetricResult(
+                metric_name=self.NAME,
+                value=None,
+                higher_is_better=True,
+                error=Error(error_class=exc.__class__.__name__, message=str(exc), traceback=traceback.format_exc()),
+            )
+        ]

{eval_framework-0.3.7 → eval_framework-0.3.8}/src/eval_framework/metrics/completion/code_assertion.py RENAMED Viewed

@@ -1,3 +1,5 @@
+from llm_sandbox.exceptions import SandboxTimeoutError
 from eval_framework.metrics.base import BaseMetric, MetricResult
 from eval_framework.shared.types import Completion, Error
 from eval_framework.tasks.utils import run_python_code
@@ -14,7 +16,9 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
         code = response.completion
         try:
             output = run_python_code(code, image="python:3.12-slim")
-        except Exception as e:
+        except SandboxTimeoutError as e:
+            # The submitted code timed out (e.g. an infinite loop) -- a failing sample, not an infra
+            # problem.
             import traceback
             return [
@@ -25,6 +29,9 @@ class CodeCompletionAssertion(BaseMetric[Completion]):
                     error=Error(error_class=e.__class__.__name__, message=str(e), traceback=traceback.format_exc()),
                 )
             ]
+        except Exception as e:
+            # Any other sandbox/Docker error (e.g. an image pull rate limit) is an infra failure.
+            return self._record_or_raise(e)
         # Split and filter out empty strings
         output_parts = [part for part in output.split() if part.strip()]

eval-framework 0.3.7__tar.gz → 0.3.8__tar.gz

eval-framework 0.3.7tar.gz → 0.3.8tar.gz