PyPI - evalscope - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

evalscope 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (100) hide show

evalscope/api/benchmark/__init__.py +8 -1
evalscope/api/benchmark/adapters/__init__.py +1 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/benchmark.py +14 -0
evalscope/api/dataset/dataset.py +21 -0
evalscope/api/dataset/loader.py +6 -2
evalscope/api/mixin/sandbox_mixin.py +32 -54
evalscope/api/model/generate_config.py +6 -0
evalscope/benchmarks/aa_lcr/__init__.py +0 -0
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +1 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +111 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +72 -79
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +1 -1
evalscope/models/utils/openai.py +4 -0
evalscope/perf/arguments.py +24 -4
evalscope/perf/benchmark.py +74 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +179 -79
evalscope/perf/plugin/api/openai_api.py +4 -3
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/utils/benchmark_util.py +36 -22
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +0 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +2 -1
evalscope/run.py +4 -0
evalscope/utils/function_utils.py +195 -12
evalscope/utils/io_utils.py +74 -0
evalscope/utils/logger.py +49 -17
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/METADATA +235 -363
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/RECORD +100 -55
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
{evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -0

evalscope/api/benchmark/__init__.py CHANGED Viewed

@@ -1,3 +1,10 @@
-from .adapters import DefaultDataAdapter, ImageEditAdapter, MultiChoiceAdapter, Text2ImageAdapter, VisionLanguageAdapter
+from .adapters import (
+    DefaultDataAdapter,
+    ImageEditAdapter,
+    MultiChoiceAdapter,
+    NERAdapter,
+    Text2ImageAdapter,
+    VisionLanguageAdapter,
+)
 from .benchmark import DataAdapter
 from .meta import BenchmarkMeta

evalscope/api/benchmark/adapters/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from .default_data_adapter import DefaultDataAdapter
 from .image_edit_adapter import ImageEditAdapter
 from .multi_choice_adapter import MultiChoiceAdapter
+from .ner_adapter import NERAdapter
 from .text2image_adapter import Text2ImageAdapter
 from .vision_language_adapter import VisionLanguageAdapter

evalscope/api/benchmark/adapters/ner_adapter.py ADDED Viewed

@@ -0,0 +1,212 @@
+from typing import Any, Dict, List, Set, Tuple
+from evalscope.api.dataset import Sample
+from evalscope.api.metric.scorer import AggScore, SampleScore, Score
+from evalscope.utils.import_utils import check_import
+from evalscope.utils.logger import get_logger
+from evalscope.utils.ner import (
+    DEFAULT_TAG_FIX_PATTERNS,
+    calculate_bio_metrics,
+    clean_prediction,
+    create_target_text,
+    extract_entities_from_text,
+    extract_spans_from_bio,
+    xml_to_bio_tags,
+)
+from .default_data_adapter import DefaultDataAdapter
+logger = get_logger()
+class NERAdapter(DefaultDataAdapter):
+    """
+    Base adapter class for Named Entity Recognition (NER) tasks.
+    This adapter handles converting between BIO tagging schemes and XML-style entity markup,
+    and provides evaluation metrics using seqeval.
+    Subclasses should define their entity types and register the benchmark.
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # Define mapping from BIO tags to user-friendly tag names
+        self.entity_type_map = {}
+        # Add descriptions for each entity type
+        self.entity_descriptions = {}
+        # These will be initialized in setup_entity_mappings
+        self.reverse_entity_map = {}
+        self.entity_list = []
+        self.entities_description = ''
+        # Define common error patterns to handle
+        self.tag_fix_patterns = DEFAULT_TAG_FIX_PATTERNS
+        check_import('seqeval', 'seqeval', raise_error=True, feature_name='NER metrics')
+        # Note: setup_entity_mappings() should be called by subclasses
+        # after they define their entity_type_map and entity_descriptions
+    def setup_entity_mappings(self):
+        """
+        Setup entity mappings and descriptions for prompt formatting.
+        This should be called after entity_type_map and entity_descriptions are defined.
+        """
+        # Reverse mapping for converting back from prediction to evaluation
+        self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
+        # Create list of tags for prompt formatting
+        self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
+        # Create description of entities for prompt
+        self.entities_description = ', '.join([
+            f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
+        ])
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """
+        Convert a record with tokens and NER tags into a Sample.
+        Creates both the raw text input and annotated text target.
+        """
+        tokens: List[str] = record['tokens']
+        ner_tags: List[str] = record['ner_tags']
+        # Create the input text by joining tokens
+        input_text = ' '.join(tokens)
+        # Process tokens and tags to create annotated target text
+        target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
+        # Store tokens and tags in metadata for evaluation
+        metadata = {'tokens': tokens, 'ner_tags': ner_tags}
+        return Sample(input=input_text, target=target_text, metadata=metadata)
+    def format_prompt_template(self, sample):
+        """
+        Format the prompt with entity types, available tags, and text to annotate.
+        """
+        return self.prompt_template.format(
+            entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
+        )
+    def format_fewshot_template(self, fewshot, sample):
+        """
+        Format the few-shot prompt with all required parameters.
+        """
+        return self.few_shot_prompt_template.format(
+            fewshot=fewshot,
+            entities=self.entities_description,
+            entity_list=', '.join(self.entity_list),
+            text=sample.input
+        )
+    def sample_to_fewshot(self, sample: Sample) -> str:
+        """
+        Format a sample as a few-shot example showing original and annotated text.
+        """
+        if not sample.metadata:
+            return ''
+        # Format few-shot examples to match the expected response format
+        return f'Input:\n{sample.input}\n\nOutput:\n{sample.target}'
+    def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
+        """
+        Evaluate named entity recognition performance using seqeval.
+        """
+        from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        try:
+            # Get the original tokens and tags from the reference metadata
+            original_tokens = task_state.metadata['tokens']
+            original_tags = task_state.metadata['ner_tags']
+            if not original_tokens or len(original_tokens) == 0:
+                if hasattr(reference, 'metadata') and reference.metadata:
+                    original_tokens = reference.metadata['tokens']
+                    original_tags = reference.metadata['ner_tags']
+            # Clean and normalize the prediction
+            cleaned_prediction = clean_prediction(filtered_prediction, self.tag_fix_patterns)
+            # Convert XML-style prediction back to BIO tags aligned with original tokens
+            pred_bio_tags = xml_to_bio_tags(cleaned_prediction, original_tokens, self.reverse_entity_map)
+            # Use seqeval to calculate metrics
+            # Note: seqeval expects lists of lists (one per sequence)
+            y_true = [original_tags]
+            y_pred = [pred_bio_tags]
+            precision = precision_score(y_true, y_pred)
+            recall = recall_score(y_true, y_pred)
+            f1 = f1_score(y_true, y_pred)
+            accuracy = accuracy_score(y_true, y_pred)
+            score.value = {'precision': precision, 'recall': recall, 'f1_score': f1, 'accuracy': accuracy}
+            # Store tags for aggregation (proper micro-averaging in aggregate_scores)
+            # This way aggregate_scores can compute metrics across all samples at once,
+            # which gives you true micro-averaged scores rather than averaged macro scores.
+            score.metadata = {'y_true': original_tags, 'y_pred': pred_bio_tags}
+        except Exception as e:
+            logger.warning(f'Error evaluating NER prediction: {str(e)}')
+            score.value = {'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0, 'accuracy': 0.0}
+        return score
+    def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
+        """
+        Aggregate metrics across all samples using seqeval.
+        """
+        from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+        # Collect all predictions and references
+        y_true_all = []
+        y_pred_all = []
+        for ss in sample_scores:
+            # Extract the BIO tags from metadata if available
+            # You may need to store these during match_score
+            if hasattr(ss.score, 'metadata') and 'y_true' in ss.score.metadata and 'y_pred' in ss.score.metadata:
+                y_true_all.append(ss.score.metadata['y_true'])
+                y_pred_all.append(ss.score.metadata['y_pred'])
+        if not y_true_all:
+            # Fallback: calculate averages from individual scores
+            num_samples = len(sample_scores)
+            avg_precision = sum(ss.score.value.get('precision', 0.0) for ss in sample_scores) / num_samples
+            avg_recall = sum(ss.score.value.get('recall', 0.0) for ss in sample_scores) / num_samples
+            avg_f1 = sum(ss.score.value.get('f1_score', 0.0) for ss in sample_scores) / num_samples
+            avg_accuracy = sum(ss.score.value.get('accuracy', 0.0) for ss in sample_scores) / num_samples
+        else:
+            # Use seqeval for micro-averaged metrics across all samples
+            avg_precision = precision_score(y_true_all, y_pred_all)
+            avg_recall = recall_score(y_true_all, y_pred_all)
+            avg_f1 = f1_score(y_true_all, y_pred_all)
+            avg_accuracy = accuracy_score(y_true_all, y_pred_all)
+        num_samples = len(sample_scores)
+        agg_scores = [
+            AggScore(
+                metric_name='precision',
+                score=avg_precision,
+                num=num_samples,
+                metadata={'type': 'seqeval-micro-average'}
+            ),
+            AggScore(
+                metric_name='recall', score=avg_recall, num=num_samples, metadata={'type': 'seqeval-micro-average'}
+            ),
+            AggScore(metric_name='f1_score', score=avg_f1, num=num_samples, metadata={'type': 'seqeval-micro-average'}),
+            AggScore(
+                metric_name='accuracy', score=avg_accuracy, num=num_samples, metadata={'type': 'seqeval-accuracy'}
+            )
+        ]
+        return agg_scores

evalscope/api/benchmark/benchmark.py CHANGED Viewed

@@ -216,6 +216,13 @@ class DataAdapter(LLMJudgeMixin, SandboxMixin, ABC):
         """
         return self._benchmark_meta.train_split
+    @train_split.setter
+    def train_split(self, value: str):
+        """
+        Set the train split of the benchmark.
+        """
+        self._benchmark_meta.train_split = value
     @property
     def eval_split(self) -> Optional[str]:
         """
@@ -223,6 +230,13 @@ class DataAdapter(LLMJudgeMixin, SandboxMixin, ABC):
         """
         return self._benchmark_meta.eval_split
+    @eval_split.setter
+    def eval_split(self, value: str):
+        """
+        Set the eval split of the benchmark.
+        """
+        self._benchmark_meta.eval_split = value
     @property
     def prompt_template(self) -> Optional[str]:
         """

evalscope/api/dataset/dataset.py CHANGED Viewed

@@ -347,3 +347,24 @@ class DatasetDict:
             cur_dataset.reindex(group_size=repeats)
             dataset_dict[key] = cur_dataset
         return cls(dataset_dict)
+    @classmethod
+    def from_dataset_dicts(cls, dataset_dicts: List['DatasetDict']) -> 'DatasetDict':
+        """
+        Create a DatasetDict by merging multiple DatasetDicts.
+        Args:
+            dataset_dicts (List[DatasetDict]): List of DatasetDicts to merge.
+        Returns:
+            DatasetDict: A new DatasetDict containing the merged datasets.
+        """
+        merged_dict = defaultdict(list)
+        for dataset_dict in dataset_dicts:
+            for key, dataset in dataset_dict.items():
+                merged_dict[key].extend(dataset.samples)
+        # Create a MemoryDataset for each subset key
+        final_dict = {}
+        for key, samples in merged_dict.items():
+            final_dict[key] = MemoryDataset(samples, name=key)
+        return cls(final_dict)

evalscope/api/dataset/loader.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Callable, Dict, List, Optional, Union
 from evalscope.api.dataset.utils import record_to_sample_fn
 from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, HubType
 from evalscope.utils import get_logger
-from evalscope.utils.io_utils import csv_to_list, gen_hash, jsonl_to_list, safe_filename
+from evalscope.utils.io_utils import csv_to_list, gen_hash, jsonl_to_list, safe_filename, tsv_to_list
 from .dataset import Dataset, FieldSpec, MemoryDataset, Sample
 from .utils import data_to_samples, shuffle_choices_if_requested
@@ -168,7 +168,11 @@ class LocalDataLoader(DataLoader):
         dataset = []
         # Check for JSONL or CSV files in the specified path
-        for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
+        for ext, loader in [
+            ('.jsonl', jsonl_to_list),
+            ('.csv', csv_to_list),
+            ('.tsv', tsv_to_list),
+        ]:
             # Check if the file exists with the given extension
             if os.path.isfile(path) and path.endswith(ext):
                 file_paths = [path]

evalscope/api/mixin/sandbox_mixin.py CHANGED Viewed

@@ -1,7 +1,6 @@
-import asyncio
-import threading
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
+from evalscope.utils.function_utils import AsyncioLoopRunner, thread_safe
 from evalscope.utils.logger import get_logger
 if TYPE_CHECKING:
@@ -24,25 +23,10 @@ class SandboxMixin:
         self._sandbox_id: Optional[str] = None
         """Sandbox ID."""
-        self._loop: Optional[asyncio.AbstractEventLoop] = None
-        """Event loop for async operations."""
-        # Initialize sandbox synchronously by running async methods
-        if self.use_sandbox:
-            self._loop = asyncio.new_event_loop()
-            # Start the loop in a separate thread
-            def run_loop():
-                asyncio.set_event_loop(self._loop)
-                self._loop.run_forever()
-            self._loop_thread = threading.Thread(target=run_loop, daemon=True)
-            self._loop_thread.start()
-            # Wait for initialization
-            future = asyncio.run_coroutine_threadsafe(self._async_init(), self._loop)
-            future.result()
+        # Lazy init state
+        self._initialized: bool = False
+        # NOTE: Initialization is deferred.
         super().__init__()
     async def _async_init(self):
@@ -70,6 +54,25 @@ class SandboxMixin:
         """Get the sandbox ID."""
         return self._sandbox_id
+    @thread_safe
+    def ensure_sandbox_ready(self) -> bool:
+        """
+        Ensure the sandbox loop, manager, and sandbox instance are initialized.
+        This method is thread-safe and idempotent.
+        """
+        if not self.use_sandbox:
+            return False
+        if self._initialized and self._manager and self._sandbox_id:
+            return True
+        # Initialize manager and sandbox using the class-level runner
+        AsyncioLoopRunner.run(self.init_sandbox_manager_async())
+        AsyncioLoopRunner.run(self.init_sandbox_async())
+        self._initialized = True
+        return True
     async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
         """Initialize the sandbox manager asynchronously."""
         if self._manager is not None:
@@ -100,13 +103,7 @@ class SandboxMixin:
         if not self.use_sandbox:
             return None
-        # Use the dedicated loop if available
-        if self._loop and not self._loop.is_closed():
-            future = asyncio.run_coroutine_threadsafe(self.init_sandbox_manager_async(), self._loop)
-            return future.result()
-        else:
-            # Fallback for cases where no loop is available
-            return asyncio.run(self.init_sandbox_manager_async())
+        return AsyncioLoopRunner.run(self.init_sandbox_manager_async())
     async def init_sandbox_async(self) -> Optional[str]:
         """Initialize the sandbox instance asynchronously."""
@@ -141,17 +138,12 @@ class SandboxMixin:
         if not self.use_sandbox:
             return None
-        # Use the dedicated loop if available
-        if self._loop and not self._loop.is_closed():
-            future = asyncio.run_coroutine_threadsafe(self.init_sandbox_async(), self._loop)
-            return future.result()
-        else:
-            # Fallback for cases where no loop is available
-            return asyncio.run(self.init_sandbox_async())
+        return AsyncioLoopRunner.run(self.init_sandbox_async())
     def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
         """Execute code in the sandbox."""
-        if not self._sandbox_id or not self._manager:
+        # Lazy, thread-safe initialization
+        if not self.ensure_sandbox_ready():
             logger.warning('Sandbox is not initialized.')
             return {'error': 'Sandbox is not initialized.'}
@@ -175,30 +167,16 @@ class SandboxMixin:
                 )
             return result
-        # Use the dedicated loop if available
-        if self._loop and not self._loop.is_closed():
-            future = asyncio.run_coroutine_threadsafe(_execute_async(), self._loop)
-            result = future.result(timeout + 10)  # Add some buffer to the timeout
-        else:
-            # Fallback for cases where no loop is available
-            result = asyncio.run(_execute_async())
+        # Execute in background loop via class-level runner
+        result = AsyncioLoopRunner.run(_execute_async(), timeout=timeout + 10)
         return result.model_dump(exclude_none=True)
     def sandbox_finalize(self, *args, **kwargs):
         """Finalize the sandbox manager."""
         if self._manager:
             try:
-                if self._loop and not self._loop.is_closed():
-                    # Stop the manager using the dedicated loop
-                    future = asyncio.run_coroutine_threadsafe(self._manager.stop(), self._loop)
-                    future.result(timeout=30)
-                    # Stop the event loop
-                    self._loop.call_soon_threadsafe(self._loop.stop)
-                    if hasattr(self, '_loop_thread'):
-                        self._loop_thread.join(timeout=5)
+                # Stop the manager but keep the shared loop alive
+                AsyncioLoopRunner.run(self._manager.stop(), timeout=30)
                 logger.info('Sandbox manager finalized.')
             except Exception as e:
                 logger.warning(f'Error finalizing sandbox manager: {e}')

evalscope/api/model/generate_config.py CHANGED Viewed

@@ -108,6 +108,12 @@ class GenerateConfig(BaseModel):
     extra_body: Optional[Dict[str, Any]] = Field(default=None)
     """Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
+    extra_query: Optional[Dict[str, Any]] = Field(default=None)
+    """Extra query parameters to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
+    extra_headers: Optional[Dict[str, str]] = Field(default=None)
+    """Extra headers to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
     height: Optional[int] = Field(default=None)
     """Image height for image generation model only"""

evalscope/benchmarks/aa_lcr/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py ADDED Viewed

@@ -0,0 +1,205 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# flake8: noqa: E501
+import re
+import urllib.request
+import zipfile
+from pathlib import Path
+from typing import Any, Dict
+from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser
+from evalscope.api.metric import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, Tags
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+# Default judge prompt template
+JUDGE_PROMPT = """Assess whether the following CANDIDATE ANSWER is CORRECT or INCORRECT. For the CANDIDATE ANSWER to be correct, it must be consistent with the OFFICIAL ANSWER.
+The question, for reference only: {question}
+The OFFICIAL ANSWER: {correct_answer}
+CANDIDATE ANSWER TO ASSESS: {response}
+Reply only with CORRECT or INCORRECT."""
+PROMPT_TEMPLATE = """
+BEGIN INPUT DOCUMENTS
+{documents_text}
+END INPUT DOCUMENTS
+Answer the following question using the input documents provided above.
+START QUESTION
+{question}
+END QUESTION
+"""
+# New constants for auto-download
+DOWNLOAD_URL: str = (
+    'https://modelscope.cn/datasets/evalscope/AA-LCR/resolve/master/extracted_text/AA-LCR_extracted-text.zip'
+)
+DEFAULT_CACHE_SUBDIR: str = 'aa_lcr'
+DEFAULT_ZIP_NAME: str = 'AA-LCR_extracted-text.zip'
+DEFAULT_EXTRACTED_DIR_NAME: str = 'lcr'
+@register_benchmark(
+    BenchmarkMeta(
+        name='aa_lcr',
+        pretty_name='AA-LCR',
+        tags=[Tags.KNOWLEDGE, Tags.REASONING, Tags.LONG_CONTEXT],
+        description='AA-LCR (Artificial Analysis Long Context Retrieval) is a benchmark for evaluating long-context '
+        'retrieval and reasoning capabilities of language models across multiple documents.',  # noqa: E501
+        dataset_id='evalscope/AA-LCR',
+        metric_list=['acc'],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='test',
+        prompt_template=PROMPT_TEMPLATE,
+        extra_params={'text_dir': None}
+    )
+)
+class AALCRAdapter(DefaultDataAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._use_llm_judge = True
+        # Get extra parameters
+        self.text_dir = self.extra_params.get('text_dir')
+    def load(self):
+        # Auto download and extract when text_dir is not provided
+        if not self.text_dir:
+            self.text_dir = self._ensure_text_dir_downloaded()
+        elif not Path(self.text_dir).exists():
+            raise ValueError(
+                'AA-LCR text_dir does not exist: '
+                f'{self.text_dir}. Please provide a valid directory or omit text_dir to auto-download.'
+            )
+        self.text_dir = Path(self.text_dir)
+        return super().load()
+    def _ensure_text_dir_downloaded(self) -> Path:
+        """Ensure AA-LCR extracted texts are available locally; download and extract if missing."""
+        cache_root = Path(DEFAULT_EVALSCOPE_CACHE_DIR) / DEFAULT_CACHE_SUBDIR
+        extracted_dir = cache_root / DEFAULT_EXTRACTED_DIR_NAME
+        if extracted_dir.exists():
+            logger.info(f'AA-LCR documents found: {extracted_dir}')
+            return extracted_dir
+        cache_root.mkdir(parents=True, exist_ok=True)
+        zip_path = cache_root / DEFAULT_ZIP_NAME
+        try:
+            logger.info(f'Downloading AA-LCR documents from {DOWNLOAD_URL} to {zip_path}...')
+            urllib.request.urlretrieve(DOWNLOAD_URL, zip_path)
+            logger.info(f'Extracting {zip_path} to {cache_root}...')
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                zf.extractall(cache_root)
+            if not extracted_dir.exists():
+                raise ValueError(f'Extraction succeeded but target directory not found: {extracted_dir}')
+            logger.info(f'AA-LCR documents ready at {extracted_dir}')
+            return extracted_dir
+        except Exception as e:
+            raise ValueError(
+                f'Failed to download or extract AA-LCR documents: {e}. '
+                'You can also manually download and set extra_params["text_dir"].'
+            ) from e
+        finally:
+            # Best-effort cleanup of the zip file
+            try:
+                if zip_path.exists():
+                    zip_path.unlink()
+            except Exception:
+                pass
+    def _get_context(self, record: Dict[str, Any]) -> str:
+        doc_folder = self.text_dir / record['document_category'] / record['document_set_id']
+        # Check if the document folder exists
+        if not doc_folder.exists() or not doc_folder.is_dir():
+            logger.warning(f'Document folder not found: {doc_folder}. Returning empty context.')
+            return ''
+        doc_blocks = []
+        try:
+            for file_path in doc_folder.iterdir():
+                if file_path.is_file():
+                    try:
+                        content = file_path.read_text(encoding='utf-8').strip()
+                        if content:
+                            doc_blocks.append(content)
+                    except (IOError, UnicodeDecodeError) as e:
+                        logger.warning(f'Could not read file {file_path}, skipping: {e}')
+        except OSError as e:
+            logger.warning(f'Could not access document folder {doc_folder}: {e}')
+            return f"ERROR: Could not read documents for {record['document_category']}/{record['document_set_id']}"
+        documents_text = '\n\n'.join(
+            f'BEGIN DOCUMENT {i + 1}:\n{doc}\nEND DOCUMENT {i + 1}' for i, doc in enumerate(doc_blocks)
+        )
+        return documents_text
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """Convert a record to a Sample with long-context prompt."""
+        context = self._get_context(record)
+        prompt = self.prompt_template.format(documents_text=context, question=record['question'])
+        return Sample(
+            input=[ChatMessageUser(content=prompt)],
+            target=record['answer'],
+            metadata={
+                'question': record['question'],
+                'data_source_urls': record['data_source_urls'],
+                'input_tokens': record.get('input_tokens', 0),
+            }
+        )
+    def llm_match_score(
+        self,
+        original_prediction: str,
+        filtered_prediction: str,
+        reference: str,
+        task_state: TaskState,
+    ) -> Score:
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        judge_prompt = JUDGE_PROMPT.format(
+            question=task_state.metadata['question'], correct_answer=reference, response=filtered_prediction
+        )
+        # Request judge and obtain score
+        judge_response = self.llm_judge.judge(prompt=judge_prompt)
+        # Parse judge response to get accuracy score
+        # Use word boundaries to avoid matching "CORRECT" within "INCORRECT"
+        is_correct = bool(re.search(r'\bCORRECT\b', judge_response, re.IGNORECASE))
+        score.value = {
+            'acc': 1.0 if is_correct else 0.0,
+        }
+        score.explanation = f'LLM judge: {judge_response}'
+        score.metadata = {
+            'source': 'llm_judge',
+            'judge_strategy': self.judge_strategy,
+            'model': self.llm_judge.model_id,
+        }
+        score.main_score_name = 'acc'
+        return score

evalscope/benchmarks/bfcl/bfcl_adapter.py CHANGED Viewed

@@ -49,7 +49,7 @@ SUBJECT_MAPPING = {
         'functions. Unlike previous evaluations, '
         'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
         'Need to run `pip install bfcl-eval==2025.6.16` before evaluating. '
-        '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)',
+        '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v3.html)',
         dataset_id='AI-ModelScope/bfcl_v3',
         subset_list=list(SUBJECT_MAPPING.keys()),
         metric_list=['acc'],

evalscope 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl