PyPI - evalscope - Versions diffs - 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

evalscope 0.13.2py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (214) hide show

evalscope/benchmarks/aime/aime25_adapter.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import OutputType
-from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
+from evalscope.metrics import extract_answer, math_equal, strip_answer_string
 from evalscope.utils.logger import get_logger
 # flake8: noqa
@@ -11,12 +11,12 @@ logger = get_logger()
 @Benchmark.register(
     name='aime25',
     pretty_name='AIME-2025',
-    dataset_id='TIGER-Lab/AIME25',
-    subset_list=['default'],
+    dataset_id='opencompass/AIME2025',
+    subset_list=['AIME2025-I', 'AIME2025-II'],
     metric_list=['AveragePass@1'],
     few_shot_num=0,
     train_split=None,
-    eval_split='train',  # Only train set is available
+    eval_split='test',  # Only train set is available
     prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
 )
 class AIME25Adapter(DataAdapter):

evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py CHANGED Viewed

@@ -3,8 +3,7 @@ from collections import defaultdict
 from typing import Any, List
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import Metric, mean, metric_registry
-from evalscope.metrics.llm_judge import LLMJudge
+from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/arc/arc_adapter.py CHANGED Viewed

@@ -18,7 +18,7 @@ logger = get_logger()
     name='arc',
     pretty_name='ARC',
     dataset_id='modelscope/ai2_arc',
-    model_adapter=OutputType.MULTIPLE_CHOICE,
+    model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
     subset_list=['ARC-Easy', 'ARC-Challenge'],
     metric_list=['AverageAccuracy'],
@@ -134,7 +134,7 @@ class ARCAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(text=result)
+            return ResponseParser.parse_first_option(text=result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)

evalscope/benchmarks/arena_hard/arena_hard_adapter.py CHANGED Viewed

@@ -3,9 +3,7 @@ from collections import defaultdict
 from typing import Any, List
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import AnswerKeys
-from evalscope.metrics import Metric, mean, metric_registry
-from evalscope.metrics.llm_judge import LLMJudge
+from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/ceval/ceval_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
-from evalscope.metrics.metrics import exact_match
+from evalscope.metrics import exact_match
 from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
@@ -127,7 +127,7 @@ SUBJECT_MAPPING = {
     name='ceval',
     pretty_name='C-Eval',
     dataset_id='modelscope/ceval-exam',
-    model_adapter=OutputType.MULTIPLE_CHOICE,
+    model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
     subset_list=SUBSET_LIST,
     metric_list=['AverageAccuracy'],

evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import re
-from collections import defaultdict
 from typing import Any, List
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import Metric, mean, metric_registry
-from evalscope.metrics.llm_judge import LLMJudge
+from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/cmmlu/cmmlu_adapter.py CHANGED Viewed

@@ -104,7 +104,7 @@ SUBJECT_MAPPING = {
     name='cmmlu',
     pretty_name='C-MMLU',
     dataset_id='modelscope/cmmlu',
-    model_adapter=OutputType.MULTIPLE_CHOICE,
+    model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
     subset_list=SUBSET_LIST,
     metric_list=['AverageAccuracy'],

evalscope/benchmarks/competition_math/competition_math_adapter.py CHANGED Viewed

@@ -6,8 +6,7 @@ import os
 from collections import defaultdict
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import AnswerKeys
-from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
+from evalscope.metrics import extract_answer, math_equal, strip_answer_string
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/data_adapter.py CHANGED Viewed

@@ -3,12 +3,11 @@ import os.path
 import random
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import Any, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 from evalscope.benchmarks.utils import PromptData, preprocess_decorator
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
-from evalscope.metrics.llm_judge import LLMJudge
-from evalscope.metrics.named_metrics import metric_registry
+from evalscope.metrics import LLMJudge, metric_registry
 from evalscope.report import Report, ReportGenerator
 from evalscope.utils.logger import get_logger
@@ -24,6 +23,7 @@ class DataAdapter(ABC):
                  subset_list: list,
                  metric_list: List[str],
                  llm_as_a_judge: bool = False,
+                 output_types: Optional[List[str]] = None,
                  few_shot_num: Optional[int] = 0,
                  train_split: Optional[str] = None,
                  eval_split: Optional[str] = None,
@@ -63,6 +63,7 @@ class DataAdapter(ABC):
         self.query_template = query_template
         self.pretty_name = pretty_name
         self.config_kwargs = kwargs
+        self.output_types = output_types or [model_adapter]
         self.llm_as_a_judge = llm_as_a_judge
         self.category_map = kwargs.get('category_map', {})
         self.choices = kwargs.get('choices', None)
@@ -190,7 +191,7 @@ class DataAdapter(ABC):
         if self.few_shot_num and self.few_shot_num < 0:
             raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
-        logger.info(f'Use default settings: '
+        logger.info(f'Use settings: '
                     f'> few_shot_num: {self.few_shot_num}, '
                     f'> few_shot_split: {self.train_split}, '
                     f'> target_eval_split: {self.eval_split}')
@@ -245,7 +246,8 @@ class DataAdapter(ABC):
             res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
         return res_list
-    def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
+    def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
+                            **kwargs) -> Dict[str, List[float]]:
         """
         compute weighted mean of the bleu score of all samples
@@ -253,7 +255,7 @@ class DataAdapter(ABC):
             review_res_list: [score1, score2, ...]
         Returns:
-            avg_res: List[dict]
+            avg_res: Dict[str, List[float]]
         """
         if isinstance(review_res_list[0], list):
@@ -314,11 +316,20 @@ class DataAdapter(ABC):
         kwargs['metric_list'] = self.metric_list
         return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
-    def gen_prompt_data(self, prompt: str, system_prompt: Optional[str] = None, **kwargs) -> dict:
-        if not isinstance(prompt, list):
-            prompt = [prompt]
+    def gen_prompt_data(self,
+                        prompt: str,
+                        system_prompt: Optional[str] = None,
+                        choices: Optional[List[str]] = None,
+                        index: Optional[Union[int, str]] = None,
+                        id: Optional[Union[int, str]] = None,
+                        **kwargs) -> dict:
+        data = [prompt] if not isinstance(prompt, list) else prompt
         prompt_data = PromptData(
-            data=prompt, multi_choices=self.choices, system_prompt=system_prompt or self.system_prompt)
+            data=data,
+            multi_choices=choices or self.choices,
+            system_prompt=system_prompt or self.system_prompt,
+            index=index or 0,
+            id=id)
         return prompt_data.to_dict()
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:

evalscope/benchmarks/data_collection/data_collection_adapter.py CHANGED Viewed

@@ -48,14 +48,16 @@ class DataCollectionAdapter(DataAdapter):
             if len(dataset) == 0:
                 raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
         else:
-            from modelscope.msdatasets import MsDataset
+            from modelscope import dataset_snapshot_download
             # Load dataset from remote
             logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
-            dataset = MsDataset.load(dataset_name=dataset_name_or_path, cache_dir=work_dir, hub=datasets_hub, **kwargs)
-            dataset = dataset[self.eval_split].to_list()
+            dataset_path = dataset_snapshot_download(
+                dataset_name_or_path, cache_dir=work_dir, allow_file_pattern='*.jsonl')
+            # find the jsonl file
+            dataset_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.jsonl')]
+            dataset = jsonl_to_list(dataset_files[0])
         return dataset

evalscope/benchmarks/general_mcq/general_mcq_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
-from evalscope.metrics.metrics import exact_match
+from evalscope.metrics import exact_match
 from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
@@ -17,7 +17,7 @@ logger = get_logger()
     name='general_mcq',
     pretty_name='General MCQ',
     dataset_id='general_mcq',
-    model_adapter=OutputType.MULTIPLE_CHOICE,
+    model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
     subset_list=['default'],
     metric_list=['AverageAccuracy'],

evalscope/benchmarks/general_qa/general_qa_adapter.py CHANGED Viewed

@@ -40,7 +40,7 @@ class GeneralQAAdapter(DataAdapter):
             for subset_name in subset_list:
                 data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
         elif os.path.isfile(dataset_name_or_path):
-            cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
+            cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
             data_file_dict[cur_subset_name] = dataset_name_or_path
         else:
             raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')

evalscope/benchmarks/hellaswag/hellaswag_adapter.py CHANGED Viewed

@@ -108,7 +108,7 @@ class HellaSwagAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(result)
+            return ResponseParser.parse_first_option(result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=str(gold), pred=str(pred))

evalscope/benchmarks/live_code_bench/evaluate_utils.py CHANGED Viewed

@@ -2,7 +2,6 @@ import json
 import multiprocessing
 import numpy as np
 from collections import defaultdict
-from concurrent.futures import ProcessPoolExecutor, as_completed
 from evalscope.utils.logger import get_logger
 from .pass_k_utils import compute_metrics_from_results
@@ -31,7 +30,10 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
         args=(sample, generation, debug, result, metadata_list, timeout),
     )
     p.start()
-    p.join(timeout=(timeout + 1) * len(json.loads(sample['input_output'])['inputs']) + 5)
+    global_timeout = (timeout + 1) * len(json.loads(sample['input_output'])['inputs'])
+    if debug:
+        logger.info(f'global timeout = {global_timeout}')
+    p.join(timeout=global_timeout)
     if p.is_alive():
         p.kill()
     if not result:
@@ -39,7 +41,7 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
         # consider that all tests failed
         result = [[-1 for i in range(len(in_outs['inputs']))]]
         if debug:
-            logger.info('global timeout')
+            logger.info('global timeout occured: alarm went off')
     return result[0], metadata_list[0]
@@ -99,7 +101,7 @@ def evaluate_generations(
     samples_list: list,
     generations_list: list[list[str]],
     debug: bool = False,
-    num_process_evaluate: int = 16,
+    num_process_evaluate: int = 16,  # This parameter will be unused
     timeout=6,
 ):
     """We take the list of code generations and try to compile them and the run
@@ -117,26 +119,19 @@ def evaluate_generations(
         [-2] = compile error, [-1] = runtime error [False] = failed test
             case [True] = passed test case
     """
+    results = {}
+    metadata = {}
-    # generations are code generations in the same order of the dataset
-    inputs = [[(generations_list[index], samples_list[index], debug, timeout), index]
-              for index in range(len(generations_list))]
-    with ProcessPoolExecutor(max_workers=1 if debug else num_process_evaluate) as executor:
-        futures = {
-            executor.submit(evaluate_generations_by_problem, problem_generations, sample, debug, timeout): index
-            for (problem_generations, sample, debug, timeout), index in inputs
-        }
+    for index in range(len(generations_list)):
+        problem_generations = generations_list[index]
+        sample = samples_list[index]
-        results = {}
-        metadata = {}
-        for future in as_completed(futures):
-            index = futures[future]
-            results[index], metadata[index] = future.result()
+        result, meta = evaluate_generations_by_problem(problem_generations, sample, debug, timeout)
+        results[index] = result
+        metadata[index] = meta
-    assert len(results) == len(inputs), f'results = {len(results)} inputs = {len(inputs)} {results=}'
-    # results = {i: r for r, (_, i) in zip(results, inputs)}
+    assert len(results) == len(
+        generations_list), f'results = {len(results)} inputs = {len(generations_list)} {results=}'
     return results, metadata

evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py CHANGED Viewed

@@ -18,8 +18,8 @@ logger = get_logger()
     extra_params={
         'start_date': None,
         'end_date': None,
-        'num_process_evaluate': 1,
-        'timeout': 6
+        'timeout': 6,
+        'debug': False
     },
     system_prompt=
     'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.',  # noqa: E501
@@ -33,8 +33,8 @@ class LiveCodeBenchAdapter(DataAdapter):
         extra_params = kwargs.get('extra_params', {})
-        self.num_process_evaluate = extra_params.get('num_process_evaluate', 1)
         self.timeout = extra_params.get('timeout', 6)
+        self.debug = extra_params.get('debug', False)
         self.start_date = extra_params.get('start_date')
         self.end_date = extra_params.get('end_date')
@@ -84,7 +84,8 @@ class LiveCodeBenchAdapter(DataAdapter):
             references,
             predictions,
             k_list=[1],
-            num_process_evaluate=self.num_process_evaluate,
+            num_process_evaluate=1,
             timeout=self.timeout,
+            debug=self.debug,
         )
         return metrics['pass@1'] / 100  # convert to point scale

evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.2py3-none-any.whl → 0.15.0py3-none-any.whl