PyPI - evalscope - Versions diffs - 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

evalscope 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (81) hide show

evalscope/arguments.py +3 -0
evalscope/benchmarks/aime/__init__.py +0 -0
evalscope/benchmarks/aime/aime24_adapter.py +49 -0
evalscope/benchmarks/aime/aime25_adapter.py +49 -0
evalscope/benchmarks/arc/arc_adapter.py +5 -7
evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
evalscope/benchmarks/benchmark.py +5 -3
evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
evalscope/benchmarks/data_adapter.py +88 -29
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +68 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/critique_template.txt +13 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
evalscope/benchmarks/race/race_adapter.py +3 -3
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
evalscope/cli/start_app.py +4 -1
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +4 -2
evalscope/collections/evaluator.py +109 -39
evalscope/collections/sampler.py +2 -1
evalscope/collections/schema.py +1 -2
evalscope/config.py +4 -1
evalscope/evaluator/evaluator.py +81 -65
evalscope/metrics/__init__.py +2 -1
evalscope/metrics/math_parser.py +526 -0
evalscope/metrics/metrics.py +39 -3
evalscope/metrics/named_metrics.py +31 -7
evalscope/models/base_adapter.py +7 -1
evalscope/models/chat_adapter.py +69 -49
evalscope/models/choice_adapter.py +52 -45
evalscope/models/custom_adapter.py +2 -2
evalscope/models/local_model.py +7 -2
evalscope/models/server_adapter.py +106 -61
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +5 -1
evalscope/perf/http_client.py +2 -2
evalscope/perf/plugin/api/openai_api.py +11 -1
evalscope/perf/utils/benchmark_util.py +6 -2
evalscope/report/app.py +42 -23
evalscope/run.py +11 -8
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +264 -0
evalscope/third_party/thinkbench/infer.py +100 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +47 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/utils/chat_service.py +2 -2
evalscope/utils/io_utils.py +1 -1
evalscope/utils/model_utils.py +17 -1
evalscope/utils/utils.py +45 -45
evalscope/version.py +2 -2
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
tests/cli/test_run.py +108 -19
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/metrics/math_accuracy.py +0 -200
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0

evalscope/arguments.py CHANGED Viewed

@@ -58,6 +58,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
                         choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
     parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
+    parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
     # Cache and working directory arguments
     parser.add_argument('--mem-cache', action='store_true', default=False, help='Deprecated, will be removed in v1.0.0.')  # noqa: E501
@@ -70,6 +71,8 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
     parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
     parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
+    parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
+    parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.')  # noqa: E501
     # yapf: enable

evalscope/benchmarks/aime/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/aime/aime24_adapter.py ADDED Viewed

@@ -0,0 +1,49 @@
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
+from evalscope.models import ChatGenerationModelAdapter
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+@Benchmark.register(
+    name='aime24',
+    dataset_id='HuggingFaceH4/aime_2024',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['default'],
+    metric_list=['AveragePass@1'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='train',  # Only train set is available
+    prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
+)
+class AIME24Adapter(DataAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate the prompt for the model input.
+        """
+        problem = input_d['problem']
+        full_prompt = self.prompt_template.format(query=problem)
+        return {'data': [full_prompt], 'system_prompt': self.system_prompt}
+    def get_gold_answer(self, input_d: dict) -> str:
+        # Extract the gold answer from the input dict.
+        return strip_answer_string(input_d['answer'])
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        """
+        Parse the model output to get the answer. Could be the best choice index.
+        """
+        # Note: Use same extraction method for both of checkpoint/service/custom
+        result = strip_answer_string(extract_answer(result))
+        return result
+    def match(self, gold: str, pred: str) -> float:
+        return math_equal(pred, gold)

evalscope/benchmarks/aime/aime25_adapter.py ADDED Viewed

@@ -0,0 +1,49 @@
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
+from evalscope.models import ChatGenerationModelAdapter
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+@Benchmark.register(
+    name='aime25',
+    dataset_id='TIGER-Lab/AIME25',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['default'],
+    metric_list=['AveragePass@1'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='train',  # Only train set is available
+    prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
+)
+class AIME25Adapter(DataAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate the prompt for the model input.
+        """
+        problem = input_d['question']
+        full_prompt = self.prompt_template.format(query=problem)
+        return {'data': [full_prompt], 'system_prompt': self.system_prompt}
+    def get_gold_answer(self, input_d: dict) -> str:
+        # Extract the gold answer from the input dict.
+        return strip_answer_string(input_d['answer'])
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        """
+        Parse the model output to get the answer. Could be the best choice index.
+        """
+        # Note: Use same extraction method for both of checkpoint/service/custom
+        result = strip_answer_string(extract_answer(result))
+        return result
+    def match(self, gold: str, pred: str) -> float:
+        return math_equal(pred, gold)

evalscope/benchmarks/arc/arc_adapter.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.metrics import exact_match
 from evalscope.models import MultiChoiceModelAdapter
 from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
@@ -20,7 +20,7 @@ logger = get_logger()
     dataset_id='modelscope/ai2_arc',
     model_adapter=MultiChoiceModelAdapter,
     subset_list=['ARC-Easy', 'ARC-Challenge'],
-    metric_list=[AverageAccuracy],
+    metric_list=['AverageAccuracy'],
     few_shot_num=0,
     train_split='train',
     eval_split='test',
@@ -112,7 +112,7 @@ class ARCAdapter(DataAdapter):
         # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
         full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
-        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
@@ -133,11 +133,9 @@ class ARCAdapter(DataAdapter):
         if eval_type == EvalType.CHECKPOINT:
             return result
         elif eval_type == EvalType.SERVICE:
-            return ResponseParser.parse_first_option_with_choices(
-                text=result, options=self.choices)  # TODO: to be checked !
+            return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
         elif eval_type == EvalType.CUSTOM:
-            return ResponseParser.parse_first_option_with_choices(
-                text=result, options=self.choices)  # TODO: to be checked !
+            return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')

evalscope/benchmarks/bbh/bbh_adapter.py CHANGED Viewed

@@ -7,7 +7,7 @@ import re
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import AnswerKeys
-from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.metrics import exact_match
 from evalscope.models.chat_adapter import ChatGenerationModelAdapter
 from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
@@ -63,11 +63,11 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
     dataset_id='modelscope/bbh',
     model_adapter=ChatGenerationModelAdapter,
     subset_list=SUBSET_LIST,
-    metric_list=[AverageAccuracy],
+    metric_list=['AverageAccuracy'],
     few_shot_num=3,
     train_split=None,
     eval_split='test',
-    prompt_template='',
+    prompt_template="Q: {query}\nA: Let's think step by step.",
 )
 class BBHAdapter(DataAdapter):
     """
@@ -119,10 +119,13 @@ class BBHAdapter(DataAdapter):
             {'data': ['xxx']}
         """
         # few_shot_list: should be ['xxxx']
-        cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
-        full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
+        if len(few_shot_list) > 0:
+            cot_prompts = 'Follow the given examples and answer the question.\n' + few_shot_list[0]
+        else:
+            cot_prompts = ''
+        full_prompt = cot_prompts + self.prompt_template.format(query=input_d['input'])
-        return {'data': [full_prompt], 'system_prompt': self.prompt_template}
+        return {'data': [full_prompt], 'system_prompt': self.system_prompt}
     def gen_prompts(self, data_dict: dict) -> dict:
         """
@@ -168,18 +171,15 @@ class BBHAdapter(DataAdapter):
                 prompt_d[AnswerKeys.RAW_INPUT] = sample_d_new
                 res_dict[sub_name].append(prompt_d)
-        rnd = random.Random()
-        rnd.seed(42)
-        for k, v in res_dict.items():
-            rnd.shuffle(v)
         return res_dict
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
-        gold = input_d.get('target')
+        gold = input_d.get('target', '')
+        # remove brackets
         if gold is None:
             logger.error(f'BBHAdapter: gold is None.')
+        gold = gold.replace('(', '').replace(')', '')
         return gold
     def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -228,8 +228,11 @@ class BBHAdapter(DataAdapter):
         """
         Extract the answer from the model output for Free-form task.
         """
-        res = ResponseParser.parse_first_option(ans)
-        if res:
+        pattern = r'answer is\s+(.*?)\.'
+        match = re.search(pattern, ans)
+        if match:
+            res = match.group(1)
             return res
         ans_line = ans.split('answer is ')

evalscope/benchmarks/benchmark.py CHANGED Viewed

@@ -17,12 +17,15 @@ class BenchmarkMeta:
     data_adapter: 'DataAdapter'
     model_adapter: BaseModelAdapter
     subset_list: List[str] = field(default_factory=list)
-    metric_list: List[dict] = field(default_factory=list)
+    metric_list: List[str] = field(default_factory=list)
     few_shot_num: int = 0
     few_shot_random: bool = False
     train_split: Optional[str] = None
     eval_split: Optional[str] = None
     prompt_template: Optional[str] = None
+    system_prompt: Optional[str] = None
+    query_template: Optional[str] = None
+    pretty_name: Optional[str] = None
     def _update(self, args: dict):
         if args.get('local_path'):
@@ -40,7 +43,6 @@ class BenchmarkMeta:
         # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
         del cur_dict['data_adapter']
         del cur_dict['model_adapter']
-        del cur_dict['metric_list']
         return cur_dict
     def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
@@ -59,7 +61,7 @@ class Benchmark:
     @classmethod
     def get(cls, name: str) -> 'BenchmarkMeta':
         if name not in BENCHMARK_MAPPINGS:
-            raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
+            raise Exception(f'Unknown benchmark: {name}. Available tasks: {list(BENCHMARK_MAPPINGS.keys())}')
         benchmark = BENCHMARK_MAPPINGS[name]
         return benchmark

evalscope/benchmarks/ceval/ceval_adapter.py CHANGED Viewed

@@ -4,10 +4,9 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import AverageAccuracy
-from evalscope.metrics.metrics import exact_match, weighted_mean
+from evalscope.metrics.metrics import exact_match
 from evalscope.models import MultiChoiceModelAdapter
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 # flake8: noqa
@@ -130,10 +129,11 @@ SUBJECT_MAPPING = {
     dataset_id='modelscope/ceval-exam',
     model_adapter=MultiChoiceModelAdapter,
     subset_list=SUBSET_LIST,
-    metric_list=[AverageAccuracy],
+    metric_list=['AverageAccuracy'],
     few_shot_num=0,
     train_split='dev',
     eval_split='val',
+    prompt_template='以下是中国关于{subset_name}考试的单项选择题，请选出其中的正确答案。\n{query}',
 )
 class CEVALAdapter(DataAdapter):
@@ -202,12 +202,12 @@ class CEVALAdapter(DataAdapter):
         else:
             context = ''
-        full_prompt: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
+        query: str = context.strip() + self._format_example(input_d=input_d, include_answer=False)
         subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
-        full_prompt = f'以下是中国关于{subject_name}考试的单项选择题，请选出其中的正确答案。\n' + full_prompt
+        full_prompt = self.prompt_template.format(subset_name=subject_name, query=query)
-        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
@@ -228,9 +228,9 @@ class CEVALAdapter(DataAdapter):
         if eval_type == EvalType.CHECKPOINT:
             return result
         elif eval_type == EvalType.SERVICE:
-            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)
         elif eval_type == EvalType.CUSTOM:
-            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')

evalscope/benchmarks/cmmlu/cmmlu_adapter.py CHANGED Viewed

@@ -5,9 +5,9 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.metrics import exact_match
 from evalscope.models import MultiChoiceModelAdapter
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 # flake8: noqa
@@ -106,10 +106,11 @@ SUBJECT_MAPPING = {
     dataset_id='modelscope/cmmlu',
     model_adapter=MultiChoiceModelAdapter,
     subset_list=SUBSET_LIST,
-    metric_list=[AverageAccuracy],
+    metric_list=['AverageAccuracy'],
     few_shot_num=5,
     train_split='dev',
     eval_split='test',
+    prompt_template='以下是关于{subset_name}的单项选择题，请直接给出正确答案的选项。\n{query}',
 )
 class CMMLUAdapter(DataAdapter):
@@ -165,16 +166,13 @@ class CMMLUAdapter(DataAdapter):
             {'data': [(context, continuation), ...]}
         """
-        prompt = '以下是关于{}的单项选择题。\n\n'.format(self._format_subject(subset_name))
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
-        context: str = '\n'.join(few_shot_prompts) + '\n'
+        context = '\n'.join(few_shot_prompts) + '\n'
         context += self._generate_prompt(input_d=input_d, include_answer=False)
-        context = prompt + context
-        full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
+        full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
-        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
@@ -195,9 +193,9 @@ class CMMLUAdapter(DataAdapter):
         if eval_type == EvalType.CHECKPOINT:
             return result
         elif eval_type == EvalType.SERVICE:
-            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)
         elif eval_type == EvalType.CUSTOM:
-            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')

evalscope/benchmarks/competition_math/competition_math_adapter.py CHANGED Viewed

@@ -3,10 +3,11 @@
 import glob
 import json
 import os
+from collections import defaultdict
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import AverageAccuracy
-from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
+from evalscope.constants import AnswerKeys
+from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
 from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.logger import get_logger
@@ -19,12 +20,12 @@ logger = get_logger()
     name='competition_math',
     dataset_id='modelscope/competition_math',
     model_adapter=ChatGenerationModelAdapter,
-    subset_list=['default'],
-    metric_list=[AverageAccuracy],
+    subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    metric_list=['AveragePass@1'],
     few_shot_num=4,
-    train_split='train',
+    train_split=None,
     eval_split='test',
-    prompt_template='Put the final answer in \\boxed{}.',
+    prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
 )
 class CompetitionMathAdapter(DataAdapter):
     """ To be tested for all models. """
@@ -39,8 +40,14 @@ class CompetitionMathAdapter(DataAdapter):
         super().__init__(**kwargs)
+    def load(self, **kwargs):
+        # default load all levels
+        kwargs['subset_list'] = ['default']
+        data_dict = super().load(**kwargs)
+        return self.reformat_subset(data_dict, subset_key='level')
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
-        data_dict: dict = {}
+        data_dict = defaultdict(dict)
         for subset_name in subset_list:
             for split_name in [self.train_split, self.eval_split]:
                 if os.path.exists(dataset_name_or_path):
@@ -53,10 +60,7 @@ class CompetitionMathAdapter(DataAdapter):
                     if os.path.exists(file_path):
                         with open(file_path, 'r') as f:
                             split_data.append(json.load(f))
-                if subset_name in data_dict:
-                    data_dict[subset_name].update({split_name: split_data})
-                else:
-                    data_dict[subset_name] = {split_name: split_data}
+                data_dict[subset_name][split_name] = split_data
         return data_dict
@@ -75,13 +79,13 @@ class CompetitionMathAdapter(DataAdapter):
             {'data': [prompt]}
         """
         use_fewshot = self.few_shot_num > 0
-        full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
-        return {'data': [full_prompt], 'system_prompt': self.prompt_template}
+        query = self._generate_prompt(input_d, use_fewshot=use_fewshot)
+        full_prompt = self.prompt_template.format(query=query)
+        return {'data': [full_prompt], 'system_prompt': self.system_prompt}
     def get_gold_answer(self, input_d: dict) -> str:
         # Extract the gold answer from the input dict.
-        return remove_boxed(last_boxed_only_string(input_d['solution']))
+        return strip_answer_string(extract_answer(input_d['solution']))
     def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
         """
@@ -96,18 +100,11 @@ class CompetitionMathAdapter(DataAdapter):
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
         # Note: Use same extraction method for both of checkpoint/service/custom
-        try:
-            result = remove_boxed(last_boxed_only_string(result))
-        except Exception:
-            return None
+        result = strip_answer_string(extract_answer(result))
         return result
     def match(self, gold: str, pred: str) -> float:
-        res = 0
-        if is_equiv(pred, gold):
-            res = 1
-        return res
+        return math_equal(pred, gold)
     @classmethod
     def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:

evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl