PyPI - evalscope - Versions diffs - 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

evalscope 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (81) hide show

evalscope/arguments.py +3 -0
evalscope/benchmarks/aime/__init__.py +0 -0
evalscope/benchmarks/aime/aime24_adapter.py +49 -0
evalscope/benchmarks/aime/aime25_adapter.py +49 -0
evalscope/benchmarks/arc/arc_adapter.py +5 -7
evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
evalscope/benchmarks/benchmark.py +5 -3
evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
evalscope/benchmarks/data_adapter.py +88 -29
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
evalscope/benchmarks/musr/__init__.py +0 -0
evalscope/benchmarks/musr/musr_adapter.py +68 -0
evalscope/benchmarks/process_bench/__init__.py +0 -0
evalscope/benchmarks/process_bench/critique_template.txt +13 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
evalscope/benchmarks/race/race_adapter.py +3 -3
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
evalscope/cli/start_app.py +4 -1
evalscope/cli/start_eval.py +4 -3
evalscope/cli/start_perf.py +4 -2
evalscope/collections/evaluator.py +109 -39
evalscope/collections/sampler.py +2 -1
evalscope/collections/schema.py +1 -2
evalscope/config.py +4 -1
evalscope/evaluator/evaluator.py +81 -65
evalscope/metrics/__init__.py +2 -1
evalscope/metrics/math_parser.py +526 -0
evalscope/metrics/metrics.py +39 -3
evalscope/metrics/named_metrics.py +31 -7
evalscope/models/base_adapter.py +7 -1
evalscope/models/chat_adapter.py +69 -49
evalscope/models/choice_adapter.py +52 -45
evalscope/models/custom_adapter.py +2 -2
evalscope/models/local_model.py +7 -2
evalscope/models/server_adapter.py +106 -61
evalscope/perf/__init__.py +0 -1
evalscope/perf/arguments.py +5 -1
evalscope/perf/http_client.py +2 -2
evalscope/perf/plugin/api/openai_api.py +11 -1
evalscope/perf/utils/benchmark_util.py +6 -2
evalscope/report/app.py +42 -23
evalscope/run.py +11 -8
evalscope/third_party/thinkbench/__init__.py +3 -0
evalscope/third_party/thinkbench/eval.py +264 -0
evalscope/third_party/thinkbench/infer.py +100 -0
evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
evalscope/third_party/thinkbench/tools/__init__.py +0 -0
evalscope/third_party/thinkbench/tools/llm.py +47 -0
evalscope/third_party/thinkbench/tools/utils.py +13 -0
evalscope/utils/chat_service.py +2 -2
evalscope/utils/io_utils.py +1 -1
evalscope/utils/model_utils.py +17 -1
evalscope/utils/utils.py +45 -45
evalscope/version.py +2 -2
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
tests/cli/test_run.py +108 -19
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/metrics/math_accuracy.py +0 -200
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import re
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import Pass1
 from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.logger import get_logger
@@ -17,11 +16,11 @@ logger = get_logger()
     dataset_id='modelscope/humaneval',
     model_adapter=ChatGenerationModelAdapter,
     subset_list=['openai_humaneval'],
-    metric_list=[Pass1],
+    metric_list=['Pass@1'],
     few_shot_num=0,
     train_split=None,
     eval_split='test',
-    prompt_template='',
+    prompt_template='Complete the following python code:\n{query}',
 )
 class HumanevalAdapter(DataAdapter):
     """
@@ -64,10 +63,10 @@ class HumanevalAdapter(DataAdapter):
             input_d (dict): The raw input. A single data format of the Humaneval:
             {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
         """
-        full_prompt = input_d['prompt']
-        full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
+        query = input_d['prompt']
+        full_prompt = self.prompt_template.format(query=query)
-        return {'data': [full_prompt], 'system_prompt': self.prompt_template}
+        return {'data': [full_prompt], 'system_prompt': self.system_prompt}
     @classmethod
     def _postprocess(cls, text: str) -> str:

evalscope/benchmarks/ifeval/ifeval_adapter.py CHANGED Viewed

@@ -2,9 +2,9 @@ from collections import defaultdict
 from typing import Any, Dict, List
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
+from evalscope.benchmarks.ifeval.utils import process_results
 from evalscope.constants import EvalType
-from evalscope.metrics import Metric, mean
+from evalscope.metrics import Metric, mean, metric_registry
 from evalscope.models import ChatGenerationModelAdapter
@@ -14,10 +14,10 @@ from evalscope.models import ChatGenerationModelAdapter
     model_adapter=ChatGenerationModelAdapter,
     subset_list=['default'],
     metric_list=[
-        Metric(name='prompt_level_strict_acc', object=mean),
-        Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
-        Metric(name='prompt_level_loose_acc', object=mean),
-        Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
+        'prompt_level_strict_acc',
+        'inst_level_strict_acc',
+        'prompt_level_loose_acc',
+        'inst_level_loose_acc',
     ],
     few_shot_num=0,
     train_split=None,
@@ -29,8 +29,14 @@ class IFEvalAdapter(DataAdapter):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        # register metrics
+        metric_registry.register(Metric(name='prompt_level_strict_acc', object=mean))
+        metric_registry.register(Metric(name='inst_level_strict_acc', object=mean))
+        metric_registry.register(Metric(name='prompt_level_loose_acc', object=mean))
+        metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
-        return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
+        return {'data': [input_d['prompt']], 'system_prompt': self.system_prompt}
     def get_gold_answer(self, input_d: dict) -> str:
         return input_d
@@ -41,16 +47,11 @@ class IFEvalAdapter(DataAdapter):
     def match(self, gold: Any, pred: Any) -> Dict:
         return process_results(gold, [pred])
-    def compute_metric(self, review_res_list: List[dict]) -> Any:
+    def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
         # aggregate review results
         res_dict = defaultdict(list)
         for res in review_res_list:
             for k, v in res.items():
                 res_dict[k].append(v)
-        metrics = []
-        for metric in self.metric_list:
-            metric_name = metric.name
-            pred_value = res_dict[metric_name]
-            metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
-        return metrics
+        return super().compute_metric(res_dict)

evalscope/benchmarks/iquiz/iquiz_adapter.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.constants import AnswerKeys, EvalType
-from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.constants import EvalType
+from evalscope.metrics import exact_match
 from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.utils import ResponseParser
@@ -10,11 +10,11 @@ from evalscope.utils.utils import ResponseParser
     dataset_id='AI-ModelScope/IQuiz',
     model_adapter=ChatGenerationModelAdapter,
     subset_list=['IQ', 'EQ'],
-    metric_list=[AverageAccuracy],
+    metric_list=['AverageAccuracy'],
     few_shot_num=0,
     train_split=None,
     eval_split='test',
-    prompt_template='你是一个高智商和高情商的专家，你被要求回答一个选择题，并选出一个正确的选项，解释原因，最终输出格式为：`答案是(选项)`。',  # noqa: E501
+    system_prompt='你是一个高智商和高情商的专家，你被要求回答一个选择题，并选出一个正确的选项，解释原因，最终输出格式为：`答案是(选项)`。',  # noqa: E501
 )
 class IQuizAdapter(DataAdapter):
@@ -36,7 +36,7 @@ class IQuizAdapter(DataAdapter):
         """
         prompt = f"问题: {input_d['question']}\n"
         prompt += self.__form_options(input_d['choices'])
-        return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
+        return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
     def __form_options(self, options: list):
         option_str = '选项:\n'

evalscope/benchmarks/math_500/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/math_500/math_500_adapter.py ADDED Viewed

@@ -0,0 +1,58 @@
+from collections import defaultdict
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import AnswerKeys
+from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
+from evalscope.models import ChatGenerationModelAdapter
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+@Benchmark.register(
+    name='math_500',
+    dataset_id='AI-ModelScope/MATH-500',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
+    metric_list=['AveragePass@1'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
+)
+class Math500Adapter(DataAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def load(self, **kwargs):
+        # default load all levels
+        kwargs['subset_list'] = ['default']
+        data_dict = super().load(**kwargs)
+        return self.reformat_subset(data_dict, subset_key='level', format='Level {}')
+    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate the prompt for the model input.
+        """
+        problem = input_d['problem']
+        full_prompt = self.prompt_template.format(query=problem)
+        return {'data': [full_prompt], 'system_prompt': self.system_prompt}
+    def get_gold_answer(self, input_d: dict) -> str:
+        # Extract the gold answer from the input dict.
+        return strip_answer_string(input_d['answer'])
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        """
+        Parse the model output to get the answer. Could be the best choice index.
+        """
+        # Note: Use same extraction method for both of checkpoint/service/custom
+        result = strip_answer_string(extract_answer(result))
+        return result
+    def match(self, gold: str, pred: str) -> float:
+        return math_equal(pred, gold)

evalscope/benchmarks/mmlu/mmlu_adapter.py CHANGED Viewed

@@ -4,17 +4,15 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.metrics import exact_match
 from evalscope.models import MultiChoiceModelAdapter
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
-DATASET_ID = 'modelscope/mmlu'
 SUBSET_LIST = [
     'high_school_european_history',
     'business_ethics',
@@ -141,11 +139,11 @@ SUBJECT_MAPPING = {
     dataset_id='modelscope/mmlu',
     model_adapter=MultiChoiceModelAdapter,
     subset_list=SUBSET_LIST,
-    metric_list=[AverageAccuracy],
+    metric_list=['AverageAccuracy'],
     few_shot_num=5,
     train_split='train',
     eval_split='test',
-    prompt_template='',
+    prompt_template='The following are multiple choice questions (with answers) about {subset_name}. \n{query}',
 )
 class MMLUAdapter(DataAdapter):
@@ -221,17 +219,15 @@ class MMLUAdapter(DataAdapter):
             {'data': [full_prompt], 'multi_choices': self.choices}
         """
-        prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
-            self._format_subject(subset_name))
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
         context: str = '\n'.join(few_shot_prompts) + '\n'
         context += self._generate_prompt(input_d=input_d, include_answer=False)
-        context = prompt + context
+        query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
-        full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
+        full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
-        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice

evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py CHANGED Viewed

@@ -3,22 +3,27 @@ from typing import Any, Dict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import AnswerKeys, EvalType
-from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.metrics import exact_match
 from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.utils import ResponseParser
+SUBSET_LIST = [
+    'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
+    'philosophy', 'economics', 'other', 'psychology', 'history'
+]
 @Benchmark.register(
     name='mmlu_pro',
-    dataset_id='modelscope/mmlu-pro',
+    dataset_id='modelscope/MMLU-Pro',
     model_adapter=ChatGenerationModelAdapter,
-    subset_list=['default'],
-    metric_list=[AverageAccuracy],
+    subset_list=SUBSET_LIST,
+    metric_list=['AverageAccuracy'],
     few_shot_num=5,
     train_split='validation',
     eval_split='test',
     prompt_template=
-    'You are an knowledge expert, you are supposed to answer the multi-choice question to derive your final answer as `The answer is ...`.',  # noqa: E501
+    'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}',  # noqa: E501
 )
 class MMLUProAdapter(DataAdapter):
@@ -26,38 +31,29 @@ class MMLUProAdapter(DataAdapter):
         super().__init__(**kwargs)
         self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
-        self.categories = [
-            'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
-            'philosophy', 'economics', 'other', 'psychology', 'history'
-        ]
-    def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
-        """
-        Generate model prompt from raw input, unify the prompt format for MMLU-Pro benchmark.
-        Return a dict with category as key and list of prompts as value.
-        """
-        data_dict = data_dict[self.subset_list[0]]  # Only one subset for MMLU-Pro
-        fewshot_prompts = self.get_fewshot_examples(data_dict)
-        #  Use the category as key to group the prompts
-        res_dict = defaultdict(list)
-        # generate prompts for each test sample
-        for entry in data_dict[self.eval_split]:
-            prefix = fewshot_prompts[entry['category']]
-            query = prefix + 'Q: ' + entry['question'] + '\n' + \
-                self.__form_options(entry['options']) + '\n'
-            prompt_d = {'data': [query], 'system_prompt': self.prompt_template, AnswerKeys.RAW_INPUT: entry}
-            res_dict[entry['category']].append(prompt_d)
-        return res_dict
-    def get_fewshot_examples(self, data_dict: dict):
-        # load 5-shot prompts for each category
-        prompts = {c: '' for c in self.categories}
-        for d in data_dict[self.train_split]:
-            prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
+    def load(self, **kwargs):
+        # default load all data
+        kwargs['subset_list'] = ['default']
+        data_dict = super().load(**kwargs)
+        return self.reformat_subset(data_dict, subset_key='category')
+    def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        if self.few_shot_num > 0:
+            prefix = self.format_fewshot_examples(few_shot_list)
+        else:
+            prefix = ''
+        query = prefix + 'Q: ' + input_d['question'] + '\n' + \
+            self.__form_options(input_d['options']) + '\n'
+        full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
+        return {'data': [full_prompt], 'system_prompt': self.system_prompt}
+    def format_fewshot_examples(self, few_shot_list):
+        # load few-shot prompts for each category
+        prompts = ''
+        for index, d in enumerate(few_shot_list):
+            prompts += 'Q: ' + d['question'] + '\n' + \
                 self.__form_options(d['options']) + '\n' + \
                 d['cot_content'] + '\n\n'
         return prompts

evalscope/benchmarks/musr/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/musr/musr_adapter.py ADDED Viewed

@@ -0,0 +1,68 @@
+import ast
+from typing import Any
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import exact_match
+from evalscope.models import ChatGenerationModelAdapter
+from evalscope.utils.utils import ResponseParser
+@Benchmark.register(
+    name='musr',
+    pretty_name='MuSR',
+    dataset_id='AI-ModelScope/MuSR',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['murder_mysteries', 'object_placements', 'team_allocation'],
+    metric_list=['AverageAccuracy'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template=
+    '{narrative}\n\n{question}\n\n{choices}\nThink step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.',  # noqa: E501
+)
+class MuSRAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.choices = ['A', 'B', 'C', 'D', 'E', 'F']
+    def load(self, **kwargs):
+        # default load all levels
+        kwargs['split_as_subset'] = True
+        data_dict = super().load(**kwargs)
+        return data_dict
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        choices = self.format_choice(ast.literal_eval(input_d['choices']))
+        full_prompt = self.prompt_template.format(
+            narrative=input_d['narrative'], question=input_d['question'], choices=choices)
+        return {'data': [full_prompt], 'system_prompt': self.system_prompt}
+    def format_choice(self, options: list):
+        option_str = ''
+        for opt, choice in zip(options, self.choices):
+            option_str += f'({choice}): {opt}\n'
+        return option_str
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        """
+        return self.choices[input_d['answer_index']]
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the predicted result and extract proper answer.
+        """
+        return ResponseParser.parse_first_option(result)
+    def match(self, gold: str, pred: str) -> float:
+        """
+        Match the gold answer and the predicted answer.
+        """
+        return exact_match(gold=gold, pred=pred)

evalscope/benchmarks/process_bench/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/process_bench/critique_template.txt ADDED Viewed

@@ -0,0 +1,13 @@
+The following is a math problem and a solution (split into paragraphs, enclosed with tags and indexed from 0):
+[Math Problem]
+{problem}
+[Solution]
+{tagged_response}
+Your task is to review and critique the solution paragraph by paragraph. Once you identify an error in a paragraph, return the index of the paragraph where the earliest error occurs. Otherwise, return the index of -1 (which typically denotes "not found").
+Please put your final answer (i.e., the index) in \boxed{{}}.

evalscope/benchmarks/process_bench/process_bench_adapter.py ADDED Viewed

@@ -0,0 +1,96 @@
+import os
+import re
+from typing import Any, List
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import AnswerKeys, EvalType
+from evalscope.metrics import Metric, mean, metric_registry, simple_f1_score
+from evalscope.models import ChatGenerationModelAdapter
+cur_path = os.path.dirname(os.path.abspath(__file__))
+@Benchmark.register(
+    name='process_bench',
+    pretty_name='ProcessBench',
+    dataset_id='Qwen/ProcessBench',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
+    metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+)
+class ProcessBenchAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.prompt_template = open(os.path.join(cur_path, 'critique_template.txt')).read()
+        # register metrics
+        metric_registry.register(Metric(name='error_acc', object=mean))
+        metric_registry.register(Metric(name='correct_acc', object=mean))
+        metric_registry.register(Metric(name='simple_f1_score', object=simple_f1_score))
+    def load(self, **kwargs):
+        # default load all levels
+        kwargs['split_as_subset'] = True
+        data_dict = super().load(**kwargs)
+        return data_dict
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        problem = input_d['problem']
+        steps = input_d['steps']
+        tagged_response = ''
+        for sdx, step in enumerate(steps):
+            tagged_response += f'<paragraph_{sdx}>\n{step}\n</paragraph_{sdx}>\n\n'
+        tagged_response = tagged_response.strip()
+        full_prompt = self.prompt_template.format(problem=problem, tagged_response=tagged_response)
+        return {'data': [full_prompt], 'system_prompt': self.system_prompt}
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        """
+        return int(input_d['label'])
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the predicted result and extract proper answer.
+        """
+        pred = ProcessBenchAdapter.extract_answer(result)
+        try:
+            pred = int(pred)
+        except Exception:
+            pred = None
+        return pred
+    def match(self, gold: int, pred: int) -> float:
+        """
+        Match the gold answer and the predicted answer.
+        """
+        return gold == pred
+    def compute_metric(self, review_res_list: list, **kwargs) -> List[dict]:
+        reviews_list = kwargs['reviews_list']
+        error_data = []
+        correct_data = []
+        for res, raw in zip(review_res_list, reviews_list):
+            if raw[AnswerKeys.RAW_INPUT]['label'] == -1:
+                correct_data.append(res)
+            else:
+                error_data.append(res)
+        data = {'error_acc': error_data, 'correct_acc': correct_data, 'simple_f1_score': (correct_data, error_data)}
+        return super().compute_metric(data)
+    @staticmethod
+    def extract_answer(solution_text: str):
+        boxed_pattern = r'\\boxed\{([^}]*)\}'
+        matches = re.findall(boxed_pattern, solution_text)
+        if matches:
+            return matches[-1].strip()
+        return None

evalscope/benchmarks/race/race_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.metrics import exact_match
 from evalscope.models import MultiChoiceModelAdapter
 from evalscope.utils import ResponseParser
 from evalscope.utils.io_utils import jsonl_to_list
@@ -20,7 +20,7 @@ logger = get_logger()
     dataset_id='modelscope/race',
     model_adapter=MultiChoiceModelAdapter,
     subset_list=['high', 'middle'],
-    metric_list=[AverageAccuracy],
+    metric_list=['AverageAccuracy'],
     few_shot_num=3,
     train_split='train',
     eval_split='test',
@@ -82,7 +82,7 @@ class RACEAdapter(DataAdapter):
         full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
-        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice

evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py CHANGED Viewed

@@ -6,7 +6,6 @@ import os
 from evalscope.benchmarks import Benchmark
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import AverageAccuracy
 from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils import get_logger
@@ -20,7 +19,7 @@ logger = get_logger()
     dataset_id='modelscope/trivia_qa',
     model_adapter=ChatGenerationModelAdapter,
     subset_list=['default'],
-    metric_list=[AverageAccuracy],
+    metric_list=['AverageAccuracy'],
     few_shot_num=5,
     train_split='dev',
     eval_split='test',

evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py CHANGED Viewed

@@ -9,9 +9,8 @@ from typing import List
 from evalscope.benchmarks import Benchmark
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import AverageAccuracy
 from evalscope.models import ContinuationLogitsModelAdapter
-from evalscope.utils import get_logger, normalize_score
+from evalscope.utils import get_logger
 # flake8: noqa
@@ -25,7 +24,7 @@ logger = get_logger()
     dataset_id='modelscope/truthful_qa',
     model_adapter=ContinuationLogitsModelAdapter,
     subset_list=['multiple_choice'],
-    metric_list=[AverageAccuracy],
+    metric_list=['AverageAccuracy'],
     few_shot_num=0,
     train_split=None,
     eval_split='validation',
@@ -259,7 +258,7 @@ class TruthfulQaAdapter(DataAdapter):
         return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}}  # or {'generation': xxx}
-    def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
+    def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
         """
         Compute evaluation result by specific metric for each subset.
@@ -284,8 +283,9 @@ class TruthfulQaAdapter(DataAdapter):
                 logger.error(f'** Unknown review_res: {review_res_d}')
         # To get mc2 score
-        return [{
-            'metric_name': self.metric_list[0].name,
-            'score': self.metric_list[0].object(mc2_list),
-            'num': len(mc2_list)
-        }]
+        # return [{
+        #     'metric_name': self.metric_list[0].name,
+        #     'score': self.metric_list[0].object(mc2_list),
+        #     'num': len(mc2_list)
+        # }]
+        return super().compute_metric(mc2_list)

evalscope/cli/start_app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import os
 from argparse import ArgumentParser
 from evalscope.cli.base import CLICommand
-from evalscope.report.app import add_argument, create_app
 def subparser_func(args):
@@ -22,9 +21,13 @@ class StartAppCMD(CLICommand):
     def define_args(parsers: ArgumentParser):
         """ define args for create pipeline template command.
         """
+        from evalscope.report.app import add_argument
         parser = parsers.add_parser(StartAppCMD.name)
         add_argument(parser)
         parser.set_defaults(func=subparser_func)
     def execute(self):
+        from evalscope.report.app import create_app
         create_app(self.args)

evalscope/cli/start_eval.py CHANGED Viewed

@@ -1,10 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 from argparse import ArgumentParser
-from evalscope.arguments import add_argument
 from evalscope.cli.base import CLICommand
-from evalscope.run import run_task
 def subparser_func(args):
@@ -23,9 +20,13 @@ class EvalCMD(CLICommand):
     def define_args(parsers: ArgumentParser):
         """ define args for create pipeline template command.
         """
+        from evalscope.arguments import add_argument
         parser = parsers.add_parser(EvalCMD.name)
         add_argument(parser)
         parser.set_defaults(func=subparser_func)
     def execute(self):
+        from evalscope.run import run_task
         run_task(self.args)

evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl