PyPI - evalscope - Versions diffs - 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl - Mend

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +29 -9
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +118 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +2 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/constants.py +5 -21
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +20 -15
evalscope/metrics/__init__.py +9 -1
evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
evalscope/metrics/llm_judge.py +106 -20
evalscope/metrics/metrics.py +20 -8
evalscope/models/__init__.py +4 -8
evalscope/models/adapters/__init__.py +4 -9
evalscope/models/adapters/base_adapter.py +4 -0
evalscope/models/adapters/bfcl_adapter.py +2 -0
evalscope/models/adapters/chat_adapter.py +3 -0
evalscope/models/adapters/choice_adapter.py +4 -0
evalscope/models/adapters/custom_adapter.py +7 -3
evalscope/models/adapters/server_adapter.py +4 -2
evalscope/models/adapters/t2i_adapter.py +3 -0
evalscope/models/adapters/tau_bench_adapter.py +189 -0
evalscope/models/custom/dummy_model.py +3 -3
evalscope/models/register.py +0 -14
evalscope/perf/arguments.py +15 -16
evalscope/perf/benchmark.py +38 -39
evalscope/perf/http_client.py +30 -86
evalscope/perf/main.py +3 -3
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +22 -4
evalscope/perf/plugin/api/custom_api.py +212 -55
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +105 -0
evalscope/perf/plugin/api/openai_api.py +17 -19
evalscope/perf/plugin/datasets/__init__.py +10 -7
evalscope/perf/plugin/datasets/base.py +22 -1
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +4 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +2 -1
evalscope/perf/plugin/datasets/random_dataset.py +15 -4
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +14 -20
evalscope/perf/utils/db_util.py +79 -61
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +55 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +68 -4
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +34 -70
tests/perf/test_perf.py +31 -4
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/models/model.py +0 -189
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/general_qa/general_qa_adapter.py CHANGED Viewed

@@ -14,7 +14,8 @@ logger = get_logger()
 @Benchmark.register(
     name='general_qa',
     pretty_name='General-QA',
-    description='General Question Answering dataset',
+    description='A general question answering dataset for custom evaluation. '
+    'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).',  # noqa: E501
     tags=['QA', 'Custom'],
     dataset_id='general_qa',
     subset_list=['default'],
@@ -25,13 +26,21 @@ logger = get_logger()
     prompt_template='请回答问题\n{query}',
 )
 class GeneralQAAdapter(DataAdapter):
-    # TODO: set few_shot_num
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
     def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
+        """
+        Load dataset from the given path or dataset name.
+        Args:
+            dataset_name_or_path (str): Path to dataset directory or file.
+            subset_list (list): List of subset names to load.
+        Returns:
+            dict: Loaded dataset organized by subset.
+        """
         dataset_name_or_path = dataset_name_or_path or self.dataset_id
         subset_list = subset_list or self.subset_list
@@ -61,58 +70,64 @@ class GeneralQAAdapter(DataAdapter):
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
         """
+        Generate prompt for the model based on input data.
         Args:
-            input_d:
-                format1: {'history': [['q1', 'a1'], ['q2', 'a2']], 'question': '', 'answer': ''}
-                format2: {'history': [['q1', 'a1'], ['q2', 'a2']], 'query': '', 'response': ''}
+            input_d (dict): Input data dictionary.
+            subset_name (str): Name of the subset.
+            few_shot_list (list): List of few-shot examples.
         Returns:
-            {'data': [prompt]}
+            dict: Dictionary containing the generated prompt.
         """
-        # prompt = f"'<|im_start|>user\n{input_d['input']}<|im_end|>\n<|im_start|>assistant\n'"
-        history = input_d.get('history', [])  # history: [['q1', 'a1'], ['q2', 'a2'], ...]
-        if len(history) > 0:
-            logger.warning('The history is not included in the prompt for GeneralQA. \
-                           To be supported in the future.')
+        messages = input_d.get('messages')
         query = input_d.get('question', '') or input_d.get('query', '')
         system_prompt = input_d.get('system')
         prompt = self.prompt_template.format(query=query)
-        return self.gen_prompt_data(prompt, system_prompt=system_prompt)
+        return self.gen_prompt_data(prompt, system_prompt=system_prompt, messages=messages)
     def get_gold_answer(self, input_d: dict) -> str:
         """
+        Extract the gold (reference) answer from the input data.
         Args:
-            input_d: {'history': [], 'question': '', 'answer': ''}
+            input_d (dict): Input data dictionary.
         Returns:
-            gold_answer: str
+            str: Gold answer string.
         """
-        return input_d.get('answer', '') or input_d.get('response', '')
+        return input_d.get('answer') or input_d.get('response')
     def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
         """
+        Parse the prediction result.
         Args:
-            result: str
+            result (str): Model prediction result.
+            raw_input_d (dict, optional): Original input data.
+            eval_type (str): Evaluation type.
         Returns:
-            pred_result: str
+            str: Parsed prediction result.
         """
         return result
     def match(self, gold: str, pred: str) -> dict:
         """
+        Compute metric scores between gold and predicted answers.
         Args:
-            gold: str
-            pred: str
+            gold (str): Gold answer.
+            pred (str): Predicted answer.
         Returns:
-            bleu_score: dict
+            dict: Dictionary of computed metric scores.
         """
+        # reference free metrics
+        if gold is None:
+            return {'AverageAccuracy': -1}
+        # calculate rouge and bleu scores
         res = dict()
         if 'AverageRouge' in self.metric_list:
             from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
@@ -128,14 +143,13 @@ class GeneralQAAdapter(DataAdapter):
     def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
         """
-        compute weighted mean of the bleu score of all samples
+        Compute weighted mean of the metric scores for all samples.
         Args:
-            review_res_list: [score1, score2, ...]
+            review_res_list (list): List of metric score dictionaries.
         Returns:
-            avg_res: List[dict]
+            list: List of dictionaries with averaged metric results.
         """
         items = super().compute_dict_metric(review_res_list, **kwargs)
         return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]

evalscope/benchmarks/hellaswag/hellaswag_adapter.py CHANGED Viewed

@@ -6,9 +6,9 @@ import re
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
+from evalscope.metrics.completion_parsers import ResponseParser
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
-from evalscope.utils.utils import ResponseParser
 # flake8: noqa

evalscope/benchmarks/hle/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/hle/hle_adapter.py ADDED Viewed

@@ -0,0 +1,118 @@
+import re
+from collections import defaultdict
+from typing import Any, List
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import DEFAULT_PROMPT_TEMPLATE, LLMJudge, exact_match, mean
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+SUBSET_LIST = [
+    'Biology/Medicine',
+    'Chemistry',
+    'Computer Science/AI',
+    'Engineering',
+    'Humanities/Social Science',
+    'Math',
+    'Physics',
+    'Other',
+]
+@Benchmark.register(
+    name='hle',
+    pretty_name="Humanity's-Last-Exam",
+    tags=['Knowledge', 'QA'],
+    description=
+    'Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 questions across a broad range of subjects. It was created jointly by the Center for AI Safety and Scale AI. The benchmark classifies the questions into the following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), humanities/social science (9%), computer science/artificial intelligence (10%), engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions require the ability to understand both text and images, i.e., multi-modality. 24% of the questions are multiple-choice; the rest are short-answer, exact-match questions.',  # noqa: E501
+    dataset_id='cais/hle',
+    subset_list=SUBSET_LIST,
+    metric_list=['AverageAccuracy'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template='{query}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
+)
+class HLEAdapter(DataAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.llm_as_a_judge = True
+    def load(self, **kwargs):
+        kwargs['subset_list'] = ['default']
+        data_dict = super().load(**kwargs)
+        return self.reformat_subset(data_dict, subset_key='category', format='{}')
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        # remove image preview
+        input_d.pop('image_preview', None)
+        input_d.pop('rationale_image', None)
+        # generate prompt
+        question = input_d['question']
+        prompt = self.prompt_template.format(query=question)
+        image = input_d.get('image', None)
+        # build messages for multi-modal input
+        messages = []
+        if self.system_prompt:
+            messages.append({'role': 'system', 'content': self.system_prompt})
+        if image:
+            messages.append({
+                'role':
+                'user',
+                'content': [{
+                    'type': 'text',
+                    'text': prompt
+                }, {
+                    'type': 'image_url',
+                    'image_url': {
+                        'url': image
+                    }
+                }]
+            })
+        else:
+            messages.append({'role': 'user', 'content': prompt})
+        return self.gen_prompt_data(prompt='', messages=messages)
+    def get_gold_answer(self, input_d: dict) -> str:
+        return input_d['answer']
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
+        # Extract the answer from the model output \boxed{answer}
+        match = re.search(r'\\boxed{([^}]*)}', result)
+        if match:
+            return match.group(1).strip()
+        else:
+            logger.warning(f'No answer found in the model output: {result}')
+            return ''
+    def llm_parse_pred_result(self, result, raw_input_d=None, **kwargs) -> str:
+        return result.strip()
+    def match(self, gold: str, pred: str) -> dict:
+        # simple match
+        return {
+            'AverageAccuracy': 1.0 if exact_match(gold, pred) else 0.0,
+        }
+    def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
+        raw_input = kwargs.get('raw_input', None)
+        question = raw_input['question']
+        # get grading response
+        prompt = judge.build_prompt(pred, gold, question)
+        judge_response = judge(prompt)
+        score = judge.get_score(judge_response)
+        return {
+            'AverageAccuracy': score,
+            'response': judge_response,
+        }
+    def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
+        # zip dict answers
+        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
+        return super().compute_metric(res_dict, **kwargs)

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -22,7 +22,8 @@ logger = get_logger()
     few_shot_num=0,
     train_split=None,
     eval_split='test',
-    prompt_template='Complete the following python code:\n{query}',
+    prompt_template=
+    'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}',  # noqa: E501
     extra_params={
         'num_workers': 4,
         'timeout': 4
@@ -76,26 +77,9 @@ class HumanevalAdapter(DataAdapter):
     @classmethod
     def _postprocess(cls, text: str) -> str:
-        if '```' in text:
-            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
-            if len(blocks) == 0:
-                text = text.split('```')[1]  # fall back to default strategy
-            else:
-                text = blocks[0]  # fetch the first code block
-                if not text.startswith('\n'):  # in case starting with ```python
-                    text = text[max(text.find('\n') + 1, 0):]
-        if text.strip().startswith('from') or text.strip().startswith('import'):
-            def_idx = text.find('def')
-            if def_idx != -1:
-                text = text[max(text.find('\n', def_idx) + 1, 0):]
-        text = text.split('\n\n')[0]
-        if text.strip().startswith('def'):
-            text = '\n'.join(text.split('\n')[1:])
-        if not text.startswith('    '):
-            if text.startswith(' '):
-                text = '    ' + text.lstrip()
-            else:
-                text = '\n'.join(['    ' + line for line in text.split('\n')])
+        blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
+        if len(blocks) >= 1:
+            text = blocks[0]
         return text
     def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:

evalscope/benchmarks/ifeval/ifeval_adapter.py CHANGED Viewed

@@ -2,7 +2,6 @@ from collections import defaultdict
 from typing import Any, Dict, List
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.benchmarks.ifeval.utils import process_results
 from evalscope.constants import EvalType
 from evalscope.metrics import Metric, mean, metric_registry
@@ -43,10 +42,9 @@ class IFEvalAdapter(DataAdapter):
     def get_gold_answer(self, input_d: dict) -> str:
         return input_d
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
-        return result
     def match(self, gold: Any, pred: Any) -> Dict:
+        from evalscope.benchmarks.ifeval.utils import process_results
         return process_results(gold, [pred])
     def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:

evalscope/benchmarks/iquiz/iquiz_adapter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 @Benchmark.register(

evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py CHANGED Viewed

@@ -69,12 +69,6 @@ class LiveCodeBenchAdapter(DataAdapter):
         # Extract the gold answer from the input dict.
         return input_d
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
-        """
-        Parse the model output to get the answer. Could be the best choice index.
-        """
-        return result
     def match(self, gold: dict, pred: str) -> float:
         from .evaluate_utils import codegen_metrics
         from .extract_utils import extract_code_generation

evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 SUBSET_LIST = ['default']

evalscope/benchmarks/math_500/math_500_adapter.py CHANGED Viewed

@@ -54,4 +54,5 @@ class Math500Adapter(DataAdapter):
         return result
     def match(self, gold: str, pred: str) -> float:
-        return math_equal(pred, gold)
+        res = math_equal(pred, gold)
+        return 1.0 if res else 0.0

evalscope/benchmarks/mmlu/mmlu_adapter.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 from evalscope.utils.logger import get_logger
 # flake8: noqa
@@ -144,7 +144,7 @@ SUBJECT_MAPPING = {
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
     subset_list=SUBSET_LIST,
     metric_list=['AverageAccuracy'],
-    few_shot_num=5,
+    few_shot_num=0,
     train_split='train',
     eval_split='test',
     prompt_template=

evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any, Dict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 SUBSET_LIST = [
     'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',

evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import Any, Dict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
+from evalscope.metrics.completion_parsers import ResponseParser
 from evalscope.utils.logger import get_logger
-from evalscope.utils.utils import ResponseParser
 logger = get_logger()

evalscope/benchmarks/musr/musr_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 @Benchmark.register(

evalscope/benchmarks/race/race_adapter.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger

evalscope/benchmarks/tau_bench/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/tau_bench/tau_bench_adapter.py ADDED Viewed

@@ -0,0 +1,110 @@
+import importlib
+from collections import defaultdict
+from typing import Dict, List
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import Metric, mean, metric_registry
+from evalscope.utils import get_logger
+logger = get_logger()
+@Benchmark.register(
+    name='tau_bench',
+    pretty_name='τ-bench',
+    tags=['Reasoning', 'Agent', 'Function Calling'],
+    description='A benchmark emulating dynamic conversations between a user (simulated by language models) '
+    'and a language agent provided with domain-specific API tools and policy guidelines. '
+    'Please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating and set a user model. ',  # noqa: E501
+    dataset_id='https://github.com/sierra-research/tau-bench',
+    model_adapter='tau_bench_server',
+    subset_list=['airline', 'retail'],
+    metric_list=['Pass^1'],
+    eval_split='test',
+    extra_params={
+        'user_model': 'qwen-plus',
+        'api_key': 'EMPTY',
+        'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
+        'generation_config': {
+            'temperature': 0.7,
+            'max_new_tokens': 1024
+        }
+    })
+class TauBenchAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        spec = importlib.util.find_spec('tau_bench')
+        if spec is None:
+            raise ImportError(
+                '`tau_bench` not found, please install it with `pip install git+https://github.com/sierra-research/tau-bench` before evaluating.'  # noqa: E501
+            )
+        metric_registry.register(Metric(name='Pass^1', object=mean))
+        # setup user model args
+        extra_params = kwargs.get('extra_params', {})
+        self.user_model = extra_params.get('user_model', 'qwen-plus')
+        self.api_key = extra_params.get('api_key', 'EMPTY')
+        self.api_base = extra_params.get('api_base', 'https://dashscope.aliyuncs.com/compatible-mode/v1')
+        self.generation_config = extra_params.get('generation_config', {'temperature': 0.7, 'max_new_tokens': 1024})
+        self._patch_env_completion()
+    def _patch_env_completion(self) -> str:
+        from tau_bench.envs.user import LLMUserSimulationEnv
+        def new_generate_next_message(self, messages):
+            from evalscope.models import ServerModelAdapter
+            user_server = ServerModelAdapter(
+                api_url=adapter_instance.api_base,
+                model_id=adapter_instance.user_model,
+                api_key=adapter_instance.api_key)
+            request_json = user_server.make_request(
+                input_item={'messages': messages}, infer_cfg=adapter_instance.generation_config)
+            res = user_server.send_request(request_json)
+            message = res['choices'][0]['message']
+            self.messages.append(message)
+            self.total_cost = 0
+            return message['content']
+        # get the current instance of TauBenchAdapter
+        adapter_instance = self
+        LLMUserSimulationEnv.generate_next_message = new_generate_next_message
+    def load(self, **kwargs):
+        from tau_bench.envs import get_env
+        data_dict = defaultdict(dict)
+        for env_name in self.subset_list:
+            logger.info(f'Loading TauBench environment: {env_name}')
+            env = get_env(
+                env_name=env_name,
+                user_strategy='llm',
+                user_model='dummy',  # Use dummy model to prevent errors
+                user_provider='openai',  # Use dummy provider to prevent errors
+                task_split=self.eval_split,
+            )
+            tasks = []
+            for i in range(len(env.tasks)):
+                tasks.append({
+                    'task_index': i,
+                    'env_name': env_name,
+                })
+            data_dict[env_name][self.eval_split] = tasks
+        return data_dict
+    def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
+        return self.gen_prompt_data(extra_data=input_d)
+    def get_gold_answer(self, input_d):
+        return ''
+    def match(self, gold, pred):
+        import json
+        res = json.loads(pred)
+        return res.get('reward', 0.0)

evalscope/benchmarks/tool_bench/tool_bench_adapter.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 from typing import Dict, List
 from evalscope.benchmarks import Benchmark, DataAdapter
@@ -8,7 +9,7 @@ from evalscope.metrics import Metric, mean, metric_registry
 @Benchmark.register(
     name='tool_bench',
     pretty_name='ToolBench-Static',
-    tags=['Reasoning', 'Agent'],
+    tags=['Reasoning', 'Agent', 'Function Calling'],
     description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
     'It includes various subsets such as in-domain and out-of-domain, '
     'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
@@ -40,6 +41,11 @@ class ToolBenchAdapter(DataAdapter):
         for message in messages:
             if 'name' in message:
                 del message['name']
+            if 'role' in message:
+                if message['role'] == 'function':
+                    content = json.dumps(message, ensure_ascii=False)
+                    message['role'] = 'user'
+                    message['content'] = content
         return self.gen_prompt_data(prompt='', messages=messages)
     def get_gold_answer(self, input_d: dict) -> str:

evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py CHANGED Viewed

@@ -96,13 +96,16 @@ class TriviaQaAdapter(DataAdapter):
         def get_sys_prompt(inp: dict) -> str:
             return inp['input'][0]['content']
-        prompt = get_sys_prompt(input_d)
+        if self.few_shot_num > 0:
+            sys_prompt = get_sys_prompt(input_d)
+        else:
+            sys_prompt = None
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
-        context: str = '\n'.join(few_shot_prompts) + '\n'
+        context = '\n'.join(few_shot_prompts) + '\n'
         context += self._generate_prompt(input_d=input_d, include_answer=False)
         full_prompt = context
-        return self.gen_prompt_data(full_prompt)
+        return self.gen_prompt_data(full_prompt, system_prompt=sys_prompt)
     def get_gold_answer(self, input_d: dict) -> list:
         # Get the gold choice
@@ -124,7 +127,9 @@ class TriviaQaAdapter(DataAdapter):
         return result
     def match(self, gold: list, pred: str) -> float:
-        is_correct = any([cand in pred for cand in gold])
+        lower_pred = pred.lower()
+        gold = [g.lower() for g in gold]
+        is_correct = any([cand in lower_pred for cand in gold])
         return 1 if is_correct else 0
     @classmethod

evalscope/benchmarks/utils.py CHANGED Viewed

@@ -2,8 +2,7 @@ from dataclasses import asdict, dataclass
 from functools import wraps
 from typing import Dict, List, Optional, Union
-from evalscope.constants import EvalType
-from evalscope.utils.filters import Filter
+from .filters import Filter
 @dataclass
@@ -14,6 +13,7 @@ class PromptData:
     multi_choices: Optional[List[str]] = None
     id: Optional[str] = None
     messages: Optional[List[dict]] = None
+    extra_data: Optional[Dict] = None
     def to_dict(self) -> Dict:
         return {k: v for k, v in asdict(self).items() if v is not None}

evalscope/benchmarks/winogrande/winogrande_adapter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 @Benchmark.register(

evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl