PyPI - evalscope - Versions diffs - 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

evalscope 0.8.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show

evalscope/__init__.py +2 -0
evalscope/arguments.py +11 -3
evalscope/backend/base.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
evalscope/backend/rag_eval/utils/clip.py +2 -2
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/__init__.py +20 -1
evalscope/benchmarks/arc/__init__.py +0 -5
evalscope/benchmarks/arc/arc_adapter.py +24 -102
evalscope/benchmarks/bbh/__init__.py +0 -4
evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
evalscope/benchmarks/benchmark.py +70 -59
evalscope/benchmarks/ceval/__init__.py +0 -5
evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
evalscope/benchmarks/cmmlu/__init__.py +0 -5
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
evalscope/benchmarks/competition_math/__init__.py +0 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
evalscope/benchmarks/data_adapter.py +115 -87
evalscope/benchmarks/general_qa/__init__.py +0 -5
evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
evalscope/benchmarks/gpqa/__init__.py +0 -0
evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
evalscope/benchmarks/gsm8k/__init__.py +0 -4
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
evalscope/benchmarks/hellaswag/__init__.py +0 -5
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
evalscope/benchmarks/humaneval/__init__.py +0 -4
evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
evalscope/benchmarks/ifeval/__init__.py +0 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
evalscope/benchmarks/ifeval/instructions.py +1477 -0
evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
evalscope/benchmarks/ifeval/utils.py +134 -0
evalscope/benchmarks/iquiz/__init__.py +0 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
evalscope/benchmarks/mmlu/__init__.py +0 -5
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
evalscope/benchmarks/race/__init__.py +0 -5
evalscope/benchmarks/race/race_adapter.py +27 -123
evalscope/benchmarks/trivia_qa/__init__.py +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
evalscope/benchmarks/truthful_qa/__init__.py +0 -5
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +30 -0
evalscope/collections/__init__.py +3 -0
evalscope/collections/evaluator.py +198 -0
evalscope/collections/sampler.py +138 -0
evalscope/collections/schema.py +126 -0
evalscope/config.py +45 -7
evalscope/constants.py +7 -38
evalscope/evaluator/__init__.py +0 -1
evalscope/evaluator/evaluator.py +89 -121
evalscope/evaluator/rating_eval.py +1 -1
evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
evalscope/metrics/__init__.py +3 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
evalscope/metrics/math_accuracy.py +193 -50
evalscope/metrics/metrics.py +18 -6
evalscope/metrics/named_metrics.py +17 -0
evalscope/metrics/rouge_metric.py +13 -8
evalscope/models/__init__.py +14 -1
evalscope/models/base_adapter.py +52 -0
evalscope/models/chat_adapter.py +140 -0
evalscope/models/choice_adapter.py +211 -0
evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
evalscope/models/custom_adapter.py +67 -0
evalscope/models/local_model.py +74 -0
evalscope/models/model.py +141 -0
evalscope/models/server_adapter.py +111 -0
evalscope/perf/__init__.py +1 -0
evalscope/perf/arguments.py +3 -1
evalscope/perf/benchmark.py +3 -3
evalscope/perf/main.py +5 -7
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +54 -50
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/longalpaca.py +1 -1
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/benchmark_util.py +4 -4
evalscope/perf/utils/db_util.py +66 -22
evalscope/perf/utils/local_server.py +4 -1
evalscope/report/__init__.py +5 -0
evalscope/report/app.py +693 -0
evalscope/report/combinator.py +73 -0
evalscope/report/generator.py +80 -0
evalscope/report/utils.py +133 -0
evalscope/run.py +64 -125
evalscope/run_arena.py +3 -2
evalscope/summarizer.py +15 -27
evalscope/third_party/longbench_write/eval.py +2 -1
evalscope/third_party/longbench_write/longbench_write.py +2 -1
evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/__init__.py +1 -0
evalscope/utils/chat_service.py +6 -5
evalscope/utils/io_utils.py +170 -0
evalscope/utils/logger.py +13 -0
evalscope/utils/model_utils.py +15 -2
evalscope/utils/utils.py +3 -200
evalscope/version.py +2 -2
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
tests/cli/test_collection.py +57 -0
tests/cli/test_run.py +57 -7
tests/perf/test_perf.py +3 -2
tests/rag/test_mteb.py +3 -2
tests/vlm/test_vlmeval.py +3 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
evalscope/evaluator/humaneval_evaluator.py +0 -158
evalscope/models/api/__init__.py +0 -3
evalscope/models/dummy_chat_model.py +0 -49
evalscope/models/model_adapter.py +0 -525
evalscope/models/openai_model.py +0 -103
evalscope/tools/__init__.py +0 -1
evalscope/tools/combine_reports.py +0 -135
evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
/evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/race/race_adapter.py CHANGED Viewed

@@ -1,57 +1,41 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import json
 import os
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import jsonl_to_list, normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.models import MultiChoiceModelAdapter
+from evalscope.utils import ResponseParser
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
-DATASET_ID = 'modelscope/race'
-SUBSET_LIST = ['high', 'middle']
-SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
+@Benchmark.register(
+    name='race',
+    dataset_id='modelscope/race',
+    model_adapter=MultiChoiceModelAdapter,
+    subset_list=['high', 'middle'],
+    metric_list=[AverageAccuracy],
+    few_shot_num=3,
+    train_split='train',
+    eval_split='test',
+)
 class RACEAdapter(DataAdapter):
     choices = ['A', 'B', 'C', 'D']
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'test',
-                 **kwargs):
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-        if few_shot_num is None:
-            logger.info(f'Set 3-shot examples by system for RACE.')
-            few_shot_num = 3
+    def __init__(self, **kwargs):
+        few_shot_num = kwargs.get('few_shot_num', 3)
         if few_shot_num > 3:
             logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
-            few_shot_num = 3
+            kwargs['few_shot_num'] = 3
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -98,13 +82,13 @@ class RACEAdapter(DataAdapter):
         full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
-        return {'data': [full_prompt], 'multi_choices': self.choices}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d.get('answer', '')
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
@@ -116,98 +100,18 @@ class RACEAdapter(DataAdapter):
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
-            return result
-        elif eval_type == 'service':  # TODO: to be implemented
-            return result
-        elif eval_type == 'custom':  # TODO: to be implemented
+        if eval_type == EvalType.CHECKPOINT:
             return result
+        elif eval_type == EvalType.SERVICE:
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
+        elif eval_type == EvalType.CUSTOM:
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
         else:
             raise ValueError(f'Unknown eval_type: {eval_type}')
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate report for the evaluation.
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-        Returns:
-        {
-            "name":"RACE",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                   "name":"High",
-                   "score":0.2528,
-                   "subset":[
-                       {
-                           "name":"high",
-                           "score":0.2528
-                       }
-                   ]
-                }
-            ],
-            "total_num":59
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        # Get domain-subject mapping
-        subject_review_map = {}
-        for subset_name, (subset_score, num) in subset_score_map.items():
-            domain_name: str = SUBJECT_MAPPING.get(subset_name)
-            if domain_name in subject_review_map:
-                subject_review_map[domain_name].append((subset_name, subset_score, num))
-            else:
-                subject_review_map[domain_name] = [(subset_name, subset_score, num)]
-        # Get domain score
-        category_list = []
-        for domain_name, domain_res_list in subject_review_map.items():
-            domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
-                                     sum([num for _, _, num in domain_res_list])
-            domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
-            category_list.append({
-                'name':
-                domain_name,
-                'score':
-                normalize_score(score=domain_weighted_avg_acc),
-                'subset': [{
-                    'name': subset_name,
-                    'score': subset_score
-                } for subset_name, subset_score, _ in domain_res_list]
-            })
-        # Get final dict of report
-        res_map = dict(
-            name=report_name or 'race',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=category_list,
-            total_num=total_num)
-        return res_map
     @classmethod
     def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:

evalscope/benchmarks/trivia_qa/__init__.py CHANGED Viewed

@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter
-from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py CHANGED Viewed

@@ -1,49 +1,35 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Copyright (c) EleutherAI Inc, and its affiliates.
 import csv
-import numpy as np
 import os
-from typing import List
+from evalscope.benchmarks import Benchmark
 from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils.logger import get_logger
+from evalscope.constants import EvalType
+from evalscope.metrics import AverageAccuracy
+from evalscope.models import ChatGenerationModelAdapter
+from evalscope.utils import get_logger
 # flake8: noqa
 logger = get_logger()
-DATASET_ID = 'modelscope/trivia_qa'
-SUBSET_LIST = ['default']
+@Benchmark.register(
+    name='trivia_qa',
+    dataset_id='modelscope/trivia_qa',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['default'],
+    metric_list=[AverageAccuracy],
+    few_shot_num=5,
+    train_split='dev',
+    eval_split='test',
+)
 class TriviaQaAdapter(DataAdapter):
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'dev',
-                 eval_split: str = 'test',
-                 **kwargs):
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+    def __init__(self, **kwargs):
-        if few_shot_num is None:
-            logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
-            few_shot_num = 5
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -113,16 +99,16 @@ class TriviaQaAdapter(DataAdapter):
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
         context: str = '\n'.join(few_shot_prompts) + '\n'
         context += self._generate_prompt(input_d=input_d, include_answer=False)
-        full_prompt = prompt + context
+        full_prompt = context
-        return {'data': [full_prompt]}
+        return {'data': [full_prompt], 'system_prompt': prompt or self.prompt_template}
     def get_gold_answer(self, input_d: dict) -> list:
         # Get the gold choice
         ans: list = input_d.get('ideal', [])
         return ans
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer.
@@ -134,73 +120,11 @@ class TriviaQaAdapter(DataAdapter):
         Returns:
             The predicted answer.
         """
-        if eval_type == 'checkpoint':
-            return result
-        elif eval_type == 'service':  # TODO: to be implemented
-            return result
-        elif eval_type == 'custom':  # TODO: to be implemented
-            return result
-        else:
-            raise ValueError(f'Unknown eval_type: {eval_type}')
+        return result
     def match(self, gold: list, pred: str) -> float:
-        return max([exact_match(gold=ref, pred=pred) for ref in gold])
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-        Args:
-            subset_score_map: {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-        Returns:
-        {
-            "name":"TriviaQA",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.3389,
-                    "subset":[
-                        {
-                            "name":"default",
-                            "score":0.3389
-                        }
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(
-            name=report_name or 'trivia_qa',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-        return res_map
+        is_correct = any([cand in pred for cand in gold])
+        return 1 if is_correct else 0
     @classmethod
     def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:

evalscope/benchmarks/truthful_qa/__init__.py CHANGED Viewed

@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter
-from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py CHANGED Viewed

@@ -6,20 +6,30 @@ import numpy as np
 import os
 from typing import List
+from evalscope.benchmarks import Benchmark
 from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import weighted_mean
-from evalscope.utils import normalize_score
-from evalscope.utils.logger import get_logger
+from evalscope.constants import EvalType
+from evalscope.metrics import AverageAccuracy
+from evalscope.models import ContinuationLogitsModelAdapter
+from evalscope.utils import get_logger, normalize_score
 # flake8: noqa
 logger = get_logger()
-DATASET_ID = 'modelscope/truthful_qa'
 # task_list = ['generation', 'multiple_choice']
-SUBSET_LIST = ['multiple_choice']
+@Benchmark.register(
+    name='truthful_qa',
+    dataset_id='modelscope/truthful_qa',
+    model_adapter=ContinuationLogitsModelAdapter,
+    subset_list=['multiple_choice'],
+    metric_list=[AverageAccuracy],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='validation',
+)
 class TruthfulQaAdapter(DataAdapter):
     """
     Adapter for TruthfulQA benchmark.
@@ -40,35 +50,14 @@ class TruthfulQaAdapter(DataAdapter):
                  'Q: Where were the 1992 Olympics held?\n'
                  'A: The 1992 Olympics were held in Barcelona, Spain.')
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = None,
-                 eval_split: str = 'validation',
-                 **kwargs):
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-        if few_shot_num is None:
-            logger.info(f'Set 0-shot examples by system for TruthfulQA.')
-            few_shot_num = 0
+    def __init__(self, **kwargs):
+        few_shot_num = kwargs.get('few_shot_num', 0)
         if few_shot_num != 0:
             logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.')
-            few_shot_num = 0
+            kwargs['few_shot_num'] = 0
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -215,7 +204,7 @@ class TruthfulQaAdapter(DataAdapter):
         # TODO: generation sub-task to be added
         return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']}
-    def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> list:
+    def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> list:
         """
         Parse the model output to get the answer.
@@ -227,11 +216,11 @@ class TruthfulQaAdapter(DataAdapter):
         Returns:
             The predicted answer.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             return result
-        elif eval_type == 'service':  # TODO: to be supported !
+        elif eval_type == EvalType.SERVICE:  # TODO: to be supported !
             return result
-        elif eval_type == 'custom':  # TODO: to be supported !
+        elif eval_type == EvalType.CUSTOM:  # TODO: to be supported !
             return result
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -270,7 +259,7 @@ class TruthfulQaAdapter(DataAdapter):
         return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}}  # or {'generation': xxx}
-    def compute_metric(self, review_res_list: List[dict]) -> float:
+    def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
         """
         Compute evaluation result by specific metric for each subset.
@@ -295,56 +284,8 @@ class TruthfulQaAdapter(DataAdapter):
                 logger.error(f'** Unknown review_res: {review_res_d}')
         # To get mc2 score
-        items = [(score, 1.0) for score in mc2_list]
-        return weighted_mean(items)
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-        Args:
-            subset_score_map: {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-        Returns:
-        {
-            "name":"TruthfulQA",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.2527,
-                    "subset":[
-                        {
-                            "name":"multiple_choice",
-                            "score":0.3157
-                        },
-                        # {
-                        #     "name":"generation",
-                        #     "score":0.2631
-                        # }
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(
-            name=report_name or 'truthful_qa',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-        return res_map
+        return [{
+            'metric_name': self.metric_list[0].name,
+            'score': self.metric_list[0].object(mc2_list),
+            'num': len(mc2_list)
+        }]

evalscope/cli/cli.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import argparse
+from evalscope.cli.start_app import StartAppCMD
 from evalscope.cli.start_eval import EvalCMD
 from evalscope.cli.start_perf import PerfBenchCMD
@@ -12,6 +13,7 @@ def run_cmd():
     PerfBenchCMD.define_args(subparsers)
     EvalCMD.define_args(subparsers)
+    StartAppCMD.define_args(subparsers)
     args = parser.parse_args()

evalscope/cli/start_app.py ADDED Viewed

@@ -0,0 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from argparse import ArgumentParser
+from evalscope.cli.base import CLICommand
+from evalscope.report.app import add_argument, create_app
+def subparser_func(args):
+    """ Function which will be called for a specific sub parser.
+    """
+    return StartAppCMD(args)
+class StartAppCMD(CLICommand):
+    name = 'app'
+    def __init__(self, args):
+        self.args = args
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for create pipeline template command.
+        """
+        parser = parsers.add_parser(StartAppCMD.name)
+        add_argument(parser)
+        parser.set_defaults(func=subparser_func)
+    def execute(self):
+        create_app(self.args)

evalscope/collections/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from evalscope.collections.evaluator import EvaluatorCollection
+from evalscope.collections.sampler import StratifiedSampler, UniformSampler, WeightedSampler
+from evalscope.collections.schema import CollectionSchema, DatasetInfo

evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.8.0py3-none-any.whl → 0.10.1py3-none-any.whl