PyPI - evalscope - Versions diffs - 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

evalscope/__init__.py +2 -0
evalscope/arguments.py +11 -3
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/__init__.py +20 -1
evalscope/benchmarks/arc/__init__.py +0 -5
evalscope/benchmarks/arc/arc_adapter.py +24 -102
evalscope/benchmarks/bbh/__init__.py +0 -4
evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
evalscope/benchmarks/benchmark.py +70 -59
evalscope/benchmarks/ceval/__init__.py +0 -5
evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
evalscope/benchmarks/cmmlu/__init__.py +0 -5
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
evalscope/benchmarks/competition_math/__init__.py +0 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
evalscope/benchmarks/data_adapter.py +115 -87
evalscope/benchmarks/general_qa/__init__.py +0 -5
evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
evalscope/benchmarks/gsm8k/__init__.py +0 -4
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
evalscope/benchmarks/hellaswag/__init__.py +0 -5
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
evalscope/benchmarks/humaneval/__init__.py +0 -4
evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
evalscope/benchmarks/ifeval/__init__.py +0 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
evalscope/benchmarks/ifeval/instructions.py +1478 -0
evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
evalscope/benchmarks/ifeval/utils.py +134 -0
evalscope/benchmarks/iquiz/__init__.py +0 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
evalscope/benchmarks/mmlu/__init__.py +0 -5
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
evalscope/benchmarks/race/__init__.py +0 -5
evalscope/benchmarks/race/race_adapter.py +26 -123
evalscope/benchmarks/trivia_qa/__init__.py +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
evalscope/benchmarks/truthful_qa/__init__.py +0 -5
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +29 -0
evalscope/collections/__init__.py +3 -0
evalscope/collections/evaluator.py +198 -0
evalscope/collections/sampler.py +138 -0
evalscope/collections/schema.py +126 -0
evalscope/config.py +7 -5
evalscope/constants.py +9 -26
evalscope/evaluator/evaluator.py +87 -121
evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
evalscope/metrics/__init__.py +3 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
evalscope/metrics/math_accuracy.py +193 -50
evalscope/metrics/metrics.py +18 -6
evalscope/metrics/named_metrics.py +17 -0
evalscope/metrics/rouge_metric.py +13 -8
evalscope/models/__init__.py +14 -1
evalscope/models/base_adapter.py +52 -0
evalscope/models/chat_adapter.py +138 -0
evalscope/models/choice_adapter.py +211 -0
evalscope/models/custom_adapter.py +67 -0
evalscope/models/local_model.py +74 -0
evalscope/models/model.py +141 -0
evalscope/models/server_adapter.py +111 -0
evalscope/perf/__init__.py +1 -0
evalscope/perf/main.py +0 -1
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +1 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/longalpaca.py +1 -1
evalscope/report/__init__.py +5 -0
evalscope/report/app.py +506 -0
evalscope/report/combinator.py +73 -0
evalscope/report/generator.py +80 -0
evalscope/report/utils.py +133 -0
evalscope/run.py +48 -72
evalscope/run_arena.py +1 -1
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +1 -1
evalscope/utils/chat_service.py +5 -4
evalscope/utils/io_utils.py +8 -0
evalscope/utils/logger.py +5 -0
evalscope/utils/model_utils.py +15 -2
evalscope/utils/utils.py +3 -25
evalscope/version.py +2 -2
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
tests/cli/test_collection.py +57 -0
tests/cli/test_run.py +52 -1
tests/rag/test_mteb.py +3 -2
evalscope/models/api/__init__.py +0 -3
evalscope/models/dummy_chat_model.py +0 -49
evalscope/models/model_adapter.py +0 -525
evalscope/models/openai_model.py +0 -103
evalscope/tools/__init__.py +0 -1
evalscope/tools/combine_reports.py +0 -133
evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
/evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
/evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/hellaswag/hellaswag_adapter.py CHANGED Viewed

@@ -3,54 +3,43 @@ import numpy as np
 import os
 import re
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.models import ContinuationLogitsModelAdapter
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
+from evalscope.utils.utils import ResponseParser
 # flake8: noqa
 logger = get_logger()
-DATASET_ID = 'modelscope/hellaswag'
-SUBSET_LIST = ['default']
+@Benchmark.register(
+    name='hellaswag',
+    dataset_id='modelscope/hellaswag',
+    model_adapter=ContinuationLogitsModelAdapter,
+    subset_list=['default'],
+    metric_list=[AverageAccuracy],
+    few_shot_num=0,
+    train_split='train',
+    eval_split='validation',
+    prompt_template=
+    'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.',  # noqa: E501
+)
 class HellaSwagAdapter(DataAdapter):
     choices = ['0', '1', '2', '3']
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'validation',
-                 **kwargs):
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-        if few_shot_num is None:
-            # Use 0-shot by default
-            logger.info(f'Set 0-shot examples by system for HellaSwag.')
-            few_shot_num = 0
+    def __init__(self, **kwargs):
+        few_shot_num = kwargs.get('few_shot_num', 0)
         if few_shot_num != 0:
             logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
-            few_shot_num = 0
+            kwargs['few_shot_num'] = 0
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -100,13 +89,17 @@ class HellaSwagAdapter(DataAdapter):
         ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
-        return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
+        return {
+            'data': ctx_continuation_pair_list,
+            'multi_choices': self.choices,
+            'system_prompt': self.prompt_template
+        }
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d['label']
-    def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
@@ -118,7 +111,7 @@ class HellaSwagAdapter(DataAdapter):
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             # answer: in the form of [-2.3, -4.5, ...], len of self.choices
             result = np.array(result)
             endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
@@ -126,76 +119,16 @@ class HellaSwagAdapter(DataAdapter):
             best_choice_idx = np.argmax(result / completion_len)
             return str(best_choice_idx)
-        elif eval_type == 'service':
-            return result  # TODO: to be supported !
-        elif eval_type == 'custom':
-            return result  # TODO: to be supported !
+        elif eval_type == EvalType.SERVICE:
+            return ResponseParser.parse_first_option(result)
+        elif eval_type == EvalType.CUSTOM:
+            return ResponseParser.parse_first_option(result)
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=str(gold), pred=str(pred))
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"HellaSwag",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.4128,
-                    "subset":[
-                        {
-                            "name":"default",
-                            "score":0.5632
-                        },
-                    ]
-                }
-            ],
-            "total_num":7800
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(
-            name=report_name or 'hellaswag',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-        return res_map
     @classmethod
     def _preprocess(cls, text):
         text = text.strip()

evalscope/benchmarks/humaneval/__init__.py CHANGED Viewed

@@ -1,5 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -1,38 +1,34 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import json
-import os
 import re
-from tqdm import tqdm
-from typing import List
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import weighted_mean
-from evalscope.tools.combine_reports import gen_table
-from evalscope.utils import normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import Pass1
+from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.logger import get_logger
 logger = get_logger()
-DATASET_ID = 'modelscope/humaneval'
-SUBSET_LIST = ['openai_humaneval']
 # Example:
 # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}  # noqa
+@Benchmark.register(
+    name='humaneval',
+    dataset_id='modelscope/humaneval',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['openai_humaneval'],
+    metric_list=[Pass1],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template='',
+)
 class HumanevalAdapter(DataAdapter):
     """
     A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
     """
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = None,
-                 eval_split: str = 'test',
-                 prompt_template: str = 'Complete the following python code:\n',
-                 **kwargs):
+    def __init__(self, **kwargs):
         try:
             from human_eval.data import stream_jsonl, write_jsonl
             from human_eval.evaluation import check_correctness
@@ -41,29 +37,15 @@ class HumanevalAdapter(DataAdapter):
                               'https://github.com/openai/human-eval/tree/master#installation , '
                               'Note that you need to enable the execution code in the human_eval/execution.py first.')
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
         self.k = [1]
         self.num_workers = 4
         self.timeout = 4.0
-        self.outputs = kwargs.get('outputs', None)
         self.read_problems_func = stream_jsonl
         self.write_jsonl_func = write_jsonl
         self.eval_func = check_correctness
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            prompt_template=prompt_template,
-            **kwargs)
+        super().__init__(**kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -83,80 +65,9 @@ class HumanevalAdapter(DataAdapter):
             {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
         """
         full_prompt = input_d['prompt']
-        full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
-        return {'data': [full_prompt]}
-    def get_answers(self, infer_cfg: dict) -> List[dict]:
-        ans_list: list = []
-        system_prompt: str = ''
-        for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
-            prompt: str = system_prompt + data_d['prompt']
-            inputs: dict = {'data': [prompt]}
-            pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
-            pred_ans: str = pred_res['choices'][0]['message']['content']
-            pred_ans = self._postprocess(pred_ans)
-            ans_list.append({'task_id': task_id, 'completion': pred_ans})
-        return ans_list
-    def eval(self, infer_cfg: dict, **kwargs):
-        # predict
-        ans_list: list = self.get_answers(infer_cfg)
-        ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
+        full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
-        self.write_jsonl_func(filename=ans_out_file, data=ans_list)
-        # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
-        logger.info('** Dump predictions successfully.')
-        # evaluate  results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
-        results = self.eval_func(
-            sample_file=ans_out_file,
-            k=self.k,
-            n_workers=self.num_workers,
-            timeout=self.timeout,
-            problem_file=self.problem_file)
-        # output: report
-        report_map: dict = self.gen_report(results=results)
-        report_dir: str = self.outputs_structure.reports_dir
-        report_file: str = os.path.join(report_dir, 'human_eval_report.json')
-        with open(report_file, 'w') as f:
-            f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
-        # logger.info(f'** Dump report to {report_file} \n')
-        logger.info('** Dump report \n')
-        try:
-            # Make table
-            report_table: str = gen_table([report_dir])
-            logger.info(f'** Report table: \n {report_table} \n')
-        except Exception:
-            logger.error('Failed to generate report table.')
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(
-            name=report_name or 'HumanEval',
-            metric='pass@1',
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-        return res_map
+        return {'data': [full_prompt], 'system_prompt': self.prompt_template}
     @classmethod
     def _postprocess(cls, text: str) -> str:
@@ -182,19 +93,6 @@ class HumanevalAdapter(DataAdapter):
                 text = '\n'.join(['    ' + line for line in text.split('\n')])
         return text
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
     def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
         return self._postprocess(result)

evalscope/benchmarks/ifeval/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/ifeval/ifeval_adapter.py ADDED Viewed

@@ -0,0 +1,57 @@
+from collections import defaultdict
+from typing import Any, Dict, List
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
+from evalscope.constants import EvalType
+from evalscope.metrics import Metric, mean
+from evalscope.models import ChatGenerationModelAdapter
+from evalscope.utils.utils import normalize_score
+@Benchmark.register(
+    name='ifeval',
+    dataset_id='opencompass/ifeval',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['default'],
+    metric_list=[
+        Metric(name='prompt_level_strict_acc', object=mean),
+        Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
+        Metric(name='prompt_level_loose_acc', object=mean),
+        Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
+    ],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='train',
+    prompt_template='',
+)
+class IFEvalAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
+    def get_gold_answer(self, input_d: dict) -> str:
+        return input_d
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        return result
+    def match(self, gold: Any, pred: Any) -> Dict:
+        return process_results(gold, [pred])
+    def compute_metric(self, review_res_list: List[dict]) -> Any:
+        # aggregate review results
+        res_dict = defaultdict(list)
+        for res in review_res_list:
+            for k, v in res.items():
+                res_dict[k].append(v)
+        metrics = []
+        for metric in self.metric_list:
+            metric_name = metric.name
+            pred_value = res_dict[metric_name]
+            metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
+        return metrics

evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl