PyPI - evalscope - Versions diffs - 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

evalscope/__init__.py +2 -0
evalscope/arguments.py +11 -3
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/__init__.py +20 -1
evalscope/benchmarks/arc/__init__.py +0 -5
evalscope/benchmarks/arc/arc_adapter.py +24 -102
evalscope/benchmarks/bbh/__init__.py +0 -4
evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
evalscope/benchmarks/benchmark.py +70 -59
evalscope/benchmarks/ceval/__init__.py +0 -5
evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
evalscope/benchmarks/cmmlu/__init__.py +0 -5
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
evalscope/benchmarks/competition_math/__init__.py +0 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
evalscope/benchmarks/data_adapter.py +115 -87
evalscope/benchmarks/general_qa/__init__.py +0 -5
evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
evalscope/benchmarks/gsm8k/__init__.py +0 -4
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
evalscope/benchmarks/hellaswag/__init__.py +0 -5
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
evalscope/benchmarks/humaneval/__init__.py +0 -4
evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
evalscope/benchmarks/ifeval/__init__.py +0 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
evalscope/benchmarks/ifeval/instructions.py +1478 -0
evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
evalscope/benchmarks/ifeval/utils.py +134 -0
evalscope/benchmarks/iquiz/__init__.py +0 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
evalscope/benchmarks/mmlu/__init__.py +0 -5
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
evalscope/benchmarks/race/__init__.py +0 -5
evalscope/benchmarks/race/race_adapter.py +26 -123
evalscope/benchmarks/trivia_qa/__init__.py +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
evalscope/benchmarks/truthful_qa/__init__.py +0 -5
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +29 -0
evalscope/collections/__init__.py +3 -0
evalscope/collections/evaluator.py +198 -0
evalscope/collections/sampler.py +138 -0
evalscope/collections/schema.py +126 -0
evalscope/config.py +7 -5
evalscope/constants.py +9 -26
evalscope/evaluator/evaluator.py +87 -121
evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
evalscope/metrics/__init__.py +3 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
evalscope/metrics/math_accuracy.py +193 -50
evalscope/metrics/metrics.py +18 -6
evalscope/metrics/named_metrics.py +17 -0
evalscope/metrics/rouge_metric.py +13 -8
evalscope/models/__init__.py +14 -1
evalscope/models/base_adapter.py +52 -0
evalscope/models/chat_adapter.py +138 -0
evalscope/models/choice_adapter.py +211 -0
evalscope/models/custom_adapter.py +67 -0
evalscope/models/local_model.py +74 -0
evalscope/models/model.py +141 -0
evalscope/models/server_adapter.py +111 -0
evalscope/perf/__init__.py +1 -0
evalscope/perf/main.py +0 -1
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +1 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/longalpaca.py +1 -1
evalscope/report/__init__.py +5 -0
evalscope/report/app.py +506 -0
evalscope/report/combinator.py +73 -0
evalscope/report/generator.py +80 -0
evalscope/report/utils.py +133 -0
evalscope/run.py +48 -72
evalscope/run_arena.py +1 -1
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +1 -1
evalscope/utils/chat_service.py +5 -4
evalscope/utils/io_utils.py +8 -0
evalscope/utils/logger.py +5 -0
evalscope/utils/model_utils.py +15 -2
evalscope/utils/utils.py +3 -25
evalscope/version.py +2 -2
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
tests/cli/test_collection.py +57 -0
tests/cli/test_run.py +52 -1
tests/rag/test_mteb.py +3 -2
evalscope/models/api/__init__.py +0 -3
evalscope/models/dummy_chat_model.py +0 -49
evalscope/models/model_adapter.py +0 -525
evalscope/models/openai_model.py +0 -103
evalscope/tools/__init__.py +0 -1
evalscope/tools/combine_reports.py +0 -133
evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
/evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
/evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0

evalscope/__init__.py CHANGED Viewed

@@ -1,3 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.config import TaskConfig
+from evalscope.run import run_task
 from .version import __release_datetime__, __version__

evalscope/arguments.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import argparse
 import json
+from evalscope.constants import EvalBackend, EvalStage, EvalType
 class ParseStrArgsAction(argparse.Action):
@@ -31,6 +33,7 @@ def add_argument(parser: argparse.ArgumentParser):
     # yapf: disable
     # Model-related arguments
     parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
+    parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
     parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
     # Template-related arguments
@@ -47,10 +50,13 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.')  # noqa: E501
     # Evaluation-related arguments
-    parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
-    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
+    parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
+                        choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
+    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
+                        choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL])  # noqa: E501
     parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.')  # noqa: E501
-    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
+    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
+                        choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
     parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
     # Cache and working directory arguments
@@ -62,6 +68,8 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.')  # noqa: E501
     parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
     parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
+    parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
+    parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
     # yapf: enable

evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py CHANGED Viewed

@@ -3,7 +3,6 @@ Code adapated from https://github.com/mlfoundations/open_clip/blob/main/src/trai
 Thanks to the authors of OpenCLIP
 """
-import logging
 import torch
 import torch.nn.functional as F
 from contextlib import suppress

evalscope/backend/rag_eval/utils/llm.py CHANGED Viewed

@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
 from typing import Any, Dict, Iterator, List, Mapping, Optional
 from evalscope.constants import DEFAULT_MODEL_REVISION
-from evalscope.models.model_adapter import ChatGenerationModelAdapter
+from evalscope.models import ChatGenerationModelAdapter
 class LLM:

evalscope/benchmarks/__init__.py CHANGED Viewed

@@ -1,4 +1,23 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import importlib
+import os
-from evalscope.benchmarks.benchmark import Benchmark
+from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
 from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.utils import get_logger
+logger = get_logger()
+# Using glob to find all files matching the pattern
+pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
+files = glob.glob(pattern, recursive=False)
+for file_path in files:
+    if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
+        # Convert file path to a module path
+        relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
+        module_path = relative_path[:-3].replace(os.path.sep, '.')  # strip '.py' and convert to module path
+        full_path = f'evalscope.benchmarks.{module_path}'
+        importlib.import_module(full_path)
+        # print(f'Importing {full_path}')

evalscope/benchmarks/arc/__init__.py CHANGED Viewed

@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
-from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/arc/arc_adapter.py CHANGED Viewed

@@ -3,40 +3,35 @@
 import json
 import os
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType
+from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.models import MultiChoiceModelAdapter
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
-DATASET_ID = 'modelscope/ai2_arc'
-# task_list = ['ARC-Easy', 'ARC-Challenge']
-SUBSET_LIST = ['ARC-Challenge']
+@Benchmark.register(
+    name='arc',
+    dataset_id='modelscope/ai2_arc',
+    model_adapter=MultiChoiceModelAdapter,
+    subset_list=['ARC-Easy', 'ARC-Challenge'],
+    metric_list=[AverageAccuracy],
+    few_shot_num=0,
+    train_split='train',
+    eval_split='test',
+    prompt_template='',
+)
 class ARCAdapter(DataAdapter):
     choices = ['A', 'B', 'C', 'D']
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'test',
-                 prompt_template: str = '',
-                 **kwargs):
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+    def __init__(self, **kwargs):
+        few_shot_num = kwargs.get('few_shot_num', None)
         if few_shot_num is None:
             # Use 0-shot by default
             logger.info(f'Set 0-shot examples by system for ARC.')
@@ -45,14 +40,7 @@ class ARCAdapter(DataAdapter):
         if few_shot_num != 0:
             logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            prompt_template=prompt_template,
-            **kwargs)
+        super().__init__(**kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         """
@@ -121,18 +109,16 @@ class ARCAdapter(DataAdapter):
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
         context: str = '\n'.join(few_shot_prompts)
-        context = f'{self.prompt_template}\n{context}' if self.prompt_template else context
         # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
         full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
-        return {'data': [full_prompt], 'multi_choices': self.choices}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
         return input_d.get('answerKey', '')
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
         """
         Parse the model output to get the answer. Could be the best choice index.
@@ -144,12 +130,12 @@ class ARCAdapter(DataAdapter):
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        if eval_type == 'checkpoint':
+        if eval_type == EvalType.CHECKPOINT:
             return result
-        elif eval_type == 'service':
+        elif eval_type == EvalType.SERVICE:
             return ResponseParser.parse_first_option_with_choices(
                 text=result, options=self.choices)  # TODO: to be checked !
-        elif eval_type == 'custom':
+        elif eval_type == EvalType.CUSTOM:
             return ResponseParser.parse_first_option_with_choices(
                 text=result, options=self.choices)  # TODO: to be checked !
         else:
@@ -158,70 +144,6 @@ class ARCAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"ARC",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.4128,
-                    "subset":[
-                        {
-                            "name":"ARC-Easy",
-                            "score":0.5632
-                        },
-                        {
-                            "name":"ARC-Challenge",
-                            "score":0.3157
-                        }
-                    ]
-                }
-            ],
-            "total_num":7800
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(
-            name=report_name or 'arc',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-        return res_map
     @classmethod
     def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:

evalscope/benchmarks/bbh/__init__.py CHANGED Viewed

@@ -1,5 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.bbh.bbh_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.bbh.bbh_adapter import BBHAdapter as DataAdapterClass
-from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass  # noqa

evalscope/benchmarks/bbh/bbh_adapter.py CHANGED Viewed

@@ -5,18 +5,17 @@ import os
 import random
 import re
-from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import AnswerKeys
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.metrics import AverageAccuracy, exact_match
+from evalscope.models.chat_adapter import ChatGenerationModelAdapter
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 # flake8: noqa
 logger = get_logger()
-DATASET_ID = 'modelscope/bbh'
 # BBH multiple choice subset list
 MULTIPLE_CHOICE = 'multiple_choice'
 MULTIPLE_CHOICE_LIST = [
@@ -59,41 +58,32 @@ TASK_TYPE = 'task_type'
 SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
+@Benchmark.register(
+    name='bbh',
+    dataset_id='modelscope/bbh',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=SUBSET_LIST,
+    metric_list=[AverageAccuracy],
+    few_shot_num=3,
+    train_split=None,
+    eval_split='test',
+    prompt_template='',
+)
 class BBHAdapter(DataAdapter):
     """
     Adapter for BBH free-form and multiple-choices sub-tasks.
     """
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = None,
-                 eval_split: str = 'test',
-                 **kwargs):
-        if subset_list is None:
-            subset_list = SUBSET_LIST
+    def __init__(self, **kwargs):
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-        if few_shot_num is None:
-            logger.info(f'Set 3-shot examples by system for BBH.')
-            few_shot_num = 3
+        few_shot_num = kwargs.get('few_shot_num', 3)
         if few_shot_num != 3 and few_shot_num != 0:
             logger.error(f'BBH uses 3-shot examples with CoT or 0-shot by system, but got {few_shot_num}. '
                          f'Use 3-shot by default.')
-            few_shot_num = 3
+            kwargs['few_shot_num'] = 3
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -132,7 +122,7 @@ class BBHAdapter(DataAdapter):
         cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
         full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
-        return {'data': [full_prompt]}
+        return {'data': [full_prompt], 'system_prompt': self.prompt_template}
     def gen_prompts(self, data_dict: dict) -> dict:
         """
@@ -217,66 +207,6 @@ class BBHAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"BBH",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.3389,
-                    "subset":[
-                        {
-                            "name":"BBH",
-                            "score":0.3389
-                        },
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(
-            name=report_name or 'bbh',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-        return res_map
     @classmethod
     def _extract_mc_answer(cls, ans: str) -> str:
         """

evalscope/benchmarks/benchmark.py CHANGED Viewed

@@ -1,65 +1,76 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Dict, List, Optional
-import os.path
-from modelscope.msdatasets import MsDataset
-from typing import Optional
+if TYPE_CHECKING:
+    from evalscope.benchmarks import DataAdapter
-from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, HubType
+from evalscope.models import BaseModelAdapter
+BENCHMARK_MAPPINGS = {}
-class Benchmark(object):
-    """
-    Wrapper for loading datasets from ModelScope or HuggingFace.
-    """
+@dataclass
+class BenchmarkMeta:
+    name: str
+    dataset_id: str
+    data_adapter: 'DataAdapter'
+    model_adapter: BaseModelAdapter
+    subset_list: List[str] = field(default_factory=list)
+    metric_list: List[dict] = field(default_factory=list)
+    few_shot_num: int = 0
+    few_shot_random: bool = False
+    train_split: Optional[str] = None
+    eval_split: Optional[str] = None
+    prompt_template: Optional[str] = None
+    def _update(self, args: dict):
+        if args.get('local_path'):
+            self.dataset_id = args['local_path']
+            del args['local_path']
+        self.__dict__.update(args)
+    def to_dict(self) -> dict:
+        return self.__dict__
+    def to_string_dict(self) -> dict:
+        cur_dict = copy.deepcopy(self.__dict__)
+        # cur_dict['data_adapter'] = self.data_adapter.__name__
+        # cur_dict['model_adapter'] = self.model_adapter.__name__
+        # cur_dict['metric_list'] = [metric['name'] for metric in self.metric_list]
+        del cur_dict['data_adapter']
+        del cur_dict['model_adapter']
+        del cur_dict['metric_list']
+        return cur_dict
+    def get_data_adapter(self, config: dict = {}) -> 'DataAdapter':
+        if config:
+            self._update(config)
+        data_adapter = self.data_adapter(**self.to_dict())
+        return data_adapter
+class Benchmark:
     def __init__(self):
-        ...
-    @staticmethod
-    def load(dataset_name: str,
-             subset: str = None,
-             split: str = None,
-             token: str = None,
-             hub: str = 'ModelScope',
-             work_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
-             **kwargs):
-        """
-        Load a dataset from ModelScope or HuggingFace.
-        Args:
-            dataset_name (str): The dataset id or path.
-                If it is dataset id, should be in the format of `organization/name` for ModelScope and HuggingFace hub.
-                If it is dataset path, should be the path on local disk.
-            subset (str):
-            split:
-            token: sdk token for ModelScope, optional, default None
-            hub: `ModelScope` or `HuggingFace`
-            work_dir: the work directory for caching, optional
-        Returns:
-            A dict.
-        """
-        dataset = MsDataset.load(
-            dataset_name=dataset_name,
-            subset_name=subset,
-            split=split,
-            token=token,
-            cache_dir=work_dir,
-            hub=hub,
-            **kwargs)
-        dataset.dataset_name = dataset_name.split('/')[-1]
-        dataset.subset_name = subset
-        # dataset.split = split
-        return dataset
-if __name__ == '__main__':
-    ds = Benchmark.load(dataset_name='mmlu', subset='management', split=None)
-    n = 1
-    for i in ds:
-        print('>', n, ': ', i)
-        n += 1
+        pass
+    @classmethod
+    def get(cls, name: str) -> 'BenchmarkMeta':
+        if name not in BENCHMARK_MAPPINGS:
+            raise Exception(f'Unknown benchmark: {name}. Available tasks: {BENCHMARK_MAPPINGS.keys()}')
+        benchmark = BENCHMARK_MAPPINGS[name]
+        return benchmark
+    @classmethod
+    def register(cls, name: str, dataset_id: str, model_adapter: BaseModelAdapter, **kwargs):
+        def register_wrapper(data_adapter):
+            if name in BENCHMARK_MAPPINGS:
+                raise Exception(f'Benchmark {name} already registered')
+            BENCHMARK_MAPPINGS[name] = BenchmarkMeta(
+                name=name, data_adapter=data_adapter, model_adapter=model_adapter, dataset_id=dataset_id, **kwargs)
+            return data_adapter
+        return register_wrapper

evalscope/benchmarks/ceval/__init__.py CHANGED Viewed

@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.benchmarks.ceval.ceval_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
-from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter
-from evalscope.benchmarks.ceval.ceval_adapter import CEVALAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa

evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl