PyPI - evalscope - Versions diffs - 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

evalscope/__init__.py +2 -0
evalscope/arguments.py +11 -3
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/__init__.py +20 -1
evalscope/benchmarks/arc/__init__.py +0 -5
evalscope/benchmarks/arc/arc_adapter.py +24 -102
evalscope/benchmarks/bbh/__init__.py +0 -4
evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
evalscope/benchmarks/benchmark.py +70 -59
evalscope/benchmarks/ceval/__init__.py +0 -5
evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
evalscope/benchmarks/cmmlu/__init__.py +0 -5
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
evalscope/benchmarks/competition_math/__init__.py +0 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
evalscope/benchmarks/data_adapter.py +115 -87
evalscope/benchmarks/general_qa/__init__.py +0 -5
evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
evalscope/benchmarks/gsm8k/__init__.py +0 -4
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
evalscope/benchmarks/hellaswag/__init__.py +0 -5
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
evalscope/benchmarks/humaneval/__init__.py +0 -4
evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
evalscope/benchmarks/ifeval/__init__.py +0 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
evalscope/benchmarks/ifeval/instructions.py +1478 -0
evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
evalscope/benchmarks/ifeval/utils.py +134 -0
evalscope/benchmarks/iquiz/__init__.py +0 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
evalscope/benchmarks/mmlu/__init__.py +0 -5
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
evalscope/benchmarks/race/__init__.py +0 -5
evalscope/benchmarks/race/race_adapter.py +26 -123
evalscope/benchmarks/trivia_qa/__init__.py +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
evalscope/benchmarks/truthful_qa/__init__.py +0 -5
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +29 -0
evalscope/collections/__init__.py +3 -0
evalscope/collections/evaluator.py +198 -0
evalscope/collections/sampler.py +138 -0
evalscope/collections/schema.py +126 -0
evalscope/config.py +7 -5
evalscope/constants.py +9 -26
evalscope/evaluator/evaluator.py +87 -121
evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
evalscope/metrics/__init__.py +3 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
evalscope/metrics/math_accuracy.py +193 -50
evalscope/metrics/metrics.py +18 -6
evalscope/metrics/named_metrics.py +17 -0
evalscope/metrics/rouge_metric.py +13 -8
evalscope/models/__init__.py +14 -1
evalscope/models/base_adapter.py +52 -0
evalscope/models/chat_adapter.py +138 -0
evalscope/models/choice_adapter.py +211 -0
evalscope/models/custom_adapter.py +67 -0
evalscope/models/local_model.py +74 -0
evalscope/models/model.py +141 -0
evalscope/models/server_adapter.py +111 -0
evalscope/perf/__init__.py +1 -0
evalscope/perf/main.py +0 -1
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +1 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/longalpaca.py +1 -1
evalscope/report/__init__.py +5 -0
evalscope/report/app.py +506 -0
evalscope/report/combinator.py +73 -0
evalscope/report/generator.py +80 -0
evalscope/report/utils.py +133 -0
evalscope/run.py +48 -72
evalscope/run_arena.py +1 -1
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +1 -1
evalscope/utils/chat_service.py +5 -4
evalscope/utils/io_utils.py +8 -0
evalscope/utils/logger.py +5 -0
evalscope/utils/model_utils.py +15 -2
evalscope/utils/utils.py +3 -25
evalscope/version.py +2 -2
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
tests/cli/test_collection.py +57 -0
tests/cli/test_run.py +52 -1
tests/rag/test_mteb.py +3 -2
evalscope/models/api/__init__.py +0 -3
evalscope/models/dummy_chat_model.py +0 -49
evalscope/models/model_adapter.py +0 -525
evalscope/models/openai_model.py +0 -103
evalscope/tools/__init__.py +0 -1
evalscope/tools/combine_reports.py +0 -133
evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
/evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
/evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -10,10 +10,9 @@ from typing import Any, Dict, List, Optional, Union
 from evalscope.benchmarks import DataAdapter
 from evalscope.config import TaskConfig
-from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
-                                 ReviewKeys)
-from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
-from evalscope.tools.combine_reports import gen_table
+from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
+from evalscope.models import BaseModelAdapter, CustomModelAdapter
+from evalscope.report import Report, gen_table
 from evalscope.utils import dict_torch_dtype_to_str, gen_hash
 from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
 from evalscope.utils.logger import get_logger
@@ -30,73 +29,63 @@ class Evaluator(object):
                 if the dataset is a local path, e.g. /path/to/your_dataset_name,
                 then the task name will be the basename of the path, which is `your_dataset_name`.
         data_adapter: DataAdapter, the data adapter for the dataset.
-        subset_list: list, the subset list for the dataset.
         model_adapter: BaseModelAdapter, the model adapter for the model.
-        use_cache: str, path to local cache. Default: None
-        outputs_dir: OutputsStructure, the outputs dir. Default: None
-        datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
-        datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
-        stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
-        eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
-        overall_task_cfg: dict, the overall task config. Default: None
+        outputs: OutputsStructure, the outputs dir. Default: None
+        task_cfg: TaskConfig, the overall task config. Default: None
         **kwargs: kwargs.
     """
     def __init__(self,
                  dataset_name_or_path: str,
                  data_adapter: DataAdapter,
-                 subset_list: Optional[list] = None,
-                 model_adapter: Optional[BaseModelAdapter] = None,
-                 use_cache: Optional[str] = None,
-                 outputs: Optional[OutputsStructure] = None,
-                 datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
-                 datasets_hub: Optional[str] = HubType.MODELSCOPE,
-                 stage: Optional[str] = EvalStage.ALL,
-                 eval_type: Optional[str] = EvalType.CHECKPOINT,
-                 overall_task_cfg: Optional[TaskConfig] = None,
+                 model_adapter: BaseModelAdapter,
+                 outputs: OutputsStructure = None,
+                 task_cfg: TaskConfig = None,
                  **kwargs):
+        self.dataset_name = data_adapter.name
         self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
-        self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
-        self.model_name = overall_task_cfg.model_id
+        self.model_name = task_cfg.model_id
         self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
-        self.datasets_dir = os.path.expanduser(datasets_dir)
-        self.kwargs = kwargs
         self.data_adapter = data_adapter
         self.model_adapter = model_adapter
-        self.eval_type = eval_type
-        self.stage = stage
-        self.use_cache = use_cache
-        self.overall_task_cfg = overall_task_cfg
-        if isinstance(self.model_adapter, CustomModelAdapter):
-            self.overall_task_cfg.model_args = self.model_adapter.custom_model.config
-        self.model_cfg = self.model_adapter.model_cfg
+        self.model_cfg = model_adapter.model_cfg
+        self.eval_type = task_cfg.eval_type
+        self.dataset_hub = task_cfg.dataset_hub
+        self.stage = task_cfg.stage
+        self.use_cache = task_cfg.use_cache
+        self.task_cfg = task_cfg
         # Deal with the output paths
         self.outputs_structure = outputs
-        # Load dataset
-        self.dataset = self.data_adapter.load(
-            dataset_name_or_path=dataset_name_or_path,
-            subset_list=subset_list,
-            work_dir=self.datasets_dir,
-            datasets_hub=datasets_hub,
-            **kwargs)
-        # Get prompts from dataset
-        # TODO: support sampler
-        self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
-        del self.dataset
-    def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
+        self.kwargs = kwargs
-        ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
-        ans[AnswerKeys.ANSWER_ID] = answer_id
-        ans[AnswerKeys.SUBSET_NAME] = subset_name
+    def load_dataset(self):
+        dataset = self.data_adapter.load(
+            dataset_name_or_path=self.dataset_name_or_path,
+            subset_list=self.data_adapter.subset_list,
+            work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
+            datasets_hub=self.dataset_hub,
+            **self.kwargs)
-        return ans
+        # Get prompts from dataset
+        prompts = self.data_adapter.gen_prompts(data_dict=dataset)
+        return prompts
+    def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
+        model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
+        input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
+        infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
+        return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
+    def _process_answer(self, answer_d, input_d, subset_name, answer_id):
+        answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
+        answer_d[AnswerKeys.ANSWER_ID] = answer_id
+        answer_d[AnswerKeys.SUBSET_NAME] = subset_name
+        answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
+        answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
+        return answer_d
     def get_answers(self,
                     subset_name: str,
@@ -147,57 +136,24 @@ class Evaluator(object):
             resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
                 inputs=prompts_list, infer_cfg=infer_cfg)
-            assert len(prompts_list) == len(resp_answers_list), \
-                f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
-            for in_d, resp_d in zip(prompts_list, resp_answers_list):
-                # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
-                model_cfg_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
-                    ensure_ascii=False)
-                input_prompt_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
-                infer_cfg_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
-                answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
-                resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
-                resp_d[AnswerKeys.ANSWER_ID] = answer_id
-                resp_d[AnswerKeys.SUBSET_NAME] = subset_name
-                resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
-                resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
-                answers_list.append(resp_d)
-                dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
+            for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
+                answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
+                processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
+                answers_list.append(processed_answer)
+                dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
         else:
             for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
-                # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
-                model_cfg_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
-                    ensure_ascii=False)
-                input_prompt_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
-                infer_cfg_str = json.dumps(
-                    OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
-                answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
-                # Get answers
-                answer_d: dict = self._pred_answer(
-                    input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
-                answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
-                answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
-                answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
+                answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
+                answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
+                processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
                 if debug:
                     logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
-                    logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
+                    logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
-                answers_list.append(answer_d)
-                dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
+                answers_list.append(processed_answer)
+                dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
         logger.info(f'Dump predictions to {pred_file_path}.')
         return answers_list
@@ -241,6 +197,19 @@ class Evaluator(object):
         return review_res
+    def _generate_review_id(self, answer_d):
+        # Gen review_id (concat: answer_id + reviewer_spec)
+        answer_id = answer_d[AnswerKeys.ANSWER_ID]
+        reviewer_spec = {
+            'metric': [metric.name for metric in self.data_adapter.metric_list],
+            'reviewer': ['Evaluator'],
+            'revision': ['default']
+        }
+        reviewer_spec_str = json.dumps(
+            OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
+        review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
+        return review_id, reviewer_spec
     def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
         """
         Get reviews from answers.
@@ -264,19 +233,7 @@ class Evaluator(object):
             logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
         for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
-            # Gen review_id (concat: answer_id + reviewer_spec)
-            answer_id = answer_d[AnswerKeys.ANSWER_ID]
-            reviewer_spec: dict = {
-                'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
-                'reviewer': ['Evaluator'],
-                'revision': ['default']
-            }
-            reviewer_spec_str = json.dumps(
-                OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
-            review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
+            review_id, reviewer_spec = self._generate_review_id(answer_d)
             # Get review
             review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
@@ -284,13 +241,12 @@ class Evaluator(object):
                 logger.info(review_d)
             reviews_list.append(review_d)
             # Dump reviews
             dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
         return reviews_list
-    def compute_metrics(self, reviews_list: List[dict]) -> Any:
+    def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
         """
         To compute metrics from reviews_list for each subset.
         It is required to rewrite this method to support your own evaluator.
@@ -308,28 +264,37 @@ class Evaluator(object):
                 logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
                 continue
-            review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
+            if len(review_d[AnswerKeys.CHOICES]) == 0:
+                logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
+                continue
+            elif len(review_d[AnswerKeys.CHOICES]) == 1:
+                review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
+            else:
+                review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
             review_res_list.append(review_res)
-        metric_score: Union[float, dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
+        metric_score: List[dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
         return metric_score
-    def dump_report(self, reviews_score_all: dict, use_table: bool = True):
+    def dump_report(self, reviews_score_all: List[dict], use_table: bool = True):
         """
         Get report for total reviews of specific dataset.
         It is required to rewrite this method to support your own evaluator.
         Args:
-            report_map: report dict. Generated by func self.data_adapter.gen_report().
+            reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
             use_table: whether to generate table for reports. Default to True.
         Returns: None
         """
         # Get report map
-        report_map: dict = self.data_adapter.gen_report(
-            subset_score_map=reviews_score_all, report_name=self.custom_task_name)
-        report_map.update(dict(model_name=self.model_name, dataset_name=self.dataset_name))
+        report_map: Report = self.data_adapter.gen_report(
+            subset_score_map=reviews_score_all,
+            report_name=self.custom_task_name,
+            model_name=self.model_name,
+            dataset_name=self.dataset_name)
         # Dump report
         report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
@@ -338,7 +303,7 @@ class Evaluator(object):
         # Write report
         with open(report_path, 'w') as f:
-            f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
+            f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
         logger.info(f'Dump report: {report_path} \n')
         # Make table
@@ -380,7 +345,8 @@ class Evaluator(object):
         stage_answers_dict = {}
         stage_reviews_dict = {}
-        for subset_name, prompts_list in self.prompts.items():
+        prompts = self.load_dataset()
+        for subset_name, prompts_list in prompts.items():
             limit = kwargs.get('limit', len(prompts_list))
             prompts_list = prompts_list[:limit]
@@ -394,7 +360,7 @@ class Evaluator(object):
                 subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
             metric_res = self.compute_metrics(reviews_list=reviews_list)
-            reviews_score_all[subset_name] = (metric_res, len(reviews_list))
+            reviews_score_all[subset_name] = metric_res
             stage_reviews_dict[subset_name] = reviews_list
         if self.stage == EvalStage.INFER:

evalscope/evaluator/reviewer/auto_reviewer.py CHANGED Viewed

@@ -8,10 +8,10 @@ import sys
 import time
 from abc import ABC, abstractmethod
 from functools import partial
-from typing import Any, List
+from typing import Any, List, Tuple
 from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
-from evalscope.models.openai_model import OpenAIModel
+from evalscope.models.model import OpenAIModel
 from evalscope.utils import completion_parsers, random_seeded_choice
 from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
 from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
@@ -240,7 +240,15 @@ class AutoReviewerGpt4(BaseReviewer):
             review_text=review_text)
         return review_result
-    def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
+    def _get_review_pair(self,
+                         model_a,
+                         model_b,
+                         question,
+                         category,
+                         ans1,
+                         ans2,
+                         dry_run=False,
+                         **kwargs) -> Tuple[str, Any]:
         input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
         if self.reference_list:
@@ -263,7 +271,7 @@ class AutoReviewerGpt4(BaseReviewer):
             result = (result, None)
         return review_text, *result
-    def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> (str, Any):
+    def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
         input_msg = dict(ques=question, category=category, ans1=answer)
         if self.reference_list:

evalscope/metrics/__init__.py CHANGED Viewed

@@ -1 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, weighted_mean
+from evalscope.metrics.named_metrics import *
+from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh

evalscope/metrics/bundled_rouge_score/rouge_scorer.py CHANGED Viewed

@@ -55,7 +55,7 @@ try:
         os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
         os.system(f'unzip {punkt_path} -d {nltk_dir}')
     else:
-        logger.info(f'{punkt_path} already exists, skipping download')
+        logger.debug(f'{punkt_path} already exists, skipping download')
 except Exception as e:
     logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')

evalscope/metrics/math_accuracy.py CHANGED Viewed

@@ -1,57 +1,200 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import re
-from collections import defaultdict
-from tqdm import tqdm
-from evalscope.constants import MetricsConstant
+# Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
+def is_equiv(str1, str2, verbose=False):
+    if str1 is None and str2 is None:
+        print('WARNING: Both None')
+        return True
+    if str1 is None or str2 is None:
+        return False
+    try:
+        ss1 = strip_string(str1)
+        ss2 = strip_string(str2)
+        if verbose:
+            print(ss1, ss2)
+        return ss1 == ss2
+    except Exception:
+        return str1 == str2
-def get_last_number(s):
-    match = re.search(r'[-+]?\d*\.\d+|\d+', s[::-1])
-    if match:
-        last_digit = match.group()[::-1]
+def remove_boxed(s):
+    if '\\boxed ' in s:
+        left = '\\boxed '
+        assert s[:len(left)] == left
+        return s[len(left):]
+    left = '\\boxed{'
+    assert s[:len(left)] == left
+    assert s[-1] == '}'
+    return s[len(left):-1]
+def last_boxed_only_string(string):
+    idx = string.rfind('\\boxed')
+    if '\\boxed ' in string:
+        return '\\boxed ' + string.split('\\boxed ')[-1].split('$')[0]
+    if idx < 0:
+        idx = string.rfind('\\fbox')
+        if idx < 0:
+            return None
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == '{':
+            num_left_braces_open += 1
+        if string[i] == '}':
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+    if right_brace_idx is None:
+        retval = None
     else:
-        last_digit = -100000
-    return float(last_digit)
-def compute_math_accuracy_one_sample(predict, reference):
-    if isinstance(predict, list):
-        predict = predict[0]
-    if isinstance(reference, list):
-        reference = reference[0]
-    predict_number = get_last_number(predict)
-    reference_number = get_last_number(reference)
-    if abs(predict_number - reference_number) <= MetricsConstant.EPSILON:
-        return 1
+        retval = string[idx:right_brace_idx + 1]
+    return retval
+def fix_fracs(string):
+    substrs = string.split('\\frac')
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += '\\frac'
+            if substr[0] == '{':
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except AssertionError:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != '{':
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += '{' + a + '}{' + b + '}' + post_substr
+                    else:
+                        new_str += '{' + a + '}{' + b + '}'
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += '{' + a + '}' + b + post_substr
+                    else:
+                        new_str += '{' + a + '}' + b
+    string = new_str
+    return string
+def fix_a_slash_b(string):
+    if len(string.split('/')) != 2:
+        return string
+    a = string.split('/')[0]
+    b = string.split('/')[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == '{}/{}'.format(a, b)
+        new_string = '\\frac{' + str(a) + '}{' + str(b) + '}'
+        return new_string
+    except AssertionError:
+        return string
+def remove_right_units(string):
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if '\\text{ ' in string:
+        splits = string.split('\\text{ ')
+        assert len(splits) == 2
+        return splits[0]
     else:
-        return 0
-def compute_math_accuracy(predict_l, reference_l):
-    assert len(predict_l) == len(reference_l)
-    if len(predict_l) == 0:
-        return 0
-    total_cnt = len(predict_l)
-    correct_cnt = 0
-    for predict, reference in zip(predict_l, reference_l):
-        correct_cnt += compute_math_accuracy_one_sample(predict, reference)
-    return {'math accuracy': correct_cnt / total_cnt}
-def run_math_eval(data_l, md_level=2):
-    print(f"{'#' * md_level} Math Eval(math accuracy)")
-    for data in tqdm(data_l):
-        data['math_accuracy'] = compute_math_accuracy_one_sample(data['gen'], data['target'])
-    task_data_d = defaultdict(list)
-    for data in data_l:
-        for task in data['task_tags']:
-            task_data_d[task].append(data)
-    correct_cnt = sum([data['math_accuracy'] for data in data_l])
-    print(f'[total], count: {len(data_l)}, math accuracy: '
-          f'{correct_cnt / len(data_l) * 100:0.2f}%')
-    for task in task_data_d.keys():
-        correct_cnt = sum([data['math_accuracy'] for data in task_data_d[task]])
-        print(f'[{task}], count: {len(task_data_d[task])}, math accuracy: '
-              f'{correct_cnt / len(task_data_d[task]) * 100:0.2f}%')
+        return string
+def fix_sqrt(string):
+    if '\\sqrt' not in string:
+        return string
+    splits = string.split('\\sqrt')
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != '{':
+            a = split[0]
+            new_substr = '\\sqrt{' + a + '}' + split[1:]
+        else:
+            new_substr = '\\sqrt' + split
+        new_string += new_substr
+    return new_string
+def strip_string(string):
+    # linebreaks
+    string = string.replace('\n', '')
+    # remove inverse spaces
+    string = string.replace('\\!', '')
+    # replace \\ with \
+    string = string.replace('\\\\', '\\')
+    # replace tfrac and dfrac with frac
+    string = string.replace('tfrac', 'frac')
+    string = string.replace('dfrac', 'frac')
+    # remove \left and \right
+    string = string.replace('\\left', '')
+    string = string.replace('\\right', '')
+    # Remove circ (degrees)
+    string = string.replace('^{\\circ}', '')
+    string = string.replace('^\\circ', '')
+    # remove dollar signs
+    string = string.replace('\\$', '')
+    # remove units (on the right)
+    string = remove_right_units(string)
+    # remove percentage
+    string = string.replace('\\%', '')
+    string = string.replace('\%', '')  # noqa: W605
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(' .', ' 0.')
+    string = string.replace('{.', '{0.')
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == '.':
+        string = '0' + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split('=')) == 2:
+        if len(string.split('=')[0]) <= 2:
+            string = string.split('=')[1]
+    # fix sqrt3 --> sqrt{3}
+    string = fix_sqrt(string)
+    # remove spaces
+    string = string.replace(' ', '')
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}  # noqa: E501
+    string = fix_fracs(string)
+    # manually change 0.5 --> \frac{1}{2}
+    if string == '0.5':
+        string = '\\frac{1}{2}'
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = fix_a_slash_b(string)
+    return string

evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl