PyPI - evalscope - Versions diffs - 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

evalscope 0.10.0py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (59) hide show

evalscope/arguments.py +1 -0
evalscope/benchmarks/aime24/__init__.py +0 -0
evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
evalscope/benchmarks/arc/arc_adapter.py +5 -7
evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
evalscope/benchmarks/benchmark.py +2 -2
evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
evalscope/benchmarks/data_adapter.py +18 -12
evalscope/benchmarks/data_collection/__init__.py +0 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
evalscope/benchmarks/general_mcq/__init__.py +0 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
evalscope/benchmarks/gpqa/__init__.py +0 -0
evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
evalscope/benchmarks/ifeval/instructions.py +3 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
evalscope/benchmarks/math_500/__init__.py +0 -0
evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
evalscope/benchmarks/race/race_adapter.py +3 -3
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
evalscope/cli/start_app.py +3 -2
evalscope/collections/evaluator.py +103 -39
evalscope/collections/sampler.py +2 -1
evalscope/collections/schema.py +1 -2
evalscope/config.py +1 -0
evalscope/evaluator/evaluator.py +78 -64
evalscope/metrics/math_parser.py +526 -0
evalscope/metrics/metrics.py +16 -1
evalscope/metrics/named_metrics.py +31 -7
evalscope/models/chat_adapter.py +69 -47
evalscope/models/choice_adapter.py +52 -45
evalscope/models/custom_adapter.py +2 -2
evalscope/models/local_model.py +4 -0
evalscope/models/server_adapter.py +28 -34
evalscope/report/app.py +298 -96
evalscope/run.py +10 -7
evalscope/utils/chat_service.py +2 -2
evalscope/utils/io_utils.py +1 -1
evalscope/version.py +2 -2
{evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
{evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
tests/cli/test_run.py +93 -16
evalscope/benchmarks/ceval/samples.jsonl +0 -1
evalscope/metrics/math_accuracy.py +0 -200
{evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
{evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
{evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -3,15 +3,16 @@
 import json
 import os
 import time
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from copy import deepcopy
 from tqdm import tqdm
 from typing import Any, Dict, List, Optional, Union
 from evalscope.benchmarks import DataAdapter
 from evalscope.config import TaskConfig
-from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
-from evalscope.models import BaseModelAdapter, CustomModelAdapter
+from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, ReviewKeys
+from evalscope.models import BaseModelAdapter
 from evalscope.report import Report, gen_table
 from evalscope.utils import dict_torch_dtype_to_str, gen_hash
 from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
@@ -36,7 +37,6 @@ class Evaluator(object):
     """
     def __init__(self,
-                 dataset_name_or_path: str,
                  data_adapter: DataAdapter,
                  model_adapter: BaseModelAdapter,
                  outputs: OutputsStructure = None,
@@ -44,7 +44,7 @@ class Evaluator(object):
                  **kwargs):
         self.dataset_name = data_adapter.name
-        self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
+        self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
         self.model_name = task_cfg.model_id
         self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
@@ -63,15 +63,20 @@ class Evaluator(object):
     def load_dataset(self):
         dataset = self.data_adapter.load(
-            dataset_name_or_path=self.dataset_name_or_path,
-            subset_list=self.data_adapter.subset_list,
-            work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
-            datasets_hub=self.dataset_hub,
-            **self.kwargs)
+            work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
         # Get prompts from dataset
         prompts = self.data_adapter.gen_prompts(data_dict=dataset)
-        return prompts
+        # Limit and index prompts
+        limited_prompts = defaultdict(list)
+        for subset_name, prompts_list in prompts.items():
+            limit = self.task_cfg.limit or len(prompts_list)
+            for index, prompt in enumerate(prompts_list[:limit]):
+                prompt['index'] = index
+                limited_prompts[subset_name].append(prompt)
+        return limited_prompts
     def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
         model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
@@ -87,12 +92,38 @@ class Evaluator(object):
         answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
         return answer_d
-    def get_answers(self,
-                    subset_name: str,
-                    prompts_list: List[dict],
-                    infer_cfg: dict = None,
-                    debug: bool = False,
-                    **kwargs) -> list:
+    def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
+        answers_list = []
+        answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
+        for answer_d, input_prompt in zip(answer_ds, input_prompts):
+            answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
+            processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
+            answers_list.append(processed_answer)
+        return answers_list
+    @staticmethod
+    def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
+        # Filter prompts that have been answered
+        answers_list = []
+        if not use_cache or not os.path.exists(pred_file_path):
+            return answers_list, prompts_list
+        def get_answered_indices(answers_list: List[Dict]) -> List[int]:
+            indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
+            if all(index is None for index in indices):
+                return list(range(len(answers_list)))
+            return [index for index in indices if index is not None]
+        answers_list = jsonl_to_list(pred_file_path)
+        answered_indices = set(get_answered_indices(answers_list))
+        logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
+        prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
+        return answers_list, prompts
+    def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
         """
         Get answers from model inference.
         It is required to rewrite this method to support your own evaluator.
@@ -110,7 +141,6 @@ class Evaluator(object):
                     max_length: int, the max length of the sequence to be generated.
                     max_new_tokens: int, the max number of new tokens to be generated.
                     repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
-            debug: whether to run in debug mode.
             **kwargs: kwargs.
         Returns: The list of answers.
@@ -119,41 +149,35 @@ class Evaluator(object):
         assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
         assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
-        answers_list = []
         pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
         pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
         os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
-        if self.use_cache and os.path.exists(pred_file_path):
-            answers_list = jsonl_to_list(pred_file_path)
-            logger.info(f'Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
-            # Note: assume prediction in order of prompts_list
-            prompts_list = prompts_list[len(answers_list):]
-        if isinstance(self.model_adapter, CustomModelAdapter):
-            # Batch inference for custom model
-            resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
-                inputs=prompts_list, infer_cfg=infer_cfg)
-            for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
-                answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
-                processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
-                answers_list.append(processed_answer)
-                dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
+        answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
+        eval_batch_size = self.task_cfg.eval_batch_size
+        if self.task_cfg.eval_type == EvalType.SERVICE:
+            with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
+                with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
+                    futures = []
+                    for input_prompt in prompts_list:
+                        futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
+                    for future in as_completed(futures):
+                        answer_ds: List[dict] = future.result()
+                        answers_list.extend(answer_ds)
+                        dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
+                        pbar.update(len(answer_ds))
         else:
-            for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
-                answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
-                answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
-                processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
-                if debug:
-                    logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
-                    logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
-                answers_list.append(processed_answer)
-                dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
+            batch_prompts_list = [
+                prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
+            ]
+            with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
+                for batch_prompts in batch_prompts_list:
+                    answer_ds: List[dict] = self._get_answer(
+                        input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
+                    answers_list.extend(answer_ds)
+                    dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
+                    pbar.update(len(batch_prompts))
         logger.info(f'Dump predictions to {pred_file_path}.')
         return answers_list
@@ -200,17 +224,13 @@ class Evaluator(object):
     def _generate_review_id(self, answer_d):
         # Gen review_id (concat: answer_id + reviewer_spec)
         answer_id = answer_d[AnswerKeys.ANSWER_ID]
-        reviewer_spec = {
-            'metric': [metric.name for metric in self.data_adapter.metric_list],
-            'reviewer': ['Evaluator'],
-            'revision': ['default']
-        }
+        reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
         reviewer_spec_str = json.dumps(
             OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
         review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
         return review_id, reviewer_spec
-    def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
+    def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
         """
         Get reviews from answers.
         It is required to rewrite this method to support your own evaluator.
@@ -218,7 +238,6 @@ class Evaluator(object):
         Args:
             subset_name: subset name of benchmark
             answers_list: inference results list.
-            debug: whether to run in debug mode.
             **kwargs: kwargs.
         Returns: reviews list.
@@ -237,8 +256,7 @@ class Evaluator(object):
             # Get review
             review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
-            if debug:
-                logger.info(review_d)
+            logger.debug(review_d)
             reviews_list.append(review_d)
             # Dump reviews
@@ -315,7 +333,7 @@ class Evaluator(object):
                 logger.error('Failed to generate report table.')
         return report_map
-    def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
+    def eval(self, **kwargs) -> dict:
         """
         Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
         It is required to rewrite this method to support your own evaluator.
@@ -329,7 +347,6 @@ class Evaluator(object):
         Args:
             infer_cfg: The config for model inference.
-            debug: Whether to run in debug mode. Default: False.
         Returns:
             Dict of results. Depends on the stage of evaluation.
@@ -347,17 +364,14 @@ class Evaluator(object):
         prompts = self.load_dataset()
         for subset_name, prompts_list in prompts.items():
-            limit = kwargs.get('limit', len(prompts_list))
-            prompts_list = prompts_list[:limit]
             answers_list: list = self.get_answers(
-                subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
+                subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
             if self.stage == EvalStage.INFER:
                 stage_answers_dict[subset_name] = answers_list
                 continue
-            reviews_list: list = self.get_reviews(
-                subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
+            reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
             metric_res = self.compute_metrics(reviews_list=reviews_list)
             reviews_score_all[subset_name] = metric_res

evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.10.0py3-none-any.whl → 0.11.0py3-none-any.whl