PyPI - evalscope - Versions diffs - 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

evalscope 0.8.0py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (47) hide show

evalscope/backend/base.py +1 -1
evalscope/backend/rag_eval/utils/clip.py +2 -2
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
evalscope/benchmarks/race/race_adapter.py +2 -1
evalscope/config.py +35 -1
evalscope/constants.py +24 -38
evalscope/evaluator/__init__.py +0 -1
evalscope/evaluator/evaluator.py +5 -4
evalscope/evaluator/rating_eval.py +1 -1
evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
evalscope/perf/arguments.py +2 -1
evalscope/perf/benchmark.py +2 -2
evalscope/perf/main.py +2 -5
evalscope/perf/plugin/api/openai_api.py +2 -2
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/benchmark_util.py +4 -4
evalscope/perf/utils/db_util.py +66 -22
evalscope/perf/utils/local_server.py +3 -1
evalscope/run.py +45 -82
evalscope/run_arena.py +2 -1
evalscope/summarizer.py +14 -26
evalscope/third_party/longbench_write/eval.py +2 -1
evalscope/third_party/longbench_write/longbench_write.py +2 -1
evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/tools/combine_reports.py +2 -4
evalscope/tools/rewrite_eval_results.py +1 -1
evalscope/utils/__init__.py +1 -0
evalscope/utils/chat_service.py +1 -1
evalscope/utils/io_utils.py +162 -0
evalscope/utils/logger.py +8 -0
evalscope/utils/utils.py +0 -175
evalscope/version.py +2 -2
{evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/METADATA +1 -1
{evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/RECORD +46 -46
tests/cli/test_run.py +11 -12
tests/perf/test_perf.py +2 -1
tests/vlm/test_vlmeval.py +3 -2
evalscope/evaluator/humaneval_evaluator.py +0 -158
{evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0

evalscope/backend/base.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from typing import Union
 from evalscope.config import TaskConfig
-from evalscope.utils import yaml_to_dict
+from evalscope.utils.io_utils import yaml_to_dict
 class BackendManager:

evalscope/backend/rag_eval/utils/clip.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch.nn.functional as F
 from langchain_core.embeddings import Embeddings
 from PIL import Image
 from transformers import AutoModel, AutoProcessor
-from typing import List
+from typing import List, Union
 from evalscope.backend.rag_eval.utils.tools import PIL_to_base64, download_model
 from evalscope.constants import HubType
@@ -86,7 +86,7 @@ class CLIPModel(Embeddings):
         self.transform = self.processor.image_processor
         self.tokenizer = self.processor.tokenizer
-    def encode_text(self, batch_texts: List[str] | List[List[str]]):
+    def encode_text(self, batch_texts: Union[List[str], List[List[str]]]):
         if isinstance(batch_texts[0], list):
             batch_texts = [text for _, texts in enumerate(batch_texts) for text in texts]
         # Ensure that the input texts are within the token limit

evalscope/backend/rag_eval/utils/embedding.py CHANGED Viewed

@@ -80,7 +80,7 @@ class BaseModel(Embeddings):
         """Embed query text. Compact mteb."""
         raise NotImplementedError
-    def encode_corpus(self, corpus: List[str] | List[Dict[str, str]], **kwargs) -> list[torch.Tensor]:
+    def encode_corpus(self, corpus: Union[List[str], List[Dict[str, str]]], **kwargs) -> list[torch.Tensor]:
         """Embed search docs . Compact mteb."""
         raise NotImplementedError

evalscope/benchmarks/general_qa/general_qa_adapter.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any, Optional
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
 from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
-from evalscope.utils import jsonl_to_list
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 logger = get_logger()

evalscope/benchmarks/gsm8k/gsm8k_adapter.py CHANGED Viewed

@@ -6,7 +6,8 @@ import re
 from evalscope.benchmarks import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import jsonl_to_list, normalize_score
+from evalscope.utils import normalize_score
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/hellaswag/hellaswag_adapter.py CHANGED Viewed

@@ -5,7 +5,8 @@ import re
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import jsonl_to_list, normalize_score
+from evalscope.utils import normalize_score
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -1,20 +1,206 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import os
+import re
+from tqdm import tqdm
+from typing import List
-# flake8: noqa
+from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.metrics.metrics import weighted_mean
+from evalscope.tools.combine_reports import gen_table
+from evalscope.utils import normalize_score
+from evalscope.utils.logger import get_logger
+logger = get_logger()
 DATASET_ID = 'modelscope/humaneval'
 SUBSET_LIST = ['openai_humaneval']
-# Note: ONLY FOR CLASS IMPORT, No implementation here.
 # Example:
-# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
+# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}  # noqa
-class HumanevalAdapter:
+class HumanevalAdapter(DataAdapter):
     """
     A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
     """
-    def __init__(self):
-        ...
+    def __init__(self,
+                 subset_list: list = None,
+                 metric_list: list = None,
+                 few_shot_num: int = None,
+                 train_split: str = None,
+                 eval_split: str = 'test',
+                 prompt_template: str = 'Complete the following python code:\n',
+                 **kwargs):
+        try:
+            from human_eval.data import stream_jsonl, write_jsonl
+            from human_eval.evaluation import check_correctness
+        except ImportError:
+            raise ImportError('Please install human_eval:'
+                              'https://github.com/openai/human-eval/tree/master#installation , '
+                              'Note that you need to enable the execution code in the human_eval/execution.py first.')
+        if subset_list is None:
+            subset_list = SUBSET_LIST
+        if metric_list is None:
+            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+        self.k = [1]
+        self.num_workers = 4
+        self.timeout = 4.0
+        self.outputs = kwargs.get('outputs', None)
+        self.read_problems_func = stream_jsonl
+        self.write_jsonl_func = write_jsonl
+        self.eval_func = check_correctness
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            prompt_template=prompt_template,
+            **kwargs)
+    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
+        data_dict = {}
+        for subset_name in subset_list:
+            data_dict[subset_name] = {}
+            # [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
+            data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
+        return data_dict
+    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate prompt for the model.
+        Args:
+            input_d (dict): The raw input. A single data format of the Humaneval:
+            {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
+        """
+        full_prompt = input_d['prompt']
+        full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
+        return {'data': [full_prompt]}
+    def get_answers(self, infer_cfg: dict) -> List[dict]:
+        ans_list: list = []
+        system_prompt: str = ''
+        for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
+            prompt: str = system_prompt + data_d['prompt']
+            inputs: dict = {'data': [prompt]}
+            pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
+            pred_ans: str = pred_res['choices'][0]['message']['content']
+            pred_ans = self._postprocess(pred_ans)
+            ans_list.append({'task_id': task_id, 'completion': pred_ans})
+        return ans_list
+    def eval(self, infer_cfg: dict, **kwargs):
+        # predict
+        ans_list: list = self.get_answers(infer_cfg)
+        ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
+        self.write_jsonl_func(filename=ans_out_file, data=ans_list)
+        # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
+        logger.info('** Dump predictions successfully.')
+        # evaluate  results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
+        results = self.eval_func(
+            sample_file=ans_out_file,
+            k=self.k,
+            n_workers=self.num_workers,
+            timeout=self.timeout,
+            problem_file=self.problem_file)
+        # output: report
+        report_map: dict = self.gen_report(results=results)
+        report_dir: str = self.outputs_structure.reports_dir
+        report_file: str = os.path.join(report_dir, 'human_eval_report.json')
+        with open(report_file, 'w') as f:
+            f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
+        # logger.info(f'** Dump report to {report_file} \n')
+        logger.info('** Dump report \n')
+        try:
+            # Make table
+            report_table: str = gen_table([report_dir])
+            logger.info(f'** Report table: \n {report_table} \n')
+        except Exception:
+            logger.error('Failed to generate report table.')
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        total_num: int = sum([num for _, num in subset_score_map.values()])
+        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
+        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score)
+        } for subset_name, (score, _) in subset_score_map.items()]
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
+        res_map = dict(
+            name=report_name or 'HumanEval',
+            metric='pass@1',
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
+        return res_map
+    @classmethod
+    def _postprocess(cls, text: str) -> str:
+        if '```' in text:
+            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+            if len(blocks) == 0:
+                text = text.split('```')[1]  # fall back to default strategy
+            else:
+                text = blocks[0]  # fetch the first code block
+                if not text.startswith('\n'):  # in case starting with ```python
+                    text = text[max(text.find('\n') + 1, 0):]
+        if text.strip().startswith('from') or text.strip().startswith('import'):
+            def_idx = text.find('def')
+            if def_idx != -1:
+                text = text[max(text.find('\n', def_idx) + 1, 0):]
+        text = text.split('\n\n')[0]
+        if text.strip().startswith('def'):
+            text = '\n'.join(text.split('\n')[1:])
+        if not text.startswith('    '):
+            if text.startswith(' '):
+                text = '    ' + text.lstrip()
+            else:
+                text = '\n'.join(['    ' + line for line in text.split('\n')])
+        return text
+    def compute_metric(self, review_res_list: list) -> float:
+        """
+        Compute evaluation result by specific metric.
+        Args:
+            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
+        Returns:
+            The metric score.
+        """
+        items = [(score, 1.0) for score in review_res_list]
+        return weighted_mean(items)
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        return self._postprocess(result)
+    def get_gold_answer(self, input_d: dict) -> str:
+        return input_d
+    def match(self, gold: str, pred: str) -> float:
+        res = self.eval_func(gold, pred, self.timeout)
+        return float(res['passed'])

evalscope/benchmarks/race/race_adapter.py CHANGED Viewed

@@ -5,7 +5,8 @@ import os
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import jsonl_to_list, normalize_score
+from evalscope.utils import normalize_score
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/config.py CHANGED Viewed

@@ -9,7 +9,8 @@ from typing import Dict, List, Optional, Union
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
 from evalscope.models.custom import CustomModel
-from evalscope.utils import dict_to_yaml, gen_hash, json_to_dict, yaml_to_dict
+from evalscope.utils import gen_hash
+from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -31,6 +32,7 @@ DEFAULT_GENERATION_CONFIG = {
 class TaskConfig:
     # Model-related arguments
     model: Union[str, CustomModel, None] = None
+    model_id: Optional[str] = None
     model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
     # Template-related arguments
@@ -64,6 +66,13 @@ class TaskConfig:
     dry_run: bool = False
     seed: int = 42
+    def __post_init__(self):
+        if (not self.model_id) and self.model:
+            if isinstance(self.model, CustomModel):
+                self.model_id = type(self.model).__name__
+            else:
+                self.model_id = os.path.basename(self.model).rstrip(os.sep)
     def to_dict(self):
         # Note: to avoid serialization error for some model instance
         return self.__dict__
@@ -119,6 +128,7 @@ class TaskConfig:
                 continue
             task.model = custom_model
+            task.model_id = type(custom_model).__name__
             res_list.append(task)
         return res_list
@@ -168,6 +178,30 @@ tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
 registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
+def parse_task_config(task_cfg) -> TaskConfig:
+    """Parse task configuration from various formats into a TaskConfig object."""
+    if isinstance(task_cfg, TaskConfig):
+        logger.info('Args: Task config is provided with TaskConfig type.')
+    elif isinstance(task_cfg, dict):
+        logger.info('Args: Task config is provided with dictionary type.')
+        task_cfg = TaskConfig.from_dict(task_cfg)
+    elif isinstance(task_cfg, Namespace):
+        logger.info('Args: Task config is provided with CommandLine type.')
+        task_cfg = TaskConfig.from_args(task_cfg)
+    elif isinstance(task_cfg, str):
+        extension = task_cfg.split('.')[-1]
+        logger.info(f'Args: Task config is provided with {extension} file type.')
+        if extension in ['yaml', 'yml']:
+            task_cfg = TaskConfig.from_yaml(task_cfg)
+        elif extension == 'json':
+            task_cfg = TaskConfig.from_json(task_cfg)
+        else:
+            raise ValueError('Args: Unsupported file extension.')
+    else:
+        raise ValueError('Args: Please provide a valid task config.')
+    return task_cfg
 class TempModel(CustomModel):
     def __init__(self, config: dict):

evalscope/constants.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
 from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
@@ -7,6 +6,7 @@ DEFAULT_WORK_DIR = './outputs'
 DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION  # master
 DEFAULT_MODEL_CACHE_DIR = get_model_cache_root()  # ~/.cache/modelscope/hub
 DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root()  # ~/.cache/modelscope/datasets
+DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR  # compatible with old version
 class HubType:
@@ -76,33 +76,6 @@ class ArenaMode:
     PAIRWISE_BASELINE = 'pairwise_baseline'
-class OutputsStructure:
-    LOGS_DIR = 'logs'
-    PREDICTIONS_DIR = 'predictions'
-    REVIEWS_DIR = 'reviews'
-    REPORTS_DIR = 'reports'
-    CONFIGS_DIR = 'configs'
-    def __init__(self, outputs_dir: str, is_make: bool = True):
-        self.outputs_dir = outputs_dir
-        self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
-        self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
-        self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
-        self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
-        self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
-        if is_make:
-            self.create_directories()
-    def create_directories(self):
-        os.makedirs(self.outputs_dir, exist_ok=True)
-        os.makedirs(self.logs_dir, exist_ok=True)
-        os.makedirs(self.predictions_dir, exist_ok=True)
-        os.makedirs(self.reviews_dir, exist_ok=True)
-        os.makedirs(self.reports_dir, exist_ok=True)
-        os.makedirs(self.configs_dir, exist_ok=True)
 class AnswerKeys:
     ANSWER_ID = 'answer_id'
     RAW_INPUT = 'raw_input'
@@ -166,17 +139,30 @@ class EvalType:
 class EvalBackend:
-    # Use native evaluation pipeline of EvalScope
-    NATIVE = 'Native'
-    # Use OpenCompass framework as the evaluation backend
-    OPEN_COMPASS = 'OpenCompass'
+    class _Backend:
+        #  compatible with old version, set 'value'
+        def __init__(self, value):
+            self._value = value
+        @property
+        def value(self):
+            return self._value
+        def __str__(self):
+            return self._value
-    # Use VLM Eval Kit as the multi-modal model evaluation backend
-    VLM_EVAL_KIT = 'VLMEvalKit'
+        def __repr__(self):
+            return f"'{self._value}'"
-    # Use RAGEval as the RAG evaluation backend
-    RAG_EVAL = 'RAGEval'
+        def __eq__(self, other):
+            if isinstance(other, str):
+                return self._value == other
+            return NotImplemented
-    # Use third-party evaluation backend/modules
-    THIRD_PARTY = 'ThirdParty'
+    NATIVE = _Backend('Native')
+    OPEN_COMPASS = _Backend('OpenCompass')
+    VLM_EVAL_KIT = _Backend('VLMEvalKit')
+    RAG_EVAL = _Backend('RAGEval')
+    THIRD_PARTY = _Backend('ThirdParty')

evalscope/evaluator/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from evalscope.evaluator.evaluator import Evaluator
-from evalscope.evaluator.humaneval_evaluator import HumanevalEvaluator

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -11,10 +11,11 @@ from typing import Any, Dict, List, Optional, Union
 from evalscope.benchmarks import DataAdapter
 from evalscope.config import TaskConfig
 from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
-                                 OutputsStructure, ReviewKeys)
+                                 ReviewKeys)
 from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
 from evalscope.tools.combine_reports import gen_table
-from evalscope.utils import dict_torch_dtype_to_str, dump_jsonl_data, gen_hash, jsonl_to_list
+from evalscope.utils import dict_torch_dtype_to_str, gen_hash
+from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -56,8 +57,8 @@ class Evaluator(object):
                  **kwargs):
         self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
-        self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
-        self.model_name = os.path.basename(str(overall_task_cfg.model).rstrip(os.sep))
+        self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
+        self.model_name = overall_task_cfg.model_id
         self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
         self.datasets_dir = os.path.expanduser(datasets_dir)

evalscope/evaluator/rating_eval.py CHANGED Viewed

@@ -5,8 +5,8 @@ import pyarrow as pa
 from typing import List, Union
 from evalscope.constants import MetricMembers
-from evalscope.utils import jsonl_to_list
 from evalscope.utils.arena_utils import compute_elo
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 logger = get_logger()

evalscope/evaluator/reviewer/auto_reviewer.py CHANGED Viewed

@@ -12,8 +12,9 @@ from typing import Any, List
 from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
 from evalscope.models.openai_model import OpenAIModel
-from evalscope.utils import completion_parsers, dump_jsonl_data, jsonl_to_list, random_seeded_choice
+from evalscope.utils import completion_parsers, random_seeded_choice
 from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
+from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
 from evalscope.utils.logger import get_logger
 logger = get_logger()

evalscope/perf/arguments.py CHANGED Viewed

@@ -16,7 +16,7 @@ class Arguments:
     attn_implementation: Optional[str] = None  # Attention implementaion, only for local inference
     api: str = 'openai'  # API to be used (default: 'openai')
     tokenizer_path: Optional[str] = None  # Path to the tokenizer
-    port: str = '8877'  # Port number for the local API server
+    port: int = 8877  # Port number for the local API server
     # Connection settings
     url: str = 'http://127.0.0.1:8877/v1/chat/completions'  # URL for the API connection
@@ -138,6 +138,7 @@ def add_argument(parser: argparse.ArgumentParser):
     # Connection settings
     parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
+    parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
     parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
     parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
     parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')

evalscope/perf/benchmark.py CHANGED Viewed

@@ -195,9 +195,9 @@ async def start_server(args: Arguments) -> bool:
         server.start()
         if args.dataset.startswith('speed_benchmark'):
-            args.url = 'http://127.0.0.1:8877/v1/completions'
+            args.url = f'http://127.0.0.1:{args.port}/v1/completions'
         else:
-            args.url = 'http://127.0.0.1:8877/v1/chat/completions'
+            args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
     if not await test_connection(args):
         raise TimeoutError('Test connection failed')

evalscope/perf/main.py CHANGED Viewed

@@ -8,7 +8,7 @@ from evalscope.perf.arguments import Arguments, parse_args
 from evalscope.perf.benchmark import benchmark
 from evalscope.perf.utils.db_util import get_output_path
 from evalscope.perf.utils.handler import add_signal_handlers
-from evalscope.utils.logger import get_logger
+from evalscope.utils.logger import configure_logging, get_logger
 from evalscope.utils.utils import seed_everything
 logger = get_logger()
@@ -23,10 +23,7 @@ def run_perf_benchmark(args):
     # Setup logger and output
     args.outputs_dir = get_output_path(args)
-    get_logger(log_file=os.path.join(args.outputs_dir, 'benchmark.log'), force=True)
-    if args.debug:
-        get_logger(log_level=logging.DEBUG, force=True)
+    configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
     logger.info('Starting benchmark...')
     logger.info(args)

evalscope/perf/plugin/api/openai_api.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import os
 from transformers import AutoTokenizer
-from typing import Any, Dict, Iterator, List
+from typing import Any, Dict, Iterator, List, Union
 from evalscope.perf.arguments import Arguments
 from evalscope.perf.plugin.api.base import ApiPluginBase
@@ -29,7 +29,7 @@ class OpenaiPlugin(ApiPluginBase):
         else:
             self.tokenizer = None
-    def build_request(self, messages: List[Dict] | str, param: Arguments) -> Dict:
+    def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
         """Build the openai format request based on prompt, dataset
         Args:

evalscope/perf/plugin/registry.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, List, Type
+from typing import Any, List, Type, Union
 class PluginRegistry:
@@ -20,7 +20,7 @@ class PluginRegistry:
         return self.get_class(name)
-def register_dataset(name: str | List[str]):
+def register_dataset(name: Union[str, List[str]]):
     def class_decorator(cls: Type):
         if isinstance(name, str):
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
     return class_decorator
-def register_api(name: str | List[str]):
+def register_api(name: Union[str, List[str]]):
     def class_decorator(cls: Type):
         if isinstance(name, str):

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -116,19 +116,19 @@ class BenchmarkMetrics:
     def create_message(self, default_ndigits=3):
         message = {
-            'Time taken for tests (senconds)': round(self.total_time, default_ndigits),
+            'Time taken for tests (s)': round(self.total_time, default_ndigits),
             'Number of concurrency': self.concurrency,
             'Total requests': int(self.n_total_queries),
             'Succeed requests': self.n_succeed_queries,
             'Failed requests': self.n_failed_queries,
+            'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
             'Average QPS': round(self.qps, default_ndigits),
             'Average latency (s)': round(self.avg_latency, default_ndigits),
             'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
             'Average time per output token (s)': round(self.avg_time_per_token, 5),
-            'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
-            'Average package per request': round(self.n_avg_chunks, default_ndigits),
-            'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
             'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
             'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
+            'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
+            'Average package per request': round(self.n_avg_chunks, default_ndigits),
         }
         return message

evalscope 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.8.0py3-none-any.whl → 0.8.1py3-none-any.whl