PyPI - evalscope - Versions diffs - 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +20 -5
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +1 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/evaluator/evaluator.py +15 -12
evalscope/metrics/__init__.py +6 -0
evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
evalscope/metrics/llm_judge.py +105 -20
evalscope/metrics/metrics.py +1 -1
evalscope/models/adapters/base_adapter.py +0 -2
evalscope/models/adapters/server_adapter.py +2 -2
evalscope/models/custom/dummy_model.py +3 -3
evalscope/perf/arguments.py +2 -16
evalscope/perf/main.py +1 -1
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +1 -1
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +45 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +50 -2
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +13 -37
tests/perf/test_perf.py +2 -2
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0

evalscope/{utils/utils.py → metrics/completion_parsers.py} RENAMED Viewed

@@ -1,77 +1,85 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright (c) OpenCompass.
+# flake8: noqa
-import functools
-import hashlib
-import importlib
-import importlib.util
-import numpy as np
-import os
-import random
+import ast
 import re
-import torch
-from inspect import signature
-from typing import Any, Dict, List, Tuple, Union
+# from . import utils as ann_utils
+from evalscope.constants import ArenaWinner
 from evalscope.utils.logger import get_logger
 logger = get_logger()
-TEST_LEVEL_LIST = [0, 1]
+one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
+one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
-# Example: export TEST_LEVEL_LIST=0,1
-TEST_LEVEL_LIST_STR = 'TEST_LEVEL_LIST'
+# modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
+# does not work with batched completions
+def lmsys_parser(completion, output_format):
+    if output_format == '[[rating]]':
+        match = re.search(one_score_pattern, completion)
+        if not match:
+            match = re.search(one_score_pattern_backup, completion)
-def test_level_list():
-    global TEST_LEVEL_LIST
-    if TEST_LEVEL_LIST_STR in os.environ:
-        TEST_LEVEL_LIST = [int(x) for x in os.environ[TEST_LEVEL_LIST_STR].split(',')]
-    return TEST_LEVEL_LIST
-def get_obj_from_cfg(eval_class_ref: Any, *args, **kwargs) -> Any:
-    module_name, spliter, cls_name = eval_class_ref.partition(':')
+        if match:
+            rating = ast.literal_eval(match.groups()[0])
+        else:
+            logger.error(f'Content: {completion}\n'
+                         'You must manually fix the score.')
+            rating = -1
+        return rating
+    if output_format == '[[rating_a,rating_b]]':
+        try:
+            score_pair = completion.split('\n')[0]
+            score_pair = score_pair.replace(',', ' ')
+            sp = score_pair.split(' ')
+            if len(sp) == 2:
+                score_1 = float(sp[0])
+                score_2 = float(sp[1])
+                if score_1 > score_2:
+                    winner = ArenaWinner.MODEL_A
+                elif score_1 < score_2:
+                    winner = ArenaWinner.MODEL_B
+                else:
+                    if score_1 == score_1 == -1:
+                        winner = ArenaWinner.UNKNOWN
+                    winner = ArenaWinner.TIE
+                return winner, [score_1, score_2]
+            else:
+                raise Exception('Invalid score pair.')
+        except Exception as e:
+            logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
+            return ArenaWinner.UNKNOWN, [-1, -1]
+    elif output_format == '[[A]]':
+        if '[[A]]' in completion:
+            winner = ArenaWinner.MODEL_A
+        elif '[[B]]' in completion:
+            winner = ArenaWinner.MODEL_B
+        elif '[[C]]' in completion:
+            winner = ArenaWinner.TIE
+        else:
+            logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
+            winner = ArenaWinner.UNKNOWN
+        return winner
+def ranking_parser(completion, **kwargs):
     try:
-        obj_cls = importlib.import_module(module_name)
-    except ImportError as e:
-        logger.error(f'{e}')
-        raise e
-    if spliter:
-        for attr in cls_name.split('.'):
-            obj_cls = getattr(obj_cls, attr)
-    return functools.partial(obj_cls, *args, **kwargs)
-def random_seeded_choice(seed: Union[int, str, float], choices, **kwargs):
-    """Random choice with a (potentially string) seed."""
-    return random.Random(seed).choices(choices, k=1, **kwargs)[0]
-def gen_hash(name: str, bits: int = 32):
-    return hashlib.md5(name.encode(encoding='UTF-8')).hexdigest()[:bits]
+        if isinstance(completion, str):
+            ordered_completions = ast.literal_eval(completion)
+        else:
+            ordered_completions = completion
-def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
-    """
-        Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None,
-        converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
-        string, which can then be stored in the json format.
+        rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
+        assert rank in [1, 2]
-        Refer to: https://github.com/huggingface/transformers/pull/16065/files for details.
-        """
-    if d.get('torch_dtype', None) is not None and not isinstance(d['torch_dtype'], str):
-        d['torch_dtype'] = str(d['torch_dtype']).split('.')[1]
-    for value in d.values():
-        if isinstance(value, dict):
-            dict_torch_dtype_to_str(value)
-    return d
+        return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
+    except Exception as e:
+        logger.error(f'{e}\nContent: {completion}\n'
+                     'You must manually fix the score pair.')
+        return ArenaWinner.UNKNOWN
 class ResponseParser:
@@ -194,7 +202,6 @@ class ResponseParser:
             return last_capital
         return 'No valid option found'
     @staticmethod
     def parse_bracketed_answer(text: str, options: list[str]) -> str:
         options = ResponseParser.process_options(options)
@@ -211,122 +218,3 @@ class ResponseParser:
         # Join options into a regex pattern separated by '|', to match any of the options
         options_pattern = '|'.join(escaped_options)
         return options_pattern
-def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
-    """
-    Normalize score.
-    Args:
-        score: input score, could be float or dict. e.g. 0.12345678 or {'acc': 0.12345678, 'f1': 0.12345678}
-        keep_num: number of digits to keep.
-    Returns:
-        Union[float, dict]: normalized score. e.g. 0.1234 or {'acc': 0.1234, 'f1': 0.1234}
-    """
-    if isinstance(score, float):
-        score = round(score, keep_num)
-    elif isinstance(score, dict):
-        score = {k: round(v, keep_num) for k, v in score.items()}
-    else:
-        logger.warning(f'Unknown score type: {type(score)}')
-    return score
-def is_module_installed(module_name):
-    try:
-        importlib.import_module(module_name)
-        return True
-    except ImportError:
-        return False
-def get_module_path(module_name):
-    spec = importlib.util.find_spec(module_name)
-    if spec and spec.origin:
-        return os.path.abspath(spec.origin)
-    else:
-        raise ValueError(f'Cannot find module: {module_name}')
-def get_valid_list(input_list, candidate_list):
-    """
-    Get the valid and invalid list from input_list based on candidate_list.
-    Args:
-        input_list: The input list.
-        candidate_list: The candidate list.
-    Returns:
-        valid_list: The valid list.
-        invalid_list: The invalid list.
-    """
-    return [i for i in input_list if i in candidate_list], \
-           [i for i in input_list if i not in candidate_list]
-def get_latest_folder_path(work_dir):
-    from datetime import datetime
-    # Get all subdirectories in the work_dir
-    folders = [f for f in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, f))]
-    # Get the timestamp（YYYYMMDD_HHMMSS）
-    timestamp_pattern = re.compile(r'^\d{8}_\d{6}$')
-    # Filter out the folders
-    timestamped_folders = [f for f in folders if timestamp_pattern.match(f)]
-    if not timestamped_folders:
-        print(f'>> No timestamped folders found in {work_dir}!')
-        return None
-    # timestamp parser
-    def parse_timestamp(folder_name):
-        return datetime.strptime(folder_name, '%Y%m%d_%H%M%S')
-    # Find the latest folder
-    latest_folder = max(timestamped_folders, key=parse_timestamp)
-    return os.path.join(work_dir, latest_folder)
-def csv_to_list(file_path: str) -> List[dict]:
-    import csv
-    with open(file_path, mode='r', newline='', encoding='utf-8') as csv_file:
-        csv_reader = csv.DictReader(csv_file)
-        result = [row for row in csv_reader]
-    return result
-def seed_everything(seed: int):
-    """Set all random seeds to a fixed value for reproducibility.
-    Args:
-        seed (int): The seed value.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
-def get_supported_params(func):
-    """Get the supported parameters of a function."""
-    sig = signature(func)
-    return list(sig.parameters.keys())
-def parse_int_or_float(num):
-    number = float(num)
-    if number.is_integer():
-        return int(number)
-    return number
-if __name__ == '__main__':
-    options = ['A', 'B', 'C', 'D']
-    answers = ['Context .... ANSWER: A', 'answer: A']
-    for answer in answers:
-        print(ResponseParser.parse_first_option(answer, options))

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -8,11 +8,14 @@ logger = get_logger()
 DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.
-Question: {question}
+[Question]
+{question}
-Reference Answer: {gold}
+[Reference Answer]
+{gold}
-Model Answer: {pred}
+[Predicted Answer]
+{pred}
 Evaluate the model's answer based on correctness compared to the reference answer.
 Grade the predicted answer of this new question as one of:
@@ -22,6 +25,18 @@ B: INCORRECT
 Just return the letters "A" or "B", with no text around it.
 """  # noqa: E501
+DEFAULT_NUMERIC_SCORE_TEMPLATE = """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
+Begin your evaluation by providing a short explanation. Be as objective as possible.
+After providing your explanation, you must rate the response on a scale of 0 (worst) to 1 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[0.5]]\"
+[Question]
+{question}
+[Response]
+{pred}
+"""  # noqa: E501
 DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
 DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
@@ -31,14 +46,18 @@ class LLMJudge:
     A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers.
     """
-    def __init__(self,
-                 api_key: Optional[str] = None,
-                 api_url: Optional[str] = None,
-                 model_id: Optional[str] = None,
-                 system_prompt: Optional[str] = None,
-                 prompt_template: Optional[str] = None,
-                 generation_config: Optional[Dict[str, Any]] = None,
-                 **kwargs):
+    def __init__(
+            self,
+            api_key: Optional[str] = None,
+            api_url: Optional[str] = None,
+            model_id: Optional[str] = None,
+            system_prompt: Optional[str] = None,
+            prompt_template: Optional[str] = None,
+            generation_config: Optional[Dict[str, Any]] = None,
+            score_pattern: Optional[str] = None,
+            score_mapping: Optional[Dict[str, float]] = None,
+            score_type: str = 'pattern',  # 'pattern', 'numeric'
+            **kwargs):
         """
         Initialize LLMJudge metric.
@@ -49,14 +68,34 @@ class LLMJudge:
             system_prompt (str, optional): System prompt for the judge
             prompt_template (str, optional): Prompt template for the judge
             generation_config (dict, optional): Generation configuration for the judge
+            score_pattern (str, optional): Regex pattern to extract score from LLM response
+            score_mapping (dict, optional): Mapping from extracted score to float value
+            score_type (str, optional): Type of score extraction strategy ('pattern', 'numeric') defaults to 'pattern'.
+                - 'pattern': Use score_pattern and score_mapping to extract categorical scores
+                - 'numeric': Treat the extracted value as a direct numerical score
         """
         self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
         self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
         self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
         self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
-        self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
         self.generation_config = generation_config or {}
+        # Default score mapping for A/B pattern
+        self.score_type = score_type
+        if self.score_type == 'numeric':
+            self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
+            self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
+                                                                     DEFAULT_NUMERIC_SCORE_TEMPLATE)
+        elif self.score_type == 'pattern':
+            self.score_pattern = score_pattern or r'(A|B)'
+            self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
+        else:
+            raise ValueError(f"Invalid score_type: {self.score_type}. Must be 'pattern' or 'numeric'.")
+        self.score_mapping = score_mapping or {'A': 1.0, 'B': 0.0}
+        self._init_server_adapter()
+    def _init_server_adapter(self):
         from evalscope.models import ServerModelAdapter
         # Initialize ServerModelAdapter
@@ -95,17 +134,63 @@ class LLMJudge:
     def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
         if question is None:
             question = 'Not provided'
-        return self.prompt_template.format(question=question, pred=pred, gold=gold)
+        # check variables in prompt_template
+        prompt = self.prompt_template
+        if '{question}' in self.prompt_template:
+            prompt = prompt.replace('{question}', question)
+        if '{pred}' in self.prompt_template:
+            prompt = prompt.replace('{pred}', pred)
+        if '{gold}' in self.prompt_template:
+            prompt = prompt.replace('{gold}', gold)
+        return prompt
     def get_score(self, response: str) -> float:
+        """
+        Extract score from LLM response using the configured pattern and mapping.
+        Args:
+            response (str): The response from the LLM
+        Returns:
+            float: The numeric score extracted from the response
+        """
         if response is None:
-            return 0
-        match = re.search(r'(A|B)', response)
+            return 0.0
+        # choose extraction method based on score_type
+        if self.score_type == 'numeric':
+            return self._extract_numeric_score(response)
+        elif self.score_type == 'pattern':
+            return self._extract_pattern_score(response)
+    def _extract_numeric_score(self, response: str) -> Optional[float]:
+        """extract numeric score from the response using the score_pattern"""
+        match = re.search(self.score_pattern, response)
+        if match:
+            # try to convert each captured group to float
+            for group in match.groups():
+                if group is not None:
+                    try:
+                        return float(group)
+                    except (ValueError, TypeError):
+                        continue
+            # if not found in groups, try the whole match
+            try:
+                return float(match.group(0))
+            except (ValueError, TypeError):
+                logger.warning(f'Failed to convert any extracted value to float from: {match.group(0)}')
+        return None
+    def _extract_pattern_score(self, response: str) -> float:
+        """use the score_pattern to extract categorical scores"""
+        match = re.search(self.score_pattern, response)
         if match:
             answer = match.group(0)
-            if answer == 'A':
-                return 1
-            elif answer == 'B':
-                return 0
+            return self.score_mapping.get(answer, 0.0)
         else:
-            return 0
+            logger.warning(f"No match found for pattern '{self.score_pattern}' in response: {response}")
+            return 0.0

evalscope/metrics/metrics.py CHANGED Viewed

@@ -223,7 +223,7 @@ def chrf(items):
     Source: https://github.com/m-popovic/chrF
     Paper: https://www.aclweb.org/anthology/W15-3049.pdf
-    Higher is better  # TODO I think
+    Higher is better
     """
     refs = list(zip(*items))[0]
     preds = list(zip(*items))[1]

evalscope/models/adapters/base_adapter.py CHANGED Viewed

@@ -54,8 +54,6 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
             if 'server' not in model_adapter_cls_str:
                 model_adapter_cls_str = 'server'
-                logger.info(
-                    f'Using {model_adapter_cls.__name__} for api model evaluation for benchmark {benchmark.name}.')
             # init server model adapter
             model_adapter_cls = get_model_adapter(model_adapter_cls_str)

evalscope/models/adapters/server_adapter.py CHANGED Viewed

@@ -5,8 +5,8 @@ from openai.types.chat import ChatCompletion, ChatCompletionChunk
 from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
 from typing import List, Optional, Union
+from evalscope.utils.argument_utils import get_supported_params
 from evalscope.utils.logger import get_logger
-from evalscope.utils.utils import get_supported_params
 from .base_adapter import BaseModelAdapter
 logger = get_logger()
@@ -29,7 +29,7 @@ class ServerModelAdapter(BaseModelAdapter):
         self.api_key = api_key
         self.client = openai.OpenAI(
-            api_key=api_key,
+            api_key=self.api_key,
             base_url=self.api_url,
         )
         self.supported_params = get_supported_params(self.client.chat.completions.create)

evalscope/models/custom/dummy_model.py CHANGED Viewed

@@ -50,14 +50,14 @@ class DummyCustomModel(CustomModel):
         # Must return a list of dicts with the same format as the OpenAI API.
         responses = []
         for input_item in original_inputs:
-            message = self.make_request_messages(input_item)
-            response = f'Dummy response for prompt: {message}'
+            # message = self.make_request_messages(input_item)
+            # response = f'Dummy response for prompt: {message}'
             res_d = {
                 'choices': [{
                     'index': 0,
                     'message': {
-                        'content': response,
+                        'content': '*PlaceHolder*',
                         'role': 'assistant'
                     }
                 }],

evalscope/perf/arguments.py CHANGED Viewed

@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
 from evalscope.constants import DEFAULT_WORK_DIR
+from evalscope.utils import BaseArgument
 @dataclass
-class Arguments:
+class Arguments(BaseArgument):
     # Model and API
     model: str  # Model name or path
     model_id: Optional[str] = None  # Model identifier
@@ -69,15 +70,6 @@ class Arguments:
     top_k: Optional[int] = None  # Top-k sampling setting for the response
     extra_args: Optional[Dict[str, Any]] = None  # Extra arguments
-    @staticmethod
-    def from_args(args):
-        # Convert Namespace to a dictionary and filter out None values
-        args_dict = {k: v for k, v in vars(args).items() if v is not None}
-        if 'func' in args_dict:
-            del args_dict['func']  # Note: compat CLI arguments
-        return Arguments(**args_dict)
     def __post_init__(self):
         # Set the default headers
         self.headers = self.headers or {}  # Default to empty dictionary
@@ -108,12 +100,6 @@ class Arguments:
             self.parallel
         ), f'The length of number and parallel should be the same, but got number: {self.number} and parallel: {self.parallel}'  # noqa: E501
-    def __str__(self):
-        return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
-    def to_dict(self) -> Dict[str, Any]:
-        return self.__dict__
 class ParseKVAction(argparse.Action):

evalscope/perf/main.py CHANGED Viewed

@@ -9,7 +9,7 @@ from argparse import Namespace
 from evalscope.perf.utils.local_server import start_app
 from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
 from evalscope.utils.logger import configure_logging, get_logger
-from evalscope.utils.utils import seed_everything
+from evalscope.utils.model_utils import seed_everything
 from .arguments import Arguments, parse_args
 from .benchmark import benchmark
 from .utils.db_util import get_output_path

evalscope/perf/utils/analysis_result.py CHANGED Viewed

@@ -3,27 +3,28 @@ import json
 import pickle
 import sqlite3
-result_db_path = './outputs/qwen2.5_benchmark_20241111_160543.db'
-con = sqlite3.connect(result_db_path)
-query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
-                FROM result WHERE success='1'"
+db_path = 'your db path'
+conn = sqlite3.connect(db_path)
+cursor = conn.cursor()
-# how to save base64.b64encode(pickle.dumps(benchmark_data["request"])).decode("ascii"),
-with con:
-    rows = con.execute(query_sql).fetchall()
-    if len(rows) > 0:
-        for row in rows:
-            request = row[0]
-            responses = row[1]
-            request = base64.b64decode(request)
-            request = pickle.loads(request)
-            responses = base64.b64decode(responses)
-            responses = pickle.loads(responses)
-            response_content = ''
-            for response in responses:
-                response = json.loads(response)
-                if not response['choices']:
-                    continue
-                response_content += response['choices'][0]['delta']['content']
-            print('prompt: %s, tokens: %s, completion: %s, tokens: %s' %
-                  (request['messages'][0]['content'], row[2], response_content, row[3]))
+# 获取列名
+cursor.execute('PRAGMA table_info(result)')
+columns = [info[1] for info in cursor.fetchall()]
+print('列名：', columns)
+cursor.execute('SELECT * FROM result WHERE success=1 AND first_chunk_latency > 1')
+rows = cursor.fetchall()
+print(f'len(rows): {len(rows)}')
+for row in rows:
+    row_dict = dict(zip(columns, row))
+    # 解码request
+    row_dict['request'] = pickle.loads(base64.b64decode(row_dict['request']))
+    # 解码response_messages
+    row_dict['response_messages'] = pickle.loads(base64.b64decode(row_dict['response_messages']))
+    # print(row_dict)
+    print(
+        f"request_id: {json.loads(row_dict['response_messages'][0])['id']}, first_chunk_latency: {row_dict['first_chunk_latency']}"  # noqa: E501
+    )
+    # 如果只想看一个可以break
+    # break

evalscope/perf/utils/benchmark_util.py CHANGED Viewed

@@ -38,7 +38,7 @@ class BenchmarkData:
             self.first_chunk_latency = self.query_latency
             self.n_chunks = 1
             self.n_chunks_time = self.query_latency
-        self.time_per_output_token = self.n_chunks_time / self.n_chunks
+        self.time_per_output_token = self.n_chunks_time / self.n_chunks if self.n_chunks != 0 else 0.0
     def _calculate_tokens(self, api_plugin):
         self.prompt_tokens, self.completion_tokens = \

evalscope/report/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import TYPE_CHECKING
 from evalscope.utils.import_utils import _LazyModule
 if TYPE_CHECKING:
-    from .combinator import gen_report_table, gen_table, get_data_frame, get_report_list
+    from .combinator import gen_table, get_data_frame, get_report_list
     from .generator import ReportGenerator
     from .utils import Category, Report, ReportKey, Subset

evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl