PyPI - evalscope - Versions diffs - 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

evalscope 0.13.1py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (72) hide show

evalscope/arguments.py +1 -1
evalscope/backend/rag_eval/__init__.py +1 -1
evalscope/backend/rag_eval/backend_manager.py +21 -5
evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
evalscope/backend/rag_eval/ragas/arguments.py +0 -1
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
evalscope/backend/rag_eval/utils/embedding.py +49 -3
evalscope/backend/rag_eval/utils/llm.py +8 -9
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
evalscope/benchmarks/arena_hard/utils.py +162 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
evalscope/benchmarks/data_adapter.py +30 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
evalscope/benchmarks/maritime_bench/__init__.py +0 -0
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
evalscope/collections/evaluator.py +4 -2
evalscope/config.py +2 -2
evalscope/metrics/llm_judge.py +1 -1
evalscope/models/chat_adapter.py +32 -11
evalscope/perf/arguments.py +30 -9
evalscope/perf/benchmark.py +57 -103
evalscope/perf/http_client.py +2 -3
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +4 -2
evalscope/perf/plugin/datasets/custom.py +4 -1
evalscope/perf/plugin/datasets/line_by_line.py +4 -1
evalscope/perf/plugin/datasets/longalpaca.py +4 -1
evalscope/perf/plugin/datasets/openqa.py +4 -1
evalscope/perf/plugin/datasets/random_dataset.py +13 -6
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/utils/benchmark_util.py +12 -6
evalscope/perf/utils/db_util.py +3 -3
evalscope/perf/utils/log_utils.py +41 -0
evalscope/report/app.py +11 -11
evalscope/run.py +7 -0
evalscope/summarizer.py +2 -1
evalscope/utils/utils.py +36 -25
evalscope/version.py +2 -2
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
tests/cli/test_all.py +36 -27
tests/cli/test_collection.py +2 -1
tests/cli/test_run.py +38 -20
tests/perf/test_perf.py +1 -2
tests/rag/test_clip_benchmark.py +0 -1
tests/rag/test_mteb.py +37 -8
tests/rag/test_ragas.py +33 -27
tests/vlm/test_vlmeval.py +37 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/maritime_bench/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py ADDED Viewed

@@ -0,0 +1,79 @@
+from typing import Any
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType, OutputType
+from evalscope.metrics import exact_match
+from evalscope.utils.utils import ResponseParser
+SUBSET_LIST = ['default']
+@Benchmark.register(
+    name='maritime_bench',
+    pretty_name='MaritimeBench',
+    dataset_id='HiDolphin/MaritimeBench',
+    model_adapter=OutputType.GENERATION,
+    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
+    subset_list=SUBSET_LIST,
+    metric_list=['AverageAccuracy'],
+    eval_split='test',
+    prompt_template=
+    '题目来自于{subset_name}请回答单选题。要求只输出选项，不输出解释，将选项放在<>里，直接输出答案。示例：\n\n题目：在船舶主推进动力装置中，传动轴系在运转中承受以下复杂的应力和负荷，但不包括______。\n选项：\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答：<A> 当前题目\n {query}',  # noqa: E501
+)
+class MaritimeBenchAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.choices = ['A', 'B', 'C', 'D']
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        prefix = ''
+        query = prefix + input_d['question'] + '\n'
+        available_choices = []
+        for option in self.choices:
+            if option in input_d and input_d[option]:
+                query += option + ':' + input_d[option] + '\n'
+                available_choices.append(option)
+        full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
+        return self.gen_prompt_data(full_prompt, choices=available_choices)
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        Args:
+            input_d: input raw data. Depending on the dataset.
+        Returns:
+            The parsed input. e.g. gold answer ... Depending on the dataset.
+        """
+        return input_d['answer']
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the raw model prediction (pred).
+        Args:
+            pred: model prediction. Depending on the model.
+        Returns:
+            The parsed prediction. e.g. model answer... Depending on the model.
+        """
+        return ResponseParser.parse_bracketed_answer(result, options=self.choices)
+    def match(self, gold: Any, pred: Any) -> Any:
+        """
+        Match the gold answer with the predicted answer.
+        Args:
+            gold: The gold answer.
+            pred: The predicted answer.
+        Returns:
+            The result of the match.
+        """
+        return exact_match(gold=gold, pred=pred)

evalscope/benchmarks/mmlu/mmlu_adapter.py CHANGED Viewed

@@ -145,7 +145,7 @@ SUBJECT_MAPPING = {
     train_split='train',
     eval_split='test',
     prompt_template=
-    'Answer the following multiple choice question about {subset_name}. There is only one correct answer. The last line of your response should be in the format "Answer: LETTER" (without quotes), where LETTER is one of A, B, C, D. \n{query}',
+    """Answer the following multiple choice question about {subset_name}. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{query}""",  # noqa: E501
 )
 class MMLUAdapter(DataAdapter):
@@ -224,9 +224,8 @@ class MMLUAdapter(DataAdapter):
         context: str = '\n'.join(few_shot_prompts) + '\n'
         context += self._generate_prompt(input_d=input_d, include_answer=False)
-        query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
-        full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
+        full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
         return self.gen_prompt_data(full_prompt)
@@ -249,7 +248,7 @@ class MMLUAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(result)
+            return ResponseParser.parse_first_option(result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
@@ -260,11 +259,10 @@ class MMLUAdapter(DataAdapter):
         example: str = input_d['input']
         for j in range(len(self.choices)):
-            example += '\n{}. {}'.format(self.choices[j], input_choices[j])
+            example += f'\n{self.choices[j]}) {input_choices[j]}'
-        example += '\nAnswer:'
         if include_answer:
-            example += ' {}\n\n'.format(input_d['target'])
+            example += f"\nAnswer: {input_d['target']}\n\n"
         return example

evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py CHANGED Viewed

@@ -92,7 +92,7 @@ class MMLUProAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(result)
+            return ResponseParser.parse_first_option(result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         """

evalscope/benchmarks/mmlu_redux/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py ADDED Viewed

@@ -0,0 +1,182 @@
+from collections import defaultdict
+from typing import Any, Dict
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType, OutputType
+from evalscope.metrics import exact_match
+from evalscope.utils.logger import get_logger
+from evalscope.utils.utils import ResponseParser
+logger = get_logger()
+SUBSET_LIST = [
+    'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
+    'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
+    'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
+    'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
+    'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
+    'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
+    'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
+    'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
+    'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
+    'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
+    'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
+    'world_religions'
+]
+SUBJECT_MAPPING = {
+    'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
+    'anatomy': ['Anatomy', 'health', 'Other'],
+    'astronomy': ['Astronomy', 'physics', 'STEM'],
+    'business_ethics': ['Business Ethics', 'business', 'Other'],
+    'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
+    'college_biology': ['College Biology', 'biology', 'STEM'],
+    'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
+    'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
+    'college_mathematics': ['College Mathematics', 'math', 'STEM'],
+    'college_medicine': ['College Medicine', 'health', 'Other'],
+    'college_physics': ['College Physics', 'physics', 'STEM'],
+    'computer_security': ['Computer Security', 'computer science', 'STEM'],
+    'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
+    'econometrics': ['Econometrics', 'economics', 'Social Science'],
+    'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
+    'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
+    'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
+    'global_facts': ['Global Facts', 'other', 'Other'],
+    'high_school_biology': ['High School Biology', 'biology', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
+    'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
+    'high_school_european_history': ['High School European History', 'history', 'Humanities'],
+    'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
+    'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
+    'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
+    'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
+    'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
+    'high_school_physics': ['High School Physics', 'physics', 'STEM'],
+    'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
+    'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
+    'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
+    'high_school_world_history': ['High School World History', 'history', 'Humanities'],
+    'human_aging': ['Human Aging', 'health', 'Other'],
+    'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
+    'international_law': ['International Law', 'law', 'Humanities'],
+    'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
+    'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
+    'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
+    'management': ['Management', 'business', 'Other'],
+    'marketing': ['Marketing', 'business', 'Other'],
+    'medical_genetics': ['Medical Genetics', 'health', 'Other'],
+    'miscellaneous': ['Miscellaneous', 'other', 'Other'],
+    'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
+    'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
+    'nutrition': ['Nutrition', 'health', 'Other'],
+    'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
+    'prehistory': ['Prehistory', 'history', 'Humanities'],
+    'professional_accounting': ['Professional Accounting', 'other', 'Other'],
+    'professional_law': ['Professional Law', 'law', 'Humanities'],
+    'professional_medicine': ['Professional Medicine', 'health', 'Other'],
+    'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
+    'public_relations': ['Public Relations', 'politics', 'Social Science'],
+    'security_studies': ['Security Studies', 'politics', 'Social Science'],
+    'sociology': ['Sociology', 'culture', 'Social Science'],
+    'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
+    'virology': ['Virology', 'health', 'Other'],
+    'world_religions': ['World Religions', 'philosophy', 'Humanities'],
+}
+@Benchmark.register(
+    name='mmlu_redux',
+    pretty_name='MMLU-Redux',
+    dataset_id='AI-ModelScope/mmlu-redux-2.0',
+    model_adapter=OutputType.GENERATION,
+    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
+    subset_list=SUBSET_LIST,
+    metric_list=['AverageAccuracy'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template=
+    'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}',  # noqa: E501
+)
+class MMLUReduxAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.few_shot_num > 0:
+            self.few_shot_num = 0
+            logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
+        self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
+        self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
+    def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        if self.few_shot_num > 0:
+            prefix = self.format_fewshot_examples(few_shot_list)
+        else:
+            prefix = ''
+        query = prefix + 'Q: ' + input_d['question'] + '\n' + \
+            self.__form_options(input_d['choices']) + '\n'
+        full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
+        return self.gen_prompt_data(full_prompt)
+    def format_fewshot_examples(self, few_shot_list):
+        # load few-shot prompts for each category
+        prompts = ''
+        for index, d in enumerate(few_shot_list):
+            prompts += 'Q: ' + d['question'] + '\n' + \
+                self.__form_options(d['choices']) + '\n'
+        return prompts
+    def __form_options(self, options: list):
+        option_str = 'Options are:\n'
+        for opt, choice in zip(options, self.choices):
+            option_str += f'({choice}): {opt}' + '\n'
+        return option_str
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        Args:
+            input_d: input raw data. Depending on the dataset.
+        Returns:
+            The parsed input. e.g. gold answer ... Depending on the dataset.
+        """
+        answer_index = int(input_d['answer'])
+        return self.choices[answer_index]
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the predicted result and extract proper answer.
+        Args:
+            result: Predicted answer from the model. Usually a string for chat.
+            raw_input_d: The raw input. Depending on the dataset.
+            eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
+        Returns:
+            The parsed answer. Depending on the dataset. Usually a string for chat.
+        """
+        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
+            return result
+        else:
+            return ResponseParser.parse_first_option(result, options=self.choices)
+    def match(self, gold: str, pred: str) -> float:
+        """
+        Match the gold answer and the predicted answer.
+        Args:
+            gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
+                        e.g. 'A', extracted from get_gold_answer method.
+            pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
+                        e.g. 'B', extracted from parse_pred_result method.
+        Returns:
+            The match result. Usually a score (float) for chat/multiple-choice-questions.
+        """
+        return exact_match(gold=gold, pred=pred)

evalscope/benchmarks/musr/musr_adapter.py CHANGED Viewed

@@ -62,7 +62,7 @@ class MuSRAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(result)
+            return ResponseParser.parse_first_option(result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         """

evalscope/benchmarks/simple_qa/simple_qa_adapter.py CHANGED Viewed

@@ -126,7 +126,7 @@ class SimpleQAAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         # simple match
-        logger.warning(f'Please use LLMJudge to match the result for SimpleQA')
+        logger.warning(f'Please use LLMJudge to match the result for {self.name}')
         is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
         is_incorrect = not is_correct
         is_not_attempted = 0
@@ -159,9 +159,6 @@ class SimpleQAAdapter(DataAdapter):
             review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
         """
         # zip dict answers
-        res_dict = defaultdict(list)
-        for res in review_res_list:
-            for key, value in res.items():
-                res_dict[key].append(value)
+        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
         return super().compute_metric(res_dict, **kwargs)

evalscope/collections/evaluator.py CHANGED Viewed

@@ -65,7 +65,7 @@ class EvaluatorCollection:
         self.evaluators = self._initialize_evaluators()
     def load(self) -> tuple[list[DatasetEntry], str]:
-        dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
+        dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
         raw_dataset = self.data_adapter.load()
         # limit the dataset
         if self.task_cfg.limit:
@@ -174,6 +174,7 @@ class EvaluatorCollection:
         os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
         with open(report_file_path, 'w', encoding='utf-8') as f:
             json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
+        return report
     def _filter_answer(self, pred_file_path):
         answer_dict = defaultdict(dict)
@@ -274,4 +275,5 @@ class EvaluatorCollection:
         answers = self.get_answers()
         reviews = self.get_reviews(answers)
         scores = self.get_scores(reviews)
-        self.get_report(scores)
+        report = self.get_report(scores)
+        return report

evalscope/config.py CHANGED Viewed

@@ -75,7 +75,7 @@ class TaskConfig:
     # LLMJudge arguments
     judge_strategy: str = JudgeStrategy.AUTO
-    judge_worker_num: int = 8
+    judge_worker_num: int = 1
     judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
     def __post_init__(self):
@@ -212,7 +212,7 @@ def parse_task_config(task_cfg) -> TaskConfig:
         logger.info('Args: Task config is provided with CommandLine type.')
         task_cfg = TaskConfig.from_args(task_cfg)
     elif isinstance(task_cfg, str):
-        extension = task_cfg.split('.')[-1]
+        extension = os.path.splitext(task_cfg)[-1]
         logger.info(f'Args: Task config is provided with {extension} file type.')
         if extension in ['yaml', 'yml']:
             task_cfg = TaskConfig.from_yaml(task_cfg)

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -49,7 +49,7 @@ class LLMJudge:
         """
         self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
         self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
-        self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-3.5-turbo')
+        self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
         self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
         self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
         self.generation_config = generation_config

evalscope/models/chat_adapter.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import os
 import time
 import torch
-from typing import List, Union
+from typing import Any, Dict, List, Tuple, Union
 from evalscope.constants import OutputType
 from evalscope.models.base_adapter import BaseModelAdapter
 from evalscope.models.local_model import LocalModel
 from evalscope.models.register import register_model_adapter
-from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
+from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
 from evalscope.utils.logger import get_logger
 from evalscope.utils.model_utils import fix_do_sample_warning
@@ -60,7 +60,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         return generation_config
-    def _model_generate(self, queries: List[str], system_prompts: List[str] = None, infer_cfg: dict = {}) -> List[str]:
+    def _model_generate(self,
+                        queries: List[str],
+                        system_prompts: List[str] = None,
+                        infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
         """
         Args:
             queries: The input queries.
@@ -69,6 +72,11 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         Returns:
             The prediction results.
         """
+        if system_prompts is None:
+            system_prompts = []
+        if infer_cfg is None:
+            infer_cfg = {}
         # Process infer_cfg
         num_return_sequences = infer_cfg.get('num_return_sequences', 1)
         if num_return_sequences > 1:
@@ -111,7 +119,9 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         # Run inference
         output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
+        # Decode output
         responses = []
+        input_lengths = [len(self.tokenizer.encode(prompt)) for prompt in formatted_prompts]
         for i in range(0, len(output_ids), num_return_sequences):
             query_responses = []
             for j in range(num_return_sequences):
@@ -121,7 +131,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
                 query_responses.append(response)
             responses.append(query_responses)
-        return responses
+        return responses, input_lengths
     @torch.no_grad()
     def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
@@ -141,22 +151,33 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
             queries.append(input_item['data'][0])
             system_prompts.append(input_item.get('system_prompt', None))
-        responses = self._model_generate(queries, system_prompts, infer_cfg)
+        # Run inference
+        responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
+        # Process outputs
         results = []
-        for response in responses:
-            choices_list = [
-                ChatCompletionResponseChoice(
+        for response, input_length in zip(responses, input_lengths):
+            choices_list = []
+            completion_tokens = 0
+            for index, one_response in enumerate(response):
+                choice = ChatCompletionResponseChoice(
                     index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
-                for index, one_response in enumerate(response)
-            ]
+                choices_list.append(choice)
+                completion_tokens += len(self.tokenizer.encode(one_response))
+            usage = Usage(
+                prompt_tokens=input_length,
+                completion_tokens=completion_tokens,
+                total_tokens=input_length + completion_tokens)
             res_d = ChatCompletionResponse(
                 model=self.model_id,
                 choices=choices_list,
                 object='chat.completion',
                 created=int(time.time()),
-                usage=None).model_dump(exclude_unset=True)
+                usage=usage).model_dump(exclude_unset=True)
             results.append(res_d)

evalscope/perf/arguments.py CHANGED Viewed

@@ -27,7 +27,7 @@ class Arguments:
     no_test_connection: bool = False  # Test the connection before starting the benchmark
     # Performance and parallelism
-    number: Optional[int] = None  # Number of requests to be made
+    number: int = 1000  # Number of requests to be made
     parallel: int = 1  # Number of parallel requests
     rate: int = -1  # Rate limit for requests (default: -1, no limit)
@@ -35,6 +35,7 @@ class Arguments:
     log_every_n_query: int = 10  # Log every N queries
     debug: bool = False  # Debug mode
     wandb_api_key: Optional[str] = None  # WandB API key for logging
+    swanlab_api_key: Optional[str] = None  # SwanLab API key for logging
     name: Optional[str] = None  # Name for the run
     # Output settings
@@ -46,6 +47,7 @@ class Arguments:
     prefix_length: int = 0  # Length of the prefix, only for random dataset
     prompt: Optional[str] = None  # The prompt text
     query_template: Optional[str] = None  # Template for the query
+    apply_chat_template: Optional[bool] = None  # Whether to apply chat template
     # Dataset settings
     dataset: str = 'openqa'  # Dataset type (default: 'line_by_line')
@@ -57,13 +59,14 @@ class Arguments:
     max_tokens: Optional[int] = 2048  # Maximum number of tokens in the response
     min_tokens: Optional[int] = None  # Minimum number of tokens in the response
     n_choices: Optional[int] = None  # Number of response choices
-    seed: Optional[int] = 42  # Random seed for reproducibility
+    seed: Optional[int] = 0  # Random seed for reproducibility
     stop: Optional[List[str]] = field(default_factory=list)  # Stop sequences for the response
     stop_token_ids: Optional[List[str]] = field(default_factory=list)  # Stop token IDs for the response
-    stream: Optional[bool] = None  # Whether to stream the response
-    temperature: Optional[float] = None  # Temperature setting for the response
+    stream: Optional[bool] = True  # Whether to stream the response
+    temperature: float = 0.0  # Temperature setting for the response
     top_p: Optional[float] = None  # Top-p (nucleus) sampling setting for the response
     top_k: Optional[int] = None  # Top-k sampling setting for the response
+    extra_args: Optional[Dict[str, Any]] = None  # Extra arguments
     @staticmethod
     def from_args(args):
@@ -75,12 +78,26 @@ class Arguments:
         return Arguments(**args_dict)
     def __post_init__(self):
+        # Set the default headers
         self.headers = self.headers or {}  # Default to empty dictionary
         if self.api_key:
             # Assuming the API key is used as a Bearer token
             self.headers['Authorization'] = f'Bearer {self.api_key}'
+        # Set the model ID based on the model name
         self.model_id = os.path.basename(self.model)
+        # Set the URL based on the dataset type
+        if self.api.startswith('local'):
+            if self.dataset.startswith('speed_benchmark'):
+                self.url = f'http://127.0.0.1:{self.port}/v1/completions'
+            else:
+                self.url = f'http://127.0.0.1:{self.port}/v1/chat/completions'
+        # Set the apply_chat_template flag based on the URL
+        if self.apply_chat_template is None:
+            self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
     def __str__(self):
         return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
@@ -126,7 +143,7 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark')  # noqa: E501
     # Performance and parallelism
-    parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
+    parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
     parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
     parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
@@ -134,7 +151,8 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
     parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
     parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
-    parser.add_argument('--name', type=str, help='The wandb db result name and result db name')
+    parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
+    parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
     # Prompt settings
     parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
@@ -142,6 +160,8 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
     parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
     parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
+    parser.add_argument(
+        '--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt')  # noqa: E501
     # Output settings
     parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
@@ -158,13 +178,14 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument(
         '--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
     parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
-    parser.add_argument('--seed', type=int, help='The random seed', default=42)
+    parser.add_argument('--seed', type=int, help='The random seed', default=0)
     parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
     parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
-    parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
-    parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
+    parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
+    parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
     parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
     parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
+    parser.add_argument('--extra-args', type=json.loads, default='{}', help='Extra arguments, should in JSON format',)
     # yapf: enable

evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.1py3-none-any.whl → 0.14.0py3-none-any.whl