PyPI - evalscope - Versions diffs - 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl - Mend

evalscope 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (45) hide show

evalscope/arguments.py +1 -1
evalscope/backend/rag_eval/utils/llm.py +4 -5
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
evalscope/benchmarks/arena_hard/utils.py +162 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
evalscope/benchmarks/data_adapter.py +26 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
evalscope/collections/evaluator.py +1 -1
evalscope/config.py +6 -3
evalscope/constants.py +1 -0
evalscope/evaluator/evaluator.py +5 -4
evalscope/metrics/llm_judge.py +1 -1
evalscope/models/chat_adapter.py +32 -11
evalscope/models/custom_adapter.py +1 -1
evalscope/perf/arguments.py +19 -46
evalscope/perf/benchmark.py +64 -90
evalscope/perf/main.py +1 -1
evalscope/perf/plugin/api/openai_api.py +4 -2
evalscope/perf/plugin/datasets/__init__.py +1 -0
evalscope/perf/plugin/datasets/openqa.py +6 -11
evalscope/perf/plugin/datasets/random_dataset.py +51 -0
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/utils/db_util.py +5 -2
evalscope/run.py +14 -2
evalscope/version.py +2 -2
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/METADATA +42 -78
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/RECORD +45 -37
tests/cli/test_all.py +33 -24
tests/cli/test_run.py +69 -22
tests/perf/test_perf.py +23 -0
tests/rag/test_ragas.py +4 -1
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
{evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py ADDED Viewed

@@ -0,0 +1,182 @@
+from collections import defaultdict
+from typing import Any, Dict
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import EvalType, OutputType
+from evalscope.metrics import exact_match
+from evalscope.utils.logger import get_logger
+from evalscope.utils.utils import ResponseParser
+logger = get_logger()
+SUBSET_LIST = [
+    'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
+    'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
+    'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
+    'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
+    'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
+    'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
+    'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
+    'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
+    'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
+    'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
+    'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
+    'world_religions'
+]
+SUBJECT_MAPPING = {
+    'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
+    'anatomy': ['Anatomy', 'health', 'Other'],
+    'astronomy': ['Astronomy', 'physics', 'STEM'],
+    'business_ethics': ['Business Ethics', 'business', 'Other'],
+    'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
+    'college_biology': ['College Biology', 'biology', 'STEM'],
+    'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
+    'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
+    'college_mathematics': ['College Mathematics', 'math', 'STEM'],
+    'college_medicine': ['College Medicine', 'health', 'Other'],
+    'college_physics': ['College Physics', 'physics', 'STEM'],
+    'computer_security': ['Computer Security', 'computer science', 'STEM'],
+    'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
+    'econometrics': ['Econometrics', 'economics', 'Social Science'],
+    'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
+    'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
+    'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
+    'global_facts': ['Global Facts', 'other', 'Other'],
+    'high_school_biology': ['High School Biology', 'biology', 'STEM'],
+    'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
+    'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
+    'high_school_european_history': ['High School European History', 'history', 'Humanities'],
+    'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
+    'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
+    'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
+    'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
+    'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
+    'high_school_physics': ['High School Physics', 'physics', 'STEM'],
+    'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
+    'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
+    'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
+    'high_school_world_history': ['High School World History', 'history', 'Humanities'],
+    'human_aging': ['Human Aging', 'health', 'Other'],
+    'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
+    'international_law': ['International Law', 'law', 'Humanities'],
+    'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
+    'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
+    'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
+    'management': ['Management', 'business', 'Other'],
+    'marketing': ['Marketing', 'business', 'Other'],
+    'medical_genetics': ['Medical Genetics', 'health', 'Other'],
+    'miscellaneous': ['Miscellaneous', 'other', 'Other'],
+    'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
+    'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
+    'nutrition': ['Nutrition', 'health', 'Other'],
+    'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
+    'prehistory': ['Prehistory', 'history', 'Humanities'],
+    'professional_accounting': ['Professional Accounting', 'other', 'Other'],
+    'professional_law': ['Professional Law', 'law', 'Humanities'],
+    'professional_medicine': ['Professional Medicine', 'health', 'Other'],
+    'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
+    'public_relations': ['Public Relations', 'politics', 'Social Science'],
+    'security_studies': ['Security Studies', 'politics', 'Social Science'],
+    'sociology': ['Sociology', 'culture', 'Social Science'],
+    'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
+    'virology': ['Virology', 'health', 'Other'],
+    'world_religions': ['World Religions', 'philosophy', 'Humanities'],
+}
+@Benchmark.register(
+    name='mmlu_redux',
+    pretty_name='MMLU-Redux',
+    dataset_id='AI-ModelScope/mmlu-redux-2.0',
+    model_adapter=OutputType.GENERATION,
+    output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
+    subset_list=SUBSET_LIST,
+    metric_list=['AverageAccuracy'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template=
+    'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}',  # noqa: E501
+)
+class MMLUReduxAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if self.few_shot_num > 0:
+            self.few_shot_num = 0
+            logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
+        self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
+        self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
+    def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        if self.few_shot_num > 0:
+            prefix = self.format_fewshot_examples(few_shot_list)
+        else:
+            prefix = ''
+        query = prefix + 'Q: ' + input_d['question'] + '\n' + \
+            self.__form_options(input_d['choices']) + '\n'
+        full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
+        return self.gen_prompt_data(full_prompt)
+    def format_fewshot_examples(self, few_shot_list):
+        # load few-shot prompts for each category
+        prompts = ''
+        for index, d in enumerate(few_shot_list):
+            prompts += 'Q: ' + d['question'] + '\n' + \
+                self.__form_options(d['choices']) + '\n'
+        return prompts
+    def __form_options(self, options: list):
+        option_str = 'Options are:\n'
+        for opt, choice in zip(options, self.choices):
+            option_str += f'({choice}): {opt}' + '\n'
+        return option_str
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Parse the raw input labels (gold).
+        Args:
+            input_d: input raw data. Depending on the dataset.
+        Returns:
+            The parsed input. e.g. gold answer ... Depending on the dataset.
+        """
+        answer_index = int(input_d['answer'])
+        return self.choices[answer_index]
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        """
+        Parse the predicted result and extract proper answer.
+        Args:
+            result: Predicted answer from the model. Usually a string for chat.
+            raw_input_d: The raw input. Depending on the dataset.
+            eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
+        Returns:
+            The parsed answer. Depending on the dataset. Usually a string for chat.
+        """
+        if self.model_adapter == OutputType.MULTIPLE_CHOICE:
+            return result
+        else:
+            return ResponseParser.parse_first_option(result)
+    def match(self, gold: str, pred: str) -> float:
+        """
+        Match the gold answer and the predicted answer.
+        Args:
+            gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
+                        e.g. 'A', extracted from get_gold_answer method.
+            pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
+                        e.g. 'B', extracted from parse_pred_result method.
+        Returns:
+            The match result. Usually a score (float) for chat/multiple-choice-questions.
+        """
+        return exact_match(gold=gold, pred=pred)

evalscope/benchmarks/simple_qa/simple_qa_adapter.py CHANGED Viewed

@@ -126,7 +126,7 @@ class SimpleQAAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         # simple match
-        logger.warning(f'Please use LLMJudge to match the result for SimpleQA')
+        logger.warning(f'Please use LLMJudge to match the result for {self.name}')
         is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
         is_incorrect = not is_correct
         is_not_attempted = 0
@@ -159,9 +159,6 @@ class SimpleQAAdapter(DataAdapter):
             review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
         """
         # zip dict answers
-        res_dict = defaultdict(list)
-        for res in review_res_list:
-            for key, value in res.items():
-                res_dict[key].append(value)
+        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
         return super().compute_metric(res_dict, **kwargs)

evalscope/collections/evaluator.py CHANGED Viewed

@@ -181,7 +181,7 @@ class EvaluatorCollection:
             answers_list = jsonl_to_list(pred_file_path)
             indices = set()
             for answer in answers_list:
-                index = answer[AnswerKeys.ORIGIN_PROMPT].get('index')
+                index = answer.get(AnswerKeys.INDEX)
                 answer_dict[index] = answer
                 indices.add(index)
             data = []

evalscope/config.py CHANGED Viewed

@@ -75,13 +75,13 @@ class TaskConfig:
     # LLMJudge arguments
     judge_strategy: str = JudgeStrategy.AUTO
-    judge_worker_num: int = 8
+    judge_worker_num: int = 1
     judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
     def __post_init__(self):
         if (not self.model_id) and self.model:
             if isinstance(self.model, CustomModel):
-                self.model_id = type(self.model).__name__
+                self.model_id = self.model.config.get('model_id', 'custom_model')
             else:
                 self.model_id = os.path.basename(self.model).rstrip(os.sep)
             # fix path error, see http://github.com/modelscope/evalscope/issues/377
@@ -92,7 +92,10 @@ class TaskConfig:
             self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
     def to_dict(self):
-        return self.__dict__
+        result = self.__dict__.copy()
+        if isinstance(self.model, CustomModel):
+            result['model'] = self.model.__class__.__name__
+        return result
     def __str__(self):
         return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)

evalscope/constants.py CHANGED Viewed

@@ -77,6 +77,7 @@ class ArenaMode:
 class AnswerKeys:
+    INDEX = 'index'
     ANSWER_ID = 'answer_id'
     RAW_INPUT = 'raw_input'
     ORIGIN_PROMPT = 'origin_prompt'

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -81,7 +81,7 @@ class Evaluator(object):
         for subset_name, prompts_list in prompts.items():
             limit = self.task_cfg.limit or len(prompts_list)
             for index, prompt in enumerate(prompts_list[:limit]):
-                prompt['index'] = index
+                prompt[AnswerKeys.INDEX] = index
                 limited_prompts[subset_name].append(prompt)
         return limited_prompts
@@ -97,7 +97,8 @@ class Evaluator(object):
         answer_d[AnswerKeys.ANSWER_ID] = answer_id
         answer_d[AnswerKeys.SUBSET_NAME] = subset_name
         answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
-        answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
+        # answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
+        answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
         return answer_d
     def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
@@ -117,7 +118,7 @@ class Evaluator(object):
             return answers_list, prompts_list
         def get_answered_indices(answers_list: List[Dict]) -> List[int]:
-            indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
+            indices = [answer.get(AnswerKeys.INDEX) for answer in answers_list]
             if all(index is None for index in indices):
                 return list(range(len(answers_list)))
@@ -238,7 +239,7 @@ class Evaluator(object):
                     pred = pred_content
             choice[ReviewKeys.REVIEW] = {
-                ReviewKeys.GOLD: gold_content,
+                ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
                 ReviewKeys.PRED: pred,
                 ReviewKeys.RESULT: review_result
             }

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -49,7 +49,7 @@ class LLMJudge:
         """
         self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
         self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
-        self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-3.5-turbo')
+        self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
         self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
         self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
         self.generation_config = generation_config

evalscope/models/chat_adapter.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import os
 import time
 import torch
-from typing import List, Union
+from typing import Any, Dict, List, Tuple, Union
 from evalscope.constants import OutputType
 from evalscope.models.base_adapter import BaseModelAdapter
 from evalscope.models.local_model import LocalModel
 from evalscope.models.register import register_model_adapter
-from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
+from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
 from evalscope.utils.logger import get_logger
 from evalscope.utils.model_utils import fix_do_sample_warning
@@ -60,7 +60,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         return generation_config
-    def _model_generate(self, queries: List[str], system_prompts: List[str] = None, infer_cfg: dict = {}) -> List[str]:
+    def _model_generate(self,
+                        queries: List[str],
+                        system_prompts: List[str] = None,
+                        infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
         """
         Args:
             queries: The input queries.
@@ -69,6 +72,11 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         Returns:
             The prediction results.
         """
+        if system_prompts is None:
+            system_prompts = []
+        if infer_cfg is None:
+            infer_cfg = {}
         # Process infer_cfg
         num_return_sequences = infer_cfg.get('num_return_sequences', 1)
         if num_return_sequences > 1:
@@ -111,7 +119,9 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
         # Run inference
         output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
+        # Decode output
         responses = []
+        input_lengths = [len(self.tokenizer.encode(prompt)) for prompt in formatted_prompts]
         for i in range(0, len(output_ids), num_return_sequences):
             query_responses = []
             for j in range(num_return_sequences):
@@ -121,7 +131,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
                 query_responses.append(response)
             responses.append(query_responses)
-        return responses
+        return responses, input_lengths
     @torch.no_grad()
     def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
@@ -141,22 +151,33 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
             queries.append(input_item['data'][0])
             system_prompts.append(input_item.get('system_prompt', None))
-        responses = self._model_generate(queries, system_prompts, infer_cfg)
+        # Run inference
+        responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
+        # Process outputs
         results = []
-        for response in responses:
-            choices_list = [
-                ChatCompletionResponseChoice(
+        for response, input_length in zip(responses, input_lengths):
+            choices_list = []
+            completion_tokens = 0
+            for index, one_response in enumerate(response):
+                choice = ChatCompletionResponseChoice(
                     index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
-                for index, one_response in enumerate(response)
-            ]
+                choices_list.append(choice)
+                completion_tokens += len(self.tokenizer.encode(one_response))
+            usage = Usage(
+                prompt_tokens=input_length,
+                completion_tokens=completion_tokens,
+                total_tokens=input_length + completion_tokens)
             res_d = ChatCompletionResponse(
                 model=self.model_id,
                 choices=choices_list,
                 object='chat.completion',
                 created=int(time.time()),
-                usage=None).model_dump(exclude_unset=True)
+                usage=usage).model_dump(exclude_unset=True)
             results.append(res_d)

evalscope/models/custom_adapter.py CHANGED Viewed

@@ -66,4 +66,4 @@ class CustomModelAdapter(BaseModelAdapter):
             else:
                 raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
-        return self.custom_model.predict(prompts=in_prompts, **kwargs)
+        return self.custom_model.predict(prompts=in_prompts, origin_inputs=inputs, **kwargs)

evalscope/perf/arguments.py CHANGED Viewed

@@ -24,9 +24,10 @@ class Arguments:
     connect_timeout: int = 600  # Connection timeout in seconds
     read_timeout: int = 600  # Read timeout in seconds
     api_key: Optional[str] = None
+    no_test_connection: bool = False  # Test the connection before starting the benchmark
     # Performance and parallelism
-    number: Optional[int] = None  # Number of requests to be made
+    number: int = 1000  # Number of requests to be made
     parallel: int = 1  # Number of parallel requests
     rate: int = -1  # Rate limit for requests (default: -1, no limit)
@@ -40,8 +41,9 @@ class Arguments:
     outputs_dir: str = DEFAULT_WORK_DIR
     # Prompt settings
-    max_prompt_length: int = sys.maxsize  # Maximum length of the prompt
+    max_prompt_length: int = 131072  # Maximum length of the prompt
     min_prompt_length: int = 0  # Minimum length of the prompt
+    prefix_length: int = 0  # Length of the prefix, only for random dataset
     prompt: Optional[str] = None  # The prompt text
     query_template: Optional[str] = None  # Template for the query
@@ -58,51 +60,20 @@ class Arguments:
     seed: Optional[int] = 42  # Random seed for reproducibility
     stop: Optional[List[str]] = field(default_factory=list)  # Stop sequences for the response
     stop_token_ids: Optional[List[str]] = field(default_factory=list)  # Stop token IDs for the response
-    stream: Optional[bool] = None  # Whether to stream the response
-    temperature: Optional[float] = None  # Temperature setting for the response
+    stream: Optional[bool] = False  # Whether to stream the response
+    temperature: float = 0.0  # Temperature setting for the response
     top_p: Optional[float] = None  # Top-p (nucleus) sampling setting for the response
     top_k: Optional[int] = None  # Top-k sampling setting for the response
+    extra_args: Optional[Dict[str, Any]] = None  # Extra arguments
     @staticmethod
     def from_args(args):
-        return Arguments(
-            model=args.model,
-            attn_implementation=args.attn_implementation,
-            url=args.url,
-            port=args.port,
-            api_key=args.api_key,
-            connect_timeout=args.connect_timeout,
-            read_timeout=args.read_timeout,
-            number=args.number,
-            parallel=args.parallel,
-            rate=args.rate,
-            log_every_n_query=args.log_every_n_query,
-            headers=args.headers,
-            wandb_api_key=args.wandb_api_key,
-            name=args.name,
-            outputs_dir=args.outputs_dir,
-            debug=args.debug,
-            tokenizer_path=args.tokenizer_path,
-            api=args.api,
-            max_prompt_length=args.max_prompt_length,
-            min_prompt_length=args.min_prompt_length,
-            prompt=args.prompt,
-            query_template=args.query_template,
-            dataset=args.dataset,
-            dataset_path=args.dataset_path,
-            frequency_penalty=args.frequency_penalty,
-            logprobs=args.logprobs,
-            max_tokens=args.max_tokens,
-            min_tokens=args.min_tokens,
-            n_choices=args.n_choices,
-            seed=args.seed,
-            stop=args.stop,
-            stop_token_ids=args.stop_token_ids,
-            stream=args.stream,
-            temperature=args.temperature,
-            top_p=args.top_p,
-            top_k=args.top_k,
-        )
+        # Convert Namespace to a dictionary and filter out None values
+        args_dict = {k: v for k, v in vars(args).items() if v is not None}
+        if 'func' in args_dict:
+            del args_dict['func']  # Note: compat CLI arguments
+        return Arguments(**args_dict)
     def __post_init__(self):
         self.headers = self.headers or {}  # Default to empty dictionary
@@ -153,9 +124,10 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
     parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
     parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
+    parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark')  # noqa: E501
     # Performance and parallelism
-    parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
+    parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
     parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
     parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
@@ -168,6 +140,7 @@ def add_argument(parser: argparse.ArgumentParser):
     # Prompt settings
     parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
     parser.add_argument('--min-prompt-length', type=int, default=0, help='Minimum input prompt length')
+    parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
     parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
     parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
@@ -189,11 +162,11 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--seed', type=int, help='The random seed', default=42)
     parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
     parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
-    parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
-    parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
+    parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=False)
+    parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
     parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
     parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
+    parser.add_argument('--extra-args', type=json.loads, default='{}', help='Extra arguments, should in JSON format',)
     # yapf: enable

evalscope 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.0py3-none-any.whl → 0.13.2py3-none-any.whl