evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +6 -1
- evalscope/benchmarks/aime/aime24_adapter.py +3 -3
- evalscope/benchmarks/aime/aime25_adapter.py +3 -3
- evalscope/benchmarks/arc/arc_adapter.py +15 -18
- evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
- evalscope/benchmarks/benchmark.py +12 -11
- evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
- evalscope/benchmarks/data_adapter.py +59 -21
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
- evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
- evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
- evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
- evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
- evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
- evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
- evalscope/benchmarks/musr/musr_adapter.py +8 -5
- evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
- evalscope/benchmarks/race/race_adapter.py +12 -16
- evalscope/benchmarks/simple_qa/__init__.py +0 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
- evalscope/benchmarks/super_gpqa/__init__.py +0 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
- evalscope/benchmarks/super_gpqa/utils.py +85 -0
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
- evalscope/benchmarks/utils.py +43 -0
- evalscope/collections/evaluator.py +14 -5
- evalscope/config.py +15 -2
- evalscope/constants.py +14 -0
- evalscope/evaluator/evaluator.py +51 -13
- evalscope/metrics/llm_judge.py +104 -0
- evalscope/metrics/named_metrics.py +1 -0
- evalscope/models/__init__.py +2 -1
- evalscope/models/base_adapter.py +25 -5
- evalscope/models/chat_adapter.py +3 -0
- evalscope/models/choice_adapter.py +4 -0
- evalscope/models/custom_adapter.py +2 -0
- evalscope/models/register.py +28 -0
- evalscope/models/server_adapter.py +35 -8
- evalscope/perf/arguments.py +13 -7
- evalscope/perf/benchmark.py +5 -0
- evalscope/perf/http_client.py +15 -5
- evalscope/perf/main.py +1 -0
- evalscope/perf/utils/analysis_result.py +1 -1
- evalscope/report/app.py +3 -0
- evalscope/report/combinator.py +2 -2
- evalscope/run.py +6 -5
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/third_party/thinkbench/eval.py +220 -55
- evalscope/third_party/thinkbench/infer.py +37 -7
- evalscope/third_party/thinkbench/tools/llm.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
- evalscope/utils/chat_service.py +1 -0
- evalscope/utils/filters.py +59 -0
- evalscope/utils/logger.py +3 -3
- evalscope/version.py +2 -2
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
- tests/cli/test_all.py +144 -0
- tests/cli/test_collection.py +28 -2
- tests/cli/test_run.py +201 -32
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
|
|
7
|
+
logger = get_logger()
|
|
8
|
+
|
|
9
|
+
DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.
|
|
10
|
+
|
|
11
|
+
Question: {question}
|
|
12
|
+
|
|
13
|
+
Reference Answer: {gold}
|
|
14
|
+
|
|
15
|
+
Model Answer: {pred}
|
|
16
|
+
|
|
17
|
+
Evaluate the model's answer based on correctness compared to the reference answer.
|
|
18
|
+
Grade the predicted answer of this new question as one of:
|
|
19
|
+
A: CORRECT
|
|
20
|
+
B: INCORRECT
|
|
21
|
+
|
|
22
|
+
Just return the letters "A" or "B", with no text around it.
|
|
23
|
+
""" # noqa: E501
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LLMJudge:
|
|
27
|
+
"""
|
|
28
|
+
A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self,
|
|
32
|
+
api_key: Optional[str] = None,
|
|
33
|
+
api_url: Optional[str] = None,
|
|
34
|
+
model_id: Optional[str] = None,
|
|
35
|
+
system_prompt: Optional[str] = None,
|
|
36
|
+
prompt_template: Optional[str] = None,
|
|
37
|
+
generation_config: Optional[Dict[str, Any]] = None,
|
|
38
|
+
**kwargs):
|
|
39
|
+
"""
|
|
40
|
+
Initialize LLMJudge metric.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
api_key (str, optional): API key for OpenAI or compatible service
|
|
44
|
+
api_base (str, optional): API base URL
|
|
45
|
+
model_id (str, optional): Model ID for LLM
|
|
46
|
+
system_prompt (str, optional): System prompt for the judge
|
|
47
|
+
prompt_template (str, optional): Prompt template for the judge
|
|
48
|
+
generation_config (dict, optional): Generation configuration for the judge
|
|
49
|
+
"""
|
|
50
|
+
self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
|
|
51
|
+
self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
|
|
52
|
+
self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-3.5-turbo')
|
|
53
|
+
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
54
|
+
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
55
|
+
self.generation_config = generation_config
|
|
56
|
+
|
|
57
|
+
from evalscope.models.server_adapter import ServerModelAdapter
|
|
58
|
+
|
|
59
|
+
# Initialize ServerModelAdapter
|
|
60
|
+
self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
|
|
61
|
+
|
|
62
|
+
def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> float:
|
|
63
|
+
"""
|
|
64
|
+
Args:
|
|
65
|
+
prompt (str): The prompt to evaluate
|
|
66
|
+
system_prompt (str, optional): The system prompt to use for the evaluation
|
|
67
|
+
Returns:
|
|
68
|
+
float: The score of the evaluation
|
|
69
|
+
"""
|
|
70
|
+
input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
|
|
71
|
+
|
|
72
|
+
# Inference configuration
|
|
73
|
+
infer_cfg = {'temperature': 0.0, 'max_tokens': 1024}
|
|
74
|
+
if self.generation_config:
|
|
75
|
+
infer_cfg.update(self.generation_config)
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
# Send request using ServerModelAdapter
|
|
79
|
+
response = self.server_adapter.process_single_input(input_data, infer_cfg)
|
|
80
|
+
|
|
81
|
+
# Extract content from response
|
|
82
|
+
llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
83
|
+
return llm_response
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error(f'Error during LLM evaluation: {e}')
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
|
|
89
|
+
if question is None:
|
|
90
|
+
question = 'Not provided'
|
|
91
|
+
return self.prompt_template.format(question=question, pred=pred, gold=gold)
|
|
92
|
+
|
|
93
|
+
def get_score(self, response: str) -> float:
|
|
94
|
+
if response is None:
|
|
95
|
+
return 0
|
|
96
|
+
match = re.search(r'(A|B)', response)
|
|
97
|
+
if match:
|
|
98
|
+
answer = match.group(0)
|
|
99
|
+
if answer == 'A':
|
|
100
|
+
return 1
|
|
101
|
+
elif answer == 'B':
|
|
102
|
+
return 0
|
|
103
|
+
else:
|
|
104
|
+
return 0
|
|
@@ -35,6 +35,7 @@ metric_registry = MetricRegistry()
|
|
|
35
35
|
metric_registry.register(Metric(name='AverageAccuracy', object=mean))
|
|
36
36
|
metric_registry.register(Metric(name='WeightedAverageAccuracy', object=weighted_mean))
|
|
37
37
|
metric_registry.register(Metric(name='AverageBLEU', object=mean))
|
|
38
|
+
metric_registry.register(Metric(name='AverageRouge', object=mean))
|
|
38
39
|
metric_registry.register(Metric(name='WeightedAverageBLEU', object=weighted_mean))
|
|
39
40
|
metric_registry.register(Metric(name='AveragePass@1', object=mean))
|
|
40
41
|
for k in range(1, 17):
|
evalscope/models/__init__.py
CHANGED
|
@@ -7,10 +7,11 @@ from evalscope.models.custom import CustomModel
|
|
|
7
7
|
from evalscope.models.custom_adapter import CustomModelAdapter
|
|
8
8
|
from evalscope.models.local_model import LocalModel, get_local_model
|
|
9
9
|
from evalscope.models.model import BaseModel, ChatBaseModel, OpenAIModel
|
|
10
|
+
from evalscope.models.register import get_model_adapter
|
|
10
11
|
from evalscope.models.server_adapter import ServerModelAdapter
|
|
11
12
|
|
|
12
13
|
__all__ = [
|
|
13
14
|
'CustomModel', 'BaseModel', 'ChatBaseModel', 'OpenAIModel', 'BaseModelAdapter', 'ChatGenerationModelAdapter',
|
|
14
15
|
'MultiChoiceModelAdapter', 'ContinuationLogitsModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter',
|
|
15
|
-
'LocalModel', 'get_local_model', 'initialize_model_adapter'
|
|
16
|
+
'LocalModel', 'get_local_model', 'initialize_model_adapter', 'get_model_adapter'
|
|
16
17
|
]
|
evalscope/models/base_adapter.py
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
import torch
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
3
|
+
from typing import TYPE_CHECKING, Any, List, Optional, Union
|
|
4
4
|
|
|
5
|
-
from evalscope.constants import EvalType
|
|
5
|
+
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.models.custom import CustomModel
|
|
7
7
|
from evalscope.models.local_model import LocalModel
|
|
8
|
+
from evalscope.models.register import get_model_adapter, register_model_adapter
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
8
12
|
|
|
9
13
|
if TYPE_CHECKING:
|
|
14
|
+
from evalscope.benchmarks import BenchmarkMeta
|
|
10
15
|
from evalscope.config import TaskConfig
|
|
11
16
|
|
|
12
17
|
|
|
18
|
+
@register_model_adapter('base')
|
|
13
19
|
class BaseModelAdapter(ABC):
|
|
14
20
|
|
|
15
21
|
def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
|
|
@@ -33,7 +39,7 @@ class BaseModelAdapter(ABC):
|
|
|
33
39
|
raise NotImplementedError
|
|
34
40
|
|
|
35
41
|
|
|
36
|
-
def initialize_model_adapter(task_cfg: 'TaskConfig',
|
|
42
|
+
def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'BenchmarkMeta', base_model: 'LocalModel'):
|
|
37
43
|
"""Initialize the model adapter based on the task configuration."""
|
|
38
44
|
if task_cfg.dry_run:
|
|
39
45
|
from evalscope.models.model import DummyChatModel
|
|
@@ -43,8 +49,14 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseMod
|
|
|
43
49
|
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
44
50
|
from evalscope.models import CustomModelAdapter
|
|
45
51
|
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
46
|
-
elif task_cfg.eval_type == EvalType.SERVICE:
|
|
52
|
+
elif task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
|
|
47
53
|
from evalscope.models import ServerModelAdapter
|
|
54
|
+
|
|
55
|
+
if benchmark.model_adapter in [OutputType.CONTINUOUS, OutputType.MULTIPLE_CHOICE]:
|
|
56
|
+
logger.warning('Output type is set to logits. This is not supported for service evaluation. '
|
|
57
|
+
'Setting output type to generation by default.')
|
|
58
|
+
benchmark.model_adapter = OutputType.GENERATION
|
|
59
|
+
|
|
48
60
|
return ServerModelAdapter(
|
|
49
61
|
api_url=task_cfg.api_url,
|
|
50
62
|
model_id=task_cfg.model,
|
|
@@ -54,5 +66,13 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', model_adapter_cls: 'BaseMod
|
|
|
54
66
|
stream=task_cfg.stream,
|
|
55
67
|
)
|
|
56
68
|
else:
|
|
57
|
-
|
|
69
|
+
# for local model, we need to determine the model adapter class based on the output type
|
|
70
|
+
model_adapter_cls = benchmark.model_adapter
|
|
71
|
+
if model_adapter_cls not in benchmark.output_types:
|
|
72
|
+
logger.warning(f'Output type {model_adapter_cls} is not supported for benchmark {benchmark.name}. '
|
|
73
|
+
f'Using {benchmark.output_types[0]} instead.')
|
|
74
|
+
model_adapter_cls = benchmark.output_types[0]
|
|
75
|
+
|
|
76
|
+
model_adapter = get_model_adapter(model_adapter_cls)
|
|
77
|
+
return model_adapter(
|
|
58
78
|
model=base_model, generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template)
|
evalscope/models/chat_adapter.py
CHANGED
|
@@ -3,8 +3,10 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import List, Union
|
|
5
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
6
7
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
7
8
|
from evalscope.models.local_model import LocalModel
|
|
9
|
+
from evalscope.models.register import register_model_adapter
|
|
8
10
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
from evalscope.utils.model_utils import fix_do_sample_warning
|
|
@@ -12,6 +14,7 @@ from evalscope.utils.model_utils import fix_do_sample_warning
|
|
|
12
14
|
logger = get_logger()
|
|
13
15
|
|
|
14
16
|
|
|
17
|
+
@register_model_adapter(OutputType.GENERATION)
|
|
15
18
|
class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
16
19
|
"""
|
|
17
20
|
Chat generation model adapter.
|
|
@@ -3,11 +3,14 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
6
7
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
7
8
|
from evalscope.models.local_model import LocalModel
|
|
9
|
+
from evalscope.models.register import register_model_adapter
|
|
8
10
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
9
11
|
|
|
10
12
|
|
|
13
|
+
@register_model_adapter(OutputType.MULTIPLE_CHOICE)
|
|
11
14
|
class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
12
15
|
""" The multi-choice model adapter. """
|
|
13
16
|
|
|
@@ -110,6 +113,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
|
110
113
|
return log_probs, {'tokens': tokens}
|
|
111
114
|
|
|
112
115
|
|
|
116
|
+
@register_model_adapter(OutputType.CONTINUOUS)
|
|
113
117
|
class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
114
118
|
"""
|
|
115
119
|
Continuation-logits model adapter.
|
|
@@ -2,8 +2,10 @@ from typing import Any, Dict, List, Union
|
|
|
2
2
|
|
|
3
3
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
4
4
|
from evalscope.models.custom import CustomModel
|
|
5
|
+
from evalscope.models.register import register_model_adapter
|
|
5
6
|
|
|
6
7
|
|
|
8
|
+
@register_model_adapter('custom')
|
|
7
9
|
class CustomModelAdapter(BaseModelAdapter):
|
|
8
10
|
|
|
9
11
|
def __init__(self, custom_model: CustomModel, **kwargs):
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
MODEL_ADAPTERS = {}
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def register_model_adapter(name):
|
|
5
|
+
"""
|
|
6
|
+
Decorator to register a model adapter with a given name.
|
|
7
|
+
:param name: The name of the model adapter.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def decorator(adapter):
|
|
11
|
+
if name in MODEL_ADAPTERS:
|
|
12
|
+
raise ValueError(f"Model adapter '{name}' is already registered.")
|
|
13
|
+
MODEL_ADAPTERS[name] = adapter
|
|
14
|
+
return adapter
|
|
15
|
+
|
|
16
|
+
return decorator
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_model_adapter(name):
|
|
20
|
+
"""
|
|
21
|
+
Retrieve a registered model adapter by name.
|
|
22
|
+
:param name: The name of the model adapter.
|
|
23
|
+
:return: The model adapter class or function.
|
|
24
|
+
"""
|
|
25
|
+
if name not in MODEL_ADAPTERS:
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
|
|
28
|
+
return MODEL_ADAPTERS[name]
|
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
import openai
|
|
2
2
|
from collections import defaultdict
|
|
3
|
+
from inspect import signature
|
|
3
4
|
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
4
5
|
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
5
6
|
from typing import List, Optional, Union
|
|
6
7
|
|
|
7
8
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
9
|
+
from evalscope.models.register import register_model_adapter
|
|
8
10
|
from evalscope.utils.logger import get_logger
|
|
9
11
|
|
|
10
12
|
logger = get_logger()
|
|
11
13
|
|
|
12
14
|
|
|
15
|
+
@register_model_adapter('server')
|
|
13
16
|
class ServerModelAdapter(BaseModelAdapter):
|
|
14
17
|
"""
|
|
15
18
|
Server model adapter to request remote API model and generate results.
|
|
@@ -30,6 +33,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
30
33
|
api_key=api_key,
|
|
31
34
|
base_url=self.api_url,
|
|
32
35
|
)
|
|
36
|
+
self.supported_params = self._get_supported_params()
|
|
33
37
|
|
|
34
38
|
self.seed = kwargs.get('seed', None)
|
|
35
39
|
self.timeout = kwargs.get('timeout', 60)
|
|
@@ -37,12 +41,16 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
37
41
|
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
38
42
|
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
39
43
|
|
|
40
|
-
def
|
|
44
|
+
def _get_supported_params(self):
|
|
45
|
+
sig = signature(self.client.chat.completions.create)
|
|
46
|
+
return list(sig.parameters.keys())
|
|
47
|
+
|
|
48
|
+
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> List[dict]:
|
|
41
49
|
"""
|
|
42
50
|
Model prediction func.
|
|
43
51
|
|
|
44
52
|
Args:
|
|
45
|
-
inputs (List[
|
|
53
|
+
inputs (List[dict]): The input data.
|
|
46
54
|
infer_cfg (dict): Inference configuration.
|
|
47
55
|
|
|
48
56
|
Returns:
|
|
@@ -104,34 +112,52 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
104
112
|
request_json['stream_options'] = {'include_usage': True}
|
|
105
113
|
|
|
106
114
|
logger.debug(f'Request to remote API: {request_json}')
|
|
115
|
+
|
|
107
116
|
return request_json
|
|
108
117
|
|
|
118
|
+
def _parse_extra_params(self, request_json):
|
|
119
|
+
api_params = {}
|
|
120
|
+
extra_body = {}
|
|
121
|
+
for key, value in request_json.items():
|
|
122
|
+
if key in self.supported_params:
|
|
123
|
+
api_params[key] = value
|
|
124
|
+
else:
|
|
125
|
+
extra_body[key] = value
|
|
126
|
+
|
|
127
|
+
if extra_body:
|
|
128
|
+
api_params['extra_body'] = extra_body
|
|
129
|
+
return api_params
|
|
130
|
+
|
|
109
131
|
def send_request(self, request_json: dict) -> dict:
|
|
110
132
|
try:
|
|
111
|
-
|
|
133
|
+
parsed_request = self._parse_extra_params(request_json)
|
|
134
|
+
response = self.client.chat.completions.create(**parsed_request)
|
|
112
135
|
|
|
113
|
-
if self.stream:
|
|
136
|
+
if response and self.stream:
|
|
114
137
|
response = self._collect_stream_response(response)
|
|
115
138
|
|
|
116
139
|
return response.model_dump(exclude_unset=True)
|
|
117
140
|
except Exception as e:
|
|
118
|
-
logger.error(f'Error when calling
|
|
141
|
+
logger.error(f'Error when calling remote API: {str(e)}')
|
|
119
142
|
raise
|
|
120
143
|
|
|
121
144
|
def _collect_stream_response(self, response_stream: List[ChatCompletionChunk]) -> ChatCompletion:
|
|
122
145
|
collected_chunks = []
|
|
123
146
|
collected_messages = defaultdict(list)
|
|
147
|
+
collected_reasoning = defaultdict(list)
|
|
124
148
|
|
|
125
149
|
for chunk in response_stream:
|
|
126
150
|
collected_chunks.append(chunk)
|
|
127
151
|
for choice in chunk.choices:
|
|
152
|
+
if hasattr(choice.delta, 'reasoning_content') and choice.delta.reasoning_content is not None:
|
|
153
|
+
collected_reasoning[choice.index].append(choice.delta.reasoning_content)
|
|
128
154
|
if choice.delta.content is not None:
|
|
129
155
|
collected_messages[choice.index].append(choice.delta.content)
|
|
130
156
|
|
|
131
157
|
choices = []
|
|
132
158
|
for index, messages in collected_messages.items():
|
|
133
159
|
full_reply_content = ''.join(messages)
|
|
134
|
-
|
|
160
|
+
reasoning = ''.join(collected_reasoning[index])
|
|
135
161
|
# use the finish_reason from the last chunk that generated this choice
|
|
136
162
|
finish_reason = None
|
|
137
163
|
for chunk in reversed(collected_chunks):
|
|
@@ -140,9 +166,10 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
140
166
|
break
|
|
141
167
|
|
|
142
168
|
choice = Choice(
|
|
143
|
-
finish_reason=finish_reason,
|
|
169
|
+
finish_reason=finish_reason or 'stop',
|
|
144
170
|
index=index,
|
|
145
|
-
message=ChatCompletionMessage(
|
|
171
|
+
message=ChatCompletionMessage(
|
|
172
|
+
role='assistant', content=full_reply_content, reasoning_content=reasoning))
|
|
146
173
|
choices.append(choice)
|
|
147
174
|
|
|
148
175
|
# build the final completion object
|
evalscope/perf/arguments.py
CHANGED
|
@@ -21,9 +21,9 @@ class Arguments:
|
|
|
21
21
|
# Connection settings
|
|
22
22
|
url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
|
|
23
23
|
headers: Dict[str, Any] = field(default_factory=dict) # Custom headers
|
|
24
|
-
connect_timeout: int =
|
|
25
|
-
read_timeout: int =
|
|
26
|
-
api_key: str =
|
|
24
|
+
connect_timeout: int = 600 # Connection timeout in seconds
|
|
25
|
+
read_timeout: int = 600 # Read timeout in seconds
|
|
26
|
+
api_key: Optional[str] = None
|
|
27
27
|
|
|
28
28
|
# Performance and parallelism
|
|
29
29
|
number: Optional[int] = None # Number of requests to be made
|
|
@@ -125,7 +125,13 @@ class ParseKVAction(argparse.Action):
|
|
|
125
125
|
setattr(namespace, self.dest, {})
|
|
126
126
|
else:
|
|
127
127
|
try:
|
|
128
|
-
kv_dict =
|
|
128
|
+
kv_dict = {}
|
|
129
|
+
for kv in values:
|
|
130
|
+
parts = kv.split('=', 1) # only split the first '='
|
|
131
|
+
if len(parts) != 2:
|
|
132
|
+
raise ValueError(f'Invalid key-value pair: {kv}')
|
|
133
|
+
key, value = parts
|
|
134
|
+
kv_dict[key.strip()] = value.strip()
|
|
129
135
|
setattr(namespace, self.dest, kv_dict)
|
|
130
136
|
except ValueError as e:
|
|
131
137
|
parser.error(f'Error parsing key-value pairs: {e}')
|
|
@@ -144,9 +150,9 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
144
150
|
parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
|
|
145
151
|
parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
|
|
146
152
|
parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
|
|
147
|
-
parser.add_argument('--api-key', type=str, required=False, default=
|
|
148
|
-
parser.add_argument('--connect-timeout', type=int, default=
|
|
149
|
-
parser.add_argument('--read-timeout', type=int, default=
|
|
153
|
+
parser.add_argument('--api-key', type=str, required=False, default=None, help='The API key for authentication')
|
|
154
|
+
parser.add_argument('--connect-timeout', type=int, default=600, help='The network connection timeout')
|
|
155
|
+
parser.add_argument('--read-timeout', type=int, default=600, help='The network read timeout')
|
|
150
156
|
|
|
151
157
|
# Performance and parallelism
|
|
152
158
|
parser.add_argument('-n', '--number', type=int, default=None, help='How many requests to be made')
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -209,9 +209,14 @@ async def benchmark(args: Arguments) -> None:
|
|
|
209
209
|
loop = asyncio.get_running_loop()
|
|
210
210
|
add_signal_handlers(loop)
|
|
211
211
|
|
|
212
|
+
# init queue
|
|
212
213
|
request_queue = asyncio.Queue()
|
|
213
214
|
benchmark_data_queue = asyncio.Queue()
|
|
214
215
|
|
|
216
|
+
# reset event
|
|
217
|
+
query_send_completed_event.clear()
|
|
218
|
+
data_process_completed_event.clear()
|
|
219
|
+
|
|
215
220
|
async def create_send_request_tasks():
|
|
216
221
|
tasks: List[asyncio.Task] = []
|
|
217
222
|
for idx in range(args.parallel):
|
evalscope/perf/http_client.py
CHANGED
|
@@ -23,10 +23,7 @@ class AioHttpClient:
|
|
|
23
23
|
self.read_timeout = args.read_timeout
|
|
24
24
|
self.connect_timeout = args.connect_timeout
|
|
25
25
|
self.client = aiohttp.ClientSession(
|
|
26
|
-
timeout=aiohttp.ClientTimeout(
|
|
27
|
-
total=self.read_timeout + self.connect_timeout,
|
|
28
|
-
connect=self.connect_timeout,
|
|
29
|
-
sock_read=self.read_timeout),
|
|
26
|
+
timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
|
|
30
27
|
connector=aiohttp.TCPConnector(limit=1),
|
|
31
28
|
trace_configs=[self._create_trace_config()] if args.debug else [])
|
|
32
29
|
|
|
@@ -102,6 +99,11 @@ class AioHttpClient:
|
|
|
102
99
|
async with self.client.request('POST', url=self.url, data=data, headers=headers) as response:
|
|
103
100
|
async for rsp in self._handle_response(response):
|
|
104
101
|
yield rsp
|
|
102
|
+
except asyncio.TimeoutError:
|
|
103
|
+
logger.error(
|
|
104
|
+
f'TimeoutError: connect_timeout: {self.connect_timeout}, read_timeout: {self.read_timeout}. Please set longger timeout.' # noqa: E501
|
|
105
|
+
)
|
|
106
|
+
yield (True, None, 'Timeout')
|
|
105
107
|
except (aiohttp.ClientConnectorError, Exception) as e:
|
|
106
108
|
logger.error(e)
|
|
107
109
|
yield (True, None, e)
|
|
@@ -143,7 +145,15 @@ async def test_connection(args: Arguments) -> bool:
|
|
|
143
145
|
client = AioHttpClient(args)
|
|
144
146
|
async with client:
|
|
145
147
|
if 'chat/completions' in args.url:
|
|
146
|
-
request = {
|
|
148
|
+
request = {
|
|
149
|
+
'messages': [{
|
|
150
|
+
'role': 'user',
|
|
151
|
+
'content': 'hello'
|
|
152
|
+
}],
|
|
153
|
+
'model': args.model,
|
|
154
|
+
'max_tokens': 10,
|
|
155
|
+
'stream': args.stream
|
|
156
|
+
}
|
|
147
157
|
else:
|
|
148
158
|
request = {'prompt': 'hello', 'model': args.model, 'max_tokens': 10}
|
|
149
159
|
async for is_error, state_code, response_data in client.post(request):
|
evalscope/perf/main.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import pickle
|
|
4
4
|
import sqlite3
|
|
5
5
|
|
|
6
|
-
result_db_path = '
|
|
6
|
+
result_db_path = './outputs/qwen2.5_benchmark_20241111_160543.db'
|
|
7
7
|
con = sqlite3.connect(result_db_path)
|
|
8
8
|
query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
|
|
9
9
|
FROM result WHERE success='1'"
|
evalscope/report/app.py
CHANGED
|
@@ -125,6 +125,9 @@ def get_compare_report_df(acc_df: pd.DataFrame):
|
|
|
125
125
|
|
|
126
126
|
|
|
127
127
|
def plot_single_report_scores(df: pd.DataFrame):
|
|
128
|
+
if df is None:
|
|
129
|
+
return None
|
|
130
|
+
logger.debug(f'df: {df}')
|
|
128
131
|
plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
|
|
129
132
|
|
|
130
133
|
width = 0.2 if len(df[ReportKey.dataset_name]) <= 5 else None
|
evalscope/report/combinator.py
CHANGED
|
@@ -57,8 +57,8 @@ class ReportsRecorder:
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
if __name__ == '__main__':
|
|
60
|
-
report_dir_1 = '
|
|
61
|
-
# report_dir_2 = '
|
|
60
|
+
report_dir_1 = './outputs/20250117_151926'
|
|
61
|
+
# report_dir_2 = './outputs/20250107_204445/reports'
|
|
62
62
|
|
|
63
63
|
report_table = gen_table([report_dir_1])
|
|
64
64
|
print(report_table)
|
evalscope/run.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"""
|
|
3
3
|
Run evaluation for LLMs.
|
|
4
4
|
"""
|
|
5
|
-
import os
|
|
5
|
+
import os
|
|
6
6
|
from argparse import Namespace
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from typing import TYPE_CHECKING, List, Optional, Union
|
|
@@ -127,16 +127,17 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
|
|
|
127
127
|
from evalscope.models import initialize_model_adapter
|
|
128
128
|
|
|
129
129
|
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
130
|
-
# Initialize data adapter
|
|
131
|
-
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
132
130
|
|
|
133
131
|
if dataset_name == DataCollection.NAME:
|
|
134
132
|
# EvaluatorCollection is a collection of evaluators
|
|
135
133
|
from evalscope.collections import EvaluatorCollection
|
|
136
|
-
|
|
134
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
135
|
+
return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
|
|
137
136
|
|
|
138
137
|
# Initialize model adapter
|
|
139
|
-
model_adapter = initialize_model_adapter(task_cfg, benchmark
|
|
138
|
+
model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
|
|
139
|
+
# Initialize data adapter
|
|
140
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
140
141
|
|
|
141
142
|
# update task_cfg.dataset_args
|
|
142
143
|
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
@@ -8,7 +8,7 @@ import random
|
|
|
8
8
|
import torch
|
|
9
9
|
from typing import List
|
|
10
10
|
|
|
11
|
-
from evalscope.
|
|
11
|
+
from evalscope.third_party.longbench_write.tools.openai_api import OpenaiApi
|
|
12
12
|
from evalscope.third_party.longbench_write.utils import count_words
|
|
13
13
|
from evalscope.utils import get_logger
|
|
14
14
|
|