evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +27 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +45 -7
- evalscope/constants.py +7 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +89 -121
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +140 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/arguments.py +3 -1
- evalscope/perf/benchmark.py +3 -3
- evalscope/perf/main.py +5 -7
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +54 -50
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +4 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +64 -125
- evalscope/run_arena.py +3 -2
- evalscope/summarizer.py +15 -27
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +6 -5
- evalscope/utils/io_utils.py +170 -0
- evalscope/utils/logger.py +13 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -200
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +57 -7
- tests/perf/test_perf.py +3 -2
- tests/rag/test_mteb.py +3 -2
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -135
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from typing import TYPE_CHECKING, Optional
|
|
3
|
+
|
|
4
|
+
from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from evalscope.config import TaskConfig
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LocalModel:
|
|
14
|
+
|
|
15
|
+
def __init__(self,
|
|
16
|
+
model_id: str,
|
|
17
|
+
model_revision: str = DEFAULT_MODEL_REVISION,
|
|
18
|
+
device_map: str = 'auto',
|
|
19
|
+
torch_dtype: str = 'auto',
|
|
20
|
+
cache_dir: str = None,
|
|
21
|
+
**kwargs):
|
|
22
|
+
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
23
|
+
|
|
24
|
+
model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
|
|
25
|
+
|
|
26
|
+
if isinstance(torch_dtype, str) and torch_dtype != 'auto':
|
|
27
|
+
torch_dtype = eval(torch_dtype)
|
|
28
|
+
|
|
29
|
+
self.model_id = model_id
|
|
30
|
+
self.model_revision = model_revision
|
|
31
|
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
32
|
+
|
|
33
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
34
|
+
self.model_id,
|
|
35
|
+
revision=model_revision,
|
|
36
|
+
trust_remote_code=True,
|
|
37
|
+
cache_dir=model_cache_dir,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
|
41
|
+
self.model_id,
|
|
42
|
+
revision=model_revision,
|
|
43
|
+
device_map=device_map,
|
|
44
|
+
trust_remote_code=True,
|
|
45
|
+
torch_dtype=torch_dtype,
|
|
46
|
+
cache_dir=model_cache_dir,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
self.model_cfg = {
|
|
50
|
+
'model_id': model_id,
|
|
51
|
+
'device_map': device_map,
|
|
52
|
+
'torch_dtype': str(torch_dtype),
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
|
|
57
|
+
"""Get the base local model for the task. If the task is not checkpoint-based, return None.
|
|
58
|
+
Avoids loading model multiple times for different datasets.
|
|
59
|
+
"""
|
|
60
|
+
if task_cfg.eval_type != EvalType.CHECKPOINT:
|
|
61
|
+
return None
|
|
62
|
+
else:
|
|
63
|
+
device_map = task_cfg.model_args.get('device_map', 'auto')
|
|
64
|
+
cache_dir = task_cfg.model_args.get('cache_dir', None)
|
|
65
|
+
model_precision = task_cfg.model_args.get('precision', 'torch.float16')
|
|
66
|
+
model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
|
|
67
|
+
|
|
68
|
+
base_model = LocalModel(
|
|
69
|
+
model_id=task_cfg.model,
|
|
70
|
+
model_revision=model_revision,
|
|
71
|
+
device_map=device_map,
|
|
72
|
+
torch_dtype=model_precision,
|
|
73
|
+
cache_dir=cache_dir)
|
|
74
|
+
return base_model
|
evalscope/models/model.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
2
5
|
from abc import ABC, abstractmethod
|
|
3
6
|
from typing import Any
|
|
4
7
|
|
|
8
|
+
from evalscope.utils.logger import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
5
12
|
|
|
6
13
|
class BaseModel(ABC):
|
|
7
14
|
|
|
@@ -86,3 +93,137 @@ class ChatBaseModel(BaseModel):
|
|
|
86
93
|
}
|
|
87
94
|
"""
|
|
88
95
|
raise NotImplementedError
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class OpenAIModel(ChatBaseModel):
|
|
99
|
+
"""
|
|
100
|
+
APIs of OpenAI models.
|
|
101
|
+
Available models: gpt-3.5-turbo, gpt-4
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
MAX_RETRIES = 3
|
|
105
|
+
|
|
106
|
+
def __init__(self, model_cfg: dict, **kwargs):
|
|
107
|
+
super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
|
|
108
|
+
|
|
109
|
+
openai_api_key = os.environ.get('OPENAI_API_KEY', None)
|
|
110
|
+
self.api_key = self.model_cfg.get('api_key', openai_api_key)
|
|
111
|
+
|
|
112
|
+
if not self.api_key:
|
|
113
|
+
logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
|
|
114
|
+
# raise ValueError(
|
|
115
|
+
# 'OpenAI API key is not provided, '
|
|
116
|
+
# 'please set it in environment variable OPENAI_API_KEY')
|
|
117
|
+
|
|
118
|
+
def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
|
|
119
|
+
|
|
120
|
+
sys_prompt: str = inputs.get('sys_prompt', '')
|
|
121
|
+
user_prompt: str = inputs.get('user_prompt', '')
|
|
122
|
+
|
|
123
|
+
# model_id: str = kwargs.get('model_id', '')
|
|
124
|
+
temperature: float = kwargs.pop('temperature', 0.2)
|
|
125
|
+
max_tokens: int = kwargs.pop('max_tokens', 1024)
|
|
126
|
+
mode: str = kwargs.pop('mode', 'chat.completion')
|
|
127
|
+
|
|
128
|
+
logger.info(f'Using OpenAI model_id: {model_id}')
|
|
129
|
+
|
|
130
|
+
res = self._predict(
|
|
131
|
+
model_id=model_id,
|
|
132
|
+
sys_prompt=sys_prompt,
|
|
133
|
+
user_prompt=user_prompt,
|
|
134
|
+
temperature=temperature,
|
|
135
|
+
max_tokens=max_tokens,
|
|
136
|
+
mode=mode)
|
|
137
|
+
|
|
138
|
+
return res
|
|
139
|
+
|
|
140
|
+
def _predict(
|
|
141
|
+
self,
|
|
142
|
+
model_id,
|
|
143
|
+
sys_prompt,
|
|
144
|
+
user_prompt,
|
|
145
|
+
temperature,
|
|
146
|
+
max_tokens,
|
|
147
|
+
mode: str = 'chat.completion',
|
|
148
|
+
) -> dict:
|
|
149
|
+
import openai
|
|
150
|
+
|
|
151
|
+
res = {}
|
|
152
|
+
openai.api_key = self.api_key
|
|
153
|
+
|
|
154
|
+
for i in range(self.MAX_RETRIES):
|
|
155
|
+
try:
|
|
156
|
+
if mode == 'chat.completion':
|
|
157
|
+
resp = openai.ChatCompletion.create(
|
|
158
|
+
model=model_id,
|
|
159
|
+
messages=[{
|
|
160
|
+
'role': 'system',
|
|
161
|
+
'content': sys_prompt
|
|
162
|
+
}, {
|
|
163
|
+
'role': 'user',
|
|
164
|
+
'content': user_prompt
|
|
165
|
+
}],
|
|
166
|
+
temperature=temperature,
|
|
167
|
+
max_tokens=max_tokens)
|
|
168
|
+
|
|
169
|
+
if resp:
|
|
170
|
+
ans_text = resp['choices'][0]['message']['content']
|
|
171
|
+
model_id = resp['model']
|
|
172
|
+
else:
|
|
173
|
+
logger.warning(f'OpenAI GPT API call failed: got empty response '
|
|
174
|
+
f'for input {sys_prompt} {user_prompt}')
|
|
175
|
+
ans_text = ''
|
|
176
|
+
model_id = ''
|
|
177
|
+
|
|
178
|
+
res['ans_text'] = ans_text
|
|
179
|
+
res['model_id'] = model_id
|
|
180
|
+
else:
|
|
181
|
+
raise ValueError(f'Invalid mode: {mode}')
|
|
182
|
+
|
|
183
|
+
return res
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.warning(f'OpenAI API call failed: {e}')
|
|
187
|
+
time.sleep(3)
|
|
188
|
+
logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
|
|
189
|
+
return res
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class DummyChatModel(ChatBaseModel):
|
|
193
|
+
|
|
194
|
+
MODEL_ID = 'dummy_chat_model_0801'
|
|
195
|
+
REVISION = 'v1.0.0'
|
|
196
|
+
|
|
197
|
+
def __init__(self, model_cfg: dict, **kwargs):
|
|
198
|
+
model_cfg['model_id'] = self.MODEL_ID
|
|
199
|
+
model_cfg['revision'] = self.REVISION
|
|
200
|
+
super(DummyChatModel, self).__init__(model_cfg=model_cfg)
|
|
201
|
+
|
|
202
|
+
def predict(self, inputs: dict, **kwargs) -> dict:
|
|
203
|
+
|
|
204
|
+
debug: bool = False
|
|
205
|
+
if debug:
|
|
206
|
+
messages = inputs['messages']
|
|
207
|
+
history = inputs['history']
|
|
208
|
+
|
|
209
|
+
logger.info(f'** messages: {messages}')
|
|
210
|
+
logger.info(f'** history: {history}')
|
|
211
|
+
|
|
212
|
+
choice = random.choice(['A', 'B', 'C', 'D'])
|
|
213
|
+
|
|
214
|
+
# Build response
|
|
215
|
+
res = {
|
|
216
|
+
'choices': [{
|
|
217
|
+
'index': 0,
|
|
218
|
+
'message': {
|
|
219
|
+
'content': choice,
|
|
220
|
+
'role': 'assistant'
|
|
221
|
+
}
|
|
222
|
+
}],
|
|
223
|
+
'created': time.time(),
|
|
224
|
+
'model': self.MODEL_ID + '-' + self.REVISION,
|
|
225
|
+
'object': 'chat.completion',
|
|
226
|
+
'usage': {}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
return res
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import time
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.models.base_adapter import BaseModelAdapter
|
|
6
|
+
from evalscope.utils.chat_service import ChatMessage
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ServerModelAdapter(BaseModelAdapter):
|
|
13
|
+
"""
|
|
14
|
+
Server model adapter to request remote API model and generate results.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
|
|
18
|
+
"""
|
|
19
|
+
Args:
|
|
20
|
+
api_url: The URL of the remote API model.
|
|
21
|
+
model_id: The ID of the remote API model.
|
|
22
|
+
api_key: The API key of the remote API model.
|
|
23
|
+
"""
|
|
24
|
+
self.api_url = api_url
|
|
25
|
+
self.model_id = model_id
|
|
26
|
+
self.api_key = api_key
|
|
27
|
+
self.seed = kwargs.get('seed', None)
|
|
28
|
+
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
29
|
+
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
30
|
+
|
|
31
|
+
def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
Model prediction func.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
inputs (Union[str, dict, list]): The input data.
|
|
37
|
+
infer_cfg (dict): Inference configuration.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
res (dict): The model prediction results.
|
|
41
|
+
"""
|
|
42
|
+
infer_cfg = infer_cfg or {}
|
|
43
|
+
|
|
44
|
+
# Process inputs
|
|
45
|
+
if isinstance(inputs, str):
|
|
46
|
+
query = inputs
|
|
47
|
+
system_prompt = None
|
|
48
|
+
elif isinstance(inputs, dict):
|
|
49
|
+
data: list = inputs['data']
|
|
50
|
+
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
51
|
+
query = '\n'.join(''.join(item) for item in data)
|
|
52
|
+
system_prompt = inputs.get('system_prompt', None)
|
|
53
|
+
else:
|
|
54
|
+
query = data[0]
|
|
55
|
+
system_prompt = inputs.get('system_prompt', None)
|
|
56
|
+
elif isinstance(inputs, list):
|
|
57
|
+
query = '\n'.join(inputs)
|
|
58
|
+
system_prompt = None
|
|
59
|
+
else:
|
|
60
|
+
raise TypeError(f'Unsupported inputs type: {type(inputs)}')
|
|
61
|
+
|
|
62
|
+
content = self.make_request_content(query, system_prompt)
|
|
63
|
+
request_json = self.make_request(content, infer_cfg)
|
|
64
|
+
response = self.send_request(request_json)
|
|
65
|
+
return response
|
|
66
|
+
|
|
67
|
+
def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> dict:
|
|
68
|
+
"""
|
|
69
|
+
Make request content for API.
|
|
70
|
+
"""
|
|
71
|
+
if system_prompt is not None:
|
|
72
|
+
messages = [
|
|
73
|
+
ChatMessage(role='system', content=system_prompt).model_dump(exclude_unset=True),
|
|
74
|
+
ChatMessage(role='user', content=query).model_dump(exclude_unset=True)
|
|
75
|
+
]
|
|
76
|
+
else:
|
|
77
|
+
messages = [ChatMessage(role='user', content=query).model_dump(exclude_unset=True)]
|
|
78
|
+
return {'messages': messages}
|
|
79
|
+
|
|
80
|
+
def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
|
|
81
|
+
"""Make request to remote API."""
|
|
82
|
+
# Format request JSON according to OpenAI API format
|
|
83
|
+
do_sample = infer_cfg.get('do_sample', False)
|
|
84
|
+
temperature = infer_cfg.get('temperature', 0.0) if do_sample else 0.0
|
|
85
|
+
|
|
86
|
+
request_json = {
|
|
87
|
+
**content, 'model': self.model_id,
|
|
88
|
+
'max_tokens': infer_cfg.get('max_tokens', 2048),
|
|
89
|
+
'temperature': temperature,
|
|
90
|
+
'top_p': infer_cfg.get('top_p', 1.0),
|
|
91
|
+
'n': infer_cfg.get('num_return_sequences', 1),
|
|
92
|
+
'stop': infer_cfg.get('stop', None)
|
|
93
|
+
}
|
|
94
|
+
if self.seed is not None:
|
|
95
|
+
request_json['seed'] = self.seed
|
|
96
|
+
logger.debug(f'Request to remote API: {request_json}')
|
|
97
|
+
return request_json
|
|
98
|
+
|
|
99
|
+
def send_request(self, request_json: dict, max_retries: int = 3) -> dict:
|
|
100
|
+
for attempt in range(max_retries):
|
|
101
|
+
response = requests.post(
|
|
102
|
+
self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'})
|
|
103
|
+
if response.status_code == 200:
|
|
104
|
+
response_data = response.json()
|
|
105
|
+
return response_data
|
|
106
|
+
logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}')
|
|
107
|
+
if attempt < max_retries - 1:
|
|
108
|
+
time.sleep(5) # Sleep for 5 seconds before retrying
|
|
109
|
+
else:
|
|
110
|
+
raise RuntimeError(f'Failed to request to remote API after {max_retries} attempts: '
|
|
111
|
+
f'{response.status_code} {response.text}')
|
evalscope/perf/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from evalscope.perf.main import run_perf_benchmark
|
evalscope/perf/arguments.py
CHANGED
|
@@ -16,7 +16,7 @@ class Arguments:
|
|
|
16
16
|
attn_implementation: Optional[str] = None # Attention implementaion, only for local inference
|
|
17
17
|
api: str = 'openai' # API to be used (default: 'openai')
|
|
18
18
|
tokenizer_path: Optional[str] = None # Path to the tokenizer
|
|
19
|
-
port:
|
|
19
|
+
port: int = 8877 # Port number for the local API server
|
|
20
20
|
|
|
21
21
|
# Connection settings
|
|
22
22
|
url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
|
|
@@ -68,6 +68,7 @@ class Arguments:
|
|
|
68
68
|
model=args.model,
|
|
69
69
|
attn_implementation=args.attn_implementation,
|
|
70
70
|
url=args.url,
|
|
71
|
+
port=args.port,
|
|
71
72
|
api_key=args.api_key,
|
|
72
73
|
connect_timeout=args.connect_timeout,
|
|
73
74
|
read_timeout=args.read_timeout,
|
|
@@ -138,6 +139,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
138
139
|
|
|
139
140
|
# Connection settings
|
|
140
141
|
parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
|
|
142
|
+
parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
|
|
141
143
|
parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
|
|
142
144
|
parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
|
|
143
145
|
parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -157,7 +157,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
157
157
|
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
158
158
|
try:
|
|
159
159
|
# Attempt to get benchmark data from the queue with a timeout
|
|
160
|
-
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=
|
|
160
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
161
161
|
benchmark_data_queue.task_done()
|
|
162
162
|
except asyncio.TimeoutError:
|
|
163
163
|
# If timeout, continue to the next iteration
|
|
@@ -195,9 +195,9 @@ async def start_server(args: Arguments) -> bool:
|
|
|
195
195
|
server.start()
|
|
196
196
|
|
|
197
197
|
if args.dataset.startswith('speed_benchmark'):
|
|
198
|
-
args.url = 'http://127.0.0.1:
|
|
198
|
+
args.url = f'http://127.0.0.1:{args.port}/v1/completions'
|
|
199
199
|
else:
|
|
200
|
-
args.url = 'http://127.0.0.1:
|
|
200
|
+
args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
|
|
201
201
|
|
|
202
202
|
if not await test_connection(args):
|
|
203
203
|
raise TimeoutError('Test connection failed')
|
evalscope/perf/main.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import logging
|
|
3
2
|
import os
|
|
4
3
|
import platform
|
|
5
4
|
from argparse import Namespace
|
|
@@ -8,7 +7,7 @@ from evalscope.perf.arguments import Arguments, parse_args
|
|
|
8
7
|
from evalscope.perf.benchmark import benchmark
|
|
9
8
|
from evalscope.perf.utils.db_util import get_output_path
|
|
10
9
|
from evalscope.perf.utils.handler import add_signal_handlers
|
|
11
|
-
from evalscope.utils.logger import get_logger
|
|
10
|
+
from evalscope.utils.logger import configure_logging, get_logger
|
|
12
11
|
from evalscope.utils.utils import seed_everything
|
|
13
12
|
|
|
14
13
|
logger = get_logger()
|
|
@@ -19,14 +18,13 @@ def run_perf_benchmark(args):
|
|
|
19
18
|
args = Arguments(**args)
|
|
20
19
|
elif isinstance(args, Namespace):
|
|
21
20
|
args = Arguments.from_args(args)
|
|
22
|
-
|
|
21
|
+
|
|
22
|
+
if args.seed is not None:
|
|
23
|
+
seed_everything(args.seed)
|
|
23
24
|
|
|
24
25
|
# Setup logger and output
|
|
25
26
|
args.outputs_dir = get_output_path(args)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
if args.debug:
|
|
29
|
-
get_logger(log_level=logging.DEBUG, force=True)
|
|
27
|
+
configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
|
|
30
28
|
|
|
31
29
|
logger.info('Starting benchmark...')
|
|
32
30
|
logger.info(args)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from transformers import AutoTokenizer
|
|
3
2
|
from typing import Any, Dict, Iterator, List
|
|
4
3
|
|
|
5
4
|
from evalscope.perf.arguments import Arguments
|
|
@@ -25,6 +24,7 @@ class CustomPlugin(ApiPluginBase):
|
|
|
25
24
|
"""
|
|
26
25
|
super().__init__(model_path=mode_path)
|
|
27
26
|
if mode_path is not None:
|
|
27
|
+
from transformers import AutoTokenizer
|
|
28
28
|
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
4
|
-
from typing import Any, Dict, Iterator, List
|
|
3
|
+
from typing import Any, Dict, Iterator, List, Union
|
|
5
4
|
|
|
6
5
|
from evalscope.perf.arguments import Arguments
|
|
7
6
|
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
@@ -25,11 +24,12 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
25
24
|
"""
|
|
26
25
|
super().__init__(model_path=mode_path)
|
|
27
26
|
if mode_path is not None:
|
|
27
|
+
from transformers import AutoTokenizer
|
|
28
28
|
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
29
29
|
else:
|
|
30
30
|
self.tokenizer = None
|
|
31
31
|
|
|
32
|
-
def build_request(self, messages: List[Dict]
|
|
32
|
+
def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
|
|
33
33
|
"""Build the openai format request based on prompt, dataset
|
|
34
34
|
|
|
35
35
|
Args:
|
|
@@ -96,60 +96,64 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
96
96
|
|
|
97
97
|
def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
|
|
98
98
|
"""Parser responses and return number of request and response tokens.
|
|
99
|
-
|
|
100
|
-
|
|
99
|
+
Only one response for non-stream, multiple responses for stream.
|
|
100
|
+
"""
|
|
101
101
|
|
|
102
|
+
# when stream, the last response is the full usage
|
|
103
|
+
# when non-stream, the last response is the first response
|
|
104
|
+
last_response_js = json.loads(responses[-1])
|
|
105
|
+
if 'usage' in last_response_js and last_response_js['usage']:
|
|
106
|
+
input_tokens = last_response_js['usage']['prompt_tokens']
|
|
107
|
+
output_tokens = last_response_js['usage']['completion_tokens']
|
|
108
|
+
return input_tokens, output_tokens
|
|
102
109
|
|
|
103
|
-
|
|
104
|
-
responses (List[bytes]): List of http response body, for stream output,
|
|
105
|
-
there are multiple responses, for general only one.
|
|
106
|
-
kwargs: (Any): The command line --parameter content.
|
|
107
|
-
Returns:
|
|
108
|
-
Tuple: Return number of prompt token and number of completion tokens.
|
|
109
|
-
"""
|
|
110
|
-
full_response_content = ''
|
|
110
|
+
# no usage information in the response, parse the response to get the tokens
|
|
111
111
|
delta_contents = {}
|
|
112
|
-
input_tokens = None
|
|
113
|
-
output_tokens = None
|
|
114
112
|
for response in responses:
|
|
115
113
|
js = json.loads(response)
|
|
116
|
-
if
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
114
|
+
if 'object' in js:
|
|
115
|
+
self.__process_response_object(js, delta_contents)
|
|
116
|
+
else:
|
|
117
|
+
self.__process_no_object(js, delta_contents)
|
|
118
|
+
|
|
119
|
+
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
|
|
120
|
+
return input_tokens, output_tokens
|
|
121
|
+
|
|
122
|
+
def __process_response_object(self, js, delta_contents):
|
|
123
|
+
if js['object'] == 'chat.completion':
|
|
124
|
+
for choice in js['choices']:
|
|
125
|
+
delta_contents[choice['index']] = [choice['message']['content']]
|
|
126
|
+
elif js['object'] == 'text_completion':
|
|
127
|
+
for choice in js['choices']:
|
|
128
|
+
delta_contents[choice['index']] = [choice['text']]
|
|
129
|
+
elif js['object'] == 'chat.completion.chunk':
|
|
130
|
+
for choice in js.get('choices', []):
|
|
131
|
+
if 'delta' in choice and 'index' in choice:
|
|
132
|
+
delta = choice['delta']
|
|
133
|
+
idx = choice['index']
|
|
134
|
+
if 'content' in delta:
|
|
135
|
+
delta_content = delta['content']
|
|
136
|
+
delta_contents.setdefault(idx, []).append(delta_content)
|
|
137
|
+
|
|
138
|
+
def __process_no_object(self, js, delta_contents):
|
|
139
|
+
# assume the response is a single choice
|
|
140
|
+
for choice in js['choices']:
|
|
141
|
+
if 'delta' in choice:
|
|
142
|
+
delta = choice['delta']
|
|
143
|
+
idx = choice['index']
|
|
144
|
+
if 'content' in delta:
|
|
145
|
+
delta_content = delta['content']
|
|
146
|
+
delta_contents.setdefault(idx, []).append(delta_content)
|
|
147
|
+
else:
|
|
148
|
+
delta_contents[choice['index']] = [choice['message']['content']]
|
|
149
|
+
|
|
150
|
+
def __calculate_tokens_from_content(self, request, delta_contents):
|
|
151
|
+
input_tokens = output_tokens = 0
|
|
152
|
+
if self.tokenizer is not None:
|
|
146
153
|
for idx, choice_contents in delta_contents.items():
|
|
147
|
-
full_response_content = ''.join(
|
|
154
|
+
full_response_content = ''.join(choice_contents)
|
|
148
155
|
input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
|
|
149
156
|
output_tokens += len(self.tokenizer.encode(full_response_content))
|
|
150
|
-
|
|
151
|
-
input_tokens = 0
|
|
152
|
-
output_tokens = 0
|
|
157
|
+
else:
|
|
153
158
|
logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
|
|
154
|
-
|
|
155
159
|
return input_tokens, output_tokens
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
from io import BytesIO
|
|
3
|
-
from modelscope.msdatasets import MsDataset
|
|
4
3
|
from PIL import Image
|
|
5
4
|
from typing import Any, Dict, Iterator, List
|
|
6
5
|
|
|
@@ -26,6 +25,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
26
25
|
super().__init__(query_parameters)
|
|
27
26
|
|
|
28
27
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
28
|
+
from modelscope.msdatasets import MsDataset
|
|
29
29
|
dataset = MsDataset.load('clip-benchmark/wds_flickr8k', split='test')
|
|
30
30
|
|
|
31
31
|
for item in dataset:
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from modelscope import MsDataset
|
|
2
1
|
from typing import Any, Dict, Iterator, List
|
|
3
2
|
|
|
4
3
|
from evalscope.perf.arguments import Arguments
|
|
@@ -17,6 +16,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
|
17
16
|
|
|
18
17
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
19
18
|
if not self.query_parameters.dataset_path:
|
|
19
|
+
from modelscope import MsDataset
|
|
20
20
|
ds = MsDataset.load('AI-ModelScope/LongAlpaca-12k', subset_name='default', split='train')
|
|
21
21
|
else:
|
|
22
22
|
ds = self.dataset_json_list(self.query_parameters.dataset_path)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, List, Type
|
|
1
|
+
from typing import Any, List, Type, Union
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class PluginRegistry:
|
|
@@ -20,7 +20,7 @@ class PluginRegistry:
|
|
|
20
20
|
return self.get_class(name)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
def register_dataset(name: str
|
|
23
|
+
def register_dataset(name: Union[str, List[str]]):
|
|
24
24
|
|
|
25
25
|
def class_decorator(cls: Type):
|
|
26
26
|
if isinstance(name, str):
|
|
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
|
|
|
35
35
|
return class_decorator
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
def register_api(name: str
|
|
38
|
+
def register_api(name: Union[str, List[str]]):
|
|
39
39
|
|
|
40
40
|
def class_decorator(cls: Type):
|
|
41
41
|
if isinstance(name, str):
|
|
@@ -116,19 +116,19 @@ class BenchmarkMetrics:
|
|
|
116
116
|
|
|
117
117
|
def create_message(self, default_ndigits=3):
|
|
118
118
|
message = {
|
|
119
|
-
'Time taken for tests (
|
|
119
|
+
'Time taken for tests (s)': round(self.total_time, default_ndigits),
|
|
120
120
|
'Number of concurrency': self.concurrency,
|
|
121
121
|
'Total requests': int(self.n_total_queries),
|
|
122
122
|
'Succeed requests': self.n_succeed_queries,
|
|
123
123
|
'Failed requests': self.n_failed_queries,
|
|
124
|
+
'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
|
|
124
125
|
'Average QPS': round(self.qps, default_ndigits),
|
|
125
126
|
'Average latency (s)': round(self.avg_latency, default_ndigits),
|
|
126
127
|
'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
|
|
127
128
|
'Average time per output token (s)': round(self.avg_time_per_token, 5),
|
|
128
|
-
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
|
129
|
-
'Average package per request': round(self.n_avg_chunks, default_ndigits),
|
|
130
|
-
'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
|
|
131
129
|
'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
|
|
132
130
|
'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
|
|
131
|
+
'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
|
|
132
|
+
'Average package per request': round(self.n_avg_chunks, default_ndigits),
|
|
133
133
|
}
|
|
134
134
|
return message
|