evalscope 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
- evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
- evalscope/benchmarks/ceval/samples.jsonl +1 -0
- evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
- evalscope/benchmarks/mmlu/samples.jsonl +5 -0
- evalscope/benchmarks/race/samples.jsonl +5 -0
- evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
- evalscope/cli/start_perf.py +8 -11
- evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
- evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
- evalscope/metrics/rouge_metric.py +30 -15
- evalscope/perf/arguments.py +179 -0
- evalscope/perf/benchmark.py +245 -0
- evalscope/perf/http_client.py +127 -711
- evalscope/perf/main.py +35 -0
- evalscope/perf/plugin/__init__.py +2 -0
- evalscope/perf/plugin/api/__init__.py +3 -0
- evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
- evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
- evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
- evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
- evalscope/perf/plugin/datasets/__init__.py +6 -0
- evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
- evalscope/perf/plugin/datasets/custom.py +21 -0
- evalscope/perf/plugin/datasets/flickr8k.py +51 -0
- evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
- evalscope/perf/plugin/datasets/longalpaca.py +28 -0
- evalscope/perf/plugin/datasets/openqa.py +38 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
- evalscope/perf/plugin/registry.py +54 -0
- evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
- evalscope/perf/utils/benchmark_util.py +135 -0
- evalscope/perf/utils/chat_service.py +252 -0
- evalscope/perf/utils/db_util.py +200 -0
- evalscope/perf/utils/handler.py +46 -0
- evalscope/perf/utils/local_server.py +139 -0
- evalscope/registry/config/cfg_arena.yaml +77 -0
- evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
- evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
- evalscope/registry/config/cfg_single.yaml +78 -0
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
- evalscope/registry/data/qa_browser/battle.jsonl +634 -0
- evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
- evalscope/registry/data/question.jsonl +80 -0
- evalscope/third_party/longbench_write/README.md +118 -0
- evalscope/third_party/longbench_write/default_task.json +27 -0
- evalscope/third_party/longbench_write/default_task.yaml +24 -0
- evalscope/third_party/toolbench_static/README.md +118 -0
- evalscope/third_party/toolbench_static/config_default.json +15 -0
- evalscope/third_party/toolbench_static/config_default.yaml +12 -0
- evalscope/third_party/toolbench_static/requirements.txt +2 -0
- evalscope/utils/logger.py +18 -20
- evalscope/utils/utils.py +41 -42
- evalscope/version.py +2 -2
- evalscope-0.7.0.dist-info/LICENSE +203 -0
- {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/METADATA +91 -33
- {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/RECORD +99 -29
- {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
- {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
- tests/cli/__init__.py +1 -0
- tests/cli/test_run.py +76 -0
- tests/perf/__init__.py +1 -0
- tests/perf/test_perf.py +96 -0
- tests/rag/test_clip_benchmark.py +85 -0
- tests/rag/test_mteb.py +136 -0
- tests/rag/test_ragas.py +120 -0
- tests/swift/__init__.py +1 -0
- tests/swift/test_run_swift_eval.py +146 -0
- tests/swift/test_run_swift_vlm_eval.py +128 -0
- tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
- tests/test_run_all.py +12 -0
- tests/vlm/__init__.py +1 -0
- tests/vlm/test_vlmeval.py +59 -0
- evalscope/perf/_logging.py +0 -32
- evalscope/perf/datasets/longalpaca_12k.py +0 -20
- evalscope/perf/datasets/openqa.py +0 -22
- evalscope/perf/plugin_registry.py +0 -35
- evalscope/perf/query_parameters.py +0 -42
- evalscope/perf/server_sent_event.py +0 -43
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
- /evalscope/perf/{datasets → utils}/__init__.py +0 -0
- {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
- {evalscope/preprocess → tests}/__init__.py +0 -0
- {evalscope/preprocess/tokenizers → tests/rag}/__init__.py +0 -0
evalscope/perf/main.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import platform
|
|
3
|
+
from argparse import Namespace
|
|
4
|
+
|
|
5
|
+
from evalscope.perf.arguments import Arguments, parse_args
|
|
6
|
+
from evalscope.perf.benchmark import benchmark
|
|
7
|
+
from evalscope.perf.utils.handler import add_signal_handlers
|
|
8
|
+
from evalscope.utils.logger import get_logger
|
|
9
|
+
from evalscope.utils.utils import seed_everything
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def run_perf_benchmark(args):
|
|
15
|
+
if isinstance(args, dict):
|
|
16
|
+
args = Arguments(**args)
|
|
17
|
+
elif isinstance(args, Namespace):
|
|
18
|
+
args = Arguments.from_args(args)
|
|
19
|
+
seed_everything(args.seed)
|
|
20
|
+
|
|
21
|
+
logger.info('Starting benchmark...')
|
|
22
|
+
logger.info(args)
|
|
23
|
+
|
|
24
|
+
if platform.system() == 'Windows':
|
|
25
|
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
26
|
+
|
|
27
|
+
loop = asyncio.get_event_loop()
|
|
28
|
+
if platform.system() != 'Windows':
|
|
29
|
+
add_signal_handlers(loop)
|
|
30
|
+
loop.run_until_complete(benchmark(args))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
if __name__ == '__main__':
|
|
34
|
+
args = Arguments.from_args(parse_args())
|
|
35
|
+
run_perf_benchmark(args)
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
2
|
from typing import Any, Dict, List, Tuple
|
|
3
3
|
|
|
4
|
-
from evalscope.perf.
|
|
4
|
+
from evalscope.perf.arguments import Arguments
|
|
5
|
+
|
|
5
6
|
|
|
6
7
|
class ApiPluginBase:
|
|
8
|
+
|
|
7
9
|
def __init__(self, model_path: str) -> None:
|
|
8
10
|
self.model_path = model_path
|
|
9
|
-
|
|
11
|
+
|
|
10
12
|
@abstractmethod
|
|
11
|
-
def build_request(self, messages: List[Dict], param:
|
|
13
|
+
def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
|
|
12
14
|
"""Build a api request body.
|
|
13
15
|
|
|
14
16
|
Args:
|
|
@@ -22,39 +24,36 @@ class ApiPluginBase:
|
|
|
22
24
|
Dict: The api request body.
|
|
23
25
|
"""
|
|
24
26
|
raise NotImplementedError
|
|
25
|
-
|
|
27
|
+
|
|
26
28
|
@abstractmethod
|
|
27
|
-
def parse_responses(self,
|
|
28
|
-
responses: List,
|
|
29
|
-
request: Any=None,
|
|
30
|
-
**kwargs:Any) -> Tuple[int, int]:
|
|
29
|
+
def parse_responses(self, responses: List, request: Any = None, **kwargs: Any) -> Tuple[int, int]:
|
|
31
30
|
"""Parser responses and return number of request and response tokens.
|
|
32
31
|
|
|
33
32
|
Args:
|
|
34
33
|
responses (List[bytes]): List of http response body, for stream output,
|
|
35
|
-
there are multiple responses, each is bytes, for general only one.
|
|
34
|
+
there are multiple responses, each is bytes, for general only one.
|
|
36
35
|
request (Any): The request body.
|
|
37
36
|
|
|
38
37
|
Returns:
|
|
39
38
|
Tuple: (Number of prompt_tokens and number of completion_tokens).
|
|
40
39
|
"""
|
|
41
|
-
raise NotImplementedError
|
|
40
|
+
raise NotImplementedError
|
|
42
41
|
|
|
43
42
|
@staticmethod
|
|
44
43
|
def replace_values(input_json: Any, model: str, prompt: str):
|
|
45
|
-
if isinstance(input_json, dict):
|
|
44
|
+
if isinstance(input_json, dict):
|
|
46
45
|
for key, value in input_json.items():
|
|
47
46
|
if isinstance(value, str):
|
|
48
|
-
input_json[key] = value.replace(
|
|
49
|
-
else:
|
|
50
|
-
ApiPluginBase.replace_values(value, model, prompt)
|
|
51
|
-
elif isinstance(input_json, list):
|
|
47
|
+
input_json[key] = value.replace('%m', model).replace('%p', prompt)
|
|
48
|
+
else:
|
|
49
|
+
ApiPluginBase.replace_values(value, model, prompt)
|
|
50
|
+
elif isinstance(input_json, list):
|
|
52
51
|
for idx, item in enumerate(input_json):
|
|
53
52
|
if isinstance(item, str):
|
|
54
|
-
input_json[idx] = item.replace(
|
|
53
|
+
input_json[idx] = item.replace('%m', model).replace('%p', prompt)
|
|
55
54
|
else:
|
|
56
55
|
ApiPluginBase.replace_values(item, model, prompt)
|
|
57
56
|
elif isinstance(input_json, str):
|
|
58
|
-
input_json = input_json.replace(
|
|
57
|
+
input_json = input_json.replace('%m', model).replace('%p', prompt)
|
|
59
58
|
else:
|
|
60
|
-
pass
|
|
59
|
+
pass
|
|
@@ -1,19 +1,26 @@
|
|
|
1
1
|
from typing import Any, Dict, Iterator, List
|
|
2
|
+
|
|
2
3
|
import json
|
|
3
|
-
from evalscope.perf.api_plugin_base import ApiPluginBase
|
|
4
4
|
from transformers import AutoTokenizer
|
|
5
|
-
from evalscope.perf.plugin_registry import register_api
|
|
6
|
-
from evalscope.perf.query_parameters import QueryParameters
|
|
7
5
|
|
|
8
|
-
|
|
6
|
+
from evalscope.perf.arguments import Arguments
|
|
7
|
+
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
8
|
+
from evalscope.perf.plugin.registry import register_api
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_api('custom')
|
|
9
15
|
class CustomPlugin(ApiPluginBase):
|
|
10
16
|
"""Support tensorrt-llm triton server
|
|
11
17
|
"""
|
|
18
|
+
|
|
12
19
|
def __init__(self, mode_path: str):
|
|
13
20
|
"""Init the plugin
|
|
14
21
|
|
|
15
22
|
Args:
|
|
16
|
-
mode_path (str): The model path, we use the tokenizer
|
|
23
|
+
mode_path (str): The model path, we use the tokenizer
|
|
17
24
|
weight in the model to calculate the number of the
|
|
18
25
|
input and output tokens.
|
|
19
26
|
"""
|
|
@@ -23,12 +30,12 @@ class CustomPlugin(ApiPluginBase):
|
|
|
23
30
|
else:
|
|
24
31
|
self.tokenizer = None
|
|
25
32
|
|
|
26
|
-
def build_request(self, messages: List[Dict], param:
|
|
33
|
+
def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
|
|
27
34
|
"""Build the openai format request based on prompt, dataset
|
|
28
35
|
|
|
29
36
|
Args:
|
|
30
37
|
message (Dict): The basic message to generator query.
|
|
31
|
-
param (
|
|
38
|
+
param (Arguments): The query parameters.
|
|
32
39
|
|
|
33
40
|
Raises:
|
|
34
41
|
Exception: NotImplemented
|
|
@@ -41,8 +48,8 @@ class CustomPlugin(ApiPluginBase):
|
|
|
41
48
|
ApiPluginBase.replace_values(query, param.model, messages[0]['content'])
|
|
42
49
|
return query
|
|
43
50
|
except Exception as e:
|
|
44
|
-
|
|
45
|
-
|
|
51
|
+
logger.exception(e)
|
|
52
|
+
logger.error('Prompt: %s invalidate!' % messages)
|
|
46
53
|
return None
|
|
47
54
|
|
|
48
55
|
def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
|
|
@@ -53,7 +60,7 @@ class CustomPlugin(ApiPluginBase):
|
|
|
53
60
|
|
|
54
61
|
Args:
|
|
55
62
|
responses (List[bytes]): List of http response body, for stream output,
|
|
56
|
-
there are multiple responses, for general only one.
|
|
63
|
+
there are multiple responses, for general only one.
|
|
57
64
|
kwargs: (Any): The command line --parameter content.
|
|
58
65
|
Returns:
|
|
59
66
|
Tuple: Return number of prompt token and number of completion tokens.
|
|
@@ -63,15 +70,15 @@ class CustomPlugin(ApiPluginBase):
|
|
|
63
70
|
input_tokens = None
|
|
64
71
|
output_tokens = None
|
|
65
72
|
for response in responses:
|
|
66
|
-
|
|
73
|
+
data = json.loads(response)
|
|
67
74
|
# {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble",
|
|
68
75
|
# "model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"性"}
|
|
69
|
-
if 'text_output' in
|
|
76
|
+
if 'text_output' in data:
|
|
70
77
|
if 0 in delta_contents:
|
|
71
|
-
delta_contents[0].append(
|
|
78
|
+
delta_contents[0].append(data['text_output'])
|
|
72
79
|
else:
|
|
73
|
-
delta_contents[0] = [
|
|
74
|
-
if input_tokens is None and output_tokens is None and self.tokenizer is not None:
|
|
80
|
+
delta_contents[0] = [data['text_output']]
|
|
81
|
+
if input_tokens is None and output_tokens is None and self.tokenizer is not None:
|
|
75
82
|
input_tokens = 0
|
|
76
83
|
output_tokens = 0
|
|
77
84
|
for _, choice_contents in delta_contents.items():
|
|
@@ -80,8 +87,7 @@ class CustomPlugin(ApiPluginBase):
|
|
|
80
87
|
output_tokens += len(self.tokenizer.encode(full_response_content))
|
|
81
88
|
elif input_tokens is None and output_tokens is None: # no usage info get.
|
|
82
89
|
input_tokens = 0
|
|
83
|
-
output_tokens = 0
|
|
84
|
-
|
|
90
|
+
output_tokens = 0
|
|
91
|
+
logger.warning('No usage info get.')
|
|
92
|
+
|
|
85
93
|
return input_tokens, output_tokens
|
|
86
|
-
|
|
87
|
-
|
|
@@ -1,26 +1,30 @@
|
|
|
1
|
-
|
|
2
|
-
from sys import maxsize
|
|
3
|
-
import sys
|
|
1
|
+
import os
|
|
4
2
|
from typing import Any, Dict, Iterator, List
|
|
3
|
+
|
|
5
4
|
import json
|
|
6
|
-
from evalscope.perf.api_plugin_base import ApiPluginBase
|
|
7
5
|
|
|
8
|
-
from evalscope.perf.
|
|
9
|
-
from evalscope.perf.
|
|
6
|
+
from evalscope.perf.arguments import Arguments
|
|
7
|
+
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
8
|
+
from evalscope.perf.plugin.registry import register_api
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger()
|
|
12
|
+
|
|
10
13
|
|
|
11
|
-
@register_api(
|
|
14
|
+
@register_api('dashscope')
|
|
12
15
|
class DashScopeApiPlugin(ApiPluginBase):
|
|
16
|
+
|
|
13
17
|
def __init__(self, mode_path: str):
|
|
14
18
|
"""Init the plugin
|
|
15
19
|
|
|
16
20
|
Args:
|
|
17
|
-
mode_path (str): The model path, we use the tokenizer
|
|
21
|
+
mode_path (str): The model path, we use the tokenizer
|
|
18
22
|
weight in the model to calculate the number of the
|
|
19
23
|
input and output tokens.
|
|
20
24
|
"""
|
|
21
25
|
super().__init__(model_path=mode_path)
|
|
22
|
-
|
|
23
|
-
def build_request(self,messages: List[Dict], param:
|
|
26
|
+
|
|
27
|
+
def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
|
|
24
28
|
"""Build the openai format request based on prompt, dataset
|
|
25
29
|
|
|
26
30
|
Args:
|
|
@@ -35,16 +39,26 @@ class DashScopeApiPlugin(ApiPluginBase):
|
|
|
35
39
|
"""
|
|
36
40
|
try:
|
|
37
41
|
if param.query_template is not None:
|
|
38
|
-
|
|
42
|
+
if param.query_template.startswith('@'):
|
|
43
|
+
file_path = param.query_template[1:]
|
|
44
|
+
if os.path.exists(file_path):
|
|
45
|
+
with open(file_path, 'r') as file:
|
|
46
|
+
query = json.load(file)
|
|
47
|
+
else:
|
|
48
|
+
raise FileNotFoundError(f'{file_path}')
|
|
49
|
+
else:
|
|
50
|
+
query = json.loads(param.query_template)
|
|
51
|
+
|
|
39
52
|
query['input']['messages'] = messages # replace template content with message.
|
|
40
53
|
return self.__compose_query_from_parameter(query, param)
|
|
41
54
|
else:
|
|
42
55
|
query = {'messages': messages}
|
|
43
56
|
return self.__compose_query_from_parameter(query, param)
|
|
44
57
|
except Exception as e:
|
|
45
|
-
|
|
58
|
+
logger.exception(e)
|
|
46
59
|
return None
|
|
47
|
-
|
|
60
|
+
|
|
61
|
+
def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
|
|
48
62
|
payload['model'] = param.model
|
|
49
63
|
if 'parameters' not in payload:
|
|
50
64
|
payload['parameters'] = {}
|
|
@@ -73,7 +87,7 @@ class DashScopeApiPlugin(ApiPluginBase):
|
|
|
73
87
|
|
|
74
88
|
Args:
|
|
75
89
|
responses (List[bytes]): List of http response body, for stream output,
|
|
76
|
-
there are multiple responses, for general only one.
|
|
90
|
+
there are multiple responses, for general only one.
|
|
77
91
|
kwargs: (Any): The command line --parameter content.
|
|
78
92
|
|
|
79
93
|
Returns:
|
|
@@ -1,19 +1,26 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import Any, Dict, Iterator, List
|
|
3
|
+
|
|
2
4
|
import json
|
|
3
|
-
from evalscope.perf.api_plugin_base import ApiPluginBase
|
|
4
5
|
from transformers import AutoTokenizer
|
|
5
|
-
from evalscope.perf.plugin_registry import register_api
|
|
6
|
-
from evalscope.perf.query_parameters import QueryParameters
|
|
7
6
|
|
|
8
|
-
|
|
7
|
+
from evalscope.perf.arguments import Arguments
|
|
8
|
+
from evalscope.perf.plugin.api.base import ApiPluginBase
|
|
9
|
+
from evalscope.perf.plugin.registry import register_api
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@register_api(['openai', 'local_vllm', 'local'])
|
|
9
16
|
class OpenaiPlugin(ApiPluginBase):
|
|
10
|
-
"""Base of openai interface.
|
|
11
|
-
|
|
17
|
+
"""Base of openai interface."""
|
|
18
|
+
|
|
12
19
|
def __init__(self, mode_path: str):
|
|
13
20
|
"""Init the plugin
|
|
14
21
|
|
|
15
22
|
Args:
|
|
16
|
-
mode_path (str): The model path, we use the tokenizer
|
|
23
|
+
mode_path (str): The model path, we use the tokenizer
|
|
17
24
|
weight in the model to calculate the number of the
|
|
18
25
|
input and output tokens.
|
|
19
26
|
"""
|
|
@@ -23,11 +30,11 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
23
30
|
else:
|
|
24
31
|
self.tokenizer = None
|
|
25
32
|
|
|
26
|
-
def build_request(self, messages: List[Dict], param:
|
|
33
|
+
def build_request(self, messages: List[Dict] | str, param: Arguments) -> Dict:
|
|
27
34
|
"""Build the openai format request based on prompt, dataset
|
|
28
35
|
|
|
29
36
|
Args:
|
|
30
|
-
message (Dict): The basic message to generator query.
|
|
37
|
+
message (List[Dict] | str): The basic message to generator query.
|
|
31
38
|
param (QueryParameters): The query parameters.
|
|
32
39
|
|
|
33
40
|
Raises:
|
|
@@ -38,22 +45,35 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
38
45
|
"""
|
|
39
46
|
try:
|
|
40
47
|
if param.query_template is not None:
|
|
41
|
-
|
|
48
|
+
if param.query_template.startswith('@'):
|
|
49
|
+
file_path = param.query_template[1:]
|
|
50
|
+
if os.path.exists(file_path):
|
|
51
|
+
with open(file_path, 'r') as file:
|
|
52
|
+
query = json.load(file)
|
|
53
|
+
else:
|
|
54
|
+
raise FileNotFoundError(f'{file_path}')
|
|
55
|
+
else:
|
|
56
|
+
query = json.loads(param.query_template)
|
|
57
|
+
|
|
42
58
|
if 'stream' in query.keys():
|
|
43
59
|
param.stream = query['stream']
|
|
44
|
-
|
|
45
|
-
|
|
60
|
+
# replace template messages with input messages.
|
|
61
|
+
query['messages'] = messages
|
|
62
|
+
elif isinstance(messages, str):
|
|
63
|
+
query = {'prompt': messages}
|
|
46
64
|
else:
|
|
47
65
|
query = {'messages': messages}
|
|
48
|
-
|
|
66
|
+
return self.__compose_query_from_parameter(query, param)
|
|
49
67
|
except Exception as e:
|
|
50
|
-
|
|
68
|
+
logger.exception(e)
|
|
51
69
|
return None
|
|
52
|
-
|
|
53
|
-
def __compose_query_from_parameter(self, payload: Dict, param:
|
|
70
|
+
|
|
71
|
+
def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
|
|
54
72
|
payload['model'] = param.model
|
|
55
73
|
if param.max_tokens is not None:
|
|
56
74
|
payload['max_tokens'] = param.max_tokens
|
|
75
|
+
if param.min_tokens is not None:
|
|
76
|
+
payload['min_tokens'] = param.min_tokens
|
|
57
77
|
if param.frequency_penalty is not None:
|
|
58
78
|
payload['frequency_penalty'] = param.frequency_penalty
|
|
59
79
|
if param.logprobs is not None:
|
|
@@ -66,7 +86,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
66
86
|
payload['stop'] = param.stop
|
|
67
87
|
if param.stream is not None and param.stream:
|
|
68
88
|
payload['stream'] = param.stream
|
|
69
|
-
payload['stream_options'] = {
|
|
89
|
+
payload['stream_options'] = {'include_usage': True}
|
|
70
90
|
if param.stop_token_ids is not None:
|
|
71
91
|
payload['stop_token_ids'] = param.stop_token_ids
|
|
72
92
|
if param.temperature is not None:
|
|
@@ -83,7 +103,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
83
103
|
|
|
84
104
|
Args:
|
|
85
105
|
responses (List[bytes]): List of http response body, for stream output,
|
|
86
|
-
there are multiple responses, for general only one.
|
|
106
|
+
there are multiple responses, for general only one.
|
|
87
107
|
kwargs: (Any): The command line --parameter content.
|
|
88
108
|
Returns:
|
|
89
109
|
Tuple: Return number of prompt token and number of completion tokens.
|
|
@@ -96,10 +116,15 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
96
116
|
js = json.loads(response)
|
|
97
117
|
if js['object'] == 'chat.completion':
|
|
98
118
|
for choice in js['choices']:
|
|
99
|
-
delta_contents[choice['index']] = [choice['message']['content']]
|
|
119
|
+
delta_contents[choice['index']] = [choice['message']['content']]
|
|
120
|
+
input_tokens = js['usage']['prompt_tokens']
|
|
121
|
+
output_tokens = js['usage']['completion_tokens']
|
|
122
|
+
elif js['object'] == 'text_completion':
|
|
123
|
+
for choice in js['choices']:
|
|
124
|
+
delta_contents[choice['index']] = [choice['text']]
|
|
100
125
|
input_tokens = js['usage']['prompt_tokens']
|
|
101
|
-
output_tokens = js['usage']['completion_tokens']
|
|
102
|
-
|
|
126
|
+
output_tokens = js['usage']['completion_tokens']
|
|
127
|
+
elif js['object'] == 'chat.completion.chunk':
|
|
103
128
|
if 'choices' in js:
|
|
104
129
|
for choice in js['choices']:
|
|
105
130
|
if 'delta' in choice and 'index' in choice:
|
|
@@ -115,8 +140,8 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
115
140
|
# "choices":[],"usage":{"prompt_tokens":32,"total_tokens":384,"completion_tokens":352}}
|
|
116
141
|
if 'usage' in js and js['usage']:
|
|
117
142
|
input_tokens = js['usage']['prompt_tokens']
|
|
118
|
-
output_tokens = js['usage']['completion_tokens']
|
|
119
|
-
if input_tokens is None and output_tokens is None and self.tokenizer is not None:
|
|
143
|
+
output_tokens = js['usage']['completion_tokens']
|
|
144
|
+
if (input_tokens is None and output_tokens is None and self.tokenizer is not None):
|
|
120
145
|
input_tokens = 0
|
|
121
146
|
output_tokens = 0
|
|
122
147
|
for idx, choice_contents in delta_contents.items():
|
|
@@ -125,8 +150,7 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
125
150
|
output_tokens += len(self.tokenizer.encode(full_response_content))
|
|
126
151
|
elif input_tokens is None and output_tokens is None: # no usage info get.
|
|
127
152
|
input_tokens = 0
|
|
128
|
-
output_tokens = 0
|
|
129
|
-
|
|
153
|
+
output_tokens = 0
|
|
154
|
+
logger.warning('No usage info get.')
|
|
155
|
+
|
|
130
156
|
return input_tokens, output_tokens
|
|
131
|
-
|
|
132
|
-
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from evalscope.perf.plugin.datasets.custom import CustomDatasetPlugin
|
|
2
|
+
from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
|
|
3
|
+
from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
|
|
4
|
+
from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
|
|
5
|
+
from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
|
|
6
|
+
from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
-
from abc import abstractmethod
|
|
2
1
|
import sys
|
|
2
|
+
from abc import abstractmethod
|
|
3
3
|
from typing import Any, Dict, Iterator, List, Tuple
|
|
4
|
+
|
|
4
5
|
import json
|
|
5
6
|
|
|
6
|
-
from evalscope.perf.
|
|
7
|
+
from evalscope.perf.arguments import Arguments
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
class DatasetPluginBase:
|
|
9
|
-
|
|
11
|
+
|
|
12
|
+
def __init__(self, query_parameters: Arguments):
|
|
10
13
|
"""Build data set plugin
|
|
11
14
|
|
|
12
15
|
Args:
|
|
@@ -21,9 +24,9 @@ class DatasetPluginBase:
|
|
|
21
24
|
|
|
22
25
|
def __iter__(self):
|
|
23
26
|
return self.build_messages()
|
|
24
|
-
|
|
27
|
+
|
|
25
28
|
@abstractmethod
|
|
26
|
-
def build_messages(self)->Iterator[List[Dict]]:
|
|
29
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
27
30
|
"""Build the request.
|
|
28
31
|
|
|
29
32
|
Raises:
|
|
@@ -33,8 +36,8 @@ class DatasetPluginBase:
|
|
|
33
36
|
Iterator[List[Dict]]: Yield request messages.
|
|
34
37
|
"""
|
|
35
38
|
raise NotImplementedError
|
|
36
|
-
|
|
37
|
-
def dataset_line_by_line(self, dataset: str)->Iterator[str]:
|
|
39
|
+
|
|
40
|
+
def dataset_line_by_line(self, dataset: str) -> Iterator[str]:
|
|
38
41
|
"""Get content line by line of dataset.
|
|
39
42
|
|
|
40
43
|
Args:
|
|
@@ -46,8 +49,8 @@ class DatasetPluginBase:
|
|
|
46
49
|
with open(dataset, 'r', encoding='utf-8') as f:
|
|
47
50
|
for line in f:
|
|
48
51
|
yield line
|
|
49
|
-
|
|
50
|
-
def dataset_json_list(self, dataset: str)->Iterator[Dict]:
|
|
52
|
+
|
|
53
|
+
def dataset_json_list(self, dataset: str) -> Iterator[Dict]:
|
|
51
54
|
"""Read data from file which is list of requests.
|
|
52
55
|
Sample: https://huggingface.co/datasets/Yukang/LongAlpaca-12k
|
|
53
56
|
|
|
@@ -61,4 +64,4 @@ class DatasetPluginBase:
|
|
|
61
64
|
content = f.read()
|
|
62
65
|
data = json.loads(content)
|
|
63
66
|
for item in data:
|
|
64
|
-
yield item
|
|
67
|
+
yield item
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Dict, Iterator, List
|
|
2
|
+
|
|
3
|
+
from evalscope.perf.arguments import Arguments
|
|
4
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
5
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@register_dataset('custom')
|
|
9
|
+
class CustomDatasetPlugin(DatasetPluginBase):
|
|
10
|
+
"""Read dataset and return prompt.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, query_parameters: Arguments):
|
|
14
|
+
super().__init__(query_parameters)
|
|
15
|
+
|
|
16
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
17
|
+
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
18
|
+
prompt = item.strip()
|
|
19
|
+
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
20
|
+
prompt) < self.query_parameters.max_prompt_length:
|
|
21
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from typing import Any, Dict, Iterator, List
|
|
4
|
+
|
|
5
|
+
from modelscope.msdatasets import MsDataset
|
|
6
|
+
from PIL import Image
|
|
7
|
+
|
|
8
|
+
from evalscope.perf.arguments import Arguments
|
|
9
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
10
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def PIL_to_base64(image: Image.Image) -> str:
|
|
14
|
+
buffered = BytesIO()
|
|
15
|
+
image.save(buffered, format='JPEG')
|
|
16
|
+
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
17
|
+
return img_str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_dataset('flickr8k')
|
|
21
|
+
class FlickrDatasetPlugin(DatasetPluginBase):
|
|
22
|
+
"""Read dataset and return prompt.
|
|
23
|
+
Datasets: https://www.modelscope.cn/datasets/clip-benchmark/wds_flickr8k/files
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, query_parameters: Arguments):
|
|
27
|
+
super().__init__(query_parameters)
|
|
28
|
+
|
|
29
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
30
|
+
dataset = MsDataset.load('clip-benchmark/wds_flickr8k', split='test')
|
|
31
|
+
|
|
32
|
+
for item in dataset:
|
|
33
|
+
pil_image = item['jpg']
|
|
34
|
+
base64_iamge = PIL_to_base64(pil_image)
|
|
35
|
+
|
|
36
|
+
yield [{
|
|
37
|
+
'role':
|
|
38
|
+
'user',
|
|
39
|
+
'content': [
|
|
40
|
+
{
|
|
41
|
+
'type': 'text',
|
|
42
|
+
'text': 'Describe the image'
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
'type': 'image_url',
|
|
46
|
+
'image_url': {
|
|
47
|
+
'url': f'data:image/jpeg;base64,{base64_iamge}',
|
|
48
|
+
}
|
|
49
|
+
},
|
|
50
|
+
],
|
|
51
|
+
}]
|
|
@@ -1,18 +1,22 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from typing import Dict, Iterator, List
|
|
3
|
-
|
|
4
|
-
from evalscope.perf.
|
|
5
|
-
from evalscope.perf.
|
|
3
|
+
|
|
4
|
+
from evalscope.perf.arguments import Arguments
|
|
5
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
6
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
@register_dataset('line_by_line')
|
|
8
10
|
class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
9
11
|
"""Read dataset and return prompt.
|
|
10
12
|
"""
|
|
11
|
-
|
|
13
|
+
|
|
14
|
+
def __init__(self, query_parameters: Arguments):
|
|
12
15
|
super().__init__(query_parameters)
|
|
13
16
|
|
|
14
17
|
def build_messages(self) -> Iterator[List[Dict]]:
|
|
15
18
|
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
16
19
|
prompt = item.strip()
|
|
17
|
-
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
20
|
+
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
21
|
+
prompt) < self.query_parameters.max_prompt_length:
|
|
18
22
|
yield [{'role': 'user', 'content': prompt}]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterator, List
|
|
2
|
+
|
|
3
|
+
from modelscope import MsDataset
|
|
4
|
+
|
|
5
|
+
from evalscope.perf.arguments import Arguments
|
|
6
|
+
from evalscope.perf.plugin.datasets.base import DatasetPluginBase
|
|
7
|
+
from evalscope.perf.plugin.registry import register_dataset
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@register_dataset('longalpaca')
|
|
11
|
+
class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
12
|
+
"""Read data from file which is list of requests.
|
|
13
|
+
Sample: https://www.modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/files
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, query_parameters: Arguments):
|
|
17
|
+
super().__init__(query_parameters)
|
|
18
|
+
|
|
19
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
20
|
+
if not self.query_parameters.dataset_path:
|
|
21
|
+
ds = MsDataset.load('AI-ModelScope/LongAlpaca-12k', subset_name='default', split='train')
|
|
22
|
+
else:
|
|
23
|
+
ds = self.dataset_json_list(self.query_parameters.dataset_path)
|
|
24
|
+
for item in ds:
|
|
25
|
+
prompt = item['instruction'].strip()
|
|
26
|
+
if len(prompt) > self.query_parameters.min_prompt_length and len(
|
|
27
|
+
prompt) < self.query_parameters.max_prompt_length:
|
|
28
|
+
yield [{'role': 'user', 'content': prompt}]
|