evalscope 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +3 -0
- evalscope/backend/__init__.py +3 -0
- evalscope/backend/base.py +27 -0
- evalscope/backend/opencompass/__init__.py +3 -0
- evalscope/backend/opencompass/api_meta_template.py +64 -0
- evalscope/backend/opencompass/backend_manager.py +247 -0
- evalscope/backend/opencompass/tasks/__init__.py +1 -0
- evalscope/backend/opencompass/tasks/eval_api.py +30 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
- evalscope/backend/vlm_eval_kit/__init__.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
- evalscope/benchmarks/__init__.py +4 -0
- evalscope/benchmarks/arc/__init__.py +5 -0
- evalscope/benchmarks/arc/ai2_arc.py +148 -0
- evalscope/benchmarks/arc/arc_adapter.py +231 -0
- evalscope/benchmarks/bbh/__init__.py +6 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
- evalscope/benchmarks/benchmark.py +65 -0
- evalscope/benchmarks/ceval/__init__.py +5 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
- evalscope/benchmarks/ceval/ceval_exam.py +159 -0
- evalscope/benchmarks/cmmlu/__init__.py +5 -0
- evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
- evalscope/benchmarks/competition_math/__init__.py +5 -0
- evalscope/benchmarks/competition_math/competition_math.py +88 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
- evalscope/benchmarks/data_adapter.py +263 -0
- evalscope/benchmarks/general_qa/__init__.py +5 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
- evalscope/benchmarks/gsm8k/__init__.py +5 -0
- evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
- evalscope/benchmarks/hellaswag/__init__.py +5 -0
- evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
- evalscope/benchmarks/humaneval/__init__.py +5 -0
- evalscope/benchmarks/humaneval/humaneval.py +82 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
- evalscope/benchmarks/mmlu/__init__.py +5 -0
- evalscope/benchmarks/mmlu/mmlu.py +174 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
- evalscope/benchmarks/race/__init__.py +5 -0
- evalscope/benchmarks/race/race.py +118 -0
- evalscope/benchmarks/race/race_adapter.py +229 -0
- evalscope/benchmarks/trivia_qa/__init__.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
- evalscope/benchmarks/truthful_qa/__init__.py +5 -0
- evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
- evalscope/cache.py +98 -0
- evalscope/cli/__init__.py +1 -0
- evalscope/cli/base.py +20 -0
- evalscope/cli/cli.py +26 -0
- evalscope/cli/start_perf.py +37 -0
- evalscope/cli/start_server.py +138 -0
- evalscope/config.py +165 -0
- evalscope/constants.py +150 -0
- evalscope/evaluator/__init__.py +3 -0
- evalscope/evaluator/evaluator.py +689 -0
- evalscope/evaluator/rating_eval.py +178 -0
- evalscope/evaluator/reviewer/__init__.py +1 -0
- evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
- evalscope/metrics/__init__.py +1 -0
- evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
- evalscope/metrics/code_metric.py +104 -0
- evalscope/metrics/math_accuracy.py +60 -0
- evalscope/metrics/metrics.py +405 -0
- evalscope/metrics/rouge_metric.py +129 -0
- evalscope/models/__init__.py +4 -0
- evalscope/models/custom/__init__.py +4 -0
- evalscope/models/custom/custom_model.py +53 -0
- evalscope/models/dummy_chat_model.py +50 -0
- evalscope/models/model.py +88 -0
- evalscope/models/model_adapter.py +586 -0
- evalscope/models/openai_model.py +103 -0
- evalscope/models/template.py +1446 -0
- evalscope/perf/__init__.py +0 -0
- evalscope/perf/_logging.py +32 -0
- evalscope/perf/api_plugin_base.py +60 -0
- evalscope/perf/custom_api.py +87 -0
- evalscope/perf/dashscope_api.py +84 -0
- evalscope/perf/dataset_plugin_base.py +64 -0
- evalscope/perf/datasets/__init__.py +0 -0
- evalscope/perf/datasets/line_by_line.py +18 -0
- evalscope/perf/datasets/longalpaca_12k.py +20 -0
- evalscope/perf/datasets/openqa.py +22 -0
- evalscope/perf/how_to_analysis_result.py +24 -0
- evalscope/perf/http_client.py +756 -0
- evalscope/perf/openai_api.py +130 -0
- evalscope/perf/plugin_registry.py +35 -0
- evalscope/perf/query_parameters.py +42 -0
- evalscope/perf/server_sent_event.py +43 -0
- evalscope/preprocess/__init__.py +1 -0
- evalscope/preprocess/tokenizers/__init__.py +0 -0
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
- evalscope/registry/__init__.py +1 -0
- evalscope/registry/tasks/arc.yaml +29 -0
- evalscope/registry/tasks/bbh.yaml +27 -0
- evalscope/registry/tasks/bbh_mini.yaml +27 -0
- evalscope/registry/tasks/ceval.yaml +27 -0
- evalscope/registry/tasks/ceval_mini.yaml +27 -0
- evalscope/registry/tasks/cmmlu.yaml +27 -0
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
- evalscope/registry/tasks/general_qa.yaml +27 -0
- evalscope/registry/tasks/gsm8k.yaml +29 -0
- evalscope/registry/tasks/mmlu.yaml +29 -0
- evalscope/registry/tasks/mmlu_mini.yaml +27 -0
- evalscope/run.py +404 -0
- evalscope/run_arena.py +204 -0
- evalscope/run_ms.py +140 -0
- evalscope/summarizer.py +144 -0
- evalscope/third_party/__init__.py +1 -0
- evalscope/third_party/toolbench_static/__init__.py +3 -0
- evalscope/third_party/toolbench_static/eval.py +219 -0
- evalscope/third_party/toolbench_static/infer.py +278 -0
- evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
- evalscope/tools/__init__.py +1 -0
- evalscope/tools/combine_reports.py +140 -0
- evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
- evalscope/tools/rewrite_eval_results.py +95 -0
- evalscope/utils/__init__.py +4 -0
- evalscope/utils/arena_utils.py +247 -0
- evalscope/utils/completion_parsers.py +87 -0
- evalscope/utils/logger.py +64 -0
- evalscope/utils/task_cfg_parser.py +10 -0
- evalscope/utils/task_utils.py +19 -0
- evalscope/utils/utils.py +625 -0
- evalscope/version.py +4 -0
- evalscope-0.5.0.dist-info/METADATA +566 -0
- evalscope-0.5.0.dist-info/RECORD +165 -0
- evalscope-0.5.0.dist-info/WHEEL +5 -0
- evalscope-0.5.0.dist-info/entry_points.txt +3 -0
- evalscope-0.5.0.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger('perf')
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def enable_logging():
|
|
9
|
+
level = os.environ.get('LOGGING_LEVEL', 'info')
|
|
10
|
+
if level is not None: # set logging level.
|
|
11
|
+
if level not in ['info', 'debug']:
|
|
12
|
+
# set logging level env, but invalid value, use default.
|
|
13
|
+
level = 'info'
|
|
14
|
+
if level == 'info':
|
|
15
|
+
logger.setLevel(logging.INFO)
|
|
16
|
+
else:
|
|
17
|
+
logger.setLevel(logging.DEBUG)
|
|
18
|
+
# set default logging handler
|
|
19
|
+
console_handler = logging.StreamHandler()
|
|
20
|
+
formatter = logging.Formatter(
|
|
21
|
+
'%(asctime)s - %(name)s - %(filename)s - %(funcName)s - %(lineno)d - %(levelname)s - %(message)s' # noqa E501
|
|
22
|
+
)
|
|
23
|
+
#formatter = logging.Formatter(
|
|
24
|
+
# '%(asctime)s - %(name)s - %(levelname)s - %(message)s' # noqa E501
|
|
25
|
+
#)
|
|
26
|
+
console_handler.setFormatter(formatter)
|
|
27
|
+
logger.addHandler(console_handler)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# in release disable dashscope log
|
|
31
|
+
# you can enable dashscope log for debugger.
|
|
32
|
+
enable_logging()
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from typing import Any, Dict, List, Tuple
|
|
3
|
+
|
|
4
|
+
from evalscope.perf.query_parameters import QueryParameters
|
|
5
|
+
|
|
6
|
+
class ApiPluginBase:
|
|
7
|
+
def __init__(self, model_path: str) -> None:
|
|
8
|
+
self.model_path = model_path
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def build_request(self, messages: List[Dict], param: QueryParameters)->Dict:
|
|
12
|
+
"""Build a api request body.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
messages (List[Dict]): The messages generated by dataset.
|
|
16
|
+
param (QueryParameters): The query parameters.
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
NotImplementedError: Not implemented.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Dict: The api request body.
|
|
23
|
+
"""
|
|
24
|
+
raise NotImplementedError
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def parse_responses(self,
|
|
28
|
+
responses: List,
|
|
29
|
+
request: Any=None,
|
|
30
|
+
**kwargs:Any) -> Tuple[int, int]:
|
|
31
|
+
"""Parser responses and return number of request and response tokens.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
responses (List[bytes]): List of http response body, for stream output,
|
|
35
|
+
there are multiple responses, each is bytes, for general only one.
|
|
36
|
+
request (Any): The request body.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Tuple: (Number of prompt_tokens and number of completion_tokens).
|
|
40
|
+
"""
|
|
41
|
+
raise NotImplementedError
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def replace_values(input_json: Any, model: str, prompt: str):
|
|
45
|
+
if isinstance(input_json, dict):
|
|
46
|
+
for key, value in input_json.items():
|
|
47
|
+
if isinstance(value, str):
|
|
48
|
+
input_json[key] = value.replace("%m", model).replace("%p", prompt)
|
|
49
|
+
else:
|
|
50
|
+
ApiPluginBase.replace_values(value, model, prompt)
|
|
51
|
+
elif isinstance(input_json, list):
|
|
52
|
+
for idx, item in enumerate(input_json):
|
|
53
|
+
if isinstance(item, str):
|
|
54
|
+
input_json[idx] = item.replace("%m", model).replace("%p", prompt)
|
|
55
|
+
else:
|
|
56
|
+
ApiPluginBase.replace_values(item, model, prompt)
|
|
57
|
+
elif isinstance(input_json, str):
|
|
58
|
+
input_json = input_json.replace("%m", model).replace("%p", prompt)
|
|
59
|
+
else:
|
|
60
|
+
pass
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterator, List
|
|
2
|
+
import json
|
|
3
|
+
from evalscope.perf.api_plugin_base import ApiPluginBase
|
|
4
|
+
from transformers import AutoTokenizer
|
|
5
|
+
from evalscope.perf.plugin_registry import register_api
|
|
6
|
+
from evalscope.perf.query_parameters import QueryParameters
|
|
7
|
+
|
|
8
|
+
@register_api("custom")
|
|
9
|
+
class CustomPlugin(ApiPluginBase):
|
|
10
|
+
"""Support tensorrt-llm triton server
|
|
11
|
+
"""
|
|
12
|
+
def __init__(self, mode_path: str):
|
|
13
|
+
"""Init the plugin
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
mode_path (str): The model path, we use the tokenizer
|
|
17
|
+
weight in the model to calculate the number of the
|
|
18
|
+
input and output tokens.
|
|
19
|
+
"""
|
|
20
|
+
super().__init__(model_path=mode_path)
|
|
21
|
+
if mode_path is not None:
|
|
22
|
+
self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
|
|
23
|
+
else:
|
|
24
|
+
self.tokenizer = None
|
|
25
|
+
|
|
26
|
+
def build_request(self, messages: List[Dict], param: QueryParameters) -> Dict:
|
|
27
|
+
"""Build the openai format request based on prompt, dataset
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
message (Dict): The basic message to generator query.
|
|
31
|
+
param (QueryParameters): The query parameters.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
Exception: NotImplemented
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Dict: The request body. None if prompt format is error.
|
|
38
|
+
"""
|
|
39
|
+
try:
|
|
40
|
+
query = json.loads(param.query_template)
|
|
41
|
+
ApiPluginBase.replace_values(query, param.model, messages[0]['content'])
|
|
42
|
+
return query
|
|
43
|
+
except Exception as e:
|
|
44
|
+
print(e)
|
|
45
|
+
print('Prompt: %s invalidate!'%messages)
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
|
|
49
|
+
"""Parser responses and return number of request and response tokens.
|
|
50
|
+
sample of the output delta:
|
|
51
|
+
{"id":"4","object":"chat.completion.chunk","created":1714030870,"model":"llama3","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
responses (List[bytes]): List of http response body, for stream output,
|
|
56
|
+
there are multiple responses, for general only one.
|
|
57
|
+
kwargs: (Any): The command line --parameter content.
|
|
58
|
+
Returns:
|
|
59
|
+
Tuple: Return number of prompt token and number of completion tokens.
|
|
60
|
+
"""
|
|
61
|
+
full_response_content = ''
|
|
62
|
+
delta_contents = {}
|
|
63
|
+
input_tokens = None
|
|
64
|
+
output_tokens = None
|
|
65
|
+
for response in responses:
|
|
66
|
+
js = json.loads(response)
|
|
67
|
+
# {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble",
|
|
68
|
+
# "model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"性"}
|
|
69
|
+
if 'text_output' in js:
|
|
70
|
+
if 0 in delta_contents:
|
|
71
|
+
delta_contents[0].append(js['text_output'])
|
|
72
|
+
else:
|
|
73
|
+
delta_contents[0] = [js['text_output']]
|
|
74
|
+
if input_tokens is None and output_tokens is None and self.tokenizer is not None:
|
|
75
|
+
input_tokens = 0
|
|
76
|
+
output_tokens = 0
|
|
77
|
+
for _, choice_contents in delta_contents.items():
|
|
78
|
+
full_response_content = ''.join([m for m in choice_contents])
|
|
79
|
+
input_tokens += len(self.tokenizer.encode(request['text_input']))
|
|
80
|
+
output_tokens += len(self.tokenizer.encode(full_response_content))
|
|
81
|
+
elif input_tokens is None and output_tokens is None: # no usage info get.
|
|
82
|
+
input_tokens = 0
|
|
83
|
+
output_tokens = 0
|
|
84
|
+
|
|
85
|
+
return input_tokens, output_tokens
|
|
86
|
+
|
|
87
|
+
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
|
|
2
|
+
from sys import maxsize
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Any, Dict, Iterator, List
|
|
5
|
+
import json
|
|
6
|
+
from evalscope.perf.api_plugin_base import ApiPluginBase
|
|
7
|
+
|
|
8
|
+
from evalscope.perf.plugin_registry import register_api
|
|
9
|
+
from evalscope.perf.query_parameters import QueryParameters
|
|
10
|
+
|
|
11
|
+
@register_api("dashscope")
|
|
12
|
+
class DashScopeApiPlugin(ApiPluginBase):
|
|
13
|
+
def __init__(self, mode_path: str):
|
|
14
|
+
"""Init the plugin
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
mode_path (str): The model path, we use the tokenizer
|
|
18
|
+
weight in the model to calculate the number of the
|
|
19
|
+
input and output tokens.
|
|
20
|
+
"""
|
|
21
|
+
super().__init__(model_path=mode_path)
|
|
22
|
+
|
|
23
|
+
def build_request(self,messages: List[Dict], param: QueryParameters) -> Dict:
|
|
24
|
+
"""Build the openai format request based on prompt, dataset
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
messages (List[Dict]): The basic message to generator query.
|
|
28
|
+
param (QueryParameters): The query parameters.
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
Exception: NotImplemented
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Dict: The request body. None if prompt format is error.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
if param.query_template is not None:
|
|
38
|
+
query = json.loads(param.query_template)
|
|
39
|
+
query['input']['messages'] = messages # replace template content with message.
|
|
40
|
+
return self.__compose_query_from_parameter(query, param)
|
|
41
|
+
else:
|
|
42
|
+
query = {'messages': messages}
|
|
43
|
+
return self.__compose_query_from_parameter(query, param)
|
|
44
|
+
except Exception as e:
|
|
45
|
+
print(e)
|
|
46
|
+
return None
|
|
47
|
+
def __compose_query_from_parameter(self, payload: Dict, param: QueryParameters):
|
|
48
|
+
payload['model'] = param.model
|
|
49
|
+
if 'parameters' not in payload:
|
|
50
|
+
payload['parameters'] = {}
|
|
51
|
+
if param.max_tokens is not None:
|
|
52
|
+
payload['parameters']['max_tokens'] = param.max_tokens
|
|
53
|
+
if param.frequency_penalty is not None:
|
|
54
|
+
payload['parameters']['frequency_penalty'] = param.frequency_penalty
|
|
55
|
+
if param.logprobs is not None:
|
|
56
|
+
payload['parameters']['logprobs'] = param.logprobs
|
|
57
|
+
if param.n_choices is not None:
|
|
58
|
+
payload['parameters']['n'] = param.n_choices
|
|
59
|
+
if param.seed is not None:
|
|
60
|
+
payload['parameters']['seed'] = param.seed
|
|
61
|
+
if param.stop is not None:
|
|
62
|
+
payload['parameters']['stop'] = param.stop
|
|
63
|
+
if param.stream is not None and not param.stream:
|
|
64
|
+
payload['parameters']['stream'] = param.stream
|
|
65
|
+
if param.temperature is not None:
|
|
66
|
+
payload['parameters']['temperature'] = param.temperature
|
|
67
|
+
if param.top_p is not None:
|
|
68
|
+
payload['parameters']['top_p'] = param.top_p
|
|
69
|
+
return payload
|
|
70
|
+
|
|
71
|
+
def parse_responses(self, responses, **kwargs) -> Dict:
|
|
72
|
+
"""Parser responses and return number of request and response tokens.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
responses (List[bytes]): List of http response body, for stream output,
|
|
76
|
+
there are multiple responses, for general only one.
|
|
77
|
+
kwargs: (Any): The command line --parameter content.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Tuple: Return number of prompt token and number of completion tokens.
|
|
81
|
+
"""
|
|
82
|
+
last_response = responses[-1]
|
|
83
|
+
js = json.loads(last_response)
|
|
84
|
+
return js['usage']['input_tokens'], js['usage']['output_tokens']
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Any, Dict, Iterator, List, Tuple
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
from evalscope.perf.query_parameters import QueryParameters
|
|
7
|
+
|
|
8
|
+
class DatasetPluginBase:
|
|
9
|
+
def __init__(self, query_parameters: QueryParameters):
|
|
10
|
+
"""Build data set plugin
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
dataset_path (str, optional): The input dataset path. Defaults to None.
|
|
14
|
+
"""
|
|
15
|
+
self.query_parameters = query_parameters
|
|
16
|
+
|
|
17
|
+
def __next__(self):
|
|
18
|
+
for item in self.build_messages():
|
|
19
|
+
yield item
|
|
20
|
+
raise StopIteration
|
|
21
|
+
|
|
22
|
+
def __iter__(self):
|
|
23
|
+
return self.build_messages()
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def build_messages(self)->Iterator[List[Dict]]:
|
|
27
|
+
"""Build the request.
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
NotImplementedError: The request is not impletion.
|
|
31
|
+
|
|
32
|
+
Yields:
|
|
33
|
+
Iterator[List[Dict]]: Yield request messages.
|
|
34
|
+
"""
|
|
35
|
+
raise NotImplementedError
|
|
36
|
+
|
|
37
|
+
def dataset_line_by_line(self, dataset: str)->Iterator[str]:
|
|
38
|
+
"""Get content line by line of dataset.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
dataset (str): The dataset path.
|
|
42
|
+
|
|
43
|
+
Yields:
|
|
44
|
+
Iterator[str]: Each line of file.
|
|
45
|
+
"""
|
|
46
|
+
with open(dataset, 'r', encoding='utf-8') as f:
|
|
47
|
+
for line in f:
|
|
48
|
+
yield line
|
|
49
|
+
|
|
50
|
+
def dataset_json_list(self, dataset: str)->Iterator[Dict]:
|
|
51
|
+
"""Read data from file which is list of requests.
|
|
52
|
+
Sample: https://huggingface.co/datasets/Yukang/LongAlpaca-12k
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
dataset (str): The dataset path.
|
|
56
|
+
|
|
57
|
+
Yields:
|
|
58
|
+
Iterator[Dict]: The each request object.
|
|
59
|
+
"""
|
|
60
|
+
with open(dataset, 'r', encoding='utf-8') as f:
|
|
61
|
+
content = f.read()
|
|
62
|
+
data = json.loads(content)
|
|
63
|
+
for item in data:
|
|
64
|
+
yield item
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Dict, Iterator, List
|
|
3
|
+
from evalscope.perf.dataset_plugin_base import DatasetPluginBase
|
|
4
|
+
from evalscope.perf.plugin_registry import register_dataset
|
|
5
|
+
from evalscope.perf.query_parameters import QueryParameters
|
|
6
|
+
|
|
7
|
+
@register_dataset('line_by_line')
|
|
8
|
+
class LineByLineDatasetPlugin(DatasetPluginBase):
|
|
9
|
+
"""Read dataset and return prompt.
|
|
10
|
+
"""
|
|
11
|
+
def __init__(self, query_parameters: QueryParameters):
|
|
12
|
+
super().__init__(query_parameters)
|
|
13
|
+
|
|
14
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
15
|
+
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
16
|
+
prompt = item.strip()
|
|
17
|
+
if len(prompt) > self.query_parameters.min_prompt_length and len(prompt) < self.query_parameters.max_prompt_length:
|
|
18
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from typing import Any, Dict, Iterator, List
|
|
3
|
+
from evalscope.perf.dataset_plugin_base import DatasetPluginBase
|
|
4
|
+
|
|
5
|
+
from evalscope.perf.plugin_registry import register_dataset
|
|
6
|
+
from evalscope.perf.query_parameters import QueryParameters
|
|
7
|
+
|
|
8
|
+
@register_dataset('longalpaca')
|
|
9
|
+
class LongAlpacaDatasetPlugin(DatasetPluginBase):
|
|
10
|
+
"""Read data from file which is list of requests.
|
|
11
|
+
Sample: https://huggingface.co/datasets/Yukang/LongAlpaca-12k
|
|
12
|
+
"""
|
|
13
|
+
def __init__(self, query_parameters: QueryParameters):
|
|
14
|
+
super().__init__(query_parameters)
|
|
15
|
+
|
|
16
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
17
|
+
for item in self.dataset_json_list(self.query_parameters.dataset_path):
|
|
18
|
+
prompt = item['instruction'].strip()
|
|
19
|
+
if len(prompt) > self.query_parameters.min_prompt_length and len(prompt) < self.query_parameters.max_prompt_length:
|
|
20
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from sys import maxsize
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Any, Dict, Iterator, List
|
|
4
|
+
import json
|
|
5
|
+
from evalscope.perf.dataset_plugin_base import DatasetPluginBase
|
|
6
|
+
from evalscope.perf.plugin_registry import register_dataset
|
|
7
|
+
from evalscope.perf.query_parameters import QueryParameters
|
|
8
|
+
|
|
9
|
+
@register_dataset('openqa')
|
|
10
|
+
class OpenqaDatasetPlugin(DatasetPluginBase):
|
|
11
|
+
"""Read dataset and return prompt.
|
|
12
|
+
Datasets: https://huggingface.co/datasets/Hello-SimpleAI/HC3-Chinese/blob/main/open_qa.jsonl
|
|
13
|
+
"""
|
|
14
|
+
def __init__(self, query_parameters: QueryParameters):
|
|
15
|
+
super().__init__(query_parameters)
|
|
16
|
+
|
|
17
|
+
def build_messages(self) -> Iterator[List[Dict]]:
|
|
18
|
+
for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
|
|
19
|
+
item = json.loads(item)
|
|
20
|
+
prompt = item['question'].strip()
|
|
21
|
+
if len(prompt) > self.query_parameters.min_prompt_length and len(prompt) < self.query_parameters.max_prompt_length:
|
|
22
|
+
yield [{'role': 'user', 'content': prompt}]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import sqlite3
|
|
2
|
+
import base64
|
|
3
|
+
import pickle
|
|
4
|
+
import json
|
|
5
|
+
result_db_path = 'db_name.db'
|
|
6
|
+
con = sqlite3.connect(result_db_path)
|
|
7
|
+
query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
|
|
8
|
+
FROM result WHERE success='True'"
|
|
9
|
+
# how to save base64.b64encode(pickle.dumps(benchmark_data["request"])).decode("ascii"),
|
|
10
|
+
with con:
|
|
11
|
+
rows = con.execute(query_sql).fetchall()
|
|
12
|
+
if len(rows) > 0:
|
|
13
|
+
for row in rows:
|
|
14
|
+
request = row[0]
|
|
15
|
+
responses = row[1]
|
|
16
|
+
request = base64.b64decode(request)
|
|
17
|
+
request = pickle.loads(request)
|
|
18
|
+
responses = base64.b64decode(responses)
|
|
19
|
+
responses = pickle.loads(responses)
|
|
20
|
+
response_content = ''
|
|
21
|
+
for response in responses:
|
|
22
|
+
response = json.loads(response)
|
|
23
|
+
response_content += response['choices'][0]['delta']['content']
|
|
24
|
+
print('prompt: %s, tokens: %s, completion: %s, tokens: %s' % (request['messages'][0]['content'], row[2], response_content, row[3]))
|