evalscope 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
  2. evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
  3. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
  4. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
  5. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
  6. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
  7. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
  8. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
  9. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
  10. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
  11. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  12. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
  22. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
  23. evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
  24. evalscope/benchmarks/ceval/samples.jsonl +1 -0
  25. evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
  26. evalscope/benchmarks/mmlu/samples.jsonl +5 -0
  27. evalscope/benchmarks/race/samples.jsonl +5 -0
  28. evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
  29. evalscope/cli/start_perf.py +8 -11
  30. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
  31. evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
  32. evalscope/metrics/rouge_metric.py +30 -15
  33. evalscope/perf/arguments.py +179 -0
  34. evalscope/perf/benchmark.py +245 -0
  35. evalscope/perf/http_client.py +127 -711
  36. evalscope/perf/main.py +35 -0
  37. evalscope/perf/plugin/__init__.py +2 -0
  38. evalscope/perf/plugin/api/__init__.py +3 -0
  39. evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
  40. evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
  41. evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
  42. evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
  43. evalscope/perf/plugin/datasets/__init__.py +6 -0
  44. evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
  45. evalscope/perf/plugin/datasets/custom.py +21 -0
  46. evalscope/perf/plugin/datasets/flickr8k.py +51 -0
  47. evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
  48. evalscope/perf/plugin/datasets/longalpaca.py +28 -0
  49. evalscope/perf/plugin/datasets/openqa.py +38 -0
  50. evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
  51. evalscope/perf/plugin/registry.py +54 -0
  52. evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
  53. evalscope/perf/utils/benchmark_util.py +135 -0
  54. evalscope/perf/utils/chat_service.py +252 -0
  55. evalscope/perf/utils/db_util.py +200 -0
  56. evalscope/perf/utils/handler.py +46 -0
  57. evalscope/perf/utils/local_server.py +139 -0
  58. evalscope/registry/config/cfg_arena.yaml +77 -0
  59. evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
  60. evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
  61. evalscope/registry/config/cfg_single.yaml +78 -0
  62. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
  63. evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
  64. evalscope/registry/data/qa_browser/battle.jsonl +634 -0
  65. evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
  66. evalscope/registry/data/question.jsonl +80 -0
  67. evalscope/third_party/longbench_write/README.md +118 -0
  68. evalscope/third_party/longbench_write/default_task.json +27 -0
  69. evalscope/third_party/longbench_write/default_task.yaml +24 -0
  70. evalscope/third_party/toolbench_static/README.md +118 -0
  71. evalscope/third_party/toolbench_static/config_default.json +15 -0
  72. evalscope/third_party/toolbench_static/config_default.yaml +12 -0
  73. evalscope/third_party/toolbench_static/requirements.txt +2 -0
  74. evalscope/utils/logger.py +18 -20
  75. evalscope/utils/utils.py +41 -42
  76. evalscope/version.py +2 -2
  77. evalscope-0.7.0.dist-info/LICENSE +203 -0
  78. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/METADATA +91 -33
  79. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/RECORD +99 -29
  80. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
  81. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
  82. tests/cli/__init__.py +1 -0
  83. tests/cli/test_run.py +76 -0
  84. tests/perf/__init__.py +1 -0
  85. tests/perf/test_perf.py +96 -0
  86. tests/rag/test_clip_benchmark.py +85 -0
  87. tests/rag/test_mteb.py +136 -0
  88. tests/rag/test_ragas.py +120 -0
  89. tests/swift/__init__.py +1 -0
  90. tests/swift/test_run_swift_eval.py +146 -0
  91. tests/swift/test_run_swift_vlm_eval.py +128 -0
  92. tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
  93. tests/test_run_all.py +12 -0
  94. tests/vlm/__init__.py +1 -0
  95. tests/vlm/test_vlmeval.py +59 -0
  96. evalscope/perf/_logging.py +0 -32
  97. evalscope/perf/datasets/longalpaca_12k.py +0 -20
  98. evalscope/perf/datasets/openqa.py +0 -22
  99. evalscope/perf/plugin_registry.py +0 -35
  100. evalscope/perf/query_parameters.py +0 -42
  101. evalscope/perf/server_sent_event.py +0 -43
  102. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
  103. /evalscope/perf/{datasets → utils}/__init__.py +0 -0
  104. {evalscope-0.6.1.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
  105. {evalscope/preprocess → tests}/__init__.py +0 -0
  106. {evalscope/preprocess/tokenizers → tests/rag}/__init__.py +0 -0
evalscope/perf/main.py ADDED
@@ -0,0 +1,35 @@
1
+ import asyncio
2
+ import platform
3
+ from argparse import Namespace
4
+
5
+ from evalscope.perf.arguments import Arguments, parse_args
6
+ from evalscope.perf.benchmark import benchmark
7
+ from evalscope.perf.utils.handler import add_signal_handlers
8
+ from evalscope.utils.logger import get_logger
9
+ from evalscope.utils.utils import seed_everything
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ def run_perf_benchmark(args):
15
+ if isinstance(args, dict):
16
+ args = Arguments(**args)
17
+ elif isinstance(args, Namespace):
18
+ args = Arguments.from_args(args)
19
+ seed_everything(args.seed)
20
+
21
+ logger.info('Starting benchmark...')
22
+ logger.info(args)
23
+
24
+ if platform.system() == 'Windows':
25
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
26
+
27
+ loop = asyncio.get_event_loop()
28
+ if platform.system() != 'Windows':
29
+ add_signal_handlers(loop)
30
+ loop.run_until_complete(benchmark(args))
31
+
32
+
33
+ if __name__ == '__main__':
34
+ args = Arguments.from_args(parse_args())
35
+ run_perf_benchmark(args)
@@ -0,0 +1,2 @@
1
+ from evalscope.perf.plugin.api import *
2
+ from evalscope.perf.plugin.datasets import *
@@ -0,0 +1,3 @@
1
+ from evalscope.perf.plugin.api.custom_api import CustomPlugin
2
+ from evalscope.perf.plugin.api.dashscope_api import DashScopeApiPlugin
3
+ from evalscope.perf.plugin.api.openai_api import OpenaiPlugin
@@ -1,14 +1,16 @@
1
1
  from abc import abstractmethod
2
2
  from typing import Any, Dict, List, Tuple
3
3
 
4
- from evalscope.perf.query_parameters import QueryParameters
4
+ from evalscope.perf.arguments import Arguments
5
+
5
6
 
6
7
  class ApiPluginBase:
8
+
7
9
  def __init__(self, model_path: str) -> None:
8
10
  self.model_path = model_path
9
-
11
+
10
12
  @abstractmethod
11
- def build_request(self, messages: List[Dict], param: QueryParameters)->Dict:
13
+ def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
12
14
  """Build a api request body.
13
15
 
14
16
  Args:
@@ -22,39 +24,36 @@ class ApiPluginBase:
22
24
  Dict: The api request body.
23
25
  """
24
26
  raise NotImplementedError
25
-
27
+
26
28
  @abstractmethod
27
- def parse_responses(self,
28
- responses: List,
29
- request: Any=None,
30
- **kwargs:Any) -> Tuple[int, int]:
29
+ def parse_responses(self, responses: List, request: Any = None, **kwargs: Any) -> Tuple[int, int]:
31
30
  """Parser responses and return number of request and response tokens.
32
31
 
33
32
  Args:
34
33
  responses (List[bytes]): List of http response body, for stream output,
35
- there are multiple responses, each is bytes, for general only one.
34
+ there are multiple responses, each is bytes, for general only one.
36
35
  request (Any): The request body.
37
36
 
38
37
  Returns:
39
38
  Tuple: (Number of prompt_tokens and number of completion_tokens).
40
39
  """
41
- raise NotImplementedError
40
+ raise NotImplementedError
42
41
 
43
42
  @staticmethod
44
43
  def replace_values(input_json: Any, model: str, prompt: str):
45
- if isinstance(input_json, dict):
44
+ if isinstance(input_json, dict):
46
45
  for key, value in input_json.items():
47
46
  if isinstance(value, str):
48
- input_json[key] = value.replace("%m", model).replace("%p", prompt)
49
- else:
50
- ApiPluginBase.replace_values(value, model, prompt)
51
- elif isinstance(input_json, list):
47
+ input_json[key] = value.replace('%m', model).replace('%p', prompt)
48
+ else:
49
+ ApiPluginBase.replace_values(value, model, prompt)
50
+ elif isinstance(input_json, list):
52
51
  for idx, item in enumerate(input_json):
53
52
  if isinstance(item, str):
54
- input_json[idx] = item.replace("%m", model).replace("%p", prompt)
53
+ input_json[idx] = item.replace('%m', model).replace('%p', prompt)
55
54
  else:
56
55
  ApiPluginBase.replace_values(item, model, prompt)
57
56
  elif isinstance(input_json, str):
58
- input_json = input_json.replace("%m", model).replace("%p", prompt)
57
+ input_json = input_json.replace('%m', model).replace('%p', prompt)
59
58
  else:
60
- pass
59
+ pass
@@ -1,19 +1,26 @@
1
1
  from typing import Any, Dict, Iterator, List
2
+
2
3
  import json
3
- from evalscope.perf.api_plugin_base import ApiPluginBase
4
4
  from transformers import AutoTokenizer
5
- from evalscope.perf.plugin_registry import register_api
6
- from evalscope.perf.query_parameters import QueryParameters
7
5
 
8
- @register_api("custom")
6
+ from evalscope.perf.arguments import Arguments
7
+ from evalscope.perf.plugin.api.base import ApiPluginBase
8
+ from evalscope.perf.plugin.registry import register_api
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ @register_api('custom')
9
15
  class CustomPlugin(ApiPluginBase):
10
16
  """Support tensorrt-llm triton server
11
17
  """
18
+
12
19
  def __init__(self, mode_path: str):
13
20
  """Init the plugin
14
21
 
15
22
  Args:
16
- mode_path (str): The model path, we use the tokenizer
23
+ mode_path (str): The model path, we use the tokenizer
17
24
  weight in the model to calculate the number of the
18
25
  input and output tokens.
19
26
  """
@@ -23,12 +30,12 @@ class CustomPlugin(ApiPluginBase):
23
30
  else:
24
31
  self.tokenizer = None
25
32
 
26
- def build_request(self, messages: List[Dict], param: QueryParameters) -> Dict:
33
+ def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
27
34
  """Build the openai format request based on prompt, dataset
28
35
 
29
36
  Args:
30
37
  message (Dict): The basic message to generator query.
31
- param (QueryParameters): The query parameters.
38
+ param (Arguments): The query parameters.
32
39
 
33
40
  Raises:
34
41
  Exception: NotImplemented
@@ -41,8 +48,8 @@ class CustomPlugin(ApiPluginBase):
41
48
  ApiPluginBase.replace_values(query, param.model, messages[0]['content'])
42
49
  return query
43
50
  except Exception as e:
44
- print(e)
45
- print('Prompt: %s invalidate!'%messages)
51
+ logger.exception(e)
52
+ logger.error('Prompt: %s invalidate!' % messages)
46
53
  return None
47
54
 
48
55
  def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
@@ -53,7 +60,7 @@ class CustomPlugin(ApiPluginBase):
53
60
 
54
61
  Args:
55
62
  responses (List[bytes]): List of http response body, for stream output,
56
- there are multiple responses, for general only one.
63
+ there are multiple responses, for general only one.
57
64
  kwargs: (Any): The command line --parameter content.
58
65
  Returns:
59
66
  Tuple: Return number of prompt token and number of completion tokens.
@@ -63,15 +70,15 @@ class CustomPlugin(ApiPluginBase):
63
70
  input_tokens = None
64
71
  output_tokens = None
65
72
  for response in responses:
66
- js = json.loads(response)
73
+ data = json.loads(response)
67
74
  # {"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble",
68
75
  # "model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"性"}
69
- if 'text_output' in js:
76
+ if 'text_output' in data:
70
77
  if 0 in delta_contents:
71
- delta_contents[0].append(js['text_output'])
78
+ delta_contents[0].append(data['text_output'])
72
79
  else:
73
- delta_contents[0] = [js['text_output']]
74
- if input_tokens is None and output_tokens is None and self.tokenizer is not None:
80
+ delta_contents[0] = [data['text_output']]
81
+ if input_tokens is None and output_tokens is None and self.tokenizer is not None:
75
82
  input_tokens = 0
76
83
  output_tokens = 0
77
84
  for _, choice_contents in delta_contents.items():
@@ -80,8 +87,7 @@ class CustomPlugin(ApiPluginBase):
80
87
  output_tokens += len(self.tokenizer.encode(full_response_content))
81
88
  elif input_tokens is None and output_tokens is None: # no usage info get.
82
89
  input_tokens = 0
83
- output_tokens = 0
84
-
90
+ output_tokens = 0
91
+ logger.warning('No usage info get.')
92
+
85
93
  return input_tokens, output_tokens
86
-
87
-
@@ -1,26 +1,30 @@
1
-
2
- from sys import maxsize
3
- import sys
1
+ import os
4
2
  from typing import Any, Dict, Iterator, List
3
+
5
4
  import json
6
- from evalscope.perf.api_plugin_base import ApiPluginBase
7
5
 
8
- from evalscope.perf.plugin_registry import register_api
9
- from evalscope.perf.query_parameters import QueryParameters
6
+ from evalscope.perf.arguments import Arguments
7
+ from evalscope.perf.plugin.api.base import ApiPluginBase
8
+ from evalscope.perf.plugin.registry import register_api
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
10
13
 
11
- @register_api("dashscope")
14
+ @register_api('dashscope')
12
15
  class DashScopeApiPlugin(ApiPluginBase):
16
+
13
17
  def __init__(self, mode_path: str):
14
18
  """Init the plugin
15
19
 
16
20
  Args:
17
- mode_path (str): The model path, we use the tokenizer
21
+ mode_path (str): The model path, we use the tokenizer
18
22
  weight in the model to calculate the number of the
19
23
  input and output tokens.
20
24
  """
21
25
  super().__init__(model_path=mode_path)
22
-
23
- def build_request(self,messages: List[Dict], param: QueryParameters) -> Dict:
26
+
27
+ def build_request(self, messages: List[Dict], param: Arguments) -> Dict:
24
28
  """Build the openai format request based on prompt, dataset
25
29
 
26
30
  Args:
@@ -35,16 +39,26 @@ class DashScopeApiPlugin(ApiPluginBase):
35
39
  """
36
40
  try:
37
41
  if param.query_template is not None:
38
- query = json.loads(param.query_template)
42
+ if param.query_template.startswith('@'):
43
+ file_path = param.query_template[1:]
44
+ if os.path.exists(file_path):
45
+ with open(file_path, 'r') as file:
46
+ query = json.load(file)
47
+ else:
48
+ raise FileNotFoundError(f'{file_path}')
49
+ else:
50
+ query = json.loads(param.query_template)
51
+
39
52
  query['input']['messages'] = messages # replace template content with message.
40
53
  return self.__compose_query_from_parameter(query, param)
41
54
  else:
42
55
  query = {'messages': messages}
43
56
  return self.__compose_query_from_parameter(query, param)
44
57
  except Exception as e:
45
- print(e)
58
+ logger.exception(e)
46
59
  return None
47
- def __compose_query_from_parameter(self, payload: Dict, param: QueryParameters):
60
+
61
+ def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
48
62
  payload['model'] = param.model
49
63
  if 'parameters' not in payload:
50
64
  payload['parameters'] = {}
@@ -73,7 +87,7 @@ class DashScopeApiPlugin(ApiPluginBase):
73
87
 
74
88
  Args:
75
89
  responses (List[bytes]): List of http response body, for stream output,
76
- there are multiple responses, for general only one.
90
+ there are multiple responses, for general only one.
77
91
  kwargs: (Any): The command line --parameter content.
78
92
 
79
93
  Returns:
@@ -1,19 +1,26 @@
1
+ import os
1
2
  from typing import Any, Dict, Iterator, List
3
+
2
4
  import json
3
- from evalscope.perf.api_plugin_base import ApiPluginBase
4
5
  from transformers import AutoTokenizer
5
- from evalscope.perf.plugin_registry import register_api
6
- from evalscope.perf.query_parameters import QueryParameters
7
6
 
8
- @register_api("openai")
7
+ from evalscope.perf.arguments import Arguments
8
+ from evalscope.perf.plugin.api.base import ApiPluginBase
9
+ from evalscope.perf.plugin.registry import register_api
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @register_api(['openai', 'local_vllm', 'local'])
9
16
  class OpenaiPlugin(ApiPluginBase):
10
- """Base of openai interface.
11
- """
17
+ """Base of openai interface."""
18
+
12
19
  def __init__(self, mode_path: str):
13
20
  """Init the plugin
14
21
 
15
22
  Args:
16
- mode_path (str): The model path, we use the tokenizer
23
+ mode_path (str): The model path, we use the tokenizer
17
24
  weight in the model to calculate the number of the
18
25
  input and output tokens.
19
26
  """
@@ -23,11 +30,11 @@ class OpenaiPlugin(ApiPluginBase):
23
30
  else:
24
31
  self.tokenizer = None
25
32
 
26
- def build_request(self, messages: List[Dict], param: QueryParameters) -> Dict:
33
+ def build_request(self, messages: List[Dict] | str, param: Arguments) -> Dict:
27
34
  """Build the openai format request based on prompt, dataset
28
35
 
29
36
  Args:
30
- message (Dict): The basic message to generator query.
37
+ message (List[Dict] | str): The basic message to generator query.
31
38
  param (QueryParameters): The query parameters.
32
39
 
33
40
  Raises:
@@ -38,22 +45,35 @@ class OpenaiPlugin(ApiPluginBase):
38
45
  """
39
46
  try:
40
47
  if param.query_template is not None:
41
- query = json.loads(param.query_template)
48
+ if param.query_template.startswith('@'):
49
+ file_path = param.query_template[1:]
50
+ if os.path.exists(file_path):
51
+ with open(file_path, 'r') as file:
52
+ query = json.load(file)
53
+ else:
54
+ raise FileNotFoundError(f'{file_path}')
55
+ else:
56
+ query = json.loads(param.query_template)
57
+
42
58
  if 'stream' in query.keys():
43
59
  param.stream = query['stream']
44
- query['messages'] = messages # replace template messages with input messages.
45
- return self.__compose_query_from_parameter(query, param)
60
+ # replace template messages with input messages.
61
+ query['messages'] = messages
62
+ elif isinstance(messages, str):
63
+ query = {'prompt': messages}
46
64
  else:
47
65
  query = {'messages': messages}
48
- return self.__compose_query_from_parameter(query, param)
66
+ return self.__compose_query_from_parameter(query, param)
49
67
  except Exception as e:
50
- print(e)
68
+ logger.exception(e)
51
69
  return None
52
-
53
- def __compose_query_from_parameter(self, payload: Dict, param: QueryParameters):
70
+
71
+ def __compose_query_from_parameter(self, payload: Dict, param: Arguments):
54
72
  payload['model'] = param.model
55
73
  if param.max_tokens is not None:
56
74
  payload['max_tokens'] = param.max_tokens
75
+ if param.min_tokens is not None:
76
+ payload['min_tokens'] = param.min_tokens
57
77
  if param.frequency_penalty is not None:
58
78
  payload['frequency_penalty'] = param.frequency_penalty
59
79
  if param.logprobs is not None:
@@ -66,7 +86,7 @@ class OpenaiPlugin(ApiPluginBase):
66
86
  payload['stop'] = param.stop
67
87
  if param.stream is not None and param.stream:
68
88
  payload['stream'] = param.stream
69
- payload['stream_options'] = {"include_usage": True}
89
+ payload['stream_options'] = {'include_usage': True}
70
90
  if param.stop_token_ids is not None:
71
91
  payload['stop_token_ids'] = param.stop_token_ids
72
92
  if param.temperature is not None:
@@ -83,7 +103,7 @@ class OpenaiPlugin(ApiPluginBase):
83
103
 
84
104
  Args:
85
105
  responses (List[bytes]): List of http response body, for stream output,
86
- there are multiple responses, for general only one.
106
+ there are multiple responses, for general only one.
87
107
  kwargs: (Any): The command line --parameter content.
88
108
  Returns:
89
109
  Tuple: Return number of prompt token and number of completion tokens.
@@ -96,10 +116,15 @@ class OpenaiPlugin(ApiPluginBase):
96
116
  js = json.loads(response)
97
117
  if js['object'] == 'chat.completion':
98
118
  for choice in js['choices']:
99
- delta_contents[choice['index']] = [choice['message']['content']]
119
+ delta_contents[choice['index']] = [choice['message']['content']]
120
+ input_tokens = js['usage']['prompt_tokens']
121
+ output_tokens = js['usage']['completion_tokens']
122
+ elif js['object'] == 'text_completion':
123
+ for choice in js['choices']:
124
+ delta_contents[choice['index']] = [choice['text']]
100
125
  input_tokens = js['usage']['prompt_tokens']
101
- output_tokens = js['usage']['completion_tokens']
102
- else: # 'object' == "chat.completion.chunk":
126
+ output_tokens = js['usage']['completion_tokens']
127
+ elif js['object'] == 'chat.completion.chunk':
103
128
  if 'choices' in js:
104
129
  for choice in js['choices']:
105
130
  if 'delta' in choice and 'index' in choice:
@@ -115,8 +140,8 @@ class OpenaiPlugin(ApiPluginBase):
115
140
  # "choices":[],"usage":{"prompt_tokens":32,"total_tokens":384,"completion_tokens":352}}
116
141
  if 'usage' in js and js['usage']:
117
142
  input_tokens = js['usage']['prompt_tokens']
118
- output_tokens = js['usage']['completion_tokens']
119
- if input_tokens is None and output_tokens is None and self.tokenizer is not None:
143
+ output_tokens = js['usage']['completion_tokens']
144
+ if (input_tokens is None and output_tokens is None and self.tokenizer is not None):
120
145
  input_tokens = 0
121
146
  output_tokens = 0
122
147
  for idx, choice_contents in delta_contents.items():
@@ -125,8 +150,7 @@ class OpenaiPlugin(ApiPluginBase):
125
150
  output_tokens += len(self.tokenizer.encode(full_response_content))
126
151
  elif input_tokens is None and output_tokens is None: # no usage info get.
127
152
  input_tokens = 0
128
- output_tokens = 0
129
-
153
+ output_tokens = 0
154
+ logger.warning('No usage info get.')
155
+
130
156
  return input_tokens, output_tokens
131
-
132
-
@@ -0,0 +1,6 @@
1
+ from evalscope.perf.plugin.datasets.custom import CustomDatasetPlugin
2
+ from evalscope.perf.plugin.datasets.flickr8k import FlickrDatasetPlugin
3
+ from evalscope.perf.plugin.datasets.line_by_line import LineByLineDatasetPlugin
4
+ from evalscope.perf.plugin.datasets.longalpaca import LongAlpacaDatasetPlugin
5
+ from evalscope.perf.plugin.datasets.openqa import OpenqaDatasetPlugin
6
+ from evalscope.perf.plugin.datasets.speed_benchmark import SpeedBenchmarkDatasetPlugin, SpeedBenchmarkLongDatasetPlugin
@@ -1,12 +1,15 @@
1
- from abc import abstractmethod
2
1
  import sys
2
+ from abc import abstractmethod
3
3
  from typing import Any, Dict, Iterator, List, Tuple
4
+
4
5
  import json
5
6
 
6
- from evalscope.perf.query_parameters import QueryParameters
7
+ from evalscope.perf.arguments import Arguments
8
+
7
9
 
8
10
  class DatasetPluginBase:
9
- def __init__(self, query_parameters: QueryParameters):
11
+
12
+ def __init__(self, query_parameters: Arguments):
10
13
  """Build data set plugin
11
14
 
12
15
  Args:
@@ -21,9 +24,9 @@ class DatasetPluginBase:
21
24
 
22
25
  def __iter__(self):
23
26
  return self.build_messages()
24
-
27
+
25
28
  @abstractmethod
26
- def build_messages(self)->Iterator[List[Dict]]:
29
+ def build_messages(self) -> Iterator[List[Dict]]:
27
30
  """Build the request.
28
31
 
29
32
  Raises:
@@ -33,8 +36,8 @@ class DatasetPluginBase:
33
36
  Iterator[List[Dict]]: Yield request messages.
34
37
  """
35
38
  raise NotImplementedError
36
-
37
- def dataset_line_by_line(self, dataset: str)->Iterator[str]:
39
+
40
+ def dataset_line_by_line(self, dataset: str) -> Iterator[str]:
38
41
  """Get content line by line of dataset.
39
42
 
40
43
  Args:
@@ -46,8 +49,8 @@ class DatasetPluginBase:
46
49
  with open(dataset, 'r', encoding='utf-8') as f:
47
50
  for line in f:
48
51
  yield line
49
-
50
- def dataset_json_list(self, dataset: str)->Iterator[Dict]:
52
+
53
+ def dataset_json_list(self, dataset: str) -> Iterator[Dict]:
51
54
  """Read data from file which is list of requests.
52
55
  Sample: https://huggingface.co/datasets/Yukang/LongAlpaca-12k
53
56
 
@@ -61,4 +64,4 @@ class DatasetPluginBase:
61
64
  content = f.read()
62
65
  data = json.loads(content)
63
66
  for item in data:
64
- yield item
67
+ yield item
@@ -0,0 +1,21 @@
1
+ from typing import Dict, Iterator, List
2
+
3
+ from evalscope.perf.arguments import Arguments
4
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
5
+ from evalscope.perf.plugin.registry import register_dataset
6
+
7
+
8
+ @register_dataset('custom')
9
+ class CustomDatasetPlugin(DatasetPluginBase):
10
+ """Read dataset and return prompt.
11
+ """
12
+
13
+ def __init__(self, query_parameters: Arguments):
14
+ super().__init__(query_parameters)
15
+
16
+ def build_messages(self) -> Iterator[List[Dict]]:
17
+ for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
18
+ prompt = item.strip()
19
+ if len(prompt) > self.query_parameters.min_prompt_length and len(
20
+ prompt) < self.query_parameters.max_prompt_length:
21
+ yield [{'role': 'user', 'content': prompt}]
@@ -0,0 +1,51 @@
1
+ import base64
2
+ from io import BytesIO
3
+ from typing import Any, Dict, Iterator, List
4
+
5
+ from modelscope.msdatasets import MsDataset
6
+ from PIL import Image
7
+
8
+ from evalscope.perf.arguments import Arguments
9
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
10
+ from evalscope.perf.plugin.registry import register_dataset
11
+
12
+
13
+ def PIL_to_base64(image: Image.Image) -> str:
14
+ buffered = BytesIO()
15
+ image.save(buffered, format='JPEG')
16
+ img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
17
+ return img_str
18
+
19
+
20
+ @register_dataset('flickr8k')
21
+ class FlickrDatasetPlugin(DatasetPluginBase):
22
+ """Read dataset and return prompt.
23
+ Datasets: https://www.modelscope.cn/datasets/clip-benchmark/wds_flickr8k/files
24
+ """
25
+
26
+ def __init__(self, query_parameters: Arguments):
27
+ super().__init__(query_parameters)
28
+
29
+ def build_messages(self) -> Iterator[List[Dict]]:
30
+ dataset = MsDataset.load('clip-benchmark/wds_flickr8k', split='test')
31
+
32
+ for item in dataset:
33
+ pil_image = item['jpg']
34
+ base64_iamge = PIL_to_base64(pil_image)
35
+
36
+ yield [{
37
+ 'role':
38
+ 'user',
39
+ 'content': [
40
+ {
41
+ 'type': 'text',
42
+ 'text': 'Describe the image'
43
+ },
44
+ {
45
+ 'type': 'image_url',
46
+ 'image_url': {
47
+ 'url': f'data:image/jpeg;base64,{base64_iamge}',
48
+ }
49
+ },
50
+ ],
51
+ }]
@@ -1,18 +1,22 @@
1
1
  import sys
2
2
  from typing import Dict, Iterator, List
3
- from evalscope.perf.dataset_plugin_base import DatasetPluginBase
4
- from evalscope.perf.plugin_registry import register_dataset
5
- from evalscope.perf.query_parameters import QueryParameters
3
+
4
+ from evalscope.perf.arguments import Arguments
5
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
6
+ from evalscope.perf.plugin.registry import register_dataset
7
+
6
8
 
7
9
  @register_dataset('line_by_line')
8
10
  class LineByLineDatasetPlugin(DatasetPluginBase):
9
11
  """Read dataset and return prompt.
10
12
  """
11
- def __init__(self, query_parameters: QueryParameters):
13
+
14
+ def __init__(self, query_parameters: Arguments):
12
15
  super().__init__(query_parameters)
13
16
 
14
17
  def build_messages(self) -> Iterator[List[Dict]]:
15
18
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
16
19
  prompt = item.strip()
17
- if len(prompt) > self.query_parameters.min_prompt_length and len(prompt) < self.query_parameters.max_prompt_length:
20
+ if len(prompt) > self.query_parameters.min_prompt_length and len(
21
+ prompt) < self.query_parameters.max_prompt_length:
18
22
  yield [{'role': 'user', 'content': prompt}]
@@ -0,0 +1,28 @@
1
+ from typing import Any, Dict, Iterator, List
2
+
3
+ from modelscope import MsDataset
4
+
5
+ from evalscope.perf.arguments import Arguments
6
+ from evalscope.perf.plugin.datasets.base import DatasetPluginBase
7
+ from evalscope.perf.plugin.registry import register_dataset
8
+
9
+
10
+ @register_dataset('longalpaca')
11
+ class LongAlpacaDatasetPlugin(DatasetPluginBase):
12
+ """Read data from file which is list of requests.
13
+ Sample: https://www.modelscope.cn/datasets/AI-ModelScope/LongAlpaca-12k/files
14
+ """
15
+
16
+ def __init__(self, query_parameters: Arguments):
17
+ super().__init__(query_parameters)
18
+
19
+ def build_messages(self) -> Iterator[List[Dict]]:
20
+ if not self.query_parameters.dataset_path:
21
+ ds = MsDataset.load('AI-ModelScope/LongAlpaca-12k', subset_name='default', split='train')
22
+ else:
23
+ ds = self.dataset_json_list(self.query_parameters.dataset_path)
24
+ for item in ds:
25
+ prompt = item['instruction'].strip()
26
+ if len(prompt) > self.query_parameters.min_prompt_length and len(
27
+ prompt) < self.query_parameters.max_prompt_length:
28
+ yield [{'role': 'user', 'content': prompt}]