evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/base.py +1 -1
  4. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  5. evalscope/backend/rag_eval/utils/clip.py +2 -2
  6. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  7. evalscope/backend/rag_eval/utils/llm.py +1 -1
  8. evalscope/benchmarks/__init__.py +20 -1
  9. evalscope/benchmarks/arc/__init__.py +0 -5
  10. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  11. evalscope/benchmarks/bbh/__init__.py +0 -4
  12. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  13. evalscope/benchmarks/benchmark.py +70 -59
  14. evalscope/benchmarks/ceval/__init__.py +0 -5
  15. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  16. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  17. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  18. evalscope/benchmarks/competition_math/__init__.py +0 -5
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  20. evalscope/benchmarks/data_adapter.py +115 -87
  21. evalscope/benchmarks/general_qa/__init__.py +0 -5
  22. evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
  23. evalscope/benchmarks/gpqa/__init__.py +0 -0
  24. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  26. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  27. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
  28. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  29. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
  30. evalscope/benchmarks/humaneval/__init__.py +0 -4
  31. evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
  32. evalscope/benchmarks/ifeval/__init__.py +0 -0
  33. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  34. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  35. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  36. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  37. evalscope/benchmarks/ifeval/utils.py +134 -0
  38. evalscope/benchmarks/iquiz/__init__.py +0 -0
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  40. evalscope/benchmarks/mmlu/__init__.py +0 -5
  41. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  42. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  43. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  44. evalscope/benchmarks/race/__init__.py +0 -5
  45. evalscope/benchmarks/race/race_adapter.py +27 -123
  46. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  48. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  49. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  50. evalscope/cli/cli.py +2 -0
  51. evalscope/cli/start_app.py +30 -0
  52. evalscope/collections/__init__.py +3 -0
  53. evalscope/collections/evaluator.py +198 -0
  54. evalscope/collections/sampler.py +138 -0
  55. evalscope/collections/schema.py +126 -0
  56. evalscope/config.py +45 -7
  57. evalscope/constants.py +7 -38
  58. evalscope/evaluator/__init__.py +0 -1
  59. evalscope/evaluator/evaluator.py +89 -121
  60. evalscope/evaluator/rating_eval.py +1 -1
  61. evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
  62. evalscope/metrics/__init__.py +3 -0
  63. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  64. evalscope/metrics/math_accuracy.py +193 -50
  65. evalscope/metrics/metrics.py +18 -6
  66. evalscope/metrics/named_metrics.py +17 -0
  67. evalscope/metrics/rouge_metric.py +13 -8
  68. evalscope/models/__init__.py +14 -1
  69. evalscope/models/base_adapter.py +52 -0
  70. evalscope/models/chat_adapter.py +140 -0
  71. evalscope/models/choice_adapter.py +211 -0
  72. evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
  73. evalscope/models/custom_adapter.py +67 -0
  74. evalscope/models/local_model.py +74 -0
  75. evalscope/models/model.py +141 -0
  76. evalscope/models/server_adapter.py +111 -0
  77. evalscope/perf/__init__.py +1 -0
  78. evalscope/perf/arguments.py +3 -1
  79. evalscope/perf/benchmark.py +3 -3
  80. evalscope/perf/main.py +5 -7
  81. evalscope/perf/plugin/api/custom_api.py +1 -1
  82. evalscope/perf/plugin/api/openai_api.py +54 -50
  83. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  84. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  85. evalscope/perf/plugin/registry.py +3 -3
  86. evalscope/perf/utils/benchmark_util.py +4 -4
  87. evalscope/perf/utils/db_util.py +66 -22
  88. evalscope/perf/utils/local_server.py +4 -1
  89. evalscope/report/__init__.py +5 -0
  90. evalscope/report/app.py +693 -0
  91. evalscope/report/combinator.py +73 -0
  92. evalscope/report/generator.py +80 -0
  93. evalscope/report/utils.py +133 -0
  94. evalscope/run.py +64 -125
  95. evalscope/run_arena.py +3 -2
  96. evalscope/summarizer.py +15 -27
  97. evalscope/third_party/longbench_write/eval.py +2 -1
  98. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  99. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  100. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  101. evalscope/utils/__init__.py +1 -0
  102. evalscope/utils/chat_service.py +6 -5
  103. evalscope/utils/io_utils.py +170 -0
  104. evalscope/utils/logger.py +13 -0
  105. evalscope/utils/model_utils.py +15 -2
  106. evalscope/utils/utils.py +3 -200
  107. evalscope/version.py +2 -2
  108. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
  109. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
  110. tests/cli/test_collection.py +57 -0
  111. tests/cli/test_run.py +57 -7
  112. tests/perf/test_perf.py +3 -2
  113. tests/rag/test_mteb.py +3 -2
  114. tests/vlm/test_vlmeval.py +3 -2
  115. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  116. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  117. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  118. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  119. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  120. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  121. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  122. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  123. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  124. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  125. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  126. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  127. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  128. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  129. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  130. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  131. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  132. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  133. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  134. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  135. evalscope/evaluator/humaneval_evaluator.py +0 -158
  136. evalscope/models/api/__init__.py +0 -3
  137. evalscope/models/dummy_chat_model.py +0 -49
  138. evalscope/models/model_adapter.py +0 -525
  139. evalscope/models/openai_model.py +0 -103
  140. evalscope/tools/__init__.py +0 -1
  141. evalscope/tools/combine_reports.py +0 -135
  142. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  143. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  144. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  145. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  146. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  147. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
1
+ import torch
2
+ from typing import TYPE_CHECKING, Optional
3
+
4
+ from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType
5
+ from evalscope.utils.logger import get_logger
6
+
7
+ if TYPE_CHECKING:
8
+ from evalscope.config import TaskConfig
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ class LocalModel:
14
+
15
+ def __init__(self,
16
+ model_id: str,
17
+ model_revision: str = DEFAULT_MODEL_REVISION,
18
+ device_map: str = 'auto',
19
+ torch_dtype: str = 'auto',
20
+ cache_dir: str = None,
21
+ **kwargs):
22
+ from modelscope import AutoModelForCausalLM, AutoTokenizer
23
+
24
+ model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
25
+
26
+ if isinstance(torch_dtype, str) and torch_dtype != 'auto':
27
+ torch_dtype = eval(torch_dtype)
28
+
29
+ self.model_id = model_id
30
+ self.model_revision = model_revision
31
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
32
+
33
+ self.tokenizer = AutoTokenizer.from_pretrained(
34
+ self.model_id,
35
+ revision=model_revision,
36
+ trust_remote_code=True,
37
+ cache_dir=model_cache_dir,
38
+ )
39
+
40
+ self.model = AutoModelForCausalLM.from_pretrained(
41
+ self.model_id,
42
+ revision=model_revision,
43
+ device_map=device_map,
44
+ trust_remote_code=True,
45
+ torch_dtype=torch_dtype,
46
+ cache_dir=model_cache_dir,
47
+ )
48
+
49
+ self.model_cfg = {
50
+ 'model_id': model_id,
51
+ 'device_map': device_map,
52
+ 'torch_dtype': str(torch_dtype),
53
+ }
54
+
55
+
56
+ def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
57
+ """Get the base local model for the task. If the task is not checkpoint-based, return None.
58
+ Avoids loading model multiple times for different datasets.
59
+ """
60
+ if task_cfg.eval_type != EvalType.CHECKPOINT:
61
+ return None
62
+ else:
63
+ device_map = task_cfg.model_args.get('device_map', 'auto')
64
+ cache_dir = task_cfg.model_args.get('cache_dir', None)
65
+ model_precision = task_cfg.model_args.get('precision', 'torch.float16')
66
+ model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
67
+
68
+ base_model = LocalModel(
69
+ model_id=task_cfg.model,
70
+ model_revision=model_revision,
71
+ device_map=device_map,
72
+ torch_dtype=model_precision,
73
+ cache_dir=cache_dir)
74
+ return base_model
evalscope/models/model.py CHANGED
@@ -1,7 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ import random
4
+ import time
2
5
  from abc import ABC, abstractmethod
3
6
  from typing import Any
4
7
 
8
+ from evalscope.utils.logger import get_logger
9
+
10
+ logger = get_logger()
11
+
5
12
 
6
13
  class BaseModel(ABC):
7
14
 
@@ -86,3 +93,137 @@ class ChatBaseModel(BaseModel):
86
93
  }
87
94
  """
88
95
  raise NotImplementedError
96
+
97
+
98
+ class OpenAIModel(ChatBaseModel):
99
+ """
100
+ APIs of OpenAI models.
101
+ Available models: gpt-3.5-turbo, gpt-4
102
+ """
103
+
104
+ MAX_RETRIES = 3
105
+
106
+ def __init__(self, model_cfg: dict, **kwargs):
107
+ super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
108
+
109
+ openai_api_key = os.environ.get('OPENAI_API_KEY', None)
110
+ self.api_key = self.model_cfg.get('api_key', openai_api_key)
111
+
112
+ if not self.api_key:
113
+ logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
114
+ # raise ValueError(
115
+ # 'OpenAI API key is not provided, '
116
+ # 'please set it in environment variable OPENAI_API_KEY')
117
+
118
+ def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
119
+
120
+ sys_prompt: str = inputs.get('sys_prompt', '')
121
+ user_prompt: str = inputs.get('user_prompt', '')
122
+
123
+ # model_id: str = kwargs.get('model_id', '')
124
+ temperature: float = kwargs.pop('temperature', 0.2)
125
+ max_tokens: int = kwargs.pop('max_tokens', 1024)
126
+ mode: str = kwargs.pop('mode', 'chat.completion')
127
+
128
+ logger.info(f'Using OpenAI model_id: {model_id}')
129
+
130
+ res = self._predict(
131
+ model_id=model_id,
132
+ sys_prompt=sys_prompt,
133
+ user_prompt=user_prompt,
134
+ temperature=temperature,
135
+ max_tokens=max_tokens,
136
+ mode=mode)
137
+
138
+ return res
139
+
140
+ def _predict(
141
+ self,
142
+ model_id,
143
+ sys_prompt,
144
+ user_prompt,
145
+ temperature,
146
+ max_tokens,
147
+ mode: str = 'chat.completion',
148
+ ) -> dict:
149
+ import openai
150
+
151
+ res = {}
152
+ openai.api_key = self.api_key
153
+
154
+ for i in range(self.MAX_RETRIES):
155
+ try:
156
+ if mode == 'chat.completion':
157
+ resp = openai.ChatCompletion.create(
158
+ model=model_id,
159
+ messages=[{
160
+ 'role': 'system',
161
+ 'content': sys_prompt
162
+ }, {
163
+ 'role': 'user',
164
+ 'content': user_prompt
165
+ }],
166
+ temperature=temperature,
167
+ max_tokens=max_tokens)
168
+
169
+ if resp:
170
+ ans_text = resp['choices'][0]['message']['content']
171
+ model_id = resp['model']
172
+ else:
173
+ logger.warning(f'OpenAI GPT API call failed: got empty response '
174
+ f'for input {sys_prompt} {user_prompt}')
175
+ ans_text = ''
176
+ model_id = ''
177
+
178
+ res['ans_text'] = ans_text
179
+ res['model_id'] = model_id
180
+ else:
181
+ raise ValueError(f'Invalid mode: {mode}')
182
+
183
+ return res
184
+
185
+ except Exception as e:
186
+ logger.warning(f'OpenAI API call failed: {e}')
187
+ time.sleep(3)
188
+ logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
189
+ return res
190
+
191
+
192
+ class DummyChatModel(ChatBaseModel):
193
+
194
+ MODEL_ID = 'dummy_chat_model_0801'
195
+ REVISION = 'v1.0.0'
196
+
197
+ def __init__(self, model_cfg: dict, **kwargs):
198
+ model_cfg['model_id'] = self.MODEL_ID
199
+ model_cfg['revision'] = self.REVISION
200
+ super(DummyChatModel, self).__init__(model_cfg=model_cfg)
201
+
202
+ def predict(self, inputs: dict, **kwargs) -> dict:
203
+
204
+ debug: bool = False
205
+ if debug:
206
+ messages = inputs['messages']
207
+ history = inputs['history']
208
+
209
+ logger.info(f'** messages: {messages}')
210
+ logger.info(f'** history: {history}')
211
+
212
+ choice = random.choice(['A', 'B', 'C', 'D'])
213
+
214
+ # Build response
215
+ res = {
216
+ 'choices': [{
217
+ 'index': 0,
218
+ 'message': {
219
+ 'content': choice,
220
+ 'role': 'assistant'
221
+ }
222
+ }],
223
+ 'created': time.time(),
224
+ 'model': self.MODEL_ID + '-' + self.REVISION,
225
+ 'object': 'chat.completion',
226
+ 'usage': {}
227
+ }
228
+
229
+ return res
@@ -0,0 +1,111 @@
1
+ import requests
2
+ import time
3
+ from typing import Optional, Union
4
+
5
+ from evalscope.models.base_adapter import BaseModelAdapter
6
+ from evalscope.utils.chat_service import ChatMessage
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ class ServerModelAdapter(BaseModelAdapter):
13
+ """
14
+ Server model adapter to request remote API model and generate results.
15
+ """
16
+
17
+ def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
18
+ """
19
+ Args:
20
+ api_url: The URL of the remote API model.
21
+ model_id: The ID of the remote API model.
22
+ api_key: The API key of the remote API model.
23
+ """
24
+ self.api_url = api_url
25
+ self.model_id = model_id
26
+ self.api_key = api_key
27
+ self.seed = kwargs.get('seed', None)
28
+ self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
29
+ super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
30
+
31
+ def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
32
+ """
33
+ Model prediction func.
34
+
35
+ Args:
36
+ inputs (Union[str, dict, list]): The input data.
37
+ infer_cfg (dict): Inference configuration.
38
+
39
+ Returns:
40
+ res (dict): The model prediction results.
41
+ """
42
+ infer_cfg = infer_cfg or {}
43
+
44
+ # Process inputs
45
+ if isinstance(inputs, str):
46
+ query = inputs
47
+ system_prompt = None
48
+ elif isinstance(inputs, dict):
49
+ data: list = inputs['data']
50
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
51
+ query = '\n'.join(''.join(item) for item in data)
52
+ system_prompt = inputs.get('system_prompt', None)
53
+ else:
54
+ query = data[0]
55
+ system_prompt = inputs.get('system_prompt', None)
56
+ elif isinstance(inputs, list):
57
+ query = '\n'.join(inputs)
58
+ system_prompt = None
59
+ else:
60
+ raise TypeError(f'Unsupported inputs type: {type(inputs)}')
61
+
62
+ content = self.make_request_content(query, system_prompt)
63
+ request_json = self.make_request(content, infer_cfg)
64
+ response = self.send_request(request_json)
65
+ return response
66
+
67
+ def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> dict:
68
+ """
69
+ Make request content for API.
70
+ """
71
+ if system_prompt is not None:
72
+ messages = [
73
+ ChatMessage(role='system', content=system_prompt).model_dump(exclude_unset=True),
74
+ ChatMessage(role='user', content=query).model_dump(exclude_unset=True)
75
+ ]
76
+ else:
77
+ messages = [ChatMessage(role='user', content=query).model_dump(exclude_unset=True)]
78
+ return {'messages': messages}
79
+
80
+ def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
81
+ """Make request to remote API."""
82
+ # Format request JSON according to OpenAI API format
83
+ do_sample = infer_cfg.get('do_sample', False)
84
+ temperature = infer_cfg.get('temperature', 0.0) if do_sample else 0.0
85
+
86
+ request_json = {
87
+ **content, 'model': self.model_id,
88
+ 'max_tokens': infer_cfg.get('max_tokens', 2048),
89
+ 'temperature': temperature,
90
+ 'top_p': infer_cfg.get('top_p', 1.0),
91
+ 'n': infer_cfg.get('num_return_sequences', 1),
92
+ 'stop': infer_cfg.get('stop', None)
93
+ }
94
+ if self.seed is not None:
95
+ request_json['seed'] = self.seed
96
+ logger.debug(f'Request to remote API: {request_json}')
97
+ return request_json
98
+
99
+ def send_request(self, request_json: dict, max_retries: int = 3) -> dict:
100
+ for attempt in range(max_retries):
101
+ response = requests.post(
102
+ self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'})
103
+ if response.status_code == 200:
104
+ response_data = response.json()
105
+ return response_data
106
+ logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}')
107
+ if attempt < max_retries - 1:
108
+ time.sleep(5) # Sleep for 5 seconds before retrying
109
+ else:
110
+ raise RuntimeError(f'Failed to request to remote API after {max_retries} attempts: '
111
+ f'{response.status_code} {response.text}')
@@ -0,0 +1 @@
1
+ from evalscope.perf.main import run_perf_benchmark
@@ -16,7 +16,7 @@ class Arguments:
16
16
  attn_implementation: Optional[str] = None # Attention implementaion, only for local inference
17
17
  api: str = 'openai' # API to be used (default: 'openai')
18
18
  tokenizer_path: Optional[str] = None # Path to the tokenizer
19
- port: str = '8877' # Port number for the local API server
19
+ port: int = 8877 # Port number for the local API server
20
20
 
21
21
  # Connection settings
22
22
  url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
@@ -68,6 +68,7 @@ class Arguments:
68
68
  model=args.model,
69
69
  attn_implementation=args.attn_implementation,
70
70
  url=args.url,
71
+ port=args.port,
71
72
  api_key=args.api_key,
72
73
  connect_timeout=args.connect_timeout,
73
74
  read_timeout=args.read_timeout,
@@ -138,6 +139,7 @@ def add_argument(parser: argparse.ArgumentParser):
138
139
 
139
140
  # Connection settings
140
141
  parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
142
+ parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
141
143
  parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
142
144
  parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
143
145
  parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
@@ -157,7 +157,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
157
157
  while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
158
158
  try:
159
159
  # Attempt to get benchmark data from the queue with a timeout
160
- benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=1)
160
+ benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
161
161
  benchmark_data_queue.task_done()
162
162
  except asyncio.TimeoutError:
163
163
  # If timeout, continue to the next iteration
@@ -195,9 +195,9 @@ async def start_server(args: Arguments) -> bool:
195
195
  server.start()
196
196
 
197
197
  if args.dataset.startswith('speed_benchmark'):
198
- args.url = 'http://127.0.0.1:8877/v1/completions'
198
+ args.url = f'http://127.0.0.1:{args.port}/v1/completions'
199
199
  else:
200
- args.url = 'http://127.0.0.1:8877/v1/chat/completions'
200
+ args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
201
201
 
202
202
  if not await test_connection(args):
203
203
  raise TimeoutError('Test connection failed')
evalscope/perf/main.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import asyncio
2
- import logging
3
2
  import os
4
3
  import platform
5
4
  from argparse import Namespace
@@ -8,7 +7,7 @@ from evalscope.perf.arguments import Arguments, parse_args
8
7
  from evalscope.perf.benchmark import benchmark
9
8
  from evalscope.perf.utils.db_util import get_output_path
10
9
  from evalscope.perf.utils.handler import add_signal_handlers
11
- from evalscope.utils.logger import get_logger
10
+ from evalscope.utils.logger import configure_logging, get_logger
12
11
  from evalscope.utils.utils import seed_everything
13
12
 
14
13
  logger = get_logger()
@@ -19,14 +18,13 @@ def run_perf_benchmark(args):
19
18
  args = Arguments(**args)
20
19
  elif isinstance(args, Namespace):
21
20
  args = Arguments.from_args(args)
22
- seed_everything(args.seed)
21
+
22
+ if args.seed is not None:
23
+ seed_everything(args.seed)
23
24
 
24
25
  # Setup logger and output
25
26
  args.outputs_dir = get_output_path(args)
26
- get_logger(log_file=os.path.join(args.outputs_dir, 'benchmark.log'), force=True)
27
-
28
- if args.debug:
29
- get_logger(log_level=logging.DEBUG, force=True)
27
+ configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
30
28
 
31
29
  logger.info('Starting benchmark...')
32
30
  logger.info(args)
@@ -1,5 +1,4 @@
1
1
  import json
2
- from transformers import AutoTokenizer
3
2
  from typing import Any, Dict, Iterator, List
4
3
 
5
4
  from evalscope.perf.arguments import Arguments
@@ -25,6 +24,7 @@ class CustomPlugin(ApiPluginBase):
25
24
  """
26
25
  super().__init__(model_path=mode_path)
27
26
  if mode_path is not None:
27
+ from transformers import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
@@ -1,7 +1,6 @@
1
1
  import json
2
2
  import os
3
- from transformers import AutoTokenizer
4
- from typing import Any, Dict, Iterator, List
3
+ from typing import Any, Dict, Iterator, List, Union
5
4
 
6
5
  from evalscope.perf.arguments import Arguments
7
6
  from evalscope.perf.plugin.api.base import ApiPluginBase
@@ -25,11 +24,12 @@ class OpenaiPlugin(ApiPluginBase):
25
24
  """
26
25
  super().__init__(model_path=mode_path)
27
26
  if mode_path is not None:
27
+ from transformers import AutoTokenizer
28
28
  self.tokenizer = AutoTokenizer.from_pretrained(mode_path)
29
29
  else:
30
30
  self.tokenizer = None
31
31
 
32
- def build_request(self, messages: List[Dict] | str, param: Arguments) -> Dict:
32
+ def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
33
33
  """Build the openai format request based on prompt, dataset
34
34
 
35
35
  Args:
@@ -96,60 +96,64 @@ class OpenaiPlugin(ApiPluginBase):
96
96
 
97
97
  def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
98
98
  """Parser responses and return number of request and response tokens.
99
- sample of the output delta:
100
- {"id":"4","object":"chat.completion.chunk","created":1714030870,"model":"llama3","choices":[{"index":0,"delta":{"role":"assistant","content":""},"logprobs":null,"finish_reason":null}]}
99
+ Only one response for non-stream, multiple responses for stream.
100
+ """
101
101
 
102
+ # when stream, the last response is the full usage
103
+ # when non-stream, the last response is the first response
104
+ last_response_js = json.loads(responses[-1])
105
+ if 'usage' in last_response_js and last_response_js['usage']:
106
+ input_tokens = last_response_js['usage']['prompt_tokens']
107
+ output_tokens = last_response_js['usage']['completion_tokens']
108
+ return input_tokens, output_tokens
102
109
 
103
- Args:
104
- responses (List[bytes]): List of http response body, for stream output,
105
- there are multiple responses, for general only one.
106
- kwargs: (Any): The command line --parameter content.
107
- Returns:
108
- Tuple: Return number of prompt token and number of completion tokens.
109
- """
110
- full_response_content = ''
110
+ # no usage information in the response, parse the response to get the tokens
111
111
  delta_contents = {}
112
- input_tokens = None
113
- output_tokens = None
114
112
  for response in responses:
115
113
  js = json.loads(response)
116
- if js['object'] == 'chat.completion':
117
- for choice in js['choices']:
118
- delta_contents[choice['index']] = [choice['message']['content']]
119
- input_tokens = js['usage']['prompt_tokens']
120
- output_tokens = js['usage']['completion_tokens']
121
- elif js['object'] == 'text_completion':
122
- for choice in js['choices']:
123
- delta_contents[choice['index']] = [choice['text']]
124
- input_tokens = js['usage']['prompt_tokens']
125
- output_tokens = js['usage']['completion_tokens']
126
- elif js['object'] == 'chat.completion.chunk':
127
- if 'choices' in js:
128
- for choice in js['choices']:
129
- if 'delta' in choice and 'index' in choice:
130
- delta = choice['delta']
131
- idx = choice['index']
132
- if 'content' in delta:
133
- delta_content = delta['content']
134
- if idx in delta_contents:
135
- delta_contents[idx].append(delta_content)
136
- else:
137
- delta_contents[idx] = [delta_content]
138
- # usage in chunk: {"id":"","object":"chat.completion.chunk","created":1718269986,"model":"llama3",
139
- # "choices":[],"usage":{"prompt_tokens":32,"total_tokens":384,"completion_tokens":352}}
140
- if 'usage' in js and js['usage']:
141
- input_tokens = js['usage']['prompt_tokens']
142
- output_tokens = js['usage']['completion_tokens']
143
- if (input_tokens is None and output_tokens is None and self.tokenizer is not None):
144
- input_tokens = 0
145
- output_tokens = 0
114
+ if 'object' in js:
115
+ self.__process_response_object(js, delta_contents)
116
+ else:
117
+ self.__process_no_object(js, delta_contents)
118
+
119
+ input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
120
+ return input_tokens, output_tokens
121
+
122
+ def __process_response_object(self, js, delta_contents):
123
+ if js['object'] == 'chat.completion':
124
+ for choice in js['choices']:
125
+ delta_contents[choice['index']] = [choice['message']['content']]
126
+ elif js['object'] == 'text_completion':
127
+ for choice in js['choices']:
128
+ delta_contents[choice['index']] = [choice['text']]
129
+ elif js['object'] == 'chat.completion.chunk':
130
+ for choice in js.get('choices', []):
131
+ if 'delta' in choice and 'index' in choice:
132
+ delta = choice['delta']
133
+ idx = choice['index']
134
+ if 'content' in delta:
135
+ delta_content = delta['content']
136
+ delta_contents.setdefault(idx, []).append(delta_content)
137
+
138
+ def __process_no_object(self, js, delta_contents):
139
+ # assume the response is a single choice
140
+ for choice in js['choices']:
141
+ if 'delta' in choice:
142
+ delta = choice['delta']
143
+ idx = choice['index']
144
+ if 'content' in delta:
145
+ delta_content = delta['content']
146
+ delta_contents.setdefault(idx, []).append(delta_content)
147
+ else:
148
+ delta_contents[choice['index']] = [choice['message']['content']]
149
+
150
+ def __calculate_tokens_from_content(self, request, delta_contents):
151
+ input_tokens = output_tokens = 0
152
+ if self.tokenizer is not None:
146
153
  for idx, choice_contents in delta_contents.items():
147
- full_response_content = ''.join([m for m in choice_contents])
154
+ full_response_content = ''.join(choice_contents)
148
155
  input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
149
156
  output_tokens += len(self.tokenizer.encode(full_response_content))
150
- elif input_tokens is None and output_tokens is None: # no usage info get.
151
- input_tokens = 0
152
- output_tokens = 0
157
+ else:
153
158
  logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
154
-
155
159
  return input_tokens, output_tokens
@@ -1,6 +1,5 @@
1
1
  import base64
2
2
  from io import BytesIO
3
- from modelscope.msdatasets import MsDataset
4
3
  from PIL import Image
5
4
  from typing import Any, Dict, Iterator, List
6
5
 
@@ -26,6 +25,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
26
25
  super().__init__(query_parameters)
27
26
 
28
27
  def build_messages(self) -> Iterator[List[Dict]]:
28
+ from modelscope.msdatasets import MsDataset
29
29
  dataset = MsDataset.load('clip-benchmark/wds_flickr8k', split='test')
30
30
 
31
31
  for item in dataset:
@@ -1,4 +1,3 @@
1
- from modelscope import MsDataset
2
1
  from typing import Any, Dict, Iterator, List
3
2
 
4
3
  from evalscope.perf.arguments import Arguments
@@ -17,6 +16,7 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
17
16
 
18
17
  def build_messages(self) -> Iterator[List[Dict]]:
19
18
  if not self.query_parameters.dataset_path:
19
+ from modelscope import MsDataset
20
20
  ds = MsDataset.load('AI-ModelScope/LongAlpaca-12k', subset_name='default', split='train')
21
21
  else:
22
22
  ds = self.dataset_json_list(self.query_parameters.dataset_path)
@@ -1,4 +1,4 @@
1
- from typing import Any, List, Type
1
+ from typing import Any, List, Type, Union
2
2
 
3
3
 
4
4
  class PluginRegistry:
@@ -20,7 +20,7 @@ class PluginRegistry:
20
20
  return self.get_class(name)
21
21
 
22
22
 
23
- def register_dataset(name: str | List[str]):
23
+ def register_dataset(name: Union[str, List[str]]):
24
24
 
25
25
  def class_decorator(cls: Type):
26
26
  if isinstance(name, str):
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
35
35
  return class_decorator
36
36
 
37
37
 
38
- def register_api(name: str | List[str]):
38
+ def register_api(name: Union[str, List[str]]):
39
39
 
40
40
  def class_decorator(cls: Type):
41
41
  if isinstance(name, str):
@@ -116,19 +116,19 @@ class BenchmarkMetrics:
116
116
 
117
117
  def create_message(self, default_ndigits=3):
118
118
  message = {
119
- 'Time taken for tests (senconds)': round(self.total_time, default_ndigits),
119
+ 'Time taken for tests (s)': round(self.total_time, default_ndigits),
120
120
  'Number of concurrency': self.concurrency,
121
121
  'Total requests': int(self.n_total_queries),
122
122
  'Succeed requests': self.n_succeed_queries,
123
123
  'Failed requests': self.n_failed_queries,
124
+ 'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
124
125
  'Average QPS': round(self.qps, default_ndigits),
125
126
  'Average latency (s)': round(self.avg_latency, default_ndigits),
126
127
  'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
127
128
  'Average time per output token (s)': round(self.avg_time_per_token, 5),
128
- 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
129
- 'Average package per request': round(self.n_avg_chunks, default_ndigits),
130
- 'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
131
129
  'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
132
130
  'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
131
+ 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
132
+ 'Average package per request': round(self.n_avg_chunks, default_ndigits),
133
133
  }
134
134
  return message