evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (234) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +6 -2
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +47 -51
  60. evalscope/backend/rag_eval/utils/embedding.py +13 -12
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +33 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +154 -96
  139. evalscope/constants.py +50 -32
  140. evalscope/evaluator/evaluator.py +97 -377
  141. evalscope/evaluator/rating_eval.py +12 -33
  142. evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
  143. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  144. evalscope/metrics/code_metric.py +3 -9
  145. evalscope/metrics/math_accuracy.py +3 -6
  146. evalscope/metrics/metrics.py +21 -21
  147. evalscope/metrics/rouge_metric.py +11 -25
  148. evalscope/models/__init__.py +1 -2
  149. evalscope/models/api/openai_api.py +40 -29
  150. evalscope/models/custom/__init__.py +0 -1
  151. evalscope/models/custom/custom_model.py +3 -3
  152. evalscope/models/dummy_chat_model.py +7 -8
  153. evalscope/models/model_adapter.py +89 -156
  154. evalscope/models/openai_model.py +20 -20
  155. evalscope/perf/arguments.py +16 -3
  156. evalscope/perf/benchmark.py +9 -11
  157. evalscope/perf/http_client.py +3 -8
  158. evalscope/perf/main.py +8 -1
  159. evalscope/perf/plugin/api/custom_api.py +1 -2
  160. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  161. evalscope/perf/plugin/api/openai_api.py +3 -4
  162. evalscope/perf/plugin/datasets/base.py +1 -2
  163. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  164. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  165. evalscope/perf/plugin/datasets/openqa.py +1 -2
  166. evalscope/perf/plugin/registry.py +3 -3
  167. evalscope/perf/utils/analysis_result.py +1 -2
  168. evalscope/perf/utils/benchmark_util.py +5 -6
  169. evalscope/perf/utils/db_util.py +77 -30
  170. evalscope/perf/utils/local_server.py +21 -13
  171. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  172. evalscope/registry/tasks/arc.yaml +2 -3
  173. evalscope/registry/tasks/bbh.yaml +3 -4
  174. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  175. evalscope/registry/tasks/ceval.yaml +3 -3
  176. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  177. evalscope/registry/tasks/cmmlu.yaml +3 -3
  178. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  179. evalscope/registry/tasks/general_qa.yaml +1 -1
  180. evalscope/registry/tasks/gsm8k.yaml +2 -2
  181. evalscope/registry/tasks/mmlu.yaml +3 -3
  182. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  183. evalscope/run.py +153 -381
  184. evalscope/run_arena.py +21 -25
  185. evalscope/summarizer.py +27 -40
  186. evalscope/third_party/longbench_write/README.md +99 -42
  187. evalscope/third_party/longbench_write/default_task.json +1 -1
  188. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  189. evalscope/third_party/longbench_write/eval.py +29 -27
  190. evalscope/third_party/longbench_write/infer.py +16 -104
  191. evalscope/third_party/longbench_write/longbench_write.py +5 -4
  192. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  193. evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
  194. evalscope/third_party/longbench_write/utils.py +0 -1
  195. evalscope/third_party/toolbench_static/eval.py +14 -15
  196. evalscope/third_party/toolbench_static/infer.py +48 -69
  197. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  198. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  199. evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
  200. evalscope/tools/combine_reports.py +27 -34
  201. evalscope/tools/rewrite_eval_results.py +15 -47
  202. evalscope/utils/__init__.py +1 -1
  203. evalscope/utils/arena_utils.py +18 -48
  204. evalscope/{perf/utils → utils}/chat_service.py +4 -5
  205. evalscope/utils/completion_parsers.py +3 -8
  206. evalscope/utils/io_utils.py +162 -0
  207. evalscope/utils/logger.py +17 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +5 -306
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
  212. evalscope-0.8.1.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +53 -15
  214. tests/perf/test_perf.py +6 -1
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. tests/vlm/test_vlmeval.py +3 -2
  222. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  224. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  225. evalscope/cache.py +0 -98
  226. evalscope/models/template.py +0 -1446
  227. evalscope/run_ms.py +0 -140
  228. evalscope/utils/task_cfg_parser.py +0 -10
  229. evalscope/utils/task_utils.py +0 -22
  230. evalscope-0.7.2.dist-info/RECORD +0 -286
  231. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  232. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  233. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  234. {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,22 @@
1
1
  import argparse
2
+ import json
3
+ import os
2
4
  import sys
3
5
  from dataclasses import dataclass, field
4
6
  from typing import Any, Dict, List, Optional
5
7
 
6
- import json
8
+ from evalscope.constants import DEFAULT_WORK_DIR
7
9
 
8
10
 
9
11
  @dataclass
10
12
  class Arguments:
11
13
  # Model and API
12
- model: str # Model identifier
14
+ model: str # Model name or path
15
+ model_id: Optional[str] = None # Model identifier
13
16
  attn_implementation: Optional[str] = None # Attention implementaion, only for local inference
14
17
  api: str = 'openai' # API to be used (default: 'openai')
15
18
  tokenizer_path: Optional[str] = None # Path to the tokenizer
19
+ port: int = 8877 # Port number for the local API server
16
20
 
17
21
  # Connection settings
18
22
  url: str = 'http://127.0.0.1:8877/v1/chat/completions' # URL for the API connection
@@ -32,6 +36,9 @@ class Arguments:
32
36
  wandb_api_key: Optional[str] = None # WandB API key for logging
33
37
  name: Optional[str] = None # Name for the run
34
38
 
39
+ # Output settings
40
+ outputs_dir: str = DEFAULT_WORK_DIR
41
+
35
42
  # Prompt settings
36
43
  max_prompt_length: int = sys.maxsize # Maximum length of the prompt
37
44
  min_prompt_length: int = 0 # Minimum length of the prompt
@@ -57,7 +64,6 @@ class Arguments:
57
64
 
58
65
  @staticmethod
59
66
  def from_args(args):
60
-
61
67
  return Arguments(
62
68
  model=args.model,
63
69
  attn_implementation=args.attn_implementation,
@@ -72,6 +78,7 @@ class Arguments:
72
78
  headers=args.headers,
73
79
  wandb_api_key=args.wandb_api_key,
74
80
  name=args.name,
81
+ outputs_dir=args.outputs_dir,
75
82
  debug=args.debug,
76
83
  tokenizer_path=args.tokenizer_path,
77
84
  api=args.api,
@@ -98,6 +105,7 @@ class Arguments:
98
105
  if self.api_key:
99
106
  # Assuming the API key is used as a Bearer token
100
107
  self.headers['Authorization'] = f'Bearer {self.api_key}'
108
+ self.model_id = os.path.basename(self.model)
101
109
 
102
110
  def __str__(self):
103
111
  return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
@@ -130,6 +138,7 @@ def add_argument(parser: argparse.ArgumentParser):
130
138
 
131
139
  # Connection settings
132
140
  parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
141
+ parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
133
142
  parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
134
143
  parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
135
144
  parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')
@@ -152,6 +161,9 @@ def add_argument(parser: argparse.ArgumentParser):
152
161
  parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
153
162
  parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
154
163
 
164
+ # Output settings
165
+ parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
166
+
155
167
  # Dataset settings
156
168
  parser.add_argument('--dataset', type=str, default='openqa', help='Specify the dataset')
157
169
  parser.add_argument('--dataset-path', type=str, required=False, help='Path to the dataset file')
@@ -170,6 +182,7 @@ def add_argument(parser: argparse.ArgumentParser):
170
182
  parser.add_argument('--stream', action='store_true', help='Stream output with SSE', default=None)
171
183
  parser.add_argument('--temperature', type=float, help='The sample temperature', default=None)
172
184
  parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
185
+
173
186
  # yapf: enable
174
187
 
175
188
 
@@ -1,16 +1,15 @@
1
1
  import asyncio
2
2
  import copy
3
+ import json
4
+ import numpy as np
3
5
  import os
4
6
  import platform
5
7
  import sqlite3
6
8
  import threading
7
9
  import time
8
10
  from http import HTTPStatus
9
- from typing import List
10
-
11
- import json
12
- import numpy as np
13
11
  from tqdm import tqdm
12
+ from typing import List
14
13
 
15
14
  from evalscope.perf.arguments import Arguments
16
15
  from evalscope.perf.http_client import AioHttpClient, test_connection
@@ -138,17 +137,17 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
138
137
  api_plugin_class = ApiRegistry(args.api)
139
138
  api_plugin = api_plugin_class(args.tokenizer_path)
140
139
 
141
- result_db_path = get_result_db_path(args.name, args.model)
140
+ result_db_path = get_result_db_path(args)
142
141
  # Initialize wandb
143
142
  if args.wandb_api_key:
144
- import wandb
145
143
  import datetime
144
+ import wandb
146
145
  os.environ['WANDB_SILENT'] = 'true'
147
- os.environ['WANDB_DIR'] = './outputs'
146
+ os.environ['WANDB_DIR'] = args.outputs_dir
148
147
 
149
148
  wandb.login(key=args.wandb_api_key)
150
149
  current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
151
- name = args.name if args.name else f'{args.model}_{current_time}'
150
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
152
151
  wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
153
152
 
154
153
  with sqlite3.connect(result_db_path) as con:
@@ -196,10 +195,9 @@ async def start_server(args: Arguments) -> bool:
196
195
  server.start()
197
196
 
198
197
  if args.dataset.startswith('speed_benchmark'):
199
- args.url = 'http://127.0.0.1:8877/v1/completions'
198
+ args.url = f'http://127.0.0.1:{args.port}/v1/completions'
200
199
  else:
201
- args.url = 'http://127.0.0.1:8877/v1/chat/completions'
202
- args.model = os.path.basename(args.model)
200
+ args.url = f'http://127.0.0.1:{args.port}/v1/chat/completions'
203
201
 
204
202
  if not await test_connection(args):
205
203
  raise TimeoutError('Test connection failed')
@@ -1,12 +1,10 @@
1
+ import aiohttp
1
2
  import asyncio
2
- import logging
3
+ import json
3
4
  import time
4
5
  from http import HTTPStatus
5
6
  from typing import AsyncGenerator, Dict, List, Tuple
6
7
 
7
- import aiohttp
8
- import json
9
-
10
8
  from evalscope.perf.arguments import Arguments
11
9
  from evalscope.perf.utils.local_server import ServerSentEvent
12
10
  from evalscope.utils.logger import get_logger
@@ -21,7 +19,6 @@ class AioHttpClient:
21
19
  args: Arguments,
22
20
  ):
23
21
  self.url = args.url
24
- self.debug = args.debug
25
22
  self.headers = {'user-agent': 'modelscope_bench', **(args.headers or {})}
26
23
  self.read_timeout = args.read_timeout
27
24
  self.connect_timeout = args.connect_timeout
@@ -31,9 +28,7 @@ class AioHttpClient:
31
28
  connect=self.connect_timeout,
32
29
  sock_read=self.read_timeout),
33
30
  connector=aiohttp.TCPConnector(limit=1),
34
- trace_configs=[self._create_trace_config()] if self.debug else [])
35
- if self.debug:
36
- get_logger(log_level=logging.DEBUG)
31
+ trace_configs=[self._create_trace_config()] if args.debug else [])
37
32
 
38
33
  def _create_trace_config(self):
39
34
  trace_config = aiohttp.TraceConfig()
evalscope/perf/main.py CHANGED
@@ -1,11 +1,14 @@
1
1
  import asyncio
2
+ import logging
3
+ import os
2
4
  import platform
3
5
  from argparse import Namespace
4
6
 
5
7
  from evalscope.perf.arguments import Arguments, parse_args
6
8
  from evalscope.perf.benchmark import benchmark
9
+ from evalscope.perf.utils.db_util import get_output_path
7
10
  from evalscope.perf.utils.handler import add_signal_handlers
8
- from evalscope.utils.logger import get_logger
11
+ from evalscope.utils.logger import configure_logging, get_logger
9
12
  from evalscope.utils.utils import seed_everything
10
13
 
11
14
  logger = get_logger()
@@ -18,6 +21,10 @@ def run_perf_benchmark(args):
18
21
  args = Arguments.from_args(args)
19
22
  seed_everything(args.seed)
20
23
 
24
+ # Setup logger and output
25
+ args.outputs_dir = get_output_path(args)
26
+ configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
27
+
21
28
  logger.info('Starting benchmark...')
22
29
  logger.info(args)
23
30
 
@@ -1,7 +1,6 @@
1
- from typing import Any, Dict, Iterator, List
2
-
3
1
  import json
4
2
  from transformers import AutoTokenizer
3
+ from typing import Any, Dict, Iterator, List
5
4
 
6
5
  from evalscope.perf.arguments import Arguments
7
6
  from evalscope.perf.plugin.api.base import ApiPluginBase
@@ -1,8 +1,7 @@
1
+ import json
1
2
  import os
2
3
  from typing import Any, Dict, Iterator, List
3
4
 
4
- import json
5
-
6
5
  from evalscope.perf.arguments import Arguments
7
6
  from evalscope.perf.plugin.api.base import ApiPluginBase
8
7
  from evalscope.perf.plugin.registry import register_api
@@ -1,8 +1,7 @@
1
- import os
2
- from typing import Any, Dict, Iterator, List
3
-
4
1
  import json
2
+ import os
5
3
  from transformers import AutoTokenizer
4
+ from typing import Any, Dict, Iterator, List, Union
6
5
 
7
6
  from evalscope.perf.arguments import Arguments
8
7
  from evalscope.perf.plugin.api.base import ApiPluginBase
@@ -30,7 +29,7 @@ class OpenaiPlugin(ApiPluginBase):
30
29
  else:
31
30
  self.tokenizer = None
32
31
 
33
- def build_request(self, messages: List[Dict] | str, param: Arguments) -> Dict:
32
+ def build_request(self, messages: Union[List[Dict], str], param: Arguments) -> Dict:
34
33
  """Build the openai format request based on prompt, dataset
35
34
 
36
35
  Args:
@@ -1,9 +1,8 @@
1
+ import json
1
2
  import sys
2
3
  from abc import abstractmethod
3
4
  from typing import Any, Dict, Iterator, List, Tuple
4
5
 
5
- import json
6
-
7
6
  from evalscope.perf.arguments import Arguments
8
7
 
9
8
 
@@ -1,9 +1,8 @@
1
1
  import base64
2
2
  from io import BytesIO
3
- from typing import Any, Dict, Iterator, List
4
-
5
3
  from modelscope.msdatasets import MsDataset
6
4
  from PIL import Image
5
+ from typing import Any, Dict, Iterator, List
7
6
 
8
7
  from evalscope.perf.arguments import Arguments
9
8
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
@@ -1,6 +1,5 @@
1
- from typing import Any, Dict, Iterator, List
2
-
3
1
  from modelscope import MsDataset
2
+ from typing import Any, Dict, Iterator, List
4
3
 
5
4
  from evalscope.perf.arguments import Arguments
6
5
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
@@ -1,8 +1,7 @@
1
+ import json
1
2
  import subprocess
2
3
  from typing import Any, Dict, Iterator, List
3
4
 
4
- import json
5
-
6
5
  from evalscope.perf.arguments import Arguments
7
6
  from evalscope.perf.plugin.datasets.base import DatasetPluginBase
8
7
  from evalscope.perf.plugin.registry import register_dataset
@@ -1,4 +1,4 @@
1
- from typing import Any, List, Type
1
+ from typing import Any, List, Type, Union
2
2
 
3
3
 
4
4
  class PluginRegistry:
@@ -20,7 +20,7 @@ class PluginRegistry:
20
20
  return self.get_class(name)
21
21
 
22
22
 
23
- def register_dataset(name: str | List[str]):
23
+ def register_dataset(name: Union[str, List[str]]):
24
24
 
25
25
  def class_decorator(cls: Type):
26
26
  if isinstance(name, str):
@@ -35,7 +35,7 @@ def register_dataset(name: str | List[str]):
35
35
  return class_decorator
36
36
 
37
37
 
38
- def register_api(name: str | List[str]):
38
+ def register_api(name: Union[str, List[str]]):
39
39
 
40
40
  def class_decorator(cls: Type):
41
41
  if isinstance(name, str):
@@ -1,9 +1,8 @@
1
1
  import base64
2
+ import json
2
3
  import pickle
3
4
  import sqlite3
4
5
 
5
- import json
6
-
7
6
  result_db_path = '/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/qwen2.5_benchmark_20241111_160543.db'
8
7
  con = sqlite3.connect(result_db_path)
9
8
  query_sql = "SELECT request, response_messages, prompt_tokens, completion_tokens \
@@ -1,9 +1,8 @@
1
1
  import time
2
+ import torch
2
3
  from dataclasses import dataclass, field
3
4
  from typing import Any, List, Optional, Tuple
4
5
 
5
- import torch
6
-
7
6
  from evalscope.utils.logger import get_logger
8
7
 
9
8
  logger = get_logger()
@@ -117,19 +116,19 @@ class BenchmarkMetrics:
117
116
 
118
117
  def create_message(self, default_ndigits=3):
119
118
  message = {
120
- 'Time taken for tests (senconds)': round(self.total_time, default_ndigits),
119
+ 'Time taken for tests (s)': round(self.total_time, default_ndigits),
121
120
  'Number of concurrency': self.concurrency,
122
121
  'Total requests': int(self.n_total_queries),
123
122
  'Succeed requests': self.n_succeed_queries,
124
123
  'Failed requests': self.n_failed_queries,
124
+ 'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
125
125
  'Average QPS': round(self.qps, default_ndigits),
126
126
  'Average latency (s)': round(self.avg_latency, default_ndigits),
127
127
  'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
128
128
  'Average time per output token (s)': round(self.avg_time_per_token, 5),
129
- 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
130
- 'Average package per request': round(self.n_avg_chunks, default_ndigits),
131
- 'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits),
132
129
  'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits),
133
130
  'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits),
131
+ 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits),
132
+ 'Average package per request': round(self.n_avg_chunks, default_ndigits),
134
133
  }
135
134
  return message
@@ -1,12 +1,12 @@
1
1
  import base64
2
+ import json
2
3
  import os
3
4
  import pickle
4
5
  import sqlite3
5
6
  import sys
6
7
  from datetime import datetime
7
-
8
- import json
9
8
  from tabulate import tabulate
9
+ from typing import Dict, List
10
10
 
11
11
  from evalscope.perf.arguments import Arguments
12
12
  from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
@@ -88,15 +88,19 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
88
88
  cursor.execute(query, common_columns)
89
89
 
90
90
 
91
- def get_result_db_path(name, model):
91
+ def get_output_path(args: Arguments) -> str:
92
92
  current_time = datetime.now().strftime('%Y%m%d_%H%M%S')
93
- output_dir = './outputs'
94
- result_db_path = os.path.join(output_dir, f'{name or model}_perf', current_time, 'benchmark_data.db')
93
+ output_path = os.path.join(args.outputs_dir, current_time, f'{args.name or args.model_id}')
94
+ if not os.path.exists(output_path):
95
+ os.makedirs(output_path, exist_ok=True)
96
+ logger.info(f'Save the result to: {output_path}')
97
+ return output_path
98
+
95
99
 
96
- if not os.path.exists(os.path.dirname(result_db_path)):
97
- os.makedirs(os.path.dirname(result_db_path), exist_ok=True)
100
+ def get_result_db_path(args: Arguments):
101
+ result_db_path = os.path.join(args.outputs_dir, 'benchmark_data.db')
98
102
 
99
- logger.info(f'Save the result to: {result_db_path}')
103
+ logger.info(f'Save the data base to: {result_db_path}')
100
104
  if os.path.exists(result_db_path):
101
105
  logger.warning('The db file exists, delete it and start again!.')
102
106
  sys.exit(1)
@@ -104,44 +108,87 @@ def get_result_db_path(name, model):
104
108
  return result_db_path
105
109
 
106
110
 
107
- def get_percentile_results(result_db_path: str):
111
+ def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]:
112
+ """
113
+ Calculate the percentiles for a specific list of data.
108
114
 
109
- def percentile_results(rows, index, percentiles):
110
- results = {}
111
- n_success_queries = len(rows)
112
- for percentile in percentiles:
115
+ :param data: List of values for a specific metric.
116
+ :param percentiles: List of percentiles to calculate.
117
+ :return: Dictionary of calculated percentiles.
118
+ """
119
+ results = {}
120
+ n_success_queries = len(data)
121
+ data.sort()
122
+ for percentile in percentiles:
123
+ try:
113
124
  idx = int(n_success_queries * percentile / 100)
114
- row = rows[idx]
115
- value = row[index] if row[index] is not None else float('inf')
125
+ value = data[idx] if data[idx] is not None else float('nan')
116
126
  results[percentile] = round(value, 4)
117
- return results
127
+ except IndexError:
128
+ results[percentile] = float('nan')
129
+ return results
130
+
131
+
132
+ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
133
+ """
134
+ Compute and return quantiles for various metrics from the database results.
135
+
136
+ :param result_db_path: Path to the SQLite database file.
137
+ :return: Dictionary of percentiles for various metrics.
138
+ """
139
+
140
+ def inter_token_latencies(chunk_times_json: str) -> List[float]:
141
+ try:
142
+ chunk_times = json.loads(chunk_times_json)
143
+ return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])]
144
+ except (json.JSONDecodeError, TypeError) as e:
145
+ logger.error(f'Error parsing chunk times: {e}')
146
+ return []
118
147
 
119
148
  query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, '
120
149
  'n_chunks, chunk_time, prompt_tokens, completion_tokens '
121
- 'FROM result WHERE success=1 ORDER BY first_chunk_latency ASC')
150
+ 'FROM result WHERE success=1')
151
+
122
152
  percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99]
123
153
 
124
154
  with sqlite3.connect(result_db_path) as con:
125
155
  rows = con.execute(query_sql).fetchall()
126
156
 
127
- if len(rows) <= len(percentiles):
157
+ if len(rows) < len(percentiles):
128
158
  logger.info('Too little data to calculate quantiles!')
129
159
  return {}
130
160
 
131
- # Calculate percentiles for first chunk latency and latency
132
- first_chunk_latency_index = 5
133
- latency_index = 4
161
+ # Define index variables for columns
162
+ CHUNK_TIMES_INDEX = 1
163
+ LATENCY_INDEX = 4
164
+ FIRST_CHUNK_LATENCY_INDEX = 5
165
+ PROMPT_TOKENS_INDEX = 8
166
+ COMPLETION_TOKENS_INDEX = 9
167
+
168
+ # Prepare data for each metric
169
+ inter_token_latencies_all = []
170
+ for row in rows:
171
+ inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX]))
172
+
173
+ metrics = {
174
+ 'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
175
+ 'TPOT (s)':
176
+ inter_token_latencies_all,
177
+ 'Latency (s)': [row[LATENCY_INDEX] for row in rows],
178
+ 'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
179
+ 'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
180
+ 'Throughput(tokens/s)':
181
+ [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
182
+ for row in rows]
183
+ }
134
184
 
135
- first_chunk_latency_results = percentile_results(rows, first_chunk_latency_index, percentiles)
136
- rows.sort(key=lambda x: x[latency_index])
137
- latency_results = percentile_results(rows, latency_index, percentiles)
185
+ # Calculate percentiles for each metric
186
+ results = {'Percentile': [f'{p}%' for p in percentiles]}
187
+ for metric_name, data in metrics.items():
188
+ metric_percentiles = calculate_percentiles(data, percentiles)
189
+ results[metric_name] = [metric_percentiles[p] for p in percentiles]
138
190
 
139
- # Prepare data for tabulation
140
- return {
141
- 'Percentile': [f'{p}%' for p in percentiles],
142
- 'First Chunk Latency (s)': [first_chunk_latency_results[p] for p in percentiles],
143
- 'Latency (s)': [latency_results[p] for p in percentiles]
144
- }
191
+ return results
145
192
 
146
193
 
147
194
  def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str):
@@ -1,16 +1,15 @@
1
1
  import os
2
2
  import subprocess
3
- from contextlib import asynccontextmanager
4
- from dataclasses import dataclass
5
-
6
3
  import torch
7
4
  import uvicorn
5
+ from contextlib import asynccontextmanager
6
+ from dataclasses import dataclass
8
7
  from fastapi import FastAPI
9
8
  from fastapi.middleware.cors import CORSMiddleware
10
9
  from sse_starlette.sse import EventSourceResponse
11
10
 
12
11
  from evalscope.perf.arguments import Arguments
13
- from evalscope.perf.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
12
+ from evalscope.utils.chat_service import ChatCompletionRequest, ChatService, ModelList, TextCompletionRequest
14
13
  from evalscope.utils.logger import get_logger
15
14
 
16
15
  logger = get_logger()
@@ -66,9 +65,9 @@ async def lifespan(app: FastAPI):
66
65
  torch.cuda.empty_cache()
67
66
 
68
67
 
69
- def create_app(args) -> FastAPI:
68
+ def create_app(model, attn_implementation=None) -> FastAPI:
70
69
  app = FastAPI(lifespan=lifespan)
71
- chat_service = ChatService(model_path=args.model, attn_implementation=args.attn_implementation)
70
+ chat_service = ChatService(model_path=model, attn_implementation=attn_implementation)
72
71
 
73
72
  app.add_middleware(
74
73
  CORSMiddleware,
@@ -98,18 +97,27 @@ def create_app(args) -> FastAPI:
98
97
 
99
98
  def start_app(args: Arguments):
100
99
  if args.api == 'local':
101
- app = create_app(args)
102
- uvicorn.run(app, host='0.0.0.0', port=8877, workers=1)
100
+ app = create_app(args.model, args.attn_implementation)
101
+ uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
103
102
 
104
103
  elif args.api == 'local_vllm':
105
104
  os.environ['VLLM_USE_MODELSCOPE'] = 'True'
106
-
105
+ os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
106
+ # yapf: disable
107
107
  proc = subprocess.Popen([
108
- 'python', '-m', 'vllm.entrypoints.openai.api_server', '--model', args.model, '--served-model-name',
109
- os.path.basename(args.model), '--tensor-parallel-size',
110
- str(torch.cuda.device_count()), '--max-model-len', '32768', '--gpu-memory-utilization', '0.9', '--host',
111
- '0.0.0.0', '--port', '8877', '--disable-log-requests', '--disable-log-stats'
108
+ 'python', '-m', 'vllm.entrypoints.openai.api_server',
109
+ '--model', args.model,
110
+ '--served-model-name', args.model,
111
+ '--tensor-parallel-size', str(torch.cuda.device_count()),
112
+ '--max-model-len', '32768',
113
+ '--gpu-memory-utilization', '0.9',
114
+ '--host', '0.0.0.0',
115
+ '--port', str(args.port),
116
+ '--trust-remote-code',
117
+ '--disable-log-requests',
118
+ '--disable-log-stats',
112
119
  ])
120
+ # yapf: enable
113
121
  import atexit
114
122
 
115
123
  def on_exit():
@@ -21,7 +21,7 @@ answers_gen:
21
21
  model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
22
22
  revision: NULL # revision of model, default is NULL
23
23
  precision: torch.float16
24
- enable: true # enable or disable this model
24
+ enable: true # enable or disable this model
25
25
  template_type: default-generation
26
26
  generation_config:
27
27
  do_sample: true
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -22,8 +22,7 @@ model: null # Note: to be implemented as CustomModel
22
22
  eval_type: custom
23
23
  datasets:
24
24
  - arc
25
- outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
26
25
  use_cache: false
27
26
  stage: all
28
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
+ dataset_hub: modelscope # `Local` or `ModelScope`
29
28
  limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
20
20
  eval_type: custom
21
21
  datasets:
22
22
  - bbh
23
- outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
23
  use_cache: false
25
24
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
25
+ dataset_hub: modelscope # `Local` or `ModelScope`
26
+ limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
20
20
  eval_type: custom
21
21
  datasets:
22
22
  - bbh
23
- outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
23
  use_cache: false
25
24
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
25
+ dataset_hub: modelscope # `Local` or `ModelScope`
26
+ limit: null
@@ -1,5 +1,5 @@
1
1
  model_args: # model args should be followed by benchmark requirements
2
- revision: default
2
+ revision: master
3
3
  precision: torch.float16
4
4
  device_map: auto
5
5
  # model_name_or_path: qwen/qwen-7b-chat
@@ -23,5 +23,5 @@ datasets:
23
23
  outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
24
24
  use_cache: false
25
25
  stage: all
26
- dataset_hub: ModelScope # `Local` or `ModelScope`
27
- limit: null
26
+ dataset_hub: modelscope # `Local` or `ModelScope`
27
+ limit: null