evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
@@ -189,7 +189,8 @@ async def benchmark(args: Arguments) -> Tuple[Dict, Dict]:
189
189
  await connect_test(args, api_plugin)
190
190
  # start statistic benchmark metric
191
191
  statistic_benchmark_metric_task = asyncio.create_task(
192
- statistic_benchmark_metric(benchmark_data_queue, args, api_plugin))
192
+ statistic_benchmark_metric(benchmark_data_queue, args, api_plugin)
193
+ )
193
194
  # start send request
194
195
  semaphore = asyncio.Semaphore(args.parallel)
195
196
  send_request_tasks: List[asyncio.Task] = []
@@ -26,7 +26,8 @@ class AioHttpClient:
26
26
  self.api_plugin = api_plugin
27
27
  self.client = aiohttp.ClientSession(
28
28
  timeout=aiohttp.ClientTimeout(connect=self.connect_timeout, sock_read=self.read_timeout),
29
- trace_configs=[self._create_trace_config()] if args.debug else [])
29
+ trace_configs=[self._create_trace_config()] if args.debug else []
30
+ )
30
31
 
31
32
  async def __aenter__(self):
32
33
  pass
@@ -105,7 +106,8 @@ async def test_connection(args: Arguments, api_plugin: 'ApiPluginBase') -> bool:
105
106
  while True:
106
107
  try:
107
108
  is_error, state_code, response_data = await asyncio.wait_for(
108
- attempt_connection(), timeout=args.connect_timeout)
109
+ attempt_connection(), timeout=args.connect_timeout
110
+ )
109
111
  if not is_error:
110
112
  logger.info('Test connection successful.')
111
113
  return True
@@ -153,7 +153,8 @@ class CustomPlugin(ApiPluginBase):
153
153
 
154
154
  # If no usage information and no tokenizer, raise an error
155
155
  raise ValueError(
156
- 'Cannot determine token counts: no usage information in response and no tokenizer provided.')
156
+ 'Cannot determine token counts: no usage information in response and no tokenizer provided.'
157
+ )
157
158
 
158
159
  except Exception as e:
159
160
  logger.error(f'Error parsing responses: {e}')
@@ -186,8 +187,7 @@ class CustomPlugin(ApiPluginBase):
186
187
  data = json.dumps(body, ensure_ascii=False)
187
188
 
188
189
  # Send the request
189
- async with client_session.request(
190
- 'POST', url=url, data=data, headers=headers) as response: # noqa: E125
190
+ async with client_session.request('POST', url=url, data=data, headers=headers) as response: # noqa: E125
191
191
  # Get the status code
192
192
  status_code = response.status
193
193
 
@@ -244,6 +244,7 @@ if __name__ == '__main__':
244
244
  api='custom', # Use the custom API plugin registered above
245
245
  dataset='openqa',
246
246
  number=1,
247
- max_tokens=10)
247
+ max_tokens=10
248
+ )
248
249
 
249
250
  run_perf_benchmark(args)
@@ -159,13 +159,15 @@ class OpenaiPlugin(DefaultApiPlugin):
159
159
  input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
160
160
  output_tokens += len(self.tokenizer.encode(full_response_content))
161
161
  else:
162
- raise ValueError('Error: Unable to retrieve usage information\n\n'
163
- 'This error occurs when:\n'
164
- '1. The API response does not contain usage data, AND\n'
165
- '2. No tokenizer has been specified or found.\n\n'
166
- 'To resolve this issue, do ONE of the following:\n'
167
- "a) Ensure that the API you're using supports and returns usage information, OR\n"
168
- 'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
169
- 'If you continue to experience issues, '
170
- 'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .')
162
+ raise ValueError(
163
+ 'Error: Unable to retrieve usage information\n\n'
164
+ 'This error occurs when:\n'
165
+ '1. The API response does not contain usage data, AND\n'
166
+ '2. No tokenizer has been specified or found.\n\n'
167
+ 'To resolve this issue, do ONE of the following:\n'
168
+ "a) Ensure that the API you're using supports and returns usage information, OR\n"
169
+ 'b) Specify a tokenizer using the `--tokenizer-path` parameter.\n\n'
170
+ 'If you continue to experience issues, '
171
+ 'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
172
+ )
171
173
  return input_tokens, output_tokens
@@ -17,7 +17,8 @@ class CustomDatasetPlugin(DatasetPluginBase):
17
17
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
18
18
  prompt = item.strip()
19
19
  if len(prompt) > self.query_parameters.min_prompt_length and len(
20
- prompt) < self.query_parameters.max_prompt_length:
20
+ prompt
21
+ ) < self.query_parameters.max_prompt_length:
21
22
  if self.query_parameters.apply_chat_template:
22
23
  message = self.create_message(prompt)
23
24
  yield [message]
@@ -24,5 +24,5 @@ class FlickrDatasetPlugin(DatasetPluginBase):
24
24
  text = item['txt']
25
25
  base64_image = PIL_to_base64(pil_image)
26
26
 
27
- message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
28
28
  yield [message]
@@ -24,5 +24,5 @@ class KontextDatasetPlugin(DatasetPluginBase):
24
24
  text = item['instruction']
25
25
  base64_image = PIL_to_base64(pil_image)
26
26
 
27
- message = self.create_message(text=text, image_url=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
28
28
  yield [message]
@@ -18,7 +18,8 @@ class LineByLineDatasetPlugin(DatasetPluginBase):
18
18
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
19
19
  prompt = item.strip()
20
20
  if len(prompt) > self.query_parameters.min_prompt_length and len(
21
- prompt) < self.query_parameters.max_prompt_length:
21
+ prompt
22
+ ) < self.query_parameters.max_prompt_length:
22
23
  if self.query_parameters.apply_chat_template:
23
24
  message = self.create_message(prompt)
24
25
  yield [message]
@@ -23,7 +23,8 @@ class LongAlpacaDatasetPlugin(DatasetPluginBase):
23
23
  for item in ds:
24
24
  prompt = item['instruction'].strip()
25
25
  if len(prompt) > self.query_parameters.min_prompt_length and len(
26
- prompt) < self.query_parameters.max_prompt_length:
26
+ prompt
27
+ ) < self.query_parameters.max_prompt_length:
27
28
  if self.query_parameters.apply_chat_template:
28
29
  message = self.create_message(prompt)
29
30
  yield [message]
@@ -27,8 +27,10 @@ class OpenqaDatasetPlugin(DatasetPluginBase):
27
27
  for item in self.dataset_line_by_line(self.query_parameters.dataset_path):
28
28
  item = json.loads(item)
29
29
  prompt = item['question'].strip()
30
- if (len(prompt) > self.query_parameters.min_prompt_length
31
- and len(prompt) < self.query_parameters.max_prompt_length):
30
+ if (
31
+ len(prompt) > self.query_parameters.min_prompt_length
32
+ and len(prompt) < self.query_parameters.max_prompt_length
33
+ ):
32
34
  if self.query_parameters.apply_chat_template:
33
35
  message = self.create_message(prompt)
34
36
  yield [message]
@@ -33,8 +33,8 @@ class BenchmarkData:
33
33
  if len(self.chunk_times) > 1:
34
34
  self.first_chunk_latency = self.chunk_times[0] - self.start_time
35
35
  # remove the first chunk time from the total latency
36
- self.time_per_output_token = (self.query_latency - self.first_chunk_latency) / (
37
- self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
36
+ self.time_per_output_token = (self.query_latency - self.first_chunk_latency
37
+ ) / (self.completion_tokens - 1) if self.completion_tokens > 1 else 0.0
38
38
  self.inter_chunk_latency = [t2 - t1 for t1, t2 in zip(self.chunk_times[:-1], self.chunk_times[1:])]
39
39
  else:
40
40
  self.first_chunk_latency = self.query_latency
@@ -126,11 +126,13 @@ class BenchmarkMetrics:
126
126
  self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
127
127
  self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
128
128
  self.avg_output_token_per_seconds = self.n_total_completion_tokens / self.total_time
129
- self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
130
- + self.n_total_completion_tokens) / self.total_time
129
+ self.avg_total_token_per_seconds = (
130
+ self.n_total_prompt_tokens + self.n_total_completion_tokens
131
+ ) / self.total_time
131
132
  self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
132
133
  self.avg_inter_token_latency = sum(self.n_total_inter_token_latency) / len(
133
- self.n_total_inter_token_latency) if self.n_total_inter_token_latency else 0.0
134
+ self.n_total_inter_token_latency
135
+ ) if self.n_total_inter_token_latency else 0.0
134
136
  self.qps = self.n_succeed_queries / self.total_time
135
137
  except ZeroDivisionError as e:
136
138
  logger.exception(e)
@@ -56,7 +56,8 @@ def transpose_results(data):
56
56
 
57
57
 
58
58
  def create_result_table(cursor):
59
- cursor.execute(f'''CREATE TABLE IF NOT EXISTS result(
59
+ cursor.execute(
60
+ f'''CREATE TABLE IF NOT EXISTS result(
60
61
  {DatabaseColumns.REQUEST} TEXT,
61
62
  {DatabaseColumns.START_TIME} REAL,
62
63
  {DatabaseColumns.CHUNK_TIMES} TEXT,
@@ -69,7 +70,8 @@ def create_result_table(cursor):
69
70
  {DatabaseColumns.COMPLETION_TOKENS} INTEGER,
70
71
  {DatabaseColumns.MAX_GPU_MEMORY_COST} REAL,
71
72
  {DatabaseColumns.TIME_PER_OUTPUT_TOKEN} REAL
72
- )''')
73
+ )'''
74
+ )
73
75
 
74
76
 
75
77
  def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData):
@@ -89,9 +91,10 @@ def insert_benchmark_data(cursor: sqlite3.Cursor, benchmark_data: BenchmarkData)
89
91
 
90
92
  if benchmark_data.success:
91
93
  # Add additional columns for success case
92
- additional_columns = (benchmark_data.query_latency, benchmark_data.first_chunk_latency,
93
- benchmark_data.prompt_tokens, benchmark_data.completion_tokens,
94
- benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token)
94
+ additional_columns = (
95
+ benchmark_data.query_latency, benchmark_data.first_chunk_latency, benchmark_data.prompt_tokens,
96
+ benchmark_data.completion_tokens, benchmark_data.max_gpu_memory_cost, benchmark_data.time_per_output_token
97
+ )
95
98
  query = f"""INSERT INTO result(
96
99
  {DatabaseColumns.REQUEST}, {DatabaseColumns.START_TIME}, {DatabaseColumns.CHUNK_TIMES},
97
100
  {DatabaseColumns.SUCCESS}, {DatabaseColumns.RESPONSE_MESSAGES}, {DatabaseColumns.COMPLETED_TIME},
@@ -124,7 +127,7 @@ def get_result_db_path(args: Arguments):
124
127
 
125
128
  logger.info(f'Save the data base to: {result_db_path}')
126
129
  if os.path.exists(result_db_path):
127
- logger.warning('The db file exists, delete it and start again!.')
130
+ logger.error(f'The db file {result_db_path} exists, delete it and start again!.')
128
131
  sys.exit(1)
129
132
 
130
133
  return result_db_path
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import subprocess
3
- import torch
4
3
  import uvicorn
5
4
  from contextlib import asynccontextmanager
6
5
  from dataclasses import dataclass
@@ -61,8 +60,12 @@ class ServerSentEvent(object):
61
60
  @asynccontextmanager
62
61
  async def lifespan(app: FastAPI):
63
62
  yield
64
- if torch.cuda.is_available():
65
- torch.cuda.empty_cache()
63
+ try:
64
+ import torch
65
+ if torch.cuda.is_available():
66
+ torch.cuda.empty_cache()
67
+ except ImportError:
68
+ pass
66
69
 
67
70
 
68
71
  def create_app(model, attn_implementation=None) -> FastAPI:
@@ -102,6 +105,8 @@ def start_app(args: Arguments):
102
105
  uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
103
106
 
104
107
  elif args.api == 'local_vllm':
108
+ import torch
109
+
105
110
  os.environ['VLLM_USE_MODELSCOPE'] = 'True'
106
111
  os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
107
112
  os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
@@ -32,8 +32,9 @@ def analyze_results(all_results):
32
32
  avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
33
33
  avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
34
34
  p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
35
- success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
36
- / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100
35
+ success_rate = (
36
+ total_metrics.get(Metrics.SUCCEED_REQUESTS, 0) / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)
37
+ ) * 100
37
38
  avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
38
39
  p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
39
40
 
@@ -55,12 +56,13 @@ def analyze_results(all_results):
55
56
  f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
56
57
  ])
57
58
 
58
- total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get(
59
- Metrics.SUCCEED_REQUESTS, 0)
59
+ total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST,
60
+ 0) * total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
60
61
  total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
61
62
  except Exception as e:
62
63
  logger.warning(
63
- f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}")
64
+ f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}"
65
+ )
64
66
  continue
65
67
 
66
68
  if not summary:
@@ -138,7 +140,8 @@ def print_summary(all_results, model_name):
138
140
  f'{float(row[8]):.3f}', # Average TPOT
139
141
  f'{float(row[9]):.3f}', # P99 TPOT
140
142
  row[6], # Success Rate
141
- style=row_style)
143
+ style=row_style
144
+ )
142
145
  except ValueError as e:
143
146
  console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
144
147
  continue
@@ -156,8 +159,9 @@ def print_summary(all_results, model_name):
156
159
  perf_info.add_column('Value', style='green', width=40)
157
160
 
158
161
  perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
159
- perf_info.add_row('Lowest Latency',
160
- f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)')
162
+ perf_info.add_row(
163
+ 'Lowest Latency', f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)'
164
+ )
161
165
 
162
166
  console.print('\n')
163
167
  console.print(perf_info)
@@ -166,7 +170,8 @@ def print_summary(all_results, model_name):
166
170
  recommendations = []
167
171
  if best_rps_idx == len(summary) - 1:
168
172
  recommendations.append(
169
- 'The system seems not to have reached its performance bottleneck, try higher concurrency')
173
+ 'The system seems not to have reached its performance bottleneck, try higher concurrency'
174
+ )
170
175
  elif best_rps_idx == 0:
171
176
  recommendations.append('Consider lowering concurrency, current load may be too high')
172
177
  else:
@@ -175,7 +180,8 @@ def print_summary(all_results, model_name):
175
180
  success_rate = float(summary[-1][6][:-1])
176
181
  if success_rate < 95:
177
182
  recommendations.append(
178
- 'Success rate is low at high concurrency, check system resources or reduce concurrency')
183
+ 'Success rate is low at high concurrency, check system resources or reduce concurrency'
184
+ )
179
185
 
180
186
  recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
181
187
  console.print(recommend_text)
@@ -6,7 +6,7 @@ from evalscope.utils.import_utils import _LazyModule
6
6
  if TYPE_CHECKING:
7
7
  from .combinator import gen_table, get_data_frame, get_report_list
8
8
  from .generator import ReportGenerator
9
- from .utils import Category, Report, ReportKey, Subset
9
+ from .report import Category, Report, ReportKey, Subset
10
10
 
11
11
  else:
12
12
  _import_structure = {
@@ -19,7 +19,7 @@ else:
19
19
  'generator': [
20
20
  'ReportGenerator',
21
21
  ],
22
- 'utils': [
22
+ 'report': [
23
23
  'Category',
24
24
  'Report',
25
25
  'ReportKey',
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from tabulate import tabulate
7
7
  from typing import List, Tuple
8
8
 
9
- from evalscope.report.utils import Report
9
+ from evalscope.report.report import Report
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -32,25 +32,30 @@ def get_report_list(reports_path_list: List[str]) -> List[Report]:
32
32
  return report_list
33
33
 
34
34
 
35
- def get_data_frame(report_list: List[Report],
36
- flatten_metrics: bool = True,
37
- flatten_categories: bool = True,
38
- add_overall_metric: bool = False) -> pd.DataFrame:
35
+ def get_data_frame(
36
+ report_list: List[Report],
37
+ flatten_metrics: bool = True,
38
+ flatten_categories: bool = True,
39
+ add_overall_metric: bool = False
40
+ ) -> pd.DataFrame:
39
41
  tables = []
40
42
  for report in report_list:
41
43
  df = report.to_dataframe(
42
44
  flatten_metrics=flatten_metrics,
43
45
  flatten_categories=flatten_categories,
44
- add_overall_metric=add_overall_metric)
46
+ add_overall_metric=add_overall_metric
47
+ )
45
48
  tables.append(df)
46
49
  return pd.concat(tables, ignore_index=True)
47
50
 
48
51
 
49
- def gen_table(reports_path_list: list[str] = None,
50
- report_list: list[Report] = None,
51
- flatten_metrics: bool = True,
52
- flatten_categories: bool = True,
53
- add_overall_metric: bool = False) -> str:
52
+ def gen_table(
53
+ reports_path_list: list[str] = None,
54
+ report_list: list[Report] = None,
55
+ flatten_metrics: bool = True,
56
+ flatten_categories: bool = True,
57
+ add_overall_metric: bool = False
58
+ ) -> str:
54
59
  """
55
60
  Generates a formatted table from a list of report paths or Report objects.
56
61
 
@@ -78,7 +83,8 @@ def gen_table(reports_path_list: list[str] = None,
78
83
  report_list,
79
84
  flatten_metrics=flatten_metrics,
80
85
  flatten_categories=flatten_categories,
81
- add_overall_metric=add_overall_metric)
86
+ add_overall_metric=add_overall_metric
87
+ )
82
88
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
83
89
 
84
90
 
@@ -3,16 +3,18 @@ from pandas import DataFrame
3
3
  from typing import TYPE_CHECKING
4
4
 
5
5
  from evalscope.constants import DataCollection
6
- from evalscope.report.utils import *
6
+ from evalscope.report.report import *
7
7
 
8
8
  if TYPE_CHECKING:
9
- from evalscope.benchmarks import DataAdapter
9
+ from evalscope.api.benchmark import DataAdapter
10
+ from evalscope.api.metric import AggScore
11
+ from evalscope.benchmarks import DataAdapter as OldDataAdapter
10
12
 
11
13
 
12
14
  class ReportGenerator:
13
15
 
14
16
  @staticmethod
15
- def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'DataAdapter', **kwargs) -> Report:
17
+ def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'OldDataAdapter', **kwargs) -> Report:
16
18
  """
17
19
  Generate a report for a specific dataset based on provided subset scores.
18
20
 
@@ -59,7 +61,9 @@ class ReportGenerator:
59
61
  score=score_item['score'],
60
62
  num=score_item['num'],
61
63
  metric_name=score_item['metric_name'],
62
- categories=tuple(categories)))
64
+ categories=tuple(categories)
65
+ )
66
+ )
63
67
  df = pd.DataFrame(subsets)
64
68
  return df
65
69
 
@@ -83,7 +87,8 @@ class ReportGenerator:
83
87
  dataset_name=dataset_name,
84
88
  model_name=model_name,
85
89
  dataset_description=data_adapter.description,
86
- dataset_pretty_name=data_adapter.pretty_name)
90
+ dataset_pretty_name=data_adapter.pretty_name
91
+ )
87
92
  return report
88
93
 
89
94
  @staticmethod
@@ -101,4 +106,94 @@ class ReportGenerator:
101
106
  name=DataCollection.NAME,
102
107
  metrics=[Metric(name='Average', categories=categories)],
103
108
  dataset_name=all_dataset_name,
104
- model_name=model_name)
109
+ model_name=model_name
110
+ )
111
+
112
+ @staticmethod
113
+ def generate_report(
114
+ score_dict: Dict[str, List['AggScore']],
115
+ model_name: str,
116
+ data_adapter: 'DataAdapter',
117
+ add_aggregation_name: bool = True
118
+ ) -> Report:
119
+ """
120
+ Generate a report for a specific dataset based on provided subset scores.
121
+
122
+ Args:
123
+ subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
124
+ ```
125
+ {
126
+ 'subset_name': [
127
+ AggScore={'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
128
+ AggScore={'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
129
+ ],
130
+ ...
131
+ }
132
+ ```
133
+ data_adapter (DataAdapter): An adapter object for data handling.
134
+
135
+ Returns:
136
+ Report: A structured report object containing metrics, categories, and subsets.
137
+
138
+ >>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
139
+ """ # noqa: E501
140
+
141
+ dataset_name = data_adapter.name
142
+ category_map = data_adapter.category_map
143
+ report_name = f'{model_name}@{dataset_name}'
144
+
145
+ def flatten_subset() -> DataFrame:
146
+ """
147
+ Flatten subset score map to a DataFrame.
148
+
149
+ Example:
150
+ name score num categories metric_name
151
+ 0 ARC-Easy 0.5 2 [default] AverageAccuracy
152
+ 1 ARC-Challenge 0.5 2 [default] AverageAccuracy
153
+ """
154
+ subsets = []
155
+ for subset_name, agg_scores in score_dict.items():
156
+ for agg_score_item in agg_scores:
157
+ categories = category_map.get(subset_name, ['default'])
158
+ if add_aggregation_name and agg_score_item.aggregation_name:
159
+ metric_name = f'{agg_score_item.aggregation_name}_{agg_score_item.metric_name}'
160
+ else:
161
+ metric_name = agg_score_item.metric_name
162
+
163
+ if isinstance(categories, str):
164
+ categories = [categories]
165
+ subsets.append(
166
+ dict(
167
+ name=subset_name,
168
+ score=agg_score_item.score,
169
+ num=agg_score_item.num,
170
+ metric_name=metric_name,
171
+ categories=tuple(categories)
172
+ )
173
+ )
174
+ df = pd.DataFrame(subsets)
175
+ return df
176
+
177
+ df = flatten_subset()
178
+
179
+ metrics_list = []
180
+ for metric_name, group_metric in df.groupby('metric_name', sort=False):
181
+ categories = []
182
+ for category_name, group_category in group_metric.groupby('categories'):
183
+ subsets = []
184
+ for _, row in group_category.iterrows():
185
+ subsets.append(Subset(name=row['name'], score=row['score'], num=row['num']))
186
+
187
+ categories.append(Category(name=category_name, subsets=subsets))
188
+
189
+ metrics_list.append(Metric(name=metric_name, categories=categories))
190
+
191
+ report = Report(
192
+ name=report_name,
193
+ metrics=metrics_list,
194
+ dataset_name=dataset_name,
195
+ model_name=model_name,
196
+ dataset_description=data_adapter.description,
197
+ dataset_pretty_name=data_adapter.pretty_name
198
+ )
199
+ return report
@@ -152,10 +152,12 @@ class Report:
152
152
  data = json.load(f)
153
153
  return cls.from_dict(data)
154
154
 
155
- def to_dataframe(self,
156
- flatten_metrics: bool = True,
157
- flatten_categories: bool = True,
158
- add_overall_metric: bool = False) -> pd.DataFrame:
155
+ def to_dataframe(
156
+ self,
157
+ flatten_metrics: bool = True,
158
+ flatten_categories: bool = True,
159
+ add_overall_metric: bool = False
160
+ ) -> pd.DataFrame:
159
161
  """
160
162
  Convert the report to a pandas DataFrame.
161
163
  Args:
@@ -201,8 +203,8 @@ class Report:
201
203
  # multi-level aggregation for categories
202
204
  max_depth = df_categories[ReportKey.category_name].apply(len).max()
203
205
  for level in range(max_depth):
204
- df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[ReportKey.category_name].apply(
205
- lambda x: x[level] if len(x) > level else None)
206
+ df_categories[f'{ReportKey.category_prefix}{level}'] = df_categories[
207
+ ReportKey.category_name].apply(lambda x: x[level] if len(x) > level else None)
206
208
 
207
209
  df_categories.drop(columns=[ReportKey.category_name], inplace=True)
208
210
  return df_categories