evalscope 0.13.2__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (214) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +4 -4
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/__init__.py +2 -2
  12. evalscope/benchmarks/aigc/__init__.py +0 -0
  13. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  14. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  15. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  16. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  17. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  18. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  19. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  20. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  21. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  22. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  23. evalscope/benchmarks/arc/arc_adapter.py +2 -2
  24. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  25. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  26. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  27. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  28. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  29. evalscope/benchmarks/data_adapter.py +21 -10
  30. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  35. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
  36. evalscope/benchmarks/live_code_bench/testing_util.py +369 -550
  37. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  38. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  39. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
  41. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  42. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  43. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  44. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  45. evalscope/benchmarks/utils.py +7 -16
  46. evalscope/cli/start_app.py +1 -1
  47. evalscope/collections/evaluator.py +20 -6
  48. evalscope/config.py +8 -4
  49. evalscope/constants.py +11 -0
  50. evalscope/evaluator/evaluator.py +2 -2
  51. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  52. evalscope/metrics/__init__.py +49 -4
  53. evalscope/metrics/llm_judge.py +1 -1
  54. evalscope/metrics/named_metrics.py +13 -0
  55. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  56. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  57. evalscope/metrics/t2v_metrics/constants.py +12 -0
  58. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  59. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  60. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  61. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  62. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  63. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  64. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  65. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  66. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  67. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  68. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  69. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  70. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  71. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  72. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  73. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  74. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  75. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  76. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  77. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  138. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  139. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  140. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  141. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  142. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  143. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  144. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  145. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  146. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  147. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  148. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  149. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  154. evalscope/metrics/t2v_metrics/score.py +78 -0
  155. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  156. evalscope/models/__init__.py +50 -14
  157. evalscope/models/adapters/__init__.py +17 -0
  158. evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
  159. evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
  160. evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
  161. evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
  162. evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
  163. evalscope/models/adapters/t2i_adapter.py +76 -0
  164. evalscope/models/custom/__init__.py +2 -1
  165. evalscope/models/custom/dummy_model.py +11 -13
  166. evalscope/models/local_model.py +82 -33
  167. evalscope/models/model.py +2 -42
  168. evalscope/models/register.py +26 -0
  169. evalscope/perf/arguments.py +24 -5
  170. evalscope/perf/benchmark.py +28 -42
  171. evalscope/perf/http_client.py +2 -3
  172. evalscope/perf/plugin/api/custom_api.py +1 -1
  173. evalscope/perf/plugin/api/openai_api.py +2 -2
  174. evalscope/perf/plugin/datasets/custom.py +4 -1
  175. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  176. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  177. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  178. evalscope/perf/plugin/datasets/openqa.py +4 -1
  179. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  180. evalscope/perf/utils/benchmark_util.py +14 -8
  181. evalscope/perf/utils/db_util.py +9 -3
  182. evalscope/perf/utils/log_utils.py +41 -0
  183. evalscope/report/__init__.py +1 -0
  184. evalscope/report/app.py +128 -78
  185. evalscope/report/app_arguments.py +11 -0
  186. evalscope/report/generator.py +1 -1
  187. evalscope/run.py +10 -3
  188. evalscope/summarizer.py +2 -1
  189. evalscope/third_party/thinkbench/eval.py +19 -7
  190. evalscope/utils/chat_service.py +2 -2
  191. evalscope/utils/import_utils.py +66 -0
  192. evalscope/utils/utils.py +48 -29
  193. evalscope/version.py +2 -2
  194. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/METADATA +37 -15
  195. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/RECORD +209 -96
  196. tests/aigc/__init__.py +1 -0
  197. tests/aigc/test_t2i.py +87 -0
  198. tests/cli/test_all.py +4 -4
  199. tests/cli/test_collection.py +2 -1
  200. tests/cli/test_run.py +19 -12
  201. tests/perf/test_perf.py +3 -3
  202. tests/rag/test_clip_benchmark.py +0 -1
  203. tests/rag/test_mteb.py +37 -8
  204. tests/rag/test_ragas.py +29 -26
  205. tests/vlm/test_vlmeval.py +37 -1
  206. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  207. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  208. evalscope/metrics/code_metric.py +0 -98
  209. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  210. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  211. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/LICENSE +0 -0
  212. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/WHEEL +0 -0
  213. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
  214. {evalscope-0.13.2.dist-info → evalscope-0.15.0.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ logger = get_logger()
11
11
  @dataclass
12
12
  class BenchmarkData:
13
13
  request: Any = None
14
- start_time: float = field(default_factory=time.perf_counter)
14
+ start_time: float = 0.0
15
15
  completed_time: float = 0.0
16
16
  chunk_times: List[float] = field(default_factory=list)
17
17
  success: bool = False
@@ -32,13 +32,13 @@ class BenchmarkData:
32
32
  self.query_latency = self.completed_time - self.start_time
33
33
  if len(self.chunk_times) > 1:
34
34
  self.first_chunk_latency = self.chunk_times[0] - self.start_time
35
- self.n_chunks = len(self.chunk_times) - 2
35
+ self.n_chunks = len(self.chunk_times) - 2 # remove last and first chunk
36
36
  self.n_chunks_time = self.chunk_times[-2] - self.chunk_times[0]
37
37
  else:
38
38
  self.first_chunk_latency = self.query_latency
39
39
  self.n_chunks = 1
40
40
  self.n_chunks_time = self.query_latency
41
- self.time_per_output_token = self.query_latency / self.completion_tokens
41
+ self.time_per_output_token = self.n_chunks_time / self.completion_tokens
42
42
 
43
43
  def _calculate_tokens(self, api_plugin):
44
44
  self.prompt_tokens, self.completion_tokens = \
@@ -73,7 +73,9 @@ class BenchmarkMetrics:
73
73
  avg_chunk_time: float = -1
74
74
  avg_prompt_tokens: float = -1
75
75
  avg_completion_tokens: float = -1
76
- avg_token_per_seconds: float = -1
76
+ avg_input_token_per_seconds: float = -1
77
+ avg_output_token_per_seconds: float = -1
78
+ avg_total_token_per_seconds: float = -1
77
79
  avg_time_per_token: float = -1
78
80
  qps: float = -1
79
81
 
@@ -111,22 +113,26 @@ class BenchmarkMetrics:
111
113
  self.avg_chunk_time = self.total_chunks_time / self.n_total_chunks
112
114
  self.avg_prompt_tokens = self.n_total_prompt_tokens / self.n_succeed_queries
113
115
  self.avg_completion_tokens = self.n_total_completion_tokens / self.n_succeed_queries
114
- self.avg_token_per_seconds = self.n_total_completion_tokens / self.total_time
116
+ self.avg_input_token_per_seconds = self.n_total_prompt_tokens / self.total_first_chunk_latency
117
+ self.avg_output_token_per_seconds = self.n_total_completion_tokens / self.total_time
118
+ self.avg_total_token_per_seconds = (self.n_total_prompt_tokens
119
+ + self.n_total_completion_tokens) / self.total_time
115
120
  self.avg_time_per_token = self.n_time_per_output_token / self.n_succeed_queries
116
121
  self.qps = self.n_succeed_queries / self.total_time
117
122
  except ZeroDivisionError as e:
118
123
  logger.exception(e)
119
124
  return
120
125
 
121
- def create_message(self, default_ndigits=3):
126
+ def create_message(self, default_ndigits=4):
122
127
  message = {
123
128
  'Time taken for tests (s)': round(self.total_time, default_ndigits),
124
129
  'Number of concurrency': self.concurrency,
125
130
  'Total requests': int(self.n_total_queries),
126
131
  'Succeed requests': self.n_succeed_queries,
127
132
  'Failed requests': self.n_failed_queries,
128
- 'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits),
129
- 'Average QPS': round(self.qps, default_ndigits),
133
+ 'Output token throughput (tok/s)': round(self.avg_output_token_per_seconds, default_ndigits),
134
+ 'Total token throughput (tok/s)': round(self.avg_total_token_per_seconds, default_ndigits),
135
+ 'Request throughput (req/s)': round(self.qps, default_ndigits),
130
136
  'Average latency (s)': round(self.avg_latency, default_ndigits),
131
137
  'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits),
132
138
  'Average time per output token (s)': round(self.avg_time_per_token, default_ndigits),
@@ -165,6 +165,7 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
165
165
  CHUNK_TIMES_INDEX = 1
166
166
  LATENCY_INDEX = 4
167
167
  FIRST_CHUNK_LATENCY_INDEX = 5
168
+ CHUNK_TIME_INDEX = 7
168
169
  PROMPT_TOKENS_INDEX = 8
169
170
  COMPLETION_TOKENS_INDEX = 9
170
171
 
@@ -175,14 +176,19 @@ def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]:
175
176
 
176
177
  metrics = {
177
178
  'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows],
178
- 'TPOT (s)':
179
+ 'ITL (s)':
179
180
  inter_token_latencies_all,
181
+ 'TPOT (s)':
182
+ [(row[CHUNK_TIME_INDEX] / row[COMPLETION_TOKENS_INDEX]) if row[COMPLETION_TOKENS_INDEX] > 0 else float('nan')
183
+ for row in rows],
180
184
  'Latency (s)': [row[LATENCY_INDEX] for row in rows],
181
185
  'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows],
182
186
  'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows],
183
- 'Throughput(tokens/s)':
187
+ 'Output throughput(tok/s)':
184
188
  [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan')
185
- for row in rows]
189
+ for row in rows],
190
+ 'Total throughput(tok/s)': [((row[PROMPT_TOKENS_INDEX] + row[COMPLETION_TOKENS_INDEX])
191
+ / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan') for row in rows]
186
192
  }
187
193
 
188
194
  # Calculate percentiles for each metric
@@ -0,0 +1,41 @@
1
+ import os
2
+
3
+ from evalscope.perf.arguments import Arguments
4
+
5
+
6
+ def init_wandb(args: Arguments) -> None:
7
+ """
8
+ Initialize WandB for logging.
9
+ """
10
+ # Initialize wandb if the api key is provided
11
+ import datetime
12
+ try:
13
+ import wandb
14
+ except ImportError:
15
+ raise RuntimeError('Cannot import wandb. Please install it with command: \n pip install wandb')
16
+ os.environ['WANDB_SILENT'] = 'true'
17
+ os.environ['WANDB_DIR'] = args.outputs_dir
18
+
19
+ wandb.login(key=args.wandb_api_key)
20
+ current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
21
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
22
+ wandb.init(project='perf_benchmark', name=name, config=args.to_dict())
23
+
24
+
25
+ def init_swanlab(args: Arguments) -> None:
26
+ import datetime
27
+ try:
28
+ import swanlab
29
+ except ImportError:
30
+ raise RuntimeError('Cannot import swanlab. Please install it with command: \n pip install swanlab')
31
+ os.environ['SWANLAB_LOG_DIR'] = args.outputs_dir
32
+ if not args.swanlab_api_key == 'local':
33
+ swanlab.login(api_key=args.swanlab_api_key)
34
+ current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
35
+ name = args.name if args.name else f'{args.model_id}_{current_time}'
36
+ swanlab.config.update({'framework': '📏evalscope'})
37
+ swanlab.init(
38
+ project='perf_benchmark',
39
+ name=name,
40
+ config=args.to_dict(),
41
+ mode='local' if args.swanlab_api_key == 'local' else None)
@@ -1,5 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
+ from evalscope.report.app_arguments import add_argument
3
4
  from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
4
5
  from evalscope.report.generator import ReportGenerator
5
6
  from evalscope.report.utils import Category, Report, ReportKey, Subset
evalscope/report/app.py CHANGED
@@ -11,7 +11,7 @@ from dataclasses import dataclass
11
11
  from typing import Any, List, Union
12
12
 
13
13
  from evalscope.constants import DataCollection
14
- from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
14
+ from evalscope.report import Report, ReportKey, add_argument, get_data_frame, get_report_list
15
15
  from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
16
16
  from evalscope.utils.logger import configure_logging, get_logger
17
17
  from evalscope.version import __version__
@@ -22,6 +22,23 @@ PLOTLY_THEME = 'plotly_dark'
22
22
  REPORT_TOKEN = '@@'
23
23
  MODEL_TOKEN = '::'
24
24
  DATASET_TOKEN = ', '
25
+ LATEX_DELIMITERS = [{
26
+ 'left': '$$',
27
+ 'right': '$$',
28
+ 'display': True
29
+ }, {
30
+ 'left': '$',
31
+ 'right': '$',
32
+ 'display': False
33
+ }, {
34
+ 'left': '\\(',
35
+ 'right': '\\)',
36
+ 'display': False
37
+ }, {
38
+ 'left': '\\[',
39
+ 'right': '\\]',
40
+ 'display': True
41
+ }]
25
42
 
26
43
 
27
44
  def scan_for_report_folders(root_path):
@@ -44,7 +61,7 @@ def scan_for_report_folders(root_path):
44
61
  continue
45
62
  datasets = []
46
63
  for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
47
- datasets.append(os.path.basename(dataset_item).split('.')[0])
64
+ datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
48
65
  datasets = DATASET_TOKEN.join(datasets)
49
66
  reports.append(
50
67
  f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
@@ -234,6 +251,18 @@ def convert_html_tags(text):
234
251
  return text
235
252
 
236
253
 
254
+ def convert_markdown_image(text):
255
+ if not os.path.isfile(text):
256
+ return text
257
+ # Convert the image path to a markdown image tag
258
+ if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
259
+ text = os.path.abspath(text)
260
+ image_tag = f'![image](gradio_api/file={text})'
261
+ logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
262
+ return image_tag
263
+ return text
264
+
265
+
237
266
  def process_string(string: str, max_length: int = 2048) -> str:
238
267
  string = convert_html_tags(string) # for display labels e.g. `<think>`
239
268
  if len(string) > max_length:
@@ -253,17 +282,17 @@ def process_model_prediction(item: Any):
253
282
 
254
283
 
255
284
  def normalize_score(score):
256
- if isinstance(score, bool):
257
- return 1.0 if score else 0.0
258
- elif isinstance(score, dict):
259
- for key in score:
260
- return float(score[key])
261
- return 0.0
262
- else:
263
- try:
264
- return float(score)
265
- except (ValueError, TypeError):
285
+ try:
286
+ if isinstance(score, bool):
287
+ return 1.0 if score else 0.0
288
+ elif isinstance(score, dict):
289
+ for key in score:
290
+ return float(score[key])
266
291
  return 0.0
292
+ else:
293
+ return float(score)
294
+ except (ValueError, TypeError):
295
+ return 0.0
267
296
 
268
297
 
269
298
  def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
@@ -285,7 +314,7 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
285
314
  'Input': raw_input,
286
315
  'Generated': raw_pred_answer,
287
316
  'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
288
- 'Pred': parsed_pred_answer if parsed_pred_answer != raw_pred_answer else '*Same as Generated*',
317
+ 'Pred': parsed_pred_answer,
289
318
  'Score': score,
290
319
  'NScore': normalize_score(score)
291
320
  }
@@ -295,22 +324,6 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
295
324
  return df_subset
296
325
 
297
326
 
298
- def get_table_data(data_review_df: pd.DataFrame, page: int = 1, rows_per_page: int = 1) -> pd.DataFrame:
299
- if data_review_df is None:
300
- return pd.DataFrame(), None
301
-
302
- logger.debug(f'page: {page}, rows_per_page: {rows_per_page}')
303
- start = (page - 1) * rows_per_page
304
- end = start + rows_per_page
305
- df_subset = data_review_df.iloc[start:end].copy()
306
- df_subset['Input'] = df_subset['Input'].map(process_model_prediction).astype(str)
307
- df_subset['Generated'] = df_subset['Generated'].map(process_model_prediction).astype(str)
308
- df_subset['Pred'] = df_subset['Pred'].map(process_model_prediction).astype(str)
309
- df_subset['Score'] = df_subset['Score'].map(process_model_prediction).astype(str)
310
- styler = style_df(df_subset, columns=['NScore'])
311
- return df_subset, styler
312
-
313
-
314
327
  @dataclass
315
328
  class SidebarComponents:
316
329
  root_path: gr.Textbox
@@ -457,7 +470,11 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
457
470
  'page': {
458
471
  'zh': '页码',
459
472
  'en': 'Page'
460
- }
473
+ },
474
+ 'score_threshold': {
475
+ 'zh': '分数阈值',
476
+ 'en': 'Score Threshold'
477
+ },
461
478
  }
462
479
 
463
480
  # Update the UI components with localized labels
@@ -489,37 +506,53 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
489
506
  gr.Markdown(f'### {locale_dict["model_prediction"][lang]}')
490
507
  subset_select = gr.Dropdown(
491
508
  label=locale_dict['select_subset'][lang], choices=[], show_label=True, interactive=True)
509
+
492
510
  with gr.Row():
493
511
  answer_mode_radio = gr.Radio(
494
512
  label=locale_dict['answer_mode'][lang], choices=['All', 'Pass', 'Fail'], value='All', interactive=True)
495
- page_number = gr.Number(
496
- value=1, label=locale_dict['page'][lang], minimum=1, maximum=1, step=1, interactive=True)
497
- answer_mode_counts = gr.Markdown('', label='Counts')
513
+ score_threshold = gr.Number(value=0.99, label=locale_dict['score_threshold'][lang], interactive=True)
514
+
498
515
  data_review_df = gr.State(None)
499
516
  filtered_review_df = gr.State(None)
500
- data_review_table = gr.DataFrame(
501
- value=None,
502
- datatype=['markdown', 'markdown', 'markdown', 'markdown', 'markdown', 'number'],
503
- # column_widths=['500px', '500px'],
504
- wrap=True,
505
- latex_delimiters=[{
506
- 'left': '$$',
507
- 'right': '$$',
508
- 'display': True
509
- }, {
510
- 'left': '$',
511
- 'right': '$',
512
- 'display': False
513
- }, {
514
- 'left': '\\(',
515
- 'right': '\\)',
516
- 'display': False
517
- }, {
518
- 'left': '\\[',
519
- 'right': '\\]',
520
- 'display': True
521
- }],
522
- max_height=600)
517
+
518
+ # show statistics
519
+ with gr.Row(variant='panel'):
520
+ with gr.Column():
521
+ gr.Markdown('### *Counts*')
522
+ answer_mode_counts = gr.Markdown('')
523
+ with gr.Column():
524
+ page_number = gr.Number(
525
+ value=1, label=locale_dict['page'][lang], minimum=1, maximum=1, step=1, interactive=True)
526
+
527
+ # show data review table
528
+ with gr.Row(variant='panel'):
529
+ with gr.Column():
530
+ gr.Markdown('### *Score*')
531
+ score_text = gr.Markdown(
532
+ '', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
533
+ with gr.Column():
534
+ gr.Markdown('### *Normalized Score*')
535
+ nscore = gr.Markdown('', elem_id='score_text', latex_delimiters=LATEX_DELIMITERS)
536
+
537
+ with gr.Row(variant='panel'):
538
+ with gr.Column():
539
+ gr.Markdown('### *Gold*')
540
+ gold_text = gr.Markdown(
541
+ '', elem_id='gold_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
542
+ with gr.Column():
543
+ gr.Markdown('### *Pred*')
544
+ pred_text = gr.Markdown(
545
+ '', elem_id='pred_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
546
+
547
+ with gr.Row(variant='panel'):
548
+ with gr.Column():
549
+ gr.Markdown('### *Input*')
550
+ input_text = gr.Markdown(
551
+ '', elem_id='input_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
552
+ with gr.Column():
553
+ gr.Markdown('### *Generated*')
554
+ generated_text = gr.Markdown(
555
+ '', elem_id='generated_text', latex_delimiters=LATEX_DELIMITERS, show_copy_button=True)
523
556
 
524
557
  @report_name.change(
525
558
  inputs=[sidebar.root_path, report_name],
@@ -561,15 +594,15 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
561
594
  return data_review_df, 1
562
595
 
563
596
  @gr.on(
564
- triggers=[data_review_df.change, answer_mode_radio.change],
565
- inputs=[data_review_df, answer_mode_radio],
597
+ triggers=[data_review_df.change, answer_mode_radio.change, score_threshold.change],
598
+ inputs=[data_review_df, answer_mode_radio, score_threshold],
566
599
  outputs=[filtered_review_df, page_number, answer_mode_counts])
567
- def filter_data(data_review_df, answer_mode):
600
+ def filter_data(data_review_df, answer_mode, score_threshold):
568
601
  if data_review_df is None:
569
602
  return None, gr.update(value=1, maximum=1), ''
570
603
 
571
604
  all_count = len(data_review_df)
572
- pass_df = data_review_df[data_review_df['NScore'] >= 0.99]
605
+ pass_df = data_review_df[data_review_df['NScore'] >= score_threshold]
573
606
  pass_count = len(pass_df)
574
607
  fail_count = all_count - pass_count
575
608
 
@@ -578,7 +611,7 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
578
611
  if answer_mode == 'Pass':
579
612
  filtered_df = pass_df
580
613
  elif answer_mode == 'Fail':
581
- filtered_df = data_review_df[data_review_df['NScore'] < 0.99]
614
+ filtered_df = data_review_df[data_review_df['NScore'] < score_threshold]
582
615
  else:
583
616
  filtered_df = data_review_df
584
617
 
@@ -588,13 +621,33 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
588
621
 
589
622
  @gr.on(
590
623
  triggers=[filtered_review_df.change, page_number.change],
591
- inputs=[filtered_review_df, page_number],
592
- outputs=[data_review_table])
593
- def update_table(filtered_df, page_number):
594
- if filtered_df is None:
595
- return gr.update(value=None)
596
- subset_df, styler = get_table_data(filtered_df, page_number)
597
- return styler
624
+ inputs=[filtered_review_df, page_number, score_threshold],
625
+ outputs=[input_text, generated_text, gold_text, pred_text, score_text, nscore])
626
+ def update_table_components(filtered_df, page_number, score_threshold):
627
+ if filtered_df is None or len(filtered_df) == 0:
628
+ return '', '', '', '', '', ''
629
+
630
+ # Get single row data for the current page
631
+ start = (page_number - 1)
632
+ if start >= len(filtered_df):
633
+ return '', '', '', '', '', ''
634
+
635
+ row = filtered_df.iloc[start]
636
+
637
+ # Process the data for display
638
+ input_md = process_model_prediction(row['Input'])
639
+ generated_md = process_model_prediction(row['Generated'])
640
+ gold_md = process_model_prediction(row['Gold'])
641
+ pred_md = convert_markdown_image(process_model_prediction(row['Pred']))
642
+ score_md = process_model_prediction(row['Score'])
643
+ nscore_val = float(row['NScore']) if not pd.isna(row['NScore']) else 0.0
644
+
645
+ if nscore_val >= score_threshold:
646
+ nscore_val = f'<div style="background-color:rgb(45,104, 62); padding:10px;">{nscore_val}</div>'
647
+ else:
648
+ nscore_val = f'<div style="background-color:rgb(151, 31, 44); padding:10px;">{nscore_val}</div>'
649
+
650
+ return input_md, generated_md, gold_md, pred_md, score_md, nscore_val
598
651
 
599
652
  return SingleModelComponents(report_name=report_name)
600
653
 
@@ -696,16 +749,13 @@ def create_app(args: argparse.Namespace):
696
749
  text = '<' if new_visible else '>'
697
750
  return gr.update(visible=new_visible), new_visible, gr.update(value=text)
698
751
 
699
- demo.launch(share=args.share, server_name=args.server_name, server_port=args.server_port, debug=args.debug)
700
-
701
-
702
- def add_argument(parser: argparse.ArgumentParser):
703
- parser.add_argument('--share', action='store_true', help='Share the app.')
704
- parser.add_argument('--server-name', type=str, default='0.0.0.0', help='The server name.')
705
- parser.add_argument('--server-port', type=int, default=None, help='The server port.')
706
- parser.add_argument('--debug', action='store_true', help='Debug the app.')
707
- parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
708
- parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
752
+ demo.launch(
753
+ share=args.share,
754
+ server_name=args.server_name,
755
+ server_port=args.server_port,
756
+ debug=args.debug,
757
+ allowed_paths=args.allowed_paths,
758
+ )
709
759
 
710
760
 
711
761
  if __name__ == '__main__':
@@ -0,0 +1,11 @@
1
+ import argparse
2
+
3
+
4
+ def add_argument(parser: argparse.ArgumentParser):
5
+ parser.add_argument('--share', action='store_true', help='Share the app.')
6
+ parser.add_argument('--server-name', type=str, default='0.0.0.0', help='The server name.')
7
+ parser.add_argument('--server-port', type=int, default=None, help='The server port.')
8
+ parser.add_argument('--debug', action='store_true', help='Debug the app.')
9
+ parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
10
+ parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
11
+ parser.add_argument('--allowed-paths', nargs='+', default=['/'], help='The outputs dir.')
@@ -48,7 +48,7 @@ class ReportGenerator:
48
48
  df = flatten_subset()
49
49
 
50
50
  metrics_list = []
51
- for metric_name, group_metric in df.groupby('metric_name'):
51
+ for metric_name, group_metric in df.groupby('metric_name', sort=False):
52
52
  categories = []
53
53
  for category_name, group_category in group_metric.groupby('categories'):
54
54
  subsets = []
evalscope/run.py CHANGED
@@ -58,10 +58,17 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
58
58
 
59
59
  outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
60
60
 
61
+ # Unify the output directory structure
61
62
  if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
62
63
  task_cfg.eval_config['time_str'] = run_time
63
64
  elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
64
65
  task_cfg.eval_config['work_dir'] = task_cfg.work_dir
66
+ elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
67
+ from evalscope.backend.rag_eval import Tools
68
+ if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
69
+ task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
70
+ elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
71
+ task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
65
72
  return outputs
66
73
 
67
74
 
@@ -146,10 +153,10 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
146
153
  data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
147
154
  return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
148
155
 
149
- # Initialize model adapter
150
- model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
151
- # Initialize data adapter
156
+ # Initialize data adapter first to update config
152
157
  data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
158
+ # Initialize model adapter
159
+ model_adapter = initialize_model_adapter(task_cfg, data_adapter, base_model)
153
160
 
154
161
  # update task_cfg.dataset_args
155
162
  task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
evalscope/summarizer.py CHANGED
@@ -105,7 +105,8 @@ class Summarizer:
105
105
  summary_res: dict = csv_to_list(summary_file_path)[0]
106
106
  elif summary_file_path.endswith('json'):
107
107
  summary_res: dict = json_to_dict(summary_file_path)
108
- file_name = os.path.basename(summary_file_path).split('.')[0]
108
+ base_name = os.path.basename(summary_file_path)
109
+ file_name = os.path.splitext(base_name)[0]
109
110
  final_res_list.append({file_name: summary_res})
110
111
 
111
112
  elif eval_backend == EvalBackend.THIRD_PARTY:
@@ -357,7 +357,7 @@ judge_config = dict(
357
357
  )
358
358
 
359
359
  distill_qwen_config = dict(
360
- report_path = './outputs/20250218_180219',
360
+ report_path = '../eval-scope/outputs/20250218_180219',
361
361
  model_name = 'DeepSeek-R1-Distill-Qwen-7B',
362
362
  tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
363
363
  dataset_name = 'math_500',
@@ -367,7 +367,7 @@ distill_qwen_config = dict(
367
367
  )
368
368
 
369
369
  math_qwen_config = dict(
370
- report_path = './outputs/20250219_202358',
370
+ report_path = '../eval-scope/outputs/20250219_202358',
371
371
  model_name = 'Qwen2.5-Math-7B-Instruct',
372
372
  tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
373
373
  dataset_name = 'math_500',
@@ -377,7 +377,7 @@ math_qwen_config = dict(
377
377
  )
378
378
 
379
379
  r1_config = dict(
380
- report_path = './outputs/20250307_000404',
380
+ report_path = '../eval-scope/outputs/20250307_000404',
381
381
  model_name = 'deepseek-r1',
382
382
  tokenizer_path = 'deepseek-ai/DeepSeek-R1',
383
383
  dataset_name = 'math_500',
@@ -387,7 +387,7 @@ r1_config = dict(
387
387
  )
388
388
 
389
389
  qwq_preview_config = dict(
390
- report_path = './outputs/20250221_105911',
390
+ report_path = '../eval-scope/outputs/20250221_105911',
391
391
  model_name = 'qwq-32b-preview',
392
392
  tokenizer_path = 'Qwen/QwQ-32B-Preview',
393
393
  dataset_name = 'math_500',
@@ -397,7 +397,7 @@ qwq_preview_config = dict(
397
397
  )
398
398
 
399
399
  qwq_config = dict(
400
- report_path = './outputs/20250306_181550',
400
+ report_path = '../eval-scope/outputs/20250306_181550',
401
401
  model_name = 'QwQ-32B',
402
402
  tokenizer_path = 'Qwen/QwQ-32B',
403
403
  dataset_name = 'math_500',
@@ -407,7 +407,7 @@ qwq_config = dict(
407
407
  )
408
408
 
409
409
  distill_qwen_32b = dict(
410
- report_path = './outputs/20250306_235951',
410
+ report_path = '../eval-scope/outputs/20250306_235951',
411
411
  model_name = 'deepseek-r1-distill-qwen-32b',
412
412
  tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
413
413
  dataset_name = 'math_500',
@@ -416,14 +416,26 @@ distill_qwen_32b = dict(
416
416
  judge_config=judge_config
417
417
  )
418
418
 
419
+ qwen3_32b_think = dict(
420
+ report_path = '../eval-scope/outputs/20250428_151817',
421
+ model_name = 'Qwen3-32B',
422
+ tokenizer_path = 'Qwen/Qwen3-32B',
423
+ dataset_name = 'math_500',
424
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
425
+ split_strategies='separator',
426
+ judge_config=judge_config
427
+ )
428
+
419
429
  if __name__ == '__main__':
420
430
  # run_task(distill_qwen_config, count=80)
421
431
  # run_task(math_qwen_config)
422
432
  # run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
423
433
  # run_task(r1_config, max_tokens=20000, count=200, workers=128)
424
434
  # run_task(qwq_config, max_tokens=20000, count=200, workers=128)
435
+ run_task(qwen3_32b_think, max_tokens=20000, count=200, workers=128)
425
436
  # run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
426
437
 
427
438
  # combine_results([qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
428
439
  # combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
429
- combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
440
+ # combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
441
+ combine_results([qwq_config, r1_config, distill_qwen_32b, qwen3_32b_think], output_path='outputs/model_comparison_metrics_4models.png')
@@ -64,10 +64,10 @@ class ChatCompletionResponseStreamChoice(BaseModel):
64
64
 
65
65
  class ChatCompletionResponse(BaseModel):
66
66
  model: str
67
- object: Literal['chat.completion', 'chat.completion.chunk']
67
+ object: Literal['chat.completion', 'chat.completion.chunk', 'images.generations']
68
68
  choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
69
69
  created: Optional[int] = Field(default_factory=lambda: int(time.time()))
70
- usage: Optional[Usage]
70
+ usage: Optional[Usage] = None
71
71
 
72
72
 
73
73
  class TextCompletionRequest(BaseModel):