evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (181) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/benchmarks/__init__.py +2 -2
  3. evalscope/benchmarks/aigc/__init__.py +0 -0
  4. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  5. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  6. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  7. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  8. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  9. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  10. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  11. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  12. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  13. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  14. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  16. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  18. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  20. evalscope/benchmarks/data_adapter.py +16 -9
  21. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  22. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  23. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
  24. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  25. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  26. evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
  27. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  28. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
  29. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  30. evalscope/benchmarks/utils.py +7 -16
  31. evalscope/cli/start_app.py +1 -1
  32. evalscope/collections/evaluator.py +16 -4
  33. evalscope/config.py +7 -3
  34. evalscope/constants.py +11 -0
  35. evalscope/evaluator/evaluator.py +9 -3
  36. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  37. evalscope/metrics/__init__.py +49 -4
  38. evalscope/metrics/llm_judge.py +1 -1
  39. evalscope/metrics/named_metrics.py +13 -0
  40. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  41. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  42. evalscope/metrics/t2v_metrics/constants.py +12 -0
  43. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  44. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  45. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  46. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  47. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  48. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  49. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  50. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  51. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  52. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  53. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  54. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  55. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  56. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  57. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  58. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  59. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  60. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  61. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  62. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  63. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  64. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  65. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  66. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  67. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  68. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  69. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  70. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  71. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  72. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  73. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  74. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  75. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  76. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  77. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  138. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  139. evalscope/metrics/t2v_metrics/score.py +78 -0
  140. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  141. evalscope/models/__init__.py +50 -14
  142. evalscope/models/adapters/__init__.py +17 -0
  143. evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
  144. evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
  145. evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
  146. evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
  147. evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
  148. evalscope/models/adapters/t2i_adapter.py +76 -0
  149. evalscope/models/custom/__init__.py +2 -1
  150. evalscope/models/custom/dummy_model.py +11 -13
  151. evalscope/models/local_model.py +82 -33
  152. evalscope/models/model.py +2 -42
  153. evalscope/models/register.py +26 -0
  154. evalscope/perf/benchmark.py +4 -3
  155. evalscope/perf/main.py +4 -2
  156. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  157. evalscope/perf/utils/benchmark_util.py +2 -2
  158. evalscope/perf/utils/db_util.py +16 -8
  159. evalscope/report/__init__.py +1 -0
  160. evalscope/report/app.py +117 -67
  161. evalscope/report/app_arguments.py +11 -0
  162. evalscope/report/generator.py +1 -1
  163. evalscope/run.py +3 -3
  164. evalscope/third_party/thinkbench/eval.py +19 -7
  165. evalscope/utils/chat_service.py +2 -2
  166. evalscope/utils/import_utils.py +66 -0
  167. evalscope/utils/utils.py +12 -4
  168. evalscope/version.py +2 -2
  169. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +20 -3
  170. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +178 -66
  171. tests/aigc/__init__.py +1 -0
  172. tests/aigc/test_t2i.py +87 -0
  173. tests/cli/test_run.py +20 -7
  174. tests/perf/test_perf.py +6 -3
  175. evalscope/metrics/code_metric.py +0 -98
  176. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  177. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  178. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
  179. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
  180. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
  181. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,11 @@
1
+ import argparse
2
+
3
+
4
+ def add_argument(parser: argparse.ArgumentParser):
5
+ parser.add_argument('--share', action='store_true', help='Share the app.')
6
+ parser.add_argument('--server-name', type=str, default='0.0.0.0', help='The server name.')
7
+ parser.add_argument('--server-port', type=int, default=None, help='The server port.')
8
+ parser.add_argument('--debug', action='store_true', help='Debug the app.')
9
+ parser.add_argument('--lang', type=str, default='zh', help='The locale.', choices=['zh', 'en'])
10
+ parser.add_argument('--outputs', type=str, default='./outputs', help='The outputs dir.')
11
+ parser.add_argument('--allowed-paths', nargs='+', default=['/'], help='The outputs dir.')
@@ -48,7 +48,7 @@ class ReportGenerator:
48
48
  df = flatten_subset()
49
49
 
50
50
  metrics_list = []
51
- for metric_name, group_metric in df.groupby('metric_name'):
51
+ for metric_name, group_metric in df.groupby('metric_name', sort=False):
52
52
  categories = []
53
53
  for category_name, group_category in group_metric.groupby('categories'):
54
54
  subsets = []
evalscope/run.py CHANGED
@@ -153,10 +153,10 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt
153
153
  data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
154
154
  return EvaluatorCollection(task_cfg, data_adapter, outputs, base_model)
155
155
 
156
- # Initialize model adapter
157
- model_adapter = initialize_model_adapter(task_cfg, benchmark, base_model)
158
- # Initialize data adapter
156
+ # Initialize data adapter first to update config
159
157
  data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
158
+ # Initialize model adapter
159
+ model_adapter = initialize_model_adapter(task_cfg, data_adapter, base_model)
160
160
 
161
161
  # update task_cfg.dataset_args
162
162
  task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
@@ -357,7 +357,7 @@ judge_config = dict(
357
357
  )
358
358
 
359
359
  distill_qwen_config = dict(
360
- report_path = './outputs/20250218_180219',
360
+ report_path = '../eval-scope/outputs/20250218_180219',
361
361
  model_name = 'DeepSeek-R1-Distill-Qwen-7B',
362
362
  tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
363
363
  dataset_name = 'math_500',
@@ -367,7 +367,7 @@ distill_qwen_config = dict(
367
367
  )
368
368
 
369
369
  math_qwen_config = dict(
370
- report_path = './outputs/20250219_202358',
370
+ report_path = '../eval-scope/outputs/20250219_202358',
371
371
  model_name = 'Qwen2.5-Math-7B-Instruct',
372
372
  tokenizer_path = 'Qwen/Qwen2.5-Math-7B-Instruct',
373
373
  dataset_name = 'math_500',
@@ -377,7 +377,7 @@ math_qwen_config = dict(
377
377
  )
378
378
 
379
379
  r1_config = dict(
380
- report_path = './outputs/20250307_000404',
380
+ report_path = '../eval-scope/outputs/20250307_000404',
381
381
  model_name = 'deepseek-r1',
382
382
  tokenizer_path = 'deepseek-ai/DeepSeek-R1',
383
383
  dataset_name = 'math_500',
@@ -387,7 +387,7 @@ r1_config = dict(
387
387
  )
388
388
 
389
389
  qwq_preview_config = dict(
390
- report_path = './outputs/20250221_105911',
390
+ report_path = '../eval-scope/outputs/20250221_105911',
391
391
  model_name = 'qwq-32b-preview',
392
392
  tokenizer_path = 'Qwen/QwQ-32B-Preview',
393
393
  dataset_name = 'math_500',
@@ -397,7 +397,7 @@ qwq_preview_config = dict(
397
397
  )
398
398
 
399
399
  qwq_config = dict(
400
- report_path = './outputs/20250306_181550',
400
+ report_path = '../eval-scope/outputs/20250306_181550',
401
401
  model_name = 'QwQ-32B',
402
402
  tokenizer_path = 'Qwen/QwQ-32B',
403
403
  dataset_name = 'math_500',
@@ -407,7 +407,7 @@ qwq_config = dict(
407
407
  )
408
408
 
409
409
  distill_qwen_32b = dict(
410
- report_path = './outputs/20250306_235951',
410
+ report_path = '../eval-scope/outputs/20250306_235951',
411
411
  model_name = 'deepseek-r1-distill-qwen-32b',
412
412
  tokenizer_path = 'deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
413
413
  dataset_name = 'math_500',
@@ -416,14 +416,26 @@ distill_qwen_32b = dict(
416
416
  judge_config=judge_config
417
417
  )
418
418
 
419
+ qwen3_32b_think = dict(
420
+ report_path = '../eval-scope/outputs/20250428_151817',
421
+ model_name = 'Qwen3-32B',
422
+ tokenizer_path = 'Qwen/Qwen3-32B',
423
+ dataset_name = 'math_500',
424
+ subsets = ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
425
+ split_strategies='separator',
426
+ judge_config=judge_config
427
+ )
428
+
419
429
  if __name__ == '__main__':
420
430
  # run_task(distill_qwen_config, count=80)
421
431
  # run_task(math_qwen_config)
422
432
  # run_task(qwq_preview_config, max_tokens=20000, count=200, workers=128)
423
433
  # run_task(r1_config, max_tokens=20000, count=200, workers=128)
424
434
  # run_task(qwq_config, max_tokens=20000, count=200, workers=128)
435
+ run_task(qwen3_32b_think, max_tokens=20000, count=200, workers=128)
425
436
  # run_task(distill_qwen_32b, max_tokens=20000, count=200, workers=128)
426
437
 
427
438
  # combine_results([qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics.png')
428
439
  # combine_results([qwq_config, r1_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_3models.png')
429
- combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
440
+ # combine_results([distill_qwen_config, math_qwen_config, qwq_config, r1_config, qwq_preview_config, distill_qwen_32b], output_path='outputs/model_comparison_metrics_6models.png')
441
+ combine_results([qwq_config, r1_config, distill_qwen_32b, qwen3_32b_think], output_path='outputs/model_comparison_metrics_4models.png')
@@ -64,10 +64,10 @@ class ChatCompletionResponseStreamChoice(BaseModel):
64
64
 
65
65
  class ChatCompletionResponse(BaseModel):
66
66
  model: str
67
- object: Literal['chat.completion', 'chat.completion.chunk']
67
+ object: Literal['chat.completion', 'chat.completion.chunk', 'images.generations']
68
68
  choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
69
69
  created: Optional[int] = Field(default_factory=lambda: int(time.time()))
70
- usage: Optional[Usage]
70
+ usage: Optional[Usage] = None
71
71
 
72
72
 
73
73
  class TextCompletionRequest(BaseModel):
@@ -0,0 +1,66 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright 2023-present the HuggingFace Inc. team.
3
+
4
+ import importlib
5
+ import os
6
+ from itertools import chain
7
+ from types import ModuleType
8
+ from typing import Any
9
+
10
+ from .logger import get_logger
11
+
12
+ logger = get_logger() # pylint: disable=invalid-name
13
+
14
+
15
+ class _LazyModule(ModuleType):
16
+ """
17
+ Module class that surfaces all objects but only performs associated imports when the objects are requested.
18
+ """
19
+
20
+ # Very heavily inspired by optuna.integration._IntegrationModule
21
+ # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
22
+ def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
23
+ super().__init__(name)
24
+ self._modules = set(import_structure.keys())
25
+ self._class_to_module = {}
26
+ for key, values in import_structure.items():
27
+ for value in values:
28
+ self._class_to_module[value] = key
29
+ # Needed for autocompletion in an IDE
30
+ self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
31
+ self.__file__ = module_file
32
+ self.__spec__ = module_spec
33
+ self.__path__ = [os.path.dirname(module_file)]
34
+ self._objects = {} if extra_objects is None else extra_objects
35
+ self._name = name
36
+ self._import_structure = import_structure
37
+
38
+ # Needed for autocompletion in an IDE
39
+ def __dir__(self):
40
+ result = super().__dir__()
41
+ # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
42
+ # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
43
+ for attr in self.__all__:
44
+ if attr not in result:
45
+ result.append(attr)
46
+ return result
47
+
48
+ def __getattr__(self, name: str) -> Any:
49
+ if name in self._objects:
50
+ return self._objects[name]
51
+ if name in self._modules:
52
+ value = self._get_module(name)
53
+ elif name in self._class_to_module.keys():
54
+ module = self._get_module(self._class_to_module[name])
55
+ value = getattr(module, name)
56
+ else:
57
+ raise AttributeError(f'module {self.__name__} has no attribute {name}')
58
+
59
+ setattr(self, name, value)
60
+ return value
61
+
62
+ def _get_module(self, module_name: str):
63
+ return importlib.import_module('.' + module_name, self.__name__)
64
+
65
+ def __reduce__(self):
66
+ return self.__class__, (self._name, self.__file__, self._import_structure)
evalscope/utils/utils.py CHANGED
@@ -76,16 +76,16 @@ def dict_torch_dtype_to_str(d: Dict[str, Any]) -> dict:
76
76
  class ResponseParser:
77
77
 
78
78
  @staticmethod
79
- def parse_first_capital(text: str) -> str:
79
+ def parse_first_capital(text: str, options: list[str]) -> str:
80
80
  for t in text:
81
- if t.isupper():
81
+ if t.isupper() and (t in options):
82
82
  return t
83
83
  return ''
84
84
 
85
85
  @staticmethod
86
- def parse_last_capital(text: str) -> str:
86
+ def parse_last_capital(text: str, options: list[str]) -> str:
87
87
  for t in text[::-1]:
88
- if t.isupper():
88
+ if t.isupper() and (t in options):
89
89
  return t
90
90
  return ''
91
91
 
@@ -155,6 +155,10 @@ class ResponseParser:
155
155
  for i in options:
156
156
  if i in outputs:
157
157
  return i
158
+ # If no match found, try to find the last capital letter in the text
159
+ last_capital = ResponseParser.parse_last_capital(text, options)
160
+ if last_capital:
161
+ return last_capital
158
162
  return 'No valid option found'
159
163
 
160
164
  @staticmethod
@@ -183,6 +187,10 @@ class ResponseParser:
183
187
  matches = regex.search(text)
184
188
  if matches:
185
189
  return matches.group(1)
190
+ # If no match found, try to find the last capital letter in the text
191
+ last_capital = ResponseParser.parse_last_capital(text, options)
192
+ if last_capital:
193
+ return last_capital
186
194
  return 'No valid option found'
187
195
 
188
196
 
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.14.0'
4
- __release_datetime__ = '2025-04-10 20:00:00'
3
+ __version__ = '0.15.1'
4
+ __release_datetime__ = '2025-04-30 12:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.14.0
3
+ Version: 0.15.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -28,8 +28,9 @@ Requires-Dist: modelscope[framework]
28
28
  Requires-Dist: nltk>=3.9
29
29
  Requires-Dist: openai
30
30
  Requires-Dist: pandas
31
+ Requires-Dist: pillow
31
32
  Requires-Dist: pyarrow
32
- Requires-Dist: pyyaml
33
+ Requires-Dist: pyyaml>=5.1
33
34
  Requires-Dist: requests
34
35
  Requires-Dist: rouge-chinese
35
36
  Requires-Dist: rouge-score>=0.1.0
@@ -39,9 +40,16 @@ Requires-Dist: seaborn
39
40
  Requires-Dist: sympy
40
41
  Requires-Dist: tabulate
41
42
  Requires-Dist: torch
43
+ Requires-Dist: torchvision
42
44
  Requires-Dist: tqdm
43
45
  Requires-Dist: transformers>=4.33
44
46
  Requires-Dist: word2number
47
+ Provides-Extra: aigc
48
+ Requires-Dist: diffusers; extra == "aigc"
49
+ Requires-Dist: iopath; extra == "aigc"
50
+ Requires-Dist: omegaconf; extra == "aigc"
51
+ Requires-Dist: open-clip-torch; extra == "aigc"
52
+ Requires-Dist: opencv-python; extra == "aigc"
45
53
  Provides-Extra: all
46
54
  Requires-Dist: accelerate; extra == "all"
47
55
  Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
@@ -55,8 +63,9 @@ Requires-Dist: modelscope[framework]; extra == "all"
55
63
  Requires-Dist: nltk>=3.9; extra == "all"
56
64
  Requires-Dist: openai; extra == "all"
57
65
  Requires-Dist: pandas; extra == "all"
66
+ Requires-Dist: pillow; extra == "all"
58
67
  Requires-Dist: pyarrow; extra == "all"
59
- Requires-Dist: pyyaml; extra == "all"
68
+ Requires-Dist: pyyaml>=5.1; extra == "all"
60
69
  Requires-Dist: requests; extra == "all"
61
70
  Requires-Dist: rouge-chinese; extra == "all"
62
71
  Requires-Dist: rouge-score>=0.1.0; extra == "all"
@@ -66,6 +75,7 @@ Requires-Dist: seaborn; extra == "all"
66
75
  Requires-Dist: sympy; extra == "all"
67
76
  Requires-Dist: tabulate; extra == "all"
68
77
  Requires-Dist: torch; extra == "all"
78
+ Requires-Dist: torchvision; extra == "all"
69
79
  Requires-Dist: tqdm; extra == "all"
70
80
  Requires-Dist: transformers>=4.33; extra == "all"
71
81
  Requires-Dist: word2number; extra == "all"
@@ -86,6 +96,11 @@ Requires-Dist: transformers; extra == "all"
86
96
  Requires-Dist: unicorn; extra == "all"
87
97
  Requires-Dist: gradio==5.4.0; extra == "all"
88
98
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
99
+ Requires-Dist: diffusers; extra == "all"
100
+ Requires-Dist: iopath; extra == "all"
101
+ Requires-Dist: omegaconf; extra == "all"
102
+ Requires-Dist: open-clip-torch; extra == "all"
103
+ Requires-Dist: opencv-python; extra == "all"
89
104
  Provides-Extra: app
90
105
  Requires-Dist: gradio==5.4.0; extra == "app"
91
106
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
@@ -199,6 +214,8 @@ Please scan the QR code below to join our community groups:
199
214
 
200
215
  ## 🎉 News
201
216
 
217
+ - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
218
+ - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
202
219
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
203
220
  - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
204
221
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)