evalscope 0.13.2__tar.gz → 0.15.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (483) hide show
  1. {evalscope-0.13.2/evalscope.egg-info → evalscope-0.15.0}/PKG-INFO +37 -15
  2. {evalscope-0.13.2 → evalscope-0.15.0}/README.md +11 -4
  3. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/arguments.py +2 -1
  4. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/__init__.py +1 -1
  5. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/backend_manager.py +21 -5
  6. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  7. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  8. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  9. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  10. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/utils/embedding.py +49 -3
  11. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/utils/llm.py +4 -4
  12. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  13. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/__init__.py +2 -2
  14. evalscope-0.15.0/evalscope/benchmarks/aigc/t2i/base.py +56 -0
  15. evalscope-0.15.0/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  16. evalscope-0.15.0/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  17. evalscope-0.15.0/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  18. evalscope-0.15.0/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  19. evalscope-0.15.0/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  20. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  21. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  22. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  23. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/arc/arc_adapter.py +2 -2
  24. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  25. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  26. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  27. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  28. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  29. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/data_adapter.py +21 -10
  30. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  31. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  32. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  33. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  35. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +5 -4
  36. evalscope-0.15.0/evalscope/benchmarks/live_code_bench/testing_util.py +540 -0
  37. evalscope-0.15.0/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  38. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  39. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -8
  40. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  44. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/utils.py +7 -16
  45. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/cli/start_app.py +1 -1
  46. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/collections/evaluator.py +20 -6
  47. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/config.py +8 -4
  48. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/constants.py +11 -0
  49. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/evaluator/evaluator.py +2 -2
  50. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  51. evalscope-0.15.0/evalscope/metrics/__init__.py +50 -0
  52. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/metrics/llm_judge.py +1 -1
  53. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/metrics/named_metrics.py +13 -0
  54. evalscope-0.15.0/evalscope/metrics/t2v_metrics/__init__.py +66 -0
  55. evalscope-0.15.0/evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  56. evalscope-0.15.0/evalscope/metrics/t2v_metrics/constants.py +12 -0
  57. evalscope-0.15.0/evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  58. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  59. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  60. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  61. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  62. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  63. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  64. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  65. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  66. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  67. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  68. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  69. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  70. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  71. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  72. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  73. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  74. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/model.py +45 -0
  75. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  76. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  77. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  78. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  79. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  80. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  81. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  82. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  83. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  84. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  85. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  86. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  87. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  88. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  89. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  90. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  91. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  92. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  93. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  94. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  95. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  96. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  97. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  98. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  99. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  100. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  101. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  102. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  103. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  104. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  105. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  106. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  107. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  108. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  109. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  110. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  111. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  112. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  113. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  114. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  115. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  116. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  117. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  118. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  119. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  120. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  121. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  122. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  123. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  124. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  125. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  126. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  127. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  128. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  129. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  130. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  131. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  132. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  133. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  134. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  135. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  136. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  137. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  138. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  139. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  140. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  141. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  142. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  143. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  144. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  145. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  146. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  147. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  148. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  149. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  150. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  151. evalscope-0.15.0/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  152. evalscope-0.15.0/evalscope/metrics/t2v_metrics/score.py +78 -0
  153. evalscope-0.15.0/evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  154. evalscope-0.15.0/evalscope/models/__init__.py +53 -0
  155. evalscope-0.15.0/evalscope/models/adapters/__init__.py +17 -0
  156. {evalscope-0.13.2/evalscope/models → evalscope-0.15.0/evalscope/models/adapters}/base_adapter.py +17 -17
  157. {evalscope-0.13.2/evalscope/models → evalscope-0.15.0/evalscope/models/adapters}/chat_adapter.py +10 -7
  158. {evalscope-0.13.2/evalscope/models → evalscope-0.15.0/evalscope/models/adapters}/choice_adapter.py +2 -6
  159. {evalscope-0.13.2/evalscope/models → evalscope-0.15.0/evalscope/models/adapters}/custom_adapter.py +2 -4
  160. {evalscope-0.13.2/evalscope/models → evalscope-0.15.0/evalscope/models/adapters}/server_adapter.py +1 -3
  161. evalscope-0.15.0/evalscope/models/adapters/t2i_adapter.py +76 -0
  162. evalscope-0.15.0/evalscope/models/custom/__init__.py +4 -0
  163. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/models/custom/dummy_model.py +11 -13
  164. evalscope-0.15.0/evalscope/models/local_model.py +128 -0
  165. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/models/model.py +2 -42
  166. evalscope-0.15.0/evalscope/models/register.py +54 -0
  167. evalscope-0.15.0/evalscope/perf/__init__.py +0 -0
  168. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/arguments.py +24 -5
  169. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/benchmark.py +28 -42
  170. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/http_client.py +2 -3
  171. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/api/custom_api.py +1 -1
  172. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/api/openai_api.py +2 -2
  173. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/datasets/custom.py +4 -1
  174. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  175. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  176. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  177. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/datasets/openqa.py +4 -1
  178. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  179. evalscope-0.15.0/evalscope/perf/utils/__init__.py +0 -0
  180. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/utils/benchmark_util.py +14 -8
  181. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/utils/db_util.py +9 -3
  182. evalscope-0.15.0/evalscope/perf/utils/log_utils.py +41 -0
  183. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/report/__init__.py +1 -0
  184. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/report/app.py +128 -78
  185. evalscope-0.15.0/evalscope/report/app_arguments.py +11 -0
  186. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/report/generator.py +1 -1
  187. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/run.py +10 -3
  188. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/summarizer.py +2 -1
  189. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/thinkbench/eval.py +19 -7
  190. evalscope-0.15.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  191. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/utils/chat_service.py +2 -2
  192. evalscope-0.15.0/evalscope/utils/import_utils.py +66 -0
  193. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/utils/utils.py +48 -29
  194. evalscope-0.15.0/evalscope/version.py +4 -0
  195. {evalscope-0.13.2 → evalscope-0.15.0/evalscope.egg-info}/PKG-INFO +37 -15
  196. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope.egg-info/SOURCES.txt +124 -10
  197. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope.egg-info/requires.txt +26 -10
  198. evalscope-0.15.0/requirements/aigc.txt +4 -0
  199. {evalscope-0.13.2 → evalscope-0.15.0}/requirements/framework.txt +3 -0
  200. evalscope-0.15.0/requirements/rag.txt +7 -0
  201. {evalscope-0.13.2 → evalscope-0.15.0}/setup.py +2 -0
  202. evalscope-0.15.0/tests/aigc/test_t2i.py +87 -0
  203. {evalscope-0.13.2 → evalscope-0.15.0}/tests/cli/test_all.py +4 -4
  204. {evalscope-0.13.2 → evalscope-0.15.0}/tests/cli/test_collection.py +2 -1
  205. {evalscope-0.13.2 → evalscope-0.15.0}/tests/cli/test_run.py +19 -12
  206. {evalscope-0.13.2 → evalscope-0.15.0}/tests/perf/test_perf.py +3 -3
  207. evalscope-0.15.0/tests/rag/__init__.py +0 -0
  208. {evalscope-0.13.2 → evalscope-0.15.0}/tests/rag/test_clip_benchmark.py +0 -1
  209. {evalscope-0.13.2 → evalscope-0.15.0}/tests/rag/test_mteb.py +37 -8
  210. {evalscope-0.13.2 → evalscope-0.15.0}/tests/rag/test_ragas.py +29 -26
  211. evalscope-0.15.0/tests/vlm/__init__.py +1 -0
  212. {evalscope-0.13.2 → evalscope-0.15.0}/tests/vlm/test_vlmeval.py +37 -1
  213. evalscope-0.13.2/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  214. evalscope-0.13.2/evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  215. evalscope-0.13.2/evalscope/benchmarks/live_code_bench/testing_util.py +0 -721
  216. evalscope-0.13.2/evalscope/metrics/__init__.py +0 -5
  217. evalscope-0.13.2/evalscope/metrics/code_metric.py +0 -98
  218. evalscope-0.13.2/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  219. evalscope-0.13.2/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  220. evalscope-0.13.2/evalscope/models/__init__.py +0 -17
  221. evalscope-0.13.2/evalscope/models/custom/__init__.py +0 -3
  222. evalscope-0.13.2/evalscope/models/local_model.py +0 -79
  223. evalscope-0.13.2/evalscope/models/register.py +0 -28
  224. evalscope-0.13.2/evalscope/version.py +0 -4
  225. evalscope-0.13.2/requirements/rag.txt +0 -7
  226. {evalscope-0.13.2 → evalscope-0.15.0}/LICENSE +0 -0
  227. {evalscope-0.13.2 → evalscope-0.15.0}/MANIFEST.in +0 -0
  228. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/__init__.py +0 -0
  229. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/__init__.py +0 -0
  230. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/base.py +0 -0
  231. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/opencompass/__init__.py +0 -0
  232. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  233. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  234. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  235. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  236. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  237. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  238. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  239. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  240. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  241. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  242. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  243. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  244. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  245. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  246. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  247. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  248. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  249. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  250. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  251. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  252. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  253. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  254. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  255. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  256. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  257. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  258. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  259. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  260. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  261. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  262. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  263. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  264. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  265. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  266. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  267. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  268. {evalscope-0.13.2/evalscope/benchmarks/aime → evalscope-0.15.0/evalscope/benchmarks/aigc}/__init__.py +0 -0
  269. {evalscope-0.13.2/evalscope/benchmarks/alpaca_eval → evalscope-0.15.0/evalscope/benchmarks/aigc/t2i}/__init__.py +0 -0
  270. {evalscope-0.13.2/evalscope/benchmarks/arena_hard → evalscope-0.15.0/evalscope/benchmarks/aime}/__init__.py +0 -0
  271. {evalscope-0.13.2/evalscope/benchmarks/chinese_simple_qa → evalscope-0.15.0/evalscope/benchmarks/alpaca_eval}/__init__.py +0 -0
  272. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  273. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  274. {evalscope-0.13.2/evalscope/benchmarks/data_collection → evalscope-0.15.0/evalscope/benchmarks/arena_hard}/__init__.py +0 -0
  275. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/arena_hard/utils.py +0 -0
  276. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  277. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  278. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  279. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  280. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  281. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  282. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  283. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  284. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  285. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  286. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  287. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  288. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  289. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  290. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  291. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  292. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  293. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  294. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  295. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  296. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  297. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  298. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  299. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  300. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  301. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  302. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  303. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  304. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  305. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/benchmark.py +0 -0
  306. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  307. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  308. {evalscope-0.13.2/evalscope/benchmarks/general_mcq → evalscope-0.15.0/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
  309. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  310. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  311. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  312. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  313. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  314. {evalscope-0.13.2/evalscope/benchmarks/gpqa → evalscope-0.15.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  315. {evalscope-0.13.2/evalscope/benchmarks/ifeval → evalscope-0.15.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  316. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  317. {evalscope-0.13.2/evalscope/benchmarks/iquiz → evalscope-0.15.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  318. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  319. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
  320. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  321. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  322. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  323. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  324. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  325. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  326. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  327. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  328. {evalscope-0.13.2/evalscope/benchmarks/live_code_bench → evalscope-0.15.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  329. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
  330. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  331. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  332. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  333. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
  334. {evalscope-0.13.2/evalscope/benchmarks/math_500 → evalscope-0.15.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  335. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
  336. {evalscope-0.13.2/evalscope/benchmarks/mmlu_pro → evalscope-0.15.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  337. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  338. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
  339. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  340. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  341. {evalscope-0.13.2/evalscope/benchmarks/mmlu_redux → evalscope-0.15.0/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
  342. {evalscope-0.13.2/evalscope/benchmarks/musr → evalscope-0.15.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
  343. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  344. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  345. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  346. {evalscope-0.13.2/evalscope/benchmarks/process_bench → evalscope-0.15.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  347. {evalscope-0.13.2/evalscope/benchmarks/simple_qa → evalscope-0.15.0/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
  348. {evalscope-0.13.2/evalscope/benchmarks/super_gpqa → evalscope-0.15.0/evalscope/benchmarks/musr}/__init__.py +0 -0
  349. {evalscope-0.13.2/evalscope/perf → evalscope-0.15.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  350. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  351. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
  352. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/race/__init__.py +0 -0
  353. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/race/race.py +0 -0
  354. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/race/race_adapter.py +0 -0
  355. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
  356. {evalscope-0.13.2/evalscope/perf/utils → evalscope-0.15.0/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  357. {evalscope-0.13.2/evalscope/third_party/thinkbench/tools → evalscope-0.15.0/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  358. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
  359. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
  360. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
  361. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
  362. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  363. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  364. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  365. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  366. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  367. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  368. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  369. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/cli/__init__.py +0 -0
  370. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/cli/base.py +0 -0
  371. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/cli/cli.py +0 -0
  372. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/cli/start_eval.py +0 -0
  373. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/cli/start_perf.py +0 -0
  374. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/cli/start_server.py +0 -0
  375. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/collections/__init__.py +0 -0
  376. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/collections/sampler.py +0 -0
  377. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/collections/schema.py +0 -0
  378. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/evaluator/__init__.py +0 -0
  379. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/evaluator/rating_eval.py +0 -0
  380. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
  381. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  382. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  383. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/metrics/math_parser.py +0 -0
  384. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/metrics/metrics.py +0 -0
  385. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/metrics/rouge_metric.py +0 -0
  386. {evalscope-0.13.2/tests/rag → evalscope-0.15.0/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
  387. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/models/custom/custom_model.py +0 -0
  388. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/main.py +0 -0
  389. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/__init__.py +0 -0
  390. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  391. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/api/base.py +0 -0
  392. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  393. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  394. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  395. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  396. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/plugin/registry.py +0 -0
  397. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/utils/analysis_result.py +0 -0
  398. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/utils/handler.py +0 -0
  399. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/perf/utils/local_server.py +0 -0
  400. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/__init__.py +0 -0
  401. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
  402. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  403. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  404. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/config/cfg_single.yaml +0 -0
  405. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  406. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  407. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  408. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  409. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/data/question.jsonl +0 -0
  410. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/arc.yaml +0 -0
  411. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  412. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  413. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  414. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  415. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  416. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  417. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  418. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  419. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  420. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  421. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/report/combinator.py +0 -0
  422. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/report/utils.py +0 -0
  423. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/run_arena.py +0 -0
  424. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/__init__.py +0 -0
  425. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/README.md +0 -0
  426. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  427. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  428. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  429. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  430. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  431. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  432. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  433. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  434. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  435. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  436. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  437. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  438. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  439. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  440. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  441. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
  442. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/thinkbench/infer.py +0 -0
  443. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  444. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  445. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  446. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  447. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  448. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  449. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  450. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  451. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  452. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  453. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  454. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  455. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  456. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  457. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/utils/__init__.py +0 -0
  458. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/utils/arena_utils.py +0 -0
  459. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/utils/completion_parsers.py +0 -0
  460. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/utils/filters.py +0 -0
  461. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/utils/io_utils.py +0 -0
  462. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/utils/logger.py +0 -0
  463. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope/utils/model_utils.py +0 -0
  464. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope.egg-info/dependency_links.txt +0 -0
  465. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope.egg-info/entry_points.txt +0 -0
  466. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope.egg-info/not-zip-safe +0 -0
  467. {evalscope-0.13.2 → evalscope-0.15.0}/evalscope.egg-info/top_level.txt +0 -0
  468. {evalscope-0.13.2 → evalscope-0.15.0}/requirements/app.txt +0 -0
  469. {evalscope-0.13.2 → evalscope-0.15.0}/requirements/docs.txt +0 -0
  470. {evalscope-0.13.2 → evalscope-0.15.0}/requirements/opencompass.txt +0 -0
  471. {evalscope-0.13.2 → evalscope-0.15.0}/requirements/perf.txt +0 -0
  472. {evalscope-0.13.2 → evalscope-0.15.0}/requirements/vlmeval.txt +0 -0
  473. {evalscope-0.13.2 → evalscope-0.15.0}/requirements.txt +0 -0
  474. {evalscope-0.13.2 → evalscope-0.15.0}/setup.cfg +0 -0
  475. {evalscope-0.13.2 → evalscope-0.15.0}/tests/__init__.py +0 -0
  476. {evalscope-0.13.2/tests/cli → evalscope-0.15.0/tests/aigc}/__init__.py +0 -0
  477. {evalscope-0.13.2/tests/perf → evalscope-0.15.0/tests/cli}/__init__.py +0 -0
  478. {evalscope-0.13.2/tests/swift → evalscope-0.15.0/tests/perf}/__init__.py +0 -0
  479. {evalscope-0.13.2/tests/vlm → evalscope-0.15.0/tests/swift}/__init__.py +0 -0
  480. {evalscope-0.13.2 → evalscope-0.15.0}/tests/swift/test_run_swift_eval.py +0 -0
  481. {evalscope-0.13.2 → evalscope-0.15.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  482. {evalscope-0.13.2 → evalscope-0.15.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  483. {evalscope-0.13.2 → evalscope-0.15.0}/tests/test_run_all.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.13.2
3
+ Version: 0.15.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -26,8 +26,10 @@ Requires-Dist: latex2sympy2
26
26
  Requires-Dist: matplotlib
27
27
  Requires-Dist: modelscope[framework]
28
28
  Requires-Dist: nltk>=3.9
29
+ Requires-Dist: omegaconf
29
30
  Requires-Dist: openai
30
31
  Requires-Dist: pandas
32
+ Requires-Dist: pillow
31
33
  Requires-Dist: pyarrow
32
34
  Requires-Dist: pyyaml
33
35
  Requires-Dist: requests
@@ -39,6 +41,7 @@ Requires-Dist: seaborn
39
41
  Requires-Dist: sympy
40
42
  Requires-Dist: tabulate
41
43
  Requires-Dist: torch
44
+ Requires-Dist: torchvision
42
45
  Requires-Dist: tqdm
43
46
  Requires-Dist: transformers>=4.33
44
47
  Requires-Dist: word2number
@@ -47,12 +50,12 @@ Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
47
50
  Provides-Extra: vlmeval
48
51
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
49
52
  Provides-Extra: rag
50
- Requires-Dist: langchain<0.3.0; extra == "rag"
51
- Requires-Dist: langchain-community<0.3.0; extra == "rag"
52
- Requires-Dist: langchain-core<0.3.0; extra == "rag"
53
- Requires-Dist: langchain-openai<0.3.0; extra == "rag"
53
+ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
54
+ Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
55
+ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
56
+ Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
54
57
  Requires-Dist: mteb==1.19.4; extra == "rag"
55
- Requires-Dist: ragas==0.2.9; extra == "rag"
58
+ Requires-Dist: ragas==0.2.14; extra == "rag"
56
59
  Requires-Dist: webdataset>0.2.0; extra == "rag"
57
60
  Provides-Extra: perf
58
61
  Requires-Dist: aiohttp; extra == "perf"
@@ -64,6 +67,11 @@ Requires-Dist: unicorn; extra == "perf"
64
67
  Provides-Extra: app
65
68
  Requires-Dist: gradio==5.4.0; extra == "app"
66
69
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
70
+ Provides-Extra: aigc
71
+ Requires-Dist: diffusers; extra == "aigc"
72
+ Requires-Dist: iopath; extra == "aigc"
73
+ Requires-Dist: open_clip_torch; extra == "aigc"
74
+ Requires-Dist: opencv-python; extra == "aigc"
67
75
  Provides-Extra: all
68
76
  Requires-Dist: accelerate; extra == "all"
69
77
  Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
@@ -75,8 +83,10 @@ Requires-Dist: latex2sympy2; extra == "all"
75
83
  Requires-Dist: matplotlib; extra == "all"
76
84
  Requires-Dist: modelscope[framework]; extra == "all"
77
85
  Requires-Dist: nltk>=3.9; extra == "all"
86
+ Requires-Dist: omegaconf; extra == "all"
78
87
  Requires-Dist: openai; extra == "all"
79
88
  Requires-Dist: pandas; extra == "all"
89
+ Requires-Dist: pillow; extra == "all"
80
90
  Requires-Dist: pyarrow; extra == "all"
81
91
  Requires-Dist: pyyaml; extra == "all"
82
92
  Requires-Dist: requests; extra == "all"
@@ -88,17 +98,18 @@ Requires-Dist: seaborn; extra == "all"
88
98
  Requires-Dist: sympy; extra == "all"
89
99
  Requires-Dist: tabulate; extra == "all"
90
100
  Requires-Dist: torch; extra == "all"
101
+ Requires-Dist: torchvision; extra == "all"
91
102
  Requires-Dist: tqdm; extra == "all"
92
103
  Requires-Dist: transformers>=4.33; extra == "all"
93
104
  Requires-Dist: word2number; extra == "all"
94
105
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
95
106
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
96
- Requires-Dist: langchain<0.3.0; extra == "all"
97
- Requires-Dist: langchain-community<0.3.0; extra == "all"
98
- Requires-Dist: langchain-core<0.3.0; extra == "all"
99
- Requires-Dist: langchain-openai<0.3.0; extra == "all"
107
+ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
108
+ Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
109
+ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
110
+ Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
100
111
  Requires-Dist: mteb==1.19.4; extra == "all"
101
- Requires-Dist: ragas==0.2.9; extra == "all"
112
+ Requires-Dist: ragas==0.2.14; extra == "all"
102
113
  Requires-Dist: webdataset>0.2.0; extra == "all"
103
114
  Requires-Dist: aiohttp; extra == "all"
104
115
  Requires-Dist: fastapi; extra == "all"
@@ -108,6 +119,10 @@ Requires-Dist: transformers; extra == "all"
108
119
  Requires-Dist: unicorn; extra == "all"
109
120
  Requires-Dist: gradio==5.4.0; extra == "all"
110
121
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
122
+ Requires-Dist: diffusers; extra == "all"
123
+ Requires-Dist: iopath; extra == "all"
124
+ Requires-Dist: open_clip_torch; extra == "all"
125
+ Requires-Dist: opencv-python; extra == "all"
111
126
 
112
127
  <p align="center">
113
128
  <br>
@@ -121,7 +136,7 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
121
136
  </p>
122
137
 
123
138
  <p align="center">
124
- <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
139
+ <img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
125
140
  <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
126
141
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
127
142
  <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
@@ -199,6 +214,10 @@ Please scan the QR code below to join our community groups:
199
214
 
200
215
  ## 🎉 News
201
216
 
217
+ - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
218
+ - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
219
+ - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
220
+ - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
202
221
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
203
222
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
204
223
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -212,15 +231,14 @@ Please scan the QR code below to join our community groups:
212
231
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
213
232
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
214
233
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
234
+ <details><summary>More</summary>
235
+
215
236
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
216
237
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
217
238
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
218
239
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
219
240
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
220
241
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
221
-
222
- <details><summary>More</summary>
223
-
224
242
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
225
243
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
226
244
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -503,6 +521,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
503
521
 
504
522
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
505
523
 
524
+ **Supports swanlab for recording results**
525
+
526
+ ![swanlab sample](https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/swanlab.png)
527
+
506
528
  **Supports Speed Benchmark**
507
529
 
508
530
  It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
@@ -10,7 +10,7 @@
10
10
  </p>
11
11
 
12
12
  <p align="center">
13
- <img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
13
+ <img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
14
14
  <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
15
15
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
16
16
  <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
@@ -88,6 +88,10 @@ Please scan the QR code below to join our community groups:
88
88
 
89
89
  ## 🎉 News
90
90
 
91
+ - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
92
+ - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
93
+ - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
94
+ - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
91
95
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
92
96
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
93
97
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -101,15 +105,14 @@ Please scan the QR code below to join our community groups:
101
105
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
102
106
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
103
107
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
108
+ <details><summary>More</summary>
109
+
104
110
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
105
111
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
106
112
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
107
113
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
108
114
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
109
115
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
110
-
111
- <details><summary>More</summary>
112
-
113
116
  - 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
114
117
  - 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
115
118
  - 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -392,6 +395,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
392
395
 
393
396
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
394
397
 
398
+ **Supports swanlab for recording results**
399
+
400
+ ![swanlab sample](https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/swanlab.png)
401
+
395
402
  **Supports Speed Benchmark**
396
403
 
397
404
  It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
@@ -1,7 +1,7 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
- from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, OutputType
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, ModelTask, OutputType
5
5
 
6
6
 
7
7
  class ParseStrArgsAction(argparse.Action):
@@ -35,6 +35,7 @@ def add_argument(parser: argparse.ArgumentParser):
35
35
  parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
36
36
  parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
37
37
  parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
38
+ parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
38
39
 
39
40
  # Template-related arguments
40
41
  parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
@@ -1,4 +1,4 @@
1
- from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager
1
+ from evalscope.backend.rag_eval.backend_manager import RAGEvalBackendManager, Tools
2
2
  from evalscope.backend.rag_eval.utils.clip import VisionModel
3
3
  from evalscope.backend.rag_eval.utils.embedding import EmbeddingModel
4
4
  from evalscope.backend.rag_eval.utils.llm import LLM, ChatOpenAI, LocalLLM
@@ -8,6 +8,12 @@ from evalscope.utils.logger import get_logger
8
8
  logger = get_logger()
9
9
 
10
10
 
11
+ class Tools:
12
+ MTEB = 'mteb'
13
+ RAGAS = 'ragas'
14
+ CLIP_BENCHMARK = 'clip_benchmark'
15
+
16
+
11
17
  class RAGEvalBackendManager(BackendManager):
12
18
 
13
19
  def __init__(self, config: Union[str, dict], **kwargs):
@@ -47,9 +53,19 @@ class RAGEvalBackendManager(BackendManager):
47
53
  from evalscope.backend.rag_eval.ragas.tasks import generate_testset
48
54
 
49
55
  if testset_args is not None:
50
- generate_testset(TestsetGenerationArguments(**testset_args))
56
+ if isinstance(testset_args, dict):
57
+ generate_testset(TestsetGenerationArguments(**testset_args))
58
+ elif isinstance(testset_args, TestsetGenerationArguments):
59
+ generate_testset(testset_args)
60
+ else:
61
+ raise ValueError('Please provide the testset generation arguments.')
51
62
  if eval_args is not None:
52
- rag_eval(EvaluationArguments(**eval_args))
63
+ if isinstance(eval_args, dict):
64
+ rag_eval(EvaluationArguments(**eval_args))
65
+ elif isinstance(eval_args, EvaluationArguments):
66
+ rag_eval(eval_args)
67
+ else:
68
+ raise ValueError('Please provide the evaluation arguments.')
53
69
 
54
70
  @staticmethod
55
71
  def run_clip_benchmark(args):
@@ -59,17 +75,17 @@ class RAGEvalBackendManager(BackendManager):
59
75
 
60
76
  def run(self, *args, **kwargs):
61
77
  tool = self.config_d.pop('tool')
62
- if tool.lower() == 'mteb':
78
+ if tool.lower() == Tools.MTEB:
63
79
  self._check_env('mteb')
64
80
  model_args = self.config_d['model']
65
81
  eval_args = self.config_d['eval']
66
82
  self.run_mteb(model_args, eval_args)
67
- elif tool.lower() == 'ragas':
83
+ elif tool.lower() == Tools.RAGAS:
68
84
  self._check_env('ragas')
69
85
  testset_args = self.config_d.get('testset_generation', None)
70
86
  eval_args = self.config_d.get('eval', None)
71
87
  self.run_ragas(testset_args, eval_args)
72
- elif tool.lower() == 'clip_benchmark':
88
+ elif tool.lower() == Tools.CLIP_BENCHMARK:
73
89
  self._check_env('webdataset')
74
90
  self.run_clip_benchmark(self.config_d['eval'])
75
91
  else:
@@ -20,6 +20,12 @@ class ModelArguments:
20
20
  encode_kwargs: dict = field(default_factory=lambda: {'show_progress_bar': True, 'batch_size': 32})
21
21
  hub: str = 'modelscope' # modelscope or huggingface
22
22
 
23
+ # for API embedding model
24
+ model_name: Optional[str] = None
25
+ api_base: Optional[str] = None
26
+ api_key: Optional[str] = None
27
+ dimensions: Optional[int] = None
28
+
23
29
  def to_dict(self) -> Dict[str, Any]:
24
30
  return {
25
31
  'model_name_or_path': self.model_name_or_path,
@@ -31,6 +37,10 @@ class ModelArguments:
31
37
  'config_kwargs': self.config_kwargs,
32
38
  'encode_kwargs': self.encode_kwargs,
33
39
  'hub': self.hub,
40
+ 'model_name': self.model_name,
41
+ 'api_base': self.api_base,
42
+ 'api_key': self.api_key,
43
+ 'dimensions': self.dimensions,
34
44
  }
35
45
 
36
46
 
@@ -21,7 +21,6 @@ class TestsetGenerationArguments:
21
21
  """
22
22
  generator_llm: Dict = field(default_factory=dict)
23
23
  embeddings: Dict = field(default_factory=dict)
24
- distribution: str = field(default_factory=lambda: {'simple': 0.5, 'multi_context': 0.4, 'reasoning': 0.1})
25
24
  # For LLM based evaluation
26
25
  # available: ['english', 'hindi', 'marathi', 'chinese', 'spanish', 'amharic', 'arabic',
27
26
  # 'armenian', 'bulgarian', 'urdu', 'russian', 'polish', 'persian', 'dutch', 'danish',
@@ -67,9 +67,14 @@ def get_persona(llm, kg, language):
67
67
 
68
68
 
69
69
  def load_data(file_path):
70
- from langchain_community.document_loaders import UnstructuredFileLoader
70
+ import nltk
71
+ from langchain_unstructured import UnstructuredLoader
71
72
 
72
- loader = UnstructuredFileLoader(file_path, mode='single')
73
+ if nltk.data.find('taggers/averaged_perceptron_tagger_eng') is False:
74
+ # need to download nltk data for the first time
75
+ nltk.download('averaged_perceptron_tagger_eng')
76
+
77
+ loader = UnstructuredLoader(file_path)
73
78
  data = loader.load()
74
79
  return data
75
80
 
@@ -2,7 +2,6 @@ import asyncio
2
2
  import os
3
3
  from ragas.llms import BaseRagasLLM
4
4
  from ragas.prompt import PromptMixin, PydanticPrompt
5
- from ragas.utils import RAGAS_SUPPORTED_LANGUAGE_CODES
6
5
  from typing import List
7
6
 
8
7
  from evalscope.utils.logger import get_logger
@@ -16,10 +15,6 @@ async def translate_prompt(
16
15
  llm: BaseRagasLLM,
17
16
  adapt_instruction: bool = False,
18
17
  ):
19
- if target_lang not in RAGAS_SUPPORTED_LANGUAGE_CODES:
20
- logger.warning(f'{target_lang} is not in supported language: {list(RAGAS_SUPPORTED_LANGUAGE_CODES)}')
21
- return
22
-
23
18
  if not issubclass(type(prompt_user), PromptMixin):
24
19
  logger.info(f"{prompt_user} is not a PromptMixin, don't translate it")
25
20
  return
@@ -1,10 +1,12 @@
1
1
  import os
2
2
  import torch
3
3
  from langchain_core.embeddings import Embeddings
4
+ from langchain_openai.embeddings import OpenAIEmbeddings
4
5
  from sentence_transformers import models
5
6
  from sentence_transformers.cross_encoder import CrossEncoder
6
7
  from sentence_transformers.SentenceTransformer import SentenceTransformer
7
8
  from torch import Tensor
9
+ from tqdm import tqdm
8
10
  from typing import Dict, List, Optional, Union
9
11
 
10
12
  from evalscope.backend.rag_eval.utils.tools import download_model
@@ -18,10 +20,10 @@ class BaseModel(Embeddings):
18
20
 
19
21
  def __init__(
20
22
  self,
21
- model_name_or_path: str,
23
+ model_name_or_path: str = '',
22
24
  max_seq_length: int = 512,
23
25
  prompt: str = '',
24
- revision: Optional[str] = None,
26
+ revision: Optional[str] = 'master',
25
27
  **kwargs,
26
28
  ):
27
29
  self.model_name_or_path = model_name_or_path
@@ -139,7 +141,7 @@ class CrossEncoderModel(BaseModel):
139
141
  max_length=self.max_seq_length,
140
142
  )
141
143
 
142
- def predict(self, sentences: List[List[str]], **kwargs) -> List[List[float]]:
144
+ def predict(self, sentences: List[List[str]], **kwargs) -> Tensor:
143
145
  self.encode_kwargs.update(kwargs)
144
146
 
145
147
  if len(sentences[0]) == 3: # Note: For mteb retrieval task
@@ -154,6 +156,46 @@ class CrossEncoderModel(BaseModel):
154
156
  return embeddings
155
157
 
156
158
 
159
+ class APIEmbeddingModel(BaseModel):
160
+
161
+ def __init__(self, **kwargs):
162
+ self.model_name = kwargs.get('model_name')
163
+ self.openai_api_base = kwargs.get('api_base')
164
+ self.openai_api_key = kwargs.get('api_key')
165
+ self.dimensions = kwargs.get('dimensions')
166
+
167
+ self.model = OpenAIEmbeddings(
168
+ model=self.model_name,
169
+ openai_api_base=self.openai_api_base,
170
+ openai_api_key=self.openai_api_key,
171
+ dimensions=self.dimensions,
172
+ check_embedding_ctx_length=False)
173
+
174
+ super().__init__(model_name_or_path=self.model_name, **kwargs)
175
+
176
+ self.batch_size = self.encode_kwargs.get('batch_size', 10)
177
+
178
+ def encode(self, texts: Union[str, List[str]], **kwargs) -> Tensor:
179
+ if isinstance(texts, str):
180
+ texts = [texts]
181
+
182
+ embeddings: List[List[float]] = []
183
+ for i in tqdm(range(0, len(texts), self.batch_size)):
184
+ response = self.model.embed_documents(texts[i:i + self.batch_size], chunk_size=self.batch_size)
185
+ embeddings.extend(response)
186
+ return torch.tensor(embeddings)
187
+
188
+ def encode_queries(self, queries, **kwargs):
189
+ return self.encode(queries, **kwargs)
190
+
191
+ def encode_corpus(self, corpus, **kwargs):
192
+ if isinstance(corpus[0], dict):
193
+ input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
194
+ else:
195
+ input_texts = corpus
196
+ return self.encode(input_texts, **kwargs)
197
+
198
+
157
199
  class EmbeddingModel:
158
200
  """Custom embeddings"""
159
201
 
@@ -165,6 +207,10 @@ class EmbeddingModel:
165
207
  revision: Optional[str] = 'master',
166
208
  **kwargs,
167
209
  ):
210
+ if kwargs.get('model_name'):
211
+ # If model_name is provided, use OpenAIEmbeddings
212
+ return APIEmbeddingModel(**kwargs)
213
+
168
214
  # If model path does not exist and hub is 'modelscope', download the model
169
215
  if not os.path.exists(model_name_or_path) and hub == HubType.MODELSCOPE:
170
216
  model_name_or_path = download_model(model_name_or_path, revision)
@@ -2,7 +2,7 @@ import os
2
2
  from langchain_core.callbacks.manager import CallbackManagerForLLMRun
3
3
  from langchain_core.language_models.llms import LLM as BaseLLM
4
4
  from langchain_openai import ChatOpenAI
5
- from modelscope.utils.hf_util import GenerationConfig
5
+ from transformers.generation.configuration_utils import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
@@ -16,9 +16,9 @@ class LLM:
16
16
  api_base = kw.get('api_base', None)
17
17
  if api_base:
18
18
  return ChatOpenAI(
19
- model_name=kw.get('model_name', ''),
20
- openai_api_base=api_base,
21
- openai_api_key=kw.get('api_key', 'EMPTY'),
19
+ model=kw.get('model_name', ''),
20
+ base_url=api_base,
21
+ api_key=kw.get('api_key', 'EMPTY'),
22
22
  )
23
23
  else:
24
24
  return LocalLLM(**kw)
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import os
2
3
  import subprocess
3
4
  from functools import partial
4
5
  from typing import Optional, Union
@@ -66,8 +67,9 @@ class VLMEvalKitBackendManager(BackendManager):
66
67
  del remain_cfg['name'] # remove not used args
67
68
  del remain_cfg['type'] # remove not used args
68
69
 
69
- self.valid_models.update({model_type: partial(model_class, model=model_type, **remain_cfg)})
70
- new_model_names.append(model_type)
70
+ norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
71
+ self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
72
+ new_model_names.append(norm_model_type)
71
73
  else:
72
74
  remain_cfg = copy.deepcopy(model_cfg)
73
75
  del remain_cfg['name'] # remove not used args
@@ -10,8 +10,8 @@ from evalscope.utils import get_logger
10
10
  logger = get_logger()
11
11
 
12
12
  # Using glob to find all files matching the pattern
13
- pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
14
- files = glob.glob(pattern, recursive=False)
13
+ pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
14
+ files = glob.glob(pattern, recursive=True)
15
15
 
16
16
  for file_path in files:
17
17
  if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
@@ -0,0 +1,56 @@
1
+ from typing import List, Optional, Union
2
+
3
+ from evalscope.benchmarks import DataAdapter
4
+ from evalscope.metrics import mean, metric_registry
5
+ from evalscope.utils.logger import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ class T2IBaseAdapter(DataAdapter):
11
+
12
+ def __init__(self, **kwargs):
13
+
14
+ super().__init__(**kwargs)
15
+
16
+ logger.info(f'Initializing metrics: {self.metric_list}')
17
+ self.metrics = {m: metric_registry.get(m).object() for m in self.metric_list}
18
+
19
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
20
+ # dummy prompt for general t2i
21
+ return self.gen_prompt_data(prompt=input_d.get('prompt', ''), id=input_d.get('id', 0))
22
+
23
+ def get_gold_answer(self, input_d: dict) -> str:
24
+ # dummy gold answer for general t2i
25
+ return input_d.get('prompt', '')
26
+
27
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
28
+ # dummy parse pred result for general t2i
29
+ return result or raw_input_d.get('image_path', '')
30
+
31
+ def match(self, gold: str, pred: str) -> dict:
32
+ # dummy match for general t2i
33
+ # pred is the image path, gold is the prompt
34
+ res = {}
35
+ for metric_name, metric_func in self.metrics.items():
36
+ score = metric_func(images=[pred], texts=[gold])[0][0]
37
+ if isinstance(score, dict):
38
+ for k, v in score.items():
39
+ res[f'{metric_name}_{k}'] = v.cpu().item()
40
+ else:
41
+ res[metric_name] = score.cpu().item() # Updated to use score.cpu().item()
42
+ return res
43
+
44
+ def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
45
+ """
46
+ compute weighted mean of the bleu score of all samples
47
+
48
+ Args:
49
+ review_res_list: [score1, score2, ...]
50
+
51
+ Returns:
52
+ avg_res: List[dict]
53
+
54
+ """
55
+ items = super().compute_dict_metric(review_res_list, **kwargs)
56
+ return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -0,0 +1,77 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os.path
3
+ from collections import defaultdict
4
+ from typing import List, Optional, Union
5
+
6
+ from evalscope.benchmarks import Benchmark
7
+ from evalscope.constants import OutputType
8
+ from evalscope.metrics import mean
9
+ from evalscope.utils.io_utils import jsonl_to_list
10
+ from evalscope.utils.logger import get_logger
11
+ from .base import T2IBaseAdapter
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ @Benchmark.register(
17
+ name='evalmuse',
18
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
+ model_adapter=OutputType.IMAGE_GENERATION,
20
+ output_types=[OutputType.IMAGE_GENERATION],
21
+ subset_list=['EvalMuse'],
22
+ metric_list=['FGA_BLIP2Score'],
23
+ few_shot_num=0,
24
+ train_split=None,
25
+ eval_split='test',
26
+ )
27
+ class EvalMuseAdapter(T2IBaseAdapter):
28
+
29
+ def __init__(self, **kwargs):
30
+ super().__init__(**kwargs)
31
+
32
+ def load(self, **kwargs) -> dict:
33
+ if os.path.isfile(self.dataset_id):
34
+ data_list = jsonl_to_list(self.dataset_id)
35
+ data_dict = {self.subset_list[0]: {'test': data_list}}
36
+ return data_dict
37
+ else:
38
+ return super().load(**kwargs)
39
+
40
+ def get_gold_answer(self, input_d: dict) -> dict:
41
+ # return prompt and elements dict
42
+ return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
43
+
44
+ def match(self, gold: dict, pred: str) -> dict:
45
+ # dummy match for general t2i
46
+ # pred is the image path, gold is the prompt
47
+ res = {}
48
+ for metric_name, metric_func in self.metrics.items():
49
+ if metric_name == 'FGA_BLIP2Score':
50
+ # For FGA_BLIP2Score, we need to pass the dictionary
51
+ score = metric_func(images=[pred], texts=[gold])[0][0]
52
+ else:
53
+ score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
54
+ if isinstance(score, dict):
55
+ for k, v in score.items():
56
+ res[f'{metric_name}:{k}'] = v.cpu().item()
57
+ else:
58
+ res[metric_name] = score.cpu().item()
59
+ return res
60
+
61
+ def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
62
+ """
63
+ compute weighted mean of the bleu score of all samples
64
+ """
65
+ items = super().compute_dict_metric(review_res_list, **kwargs)
66
+ # add statistics for each metric
67
+ new_items = defaultdict(list)
68
+ for metric_name, value_list in items.items():
69
+ if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
70
+ metrics_prefix = metric_name.split(':')[0]
71
+ category = metric_name.rpartition('(')[-1].split(')')[0]
72
+ new_items[f'{metrics_prefix}:{category}'].extend(value_list)
73
+ else:
74
+ new_items[metric_name].extend(value_list)
75
+
76
+ # calculate mean for each metric
77
+ return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in new_items.items()]