evalscope 0.17.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (601) hide show
  1. {evalscope-0.17.0 → evalscope-1.0.0}/PKG-INFO +120 -70
  2. evalscope-0.17.0/evalscope.egg-info/PKG-INFO → evalscope-1.0.0/README.md +114 -93
  3. evalscope-1.0.0/evalscope/__init__.py +8 -0
  4. evalscope-1.0.0/evalscope/api/benchmark/__init__.py +3 -0
  5. evalscope-1.0.0/evalscope/api/benchmark/adapters/__init__.py +3 -0
  6. evalscope-1.0.0/evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  7. evalscope-1.0.0/evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  8. evalscope-1.0.0/evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  9. evalscope-1.0.0/evalscope/api/benchmark/benchmark.py +321 -0
  10. evalscope-1.0.0/evalscope/api/benchmark/meta.py +115 -0
  11. evalscope-1.0.0/evalscope/api/dataset/__init__.py +2 -0
  12. evalscope-1.0.0/evalscope/api/dataset/dataset.py +349 -0
  13. evalscope-1.0.0/evalscope/api/dataset/loader.py +261 -0
  14. evalscope-1.0.0/evalscope/api/dataset/utils.py +143 -0
  15. evalscope-1.0.0/evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope-1.0.0/evalscope/api/evaluator/cache.py +355 -0
  17. evalscope-1.0.0/evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope-1.0.0/evalscope/api/evaluator/state.py +264 -0
  19. evalscope-1.0.0/evalscope/api/filter/__init__.py +1 -0
  20. evalscope-1.0.0/evalscope/api/filter/filter.py +72 -0
  21. evalscope-1.0.0/evalscope/api/messages/__init__.py +11 -0
  22. evalscope-1.0.0/evalscope/api/messages/chat_message.py +198 -0
  23. evalscope-1.0.0/evalscope/api/messages/content.py +102 -0
  24. evalscope-1.0.0/evalscope/api/messages/utils.py +35 -0
  25. evalscope-1.0.0/evalscope/api/metric/__init__.py +2 -0
  26. evalscope-1.0.0/evalscope/api/metric/metric.py +55 -0
  27. evalscope-1.0.0/evalscope/api/metric/scorer.py +105 -0
  28. evalscope-1.0.0/evalscope/api/mixin/__init__.py +2 -0
  29. evalscope-1.0.0/evalscope/api/mixin/dataset_mixin.py +105 -0
  30. evalscope-1.0.0/evalscope/api/mixin/llm_judge_mixin.py +168 -0
  31. evalscope-1.0.0/evalscope/api/model/__init__.py +12 -0
  32. evalscope-1.0.0/evalscope/api/model/generate_config.py +157 -0
  33. evalscope-1.0.0/evalscope/api/model/model.py +383 -0
  34. evalscope-1.0.0/evalscope/api/model/model_output.py +285 -0
  35. evalscope-1.0.0/evalscope/api/registry.py +182 -0
  36. evalscope-1.0.0/evalscope/api/tool/__init__.py +3 -0
  37. evalscope-1.0.0/evalscope/api/tool/tool_call.py +101 -0
  38. evalscope-1.0.0/evalscope/api/tool/tool_info.py +173 -0
  39. evalscope-1.0.0/evalscope/api/tool/utils.py +64 -0
  40. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/app_ui.py +2 -1
  41. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/multi_model.py +50 -25
  42. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/single_model.py +23 -11
  43. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/utils/data_utils.py +42 -26
  44. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/utils/text_utils.py +0 -2
  45. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/utils/visualization.py +9 -4
  46. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/arguments.py +6 -7
  47. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/api_meta_template.py +2 -1
  48. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/backend_manager.py +6 -3
  49. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  50. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  51. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  52. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  53. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  54. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  55. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  56. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/utils/embedding.py +2 -1
  57. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/utils/llm.py +13 -12
  58. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/__init__.py +0 -2
  59. evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py → evalscope-1.0.0/evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +1 -15
  60. evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +76 -0
  61. evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +53 -0
  62. evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +42 -0
  63. evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +47 -0
  64. evalscope-1.0.0/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +26 -0
  65. evalscope-1.0.0/evalscope/benchmarks/aime/aime24_adapter.py +50 -0
  66. evalscope-1.0.0/evalscope/benchmarks/aime/aime25_adapter.py +46 -0
  67. evalscope-1.0.0/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +133 -0
  68. evalscope-1.0.0/evalscope/benchmarks/arc/arc_adapter.py +46 -0
  69. evalscope-1.0.0/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +148 -0
  70. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope-1.0.0/evalscope/benchmarks/bbh/bbh_adapter.py +175 -0
  72. evalscope-1.0.0/evalscope/benchmarks/bfcl/bfcl_adapter.py +258 -0
  73. evalscope-1.0.0/evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope-1.0.0/evalscope/benchmarks/ceval/ceval_adapter.py +170 -0
  75. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope-1.0.0/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +122 -0
  77. evalscope-1.0.0/evalscope/benchmarks/competition_math/competition_math_adapter.py +73 -0
  78. evalscope-1.0.0/evalscope/benchmarks/data_collection/data_collection_adapter.py +210 -0
  79. evalscope-1.0.0/evalscope/benchmarks/docmath/docmath_adapter.py +143 -0
  80. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/docmath/utils.py +4 -5
  81. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope-1.0.0/evalscope/benchmarks/frames/frames_adapter.py +174 -0
  83. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope-1.0.0/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +58 -0
  86. evalscope-1.0.0/evalscope/benchmarks/general_qa/general_qa_adapter.py +94 -0
  87. evalscope-1.0.0/evalscope/benchmarks/gpqa/gpqa_adapter.py +90 -0
  88. evalscope-0.17.0/evalscope/benchmarks/gpqa/chain_of_thought.txt → evalscope-1.0.0/evalscope/benchmarks/gpqa/prompt.py +12 -5
  89. evalscope-1.0.0/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +76 -0
  90. evalscope-1.0.0/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +62 -0
  91. evalscope-1.0.0/evalscope/benchmarks/hle/hle_adapter.py +152 -0
  92. evalscope-1.0.0/evalscope/benchmarks/humaneval/humaneval_adapter.py +124 -0
  93. evalscope-1.0.0/evalscope/benchmarks/ifeval/ifeval_adapter.py +83 -0
  94. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope-1.0.0/evalscope/benchmarks/iquiz/iquiz_adapter.py +35 -0
  98. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope-1.0.0/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +138 -0
  100. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope-1.0.0/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +56 -0
  103. evalscope-1.0.0/evalscope/benchmarks/math_500/math_500_adapter.py +51 -0
  104. evalscope-1.0.0/evalscope/benchmarks/mmlu/mmlu_adapter.py +107 -0
  105. evalscope-1.0.0/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +94 -0
  106. evalscope-1.0.0/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +139 -0
  107. evalscope-1.0.0/evalscope/benchmarks/musr/musr_adapter.py +43 -0
  108. evalscope-1.0.0/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +388 -0
  109. evalscope-1.0.0/evalscope/benchmarks/process_bench/process_bench_adapter.py +170 -0
  110. evalscope-1.0.0/evalscope/benchmarks/race/race_adapter.py +49 -0
  111. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope-0.17.0/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt → evalscope-1.0.0/evalscope/benchmarks/super_gpqa/prompt.py +14 -16
  113. evalscope-1.0.0/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +165 -0
  114. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope-1.0.0/evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope-1.0.0/evalscope/benchmarks/tau_bench/tau_bench_adapter.py +168 -0
  117. evalscope-1.0.0/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +102 -0
  118. evalscope-1.0.0/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +74 -0
  119. evalscope-1.0.0/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +92 -0
  120. evalscope-1.0.0/evalscope/benchmarks/winogrande/winogrande_adapter.py +34 -0
  121. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/cli.py +2 -0
  122. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/start_server.py +6 -3
  123. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/collections/__init__.py +2 -10
  124. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/collections/sampler.py +10 -10
  125. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/collections/schema.py +13 -11
  126. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/config.py +95 -54
  127. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/constants.py +34 -82
  128. evalscope-1.0.0/evalscope/evaluator/__init__.py +3 -0
  129. evalscope-1.0.0/evalscope/evaluator/evaluator.py +337 -0
  130. evalscope-1.0.0/evalscope/filters/__init__.py +2 -0
  131. evalscope-1.0.0/evalscope/filters/extraction.py +126 -0
  132. evalscope-1.0.0/evalscope/filters/selection.py +57 -0
  133. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/__init__.py +16 -14
  134. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/llm_judge.py +37 -34
  135. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/math_parser.py +27 -22
  136. evalscope-1.0.0/evalscope/metrics/metric.py +307 -0
  137. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/metrics.py +41 -25
  138. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-1.0.0/evalscope/metrics/t2v_metrics}/__init__.py +0 -0
  139. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope-1.0.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  164. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  165. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  166. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  167. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  168. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  169. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  170. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  171. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  172. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  173. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  174. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  175. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  176. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  177. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  178. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  179. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  180. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  181. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  182. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  183. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  184. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  185. evalscope-1.0.0/evalscope/models/__init__.py +26 -0
  186. evalscope-1.0.0/evalscope/models/mockllm.py +65 -0
  187. evalscope-1.0.0/evalscope/models/model_apis.py +47 -0
  188. evalscope-1.0.0/evalscope/models/modelscope.py +455 -0
  189. evalscope-1.0.0/evalscope/models/openai_compatible.py +123 -0
  190. evalscope-1.0.0/evalscope/models/text2image_model.py +124 -0
  191. evalscope-1.0.0/evalscope/models/utils/openai.py +698 -0
  192. evalscope-1.0.0/evalscope/perf/__init__.py +0 -0
  193. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/arguments.py +13 -0
  194. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/benchmark.py +39 -39
  195. evalscope-1.0.0/evalscope/perf/http_client.py +122 -0
  196. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/main.py +2 -2
  197. evalscope-1.0.0/evalscope/perf/plugin/__init__.py +3 -0
  198. evalscope-1.0.0/evalscope/perf/plugin/api/__init__.py +4 -0
  199. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/api/base.py +22 -4
  200. evalscope-1.0.0/evalscope/perf/plugin/api/custom_api.py +250 -0
  201. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/api/dashscope_api.py +4 -10
  202. evalscope-1.0.0/evalscope/perf/plugin/api/default_api.py +105 -0
  203. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/api/openai_api.py +28 -28
  204. evalscope-1.0.0/evalscope/perf/plugin/datasets/__init__.py +10 -0
  205. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/base.py +22 -1
  206. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/custom.py +4 -2
  207. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  208. evalscope-1.0.0/evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  209. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/line_by_line.py +4 -2
  210. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/longalpaca.py +4 -2
  211. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/openqa.py +6 -3
  212. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  213. evalscope-1.0.0/evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  214. evalscope-1.0.0/evalscope/perf/plugin/registry.py +74 -0
  215. evalscope-1.0.0/evalscope/perf/utils/__init__.py +0 -0
  216. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/benchmark_util.py +18 -22
  217. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/db_util.py +81 -60
  218. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/local_server.py +8 -3
  219. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/rich_display.py +16 -10
  220. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/report/__init__.py +2 -2
  221. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/report/combinator.py +18 -12
  222. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/report/generator.py +101 -6
  223. evalscope-0.17.0/evalscope/report/utils.py → evalscope-1.0.0/evalscope/report/report.py +8 -6
  224. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/run.py +26 -44
  225. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/summarizer.py +1 -1
  226. evalscope-1.0.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  227. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/__init__.py +21 -2
  228. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/chat_service.py +2 -1
  229. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope-1.0.0/evalscope/utils/function_utils.py +29 -0
  231. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/io_utils.py +110 -5
  232. evalscope-1.0.0/evalscope/utils/json_schema.py +208 -0
  233. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/logger.py +51 -12
  234. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/model_utils.py +10 -7
  235. evalscope-1.0.0/evalscope/utils/multi_choices.py +271 -0
  236. evalscope-1.0.0/evalscope/utils/url_utils.py +65 -0
  237. evalscope-1.0.0/evalscope/version.py +4 -0
  238. evalscope-0.17.0/README.md → evalscope-1.0.0/evalscope.egg-info/PKG-INFO +143 -66
  239. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/SOURCES.txt +67 -42
  240. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/requires.txt +30 -9
  241. {evalscope-0.17.0 → evalscope-1.0.0}/requirements/aigc.txt +1 -0
  242. {evalscope-0.17.0 → evalscope-1.0.0}/requirements/app.txt +1 -1
  243. evalscope-1.0.0/requirements/dev.txt +5 -0
  244. {evalscope-0.17.0 → evalscope-1.0.0}/requirements/framework.txt +7 -4
  245. {evalscope-0.17.0 → evalscope-1.0.0}/setup.cfg +15 -6
  246. {evalscope-0.17.0 → evalscope-1.0.0}/setup.py +33 -15
  247. {evalscope-0.17.0 → evalscope-1.0.0}/tests/aigc/test_t2i.py +22 -4
  248. evalscope-1.0.0/tests/benchmark/test_eval.py +386 -0
  249. {evalscope-0.17.0 → evalscope-1.0.0}/tests/cli/test_all.py +21 -7
  250. {evalscope-0.17.0 → evalscope-1.0.0}/tests/cli/test_collection.py +13 -4
  251. {evalscope-0.17.0 → evalscope-1.0.0}/tests/cli/test_custom.py +22 -15
  252. {evalscope-0.17.0 → evalscope-1.0.0}/tests/perf/test_perf.py +29 -2
  253. evalscope-1.0.0/tests/rag/__init__.py +0 -0
  254. {evalscope-0.17.0 → evalscope-1.0.0}/tests/rag/test_clip_benchmark.py +1 -0
  255. evalscope-1.0.0/tests/vlm/__init__.py +1 -0
  256. evalscope-0.17.0/evalscope/__init__.py +0 -5
  257. evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/base.py +0 -56
  258. evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  259. evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  260. evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  261. evalscope-0.17.0/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  262. evalscope-0.17.0/evalscope/benchmarks/aime/aime24_adapter.py +0 -52
  263. evalscope-0.17.0/evalscope/benchmarks/aime/aime25_adapter.py +0 -52
  264. evalscope-0.17.0/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -107
  265. evalscope-0.17.0/evalscope/benchmarks/arc/ai2_arc.py +0 -151
  266. evalscope-0.17.0/evalscope/benchmarks/arc/arc_adapter.py +0 -159
  267. evalscope-0.17.0/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -122
  268. evalscope-0.17.0/evalscope/benchmarks/bbh/bbh_adapter.py +0 -247
  269. evalscope-0.17.0/evalscope/benchmarks/benchmark.py +0 -81
  270. evalscope-0.17.0/evalscope/benchmarks/bfcl/bfcl_adapter.py +0 -237
  271. evalscope-0.17.0/evalscope/benchmarks/ceval/ceval_adapter.py +0 -238
  272. evalscope-0.17.0/evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  273. evalscope-0.17.0/evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  274. evalscope-0.17.0/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -213
  275. evalscope-0.17.0/evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  276. evalscope-0.17.0/evalscope/benchmarks/competition_math/competition_math.py +0 -79
  277. evalscope-0.17.0/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -125
  278. evalscope-0.17.0/evalscope/benchmarks/data_adapter.py +0 -523
  279. evalscope-0.17.0/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -72
  280. evalscope-0.17.0/evalscope/benchmarks/docmath/docmath_adapter.py +0 -85
  281. evalscope-0.17.0/evalscope/benchmarks/filters.py +0 -59
  282. evalscope-0.17.0/evalscope/benchmarks/frames/frames_adapter.py +0 -91
  283. evalscope-0.17.0/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -118
  284. evalscope-0.17.0/evalscope/benchmarks/general_qa/general_qa_adapter.py +0 -154
  285. evalscope-0.17.0/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -129
  286. evalscope-0.17.0/evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  287. evalscope-0.17.0/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -156
  288. evalscope-0.17.0/evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  289. evalscope-0.17.0/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -151
  290. evalscope-0.17.0/evalscope/benchmarks/humaneval/humaneval.py +0 -79
  291. evalscope-0.17.0/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -109
  292. evalscope-0.17.0/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -54
  293. evalscope-0.17.0/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -70
  294. evalscope-0.17.0/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -88
  295. evalscope-0.17.0/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -82
  296. evalscope-0.17.0/evalscope/benchmarks/math_500/math_500_adapter.py +0 -58
  297. evalscope-0.17.0/evalscope/benchmarks/mmlu/mmlu.py +0 -160
  298. evalscope-0.17.0/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -280
  299. evalscope-0.17.0/evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  300. evalscope-0.17.0/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -113
  301. evalscope-0.17.0/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -185
  302. evalscope-0.17.0/evalscope/benchmarks/musr/musr_adapter.py +0 -74
  303. evalscope-0.17.0/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +0 -348
  304. evalscope-0.17.0/evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  305. evalscope-0.17.0/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -102
  306. evalscope-0.17.0/evalscope/benchmarks/race/race.py +0 -104
  307. evalscope-0.17.0/evalscope/benchmarks/race/race_adapter.py +0 -135
  308. evalscope-0.17.0/evalscope/benchmarks/race/samples.jsonl +0 -5
  309. evalscope-0.17.0/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -209
  310. evalscope-0.17.0/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  311. evalscope-0.17.0/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +0 -75
  312. evalscope-0.17.0/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  313. evalscope-0.17.0/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -142
  314. evalscope-0.17.0/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  315. evalscope-0.17.0/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -287
  316. evalscope-0.17.0/evalscope/benchmarks/utils.py +0 -59
  317. evalscope-0.17.0/evalscope/benchmarks/winogrande/winogrande_adapter.py +0 -60
  318. evalscope-0.17.0/evalscope/collections/evaluator.py +0 -375
  319. evalscope-0.17.0/evalscope/evaluator/__init__.py +0 -3
  320. evalscope-0.17.0/evalscope/evaluator/evaluator.py +0 -481
  321. evalscope-0.17.0/evalscope/metrics/completion_parsers.py +0 -220
  322. evalscope-0.17.0/evalscope/metrics/named_metrics.py +0 -55
  323. evalscope-0.17.0/evalscope/metrics/t2v_metrics/__init__.py +0 -52
  324. evalscope-0.17.0/evalscope/models/__init__.py +0 -53
  325. evalscope-0.17.0/evalscope/models/adapters/__init__.py +0 -19
  326. evalscope-0.17.0/evalscope/models/adapters/base_adapter.py +0 -80
  327. evalscope-0.17.0/evalscope/models/adapters/bfcl_adapter.py +0 -244
  328. evalscope-0.17.0/evalscope/models/adapters/chat_adapter.py +0 -204
  329. evalscope-0.17.0/evalscope/models/adapters/choice_adapter.py +0 -218
  330. evalscope-0.17.0/evalscope/models/adapters/custom_adapter.py +0 -67
  331. evalscope-0.17.0/evalscope/models/adapters/server_adapter.py +0 -234
  332. evalscope-0.17.0/evalscope/models/adapters/t2i_adapter.py +0 -76
  333. evalscope-0.17.0/evalscope/models/custom/__init__.py +0 -4
  334. evalscope-0.17.0/evalscope/models/custom/custom_model.py +0 -50
  335. evalscope-0.17.0/evalscope/models/custom/dummy_model.py +0 -99
  336. evalscope-0.17.0/evalscope/models/local_model.py +0 -128
  337. evalscope-0.17.0/evalscope/models/model.py +0 -189
  338. evalscope-0.17.0/evalscope/models/register.py +0 -55
  339. evalscope-0.17.0/evalscope/perf/http_client.py +0 -176
  340. evalscope-0.17.0/evalscope/perf/plugin/__init__.py +0 -2
  341. evalscope-0.17.0/evalscope/perf/plugin/api/__init__.py +0 -3
  342. evalscope-0.17.0/evalscope/perf/plugin/api/custom_api.py +0 -92
  343. evalscope-0.17.0/evalscope/perf/plugin/datasets/__init__.py +0 -7
  344. evalscope-0.17.0/evalscope/perf/plugin/registry.py +0 -54
  345. evalscope-0.17.0/evalscope/version.py +0 -4
  346. evalscope-0.17.0/requirements/dev.txt +0 -5
  347. evalscope-0.17.0/tests/cli/test_run.py +0 -501
  348. {evalscope-0.17.0 → evalscope-1.0.0}/LICENSE +0 -0
  349. {evalscope-0.17.0 → evalscope-1.0.0}/MANIFEST.in +0 -0
  350. {evalscope-0.17.0/evalscope/backend → evalscope-1.0.0/evalscope/api}/__init__.py +0 -0
  351. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/__init__.py +0 -0
  352. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/app.py +0 -0
  353. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/arguments.py +0 -0
  354. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/constants.py +0 -0
  355. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/__init__.py +0 -0
  356. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/sidebar.py +0 -0
  357. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/ui/visualization.py +0 -0
  358. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/app/utils/localization.py +0 -0
  359. {evalscope-0.17.0/evalscope/backend/rag_eval/clip_benchmark/tasks → evalscope-1.0.0/evalscope/backend}/__init__.py +0 -0
  360. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/base.py +0 -0
  361. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/__init__.py +0 -0
  362. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  363. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  364. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  365. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/__init__.py +0 -0
  366. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  367. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  368. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  369. {evalscope-0.17.0/evalscope/backend/rag_eval/utils → evalscope-1.0.0/evalscope/backend/rag_eval/clip_benchmark/tasks}/__init__.py +0 -0
  370. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  371. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  372. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  373. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  374. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  375. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  376. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  377. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  378. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  379. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  380. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  381. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  382. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  383. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  384. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  385. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  386. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  387. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  388. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  389. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  390. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  391. {evalscope-0.17.0/evalscope/benchmarks/aigc → evalscope-1.0.0/evalscope/backend/rag_eval/utils}/__init__.py +0 -0
  392. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  393. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  394. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  395. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  396. {evalscope-0.17.0/evalscope/benchmarks/aigc/t2i → evalscope-1.0.0/evalscope/benchmarks/aigc}/__init__.py +0 -0
  397. {evalscope-0.17.0/evalscope/benchmarks/aime → evalscope-1.0.0/evalscope/benchmarks/aigc/i2i}/__init__.py +0 -0
  398. {evalscope-0.17.0/evalscope/benchmarks/alpaca_eval → evalscope-1.0.0/evalscope/benchmarks/aigc/t2i}/__init__.py +0 -0
  399. {evalscope-0.17.0/evalscope/benchmarks/arena_hard → evalscope-1.0.0/evalscope/benchmarks/aime}/__init__.py +0 -0
  400. {evalscope-0.17.0/evalscope/benchmarks/bfcl → evalscope-1.0.0/evalscope/benchmarks/alpaca_eval}/__init__.py +0 -0
  401. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  402. {evalscope-0.17.0/evalscope/benchmarks/chinese_simple_qa → evalscope-1.0.0/evalscope/benchmarks/arena_hard}/__init__.py +0 -0
  403. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  404. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  405. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  406. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  407. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  408. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  409. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  410. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  411. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  412. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  413. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  414. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  415. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  416. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  417. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  418. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  419. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  420. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  421. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  422. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  423. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  424. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  425. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  426. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  427. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  428. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  429. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  430. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  431. {evalscope-0.17.0/evalscope/benchmarks/data_collection → evalscope-1.0.0/evalscope/benchmarks/bfcl}/__init__.py +0 -0
  432. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  433. {evalscope-0.17.0/evalscope/benchmarks/docmath → evalscope-1.0.0/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
  434. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  435. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  436. {evalscope-0.17.0/evalscope/benchmarks/drop → evalscope-1.0.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  437. {evalscope-0.17.0/evalscope/benchmarks/frames → evalscope-1.0.0/evalscope/benchmarks/docmath}/__init__.py +0 -0
  438. {evalscope-0.17.0/evalscope/benchmarks/general_arena → evalscope-1.0.0/evalscope/benchmarks/drop}/__init__.py +0 -0
  439. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/drop/utils.py +0 -0
  440. {evalscope-0.17.0/evalscope/benchmarks/general_mcq → evalscope-1.0.0/evalscope/benchmarks/frames}/__init__.py +0 -0
  441. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/frames/utils.py +0 -0
  442. {evalscope-0.17.0/evalscope/benchmarks/gpqa → evalscope-1.0.0/evalscope/benchmarks/general_arena}/__init__.py +0 -0
  443. {evalscope-0.17.0/evalscope/benchmarks/ifeval → evalscope-1.0.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  444. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  445. {evalscope-0.17.0/evalscope/benchmarks/iquiz → evalscope-1.0.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  446. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  447. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  448. {evalscope-0.17.0/evalscope/benchmarks/live_code_bench → evalscope-1.0.0/evalscope/benchmarks/hle}/__init__.py +0 -0
  449. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  450. {evalscope-0.17.0/evalscope/benchmarks/maritime_bench → evalscope-1.0.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  451. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  452. {evalscope-0.17.0/evalscope/benchmarks/math_500 → evalscope-1.0.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  453. {evalscope-0.17.0/evalscope/benchmarks/mmlu_pro → evalscope-1.0.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  454. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  455. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  456. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  457. {evalscope-0.17.0/evalscope/benchmarks/mmlu_redux → evalscope-1.0.0/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
  458. {evalscope-0.17.0/evalscope/benchmarks/musr → evalscope-1.0.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
  459. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  460. {evalscope-0.17.0/evalscope/benchmarks/needle_haystack → evalscope-1.0.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  461. {evalscope-0.17.0/evalscope/benchmarks/process_bench → evalscope-1.0.0/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
  462. {evalscope-0.17.0/evalscope/benchmarks/simple_qa → evalscope-1.0.0/evalscope/benchmarks/musr}/__init__.py +0 -0
  463. {evalscope-0.17.0/evalscope/benchmarks/super_gpqa → evalscope-1.0.0/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
  464. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/needle_haystack/utils.py +0 -0
  465. {evalscope-0.17.0/evalscope/benchmarks/tool_bench → evalscope-1.0.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  466. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/race/__init__.py +0 -0
  467. {evalscope-0.17.0/evalscope/benchmarks/winogrande → evalscope-1.0.0/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  468. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models → evalscope-1.0.0/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  469. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-1.0.0/evalscope/benchmarks/tau_bench}/__init__.py +0 -0
  470. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-1.0.0/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
  471. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/tool_bench/utils.py +0 -0
  472. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  473. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  474. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  475. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-1.0.0/evalscope/benchmarks/winogrande}/__init__.py +0 -0
  476. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/__init__.py +0 -0
  477. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/base.py +0 -0
  478. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/start_app.py +0 -0
  479. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/start_eval.py +0 -0
  480. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/cli/start_perf.py +0 -0
  481. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  482. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  483. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/rouge_metric.py +0 -0
  484. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
  485. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/constants.py +0 -0
  486. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
  487. {evalscope-0.17.0/evalscope/perf → evalscope-1.0.0/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
  488. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
  489. {evalscope-0.17.0/evalscope/perf/utils → evalscope-1.0.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
  490. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
  491. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
  492. {evalscope-0.17.0/evalscope/third_party/thinkbench/tools → evalscope-1.0.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
  493. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
  494. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
  495. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
  496. {evalscope-0.17.0/tests/rag → evalscope-1.0.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
  497. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
  498. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
  499. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
  500. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
  501. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  502. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  503. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  504. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
  505. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
  506. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
  507. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
  508. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
  509. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
  510. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
  511. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
  512. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
  513. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
  514. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
  515. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
  516. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
  517. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
  518. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
  519. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
  520. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
  521. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
  522. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
  523. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
  524. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
  525. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
  526. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
  527. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
  528. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
  529. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
  530. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
  531. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
  532. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
  533. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
  534. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
  535. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
  536. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/score.py +0 -0
  537. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
  538. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  539. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/analysis_result.py +0 -0
  540. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/handler.py +0 -0
  541. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/perf/utils/log_utils.py +0 -0
  542. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/__init__.py +0 -0
  543. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/README.md +0 -0
  544. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  545. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  546. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  547. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  548. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/infer.py +0 -0
  549. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  550. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  551. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  552. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  553. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  554. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  555. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  556. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  557. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  558. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  559. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
  560. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/eval.py +0 -0
  561. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/infer.py +0 -0
  562. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  563. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  564. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  565. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  566. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  567. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  568. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  569. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  570. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  571. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  572. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  573. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  574. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  575. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  576. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/argument_utils.py +0 -0
  577. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope/utils/import_utils.py +0 -0
  578. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/dependency_links.txt +0 -0
  579. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/entry_points.txt +0 -0
  580. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/not-zip-safe +0 -0
  581. {evalscope-0.17.0 → evalscope-1.0.0}/evalscope.egg-info/top_level.txt +0 -0
  582. {evalscope-0.17.0 → evalscope-1.0.0}/requirements/docs.txt +0 -0
  583. {evalscope-0.17.0 → evalscope-1.0.0}/requirements/opencompass.txt +0 -0
  584. {evalscope-0.17.0 → evalscope-1.0.0}/requirements/perf.txt +0 -0
  585. {evalscope-0.17.0 → evalscope-1.0.0}/requirements/rag.txt +0 -0
  586. {evalscope-0.17.0 → evalscope-1.0.0}/requirements/vlmeval.txt +0 -0
  587. {evalscope-0.17.0 → evalscope-1.0.0}/requirements.txt +0 -0
  588. {evalscope-0.17.0 → evalscope-1.0.0}/tests/__init__.py +0 -0
  589. {evalscope-0.17.0 → evalscope-1.0.0}/tests/aigc/__init__.py +0 -0
  590. {evalscope-0.17.0/tests/cli → evalscope-1.0.0/tests/benchmark}/__init__.py +0 -0
  591. {evalscope-0.17.0/tests/perf → evalscope-1.0.0/tests/cli}/__init__.py +0 -0
  592. {evalscope-0.17.0/tests/swift → evalscope-1.0.0/tests/perf}/__init__.py +0 -0
  593. {evalscope-0.17.0 → evalscope-1.0.0}/tests/rag/test_mteb.py +0 -0
  594. {evalscope-0.17.0 → evalscope-1.0.0}/tests/rag/test_ragas.py +0 -0
  595. {evalscope-0.17.0/tests/vlm → evalscope-1.0.0/tests/swift}/__init__.py +0 -0
  596. {evalscope-0.17.0 → evalscope-1.0.0}/tests/swift/test_run_swift_eval.py +0 -0
  597. {evalscope-0.17.0 → evalscope-1.0.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  598. {evalscope-0.17.0 → evalscope-1.0.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  599. {evalscope-0.17.0 → evalscope-1.0.0}/tests/test_run_all.py +0 -0
  600. {evalscope-0.17.0 → evalscope-1.0.0}/tests/utils.py +0 -0
  601. {evalscope-0.17.0 → evalscope-1.0.0}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,19 +1,20 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.17.0
3
+ Version: 1.0.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
7
7
  Author-email: contact@modelscope.cn
8
+ License: Apache License 2.0
8
9
  Keywords: python,llm,evaluation
9
10
  Classifier: Development Status :: 4 - Beta
10
- Classifier: License :: OSI Approved :: Apache Software License
11
11
  Classifier: Operating System :: OS Independent
12
12
  Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.8
14
13
  Classifier: Programming Language :: Python :: 3.9
15
14
  Classifier: Programming Language :: Python :: 3.10
16
- Requires-Python: >=3.8
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.9
17
18
  Description-Content-Type: text/markdown
18
19
  Provides-Extra: opencompass
19
20
  Provides-Extra: vlmeval
@@ -22,6 +23,7 @@ Provides-Extra: perf
22
23
  Provides-Extra: app
23
24
  Provides-Extra: aigc
24
25
  Provides-Extra: dev
26
+ Provides-Extra: docs
25
27
  Provides-Extra: all
26
28
  License-File: LICENSE
27
29
 
@@ -55,25 +57,26 @@ License-File: LICENSE
55
57
  - [📝 Introduction](#-introduction)
56
58
  - [☎ User Groups](#-user-groups)
57
59
  - [🎉 News](#-news)
58
- - [🛠️ Installation](#️-installation)
59
- - [Method 1: Install Using pip](#method-1-install-using-pip)
60
- - [Method 2: Install from Source](#method-2-install-from-source)
60
+ - [🛠️ Environment Setup](#️-environment-setup)
61
+ - [Method 1. Install via pip](#method-1-install-via-pip)
62
+ - [Method 2. Install from source](#method-2-install-from-source)
61
63
  - [🚀 Quick Start](#-quick-start)
62
64
  - [Method 1. Using Command Line](#method-1-using-command-line)
63
65
  - [Method 2. Using Python Code](#method-2-using-python-code)
64
66
  - [Basic Parameter](#basic-parameter)
65
67
  - [Output Results](#output-results)
66
68
  - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
67
- - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
69
+ - [🌐 Evaluation of Model API](#-evaluation-of-model-api)
68
70
  - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
69
- - [Parameter](#parameter)
70
- - [Evaluation Backend](#evaluation-backend)
71
+ - [Parameter Description](#parameter-description)
72
+ - [🧪 Other Evaluation Backends](#-other-evaluation-backends)
71
73
  - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
72
74
  - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
73
- - [🏟️ Arena Mode](#️-arena-mode)
75
+ - [⚔️ Arena Mode](#️-arena-mode)
74
76
  - [👷‍♂️ Contribution](#️-contribution)
77
+ - [📚 Citation](#-citation)
75
78
  - [🔜 Roadmap](#-roadmap)
76
- - [Star History](#star-history)
79
+ - [Star History](#-star-history)
77
80
 
78
81
 
79
82
  ## 📝 Introduction
@@ -138,6 +141,15 @@ Please scan the QR code below to join our community groups:
138
141
 
139
142
  ## 🎉 News
140
143
 
144
+ > [!IMPORTANT]
145
+ > **Version 1.0 Refactoring**
146
+ >
147
+ > Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
148
+
149
+ - 🔥 **[2025.08.22]** Version 1.0 Refactoring.
150
+ - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
151
+ - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
152
+ - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
141
153
  - 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
142
154
  - 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
143
155
  - 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
@@ -145,6 +157,8 @@ Please scan the QR code below to join our community groups:
145
157
  - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
146
158
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
147
159
  - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
160
+ <details><summary>More</summary>
161
+
148
162
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
149
163
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
150
164
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
@@ -158,8 +172,6 @@ Please scan the QR code below to join our community groups:
158
172
  - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
159
173
  - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
160
174
  - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
161
- <details><summary>More</summary>
162
-
163
175
  - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
164
176
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
165
177
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
@@ -183,58 +195,87 @@ Please scan the QR code below to join our community groups:
183
195
 
184
196
  </details>
185
197
 
186
- ## 🛠️ Installation
187
- ### Method 1: Install Using pip
188
- We recommend using conda to manage your environment and installing dependencies with pip:
198
+ ## 🛠️ Environment Setup
199
+
200
+ ### Method 1. Install via pip
201
+
202
+ We recommend using conda to manage your environment and pip to install dependencies. This allows you to use the latest evalscope PyPI package.
189
203
 
190
204
  1. Create a conda environment (optional)
205
+ ```shell
206
+ # Python 3.10 is recommended
207
+ conda create -n evalscope python=3.10
208
+
209
+ # Activate the conda environment
210
+ conda activate evalscope
211
+ ```
212
+ 2. Install dependencies via pip
213
+ ```shell
214
+ pip install evalscope
215
+ ```
216
+ 3. Install additional dependencies (optional)
217
+ - To use model service inference benchmarking features, install the perf dependency:
191
218
  ```shell
192
- # It is recommended to use Python 3.10
193
- conda create -n evalscope python=3.10
194
- # Activate the conda environment
195
- conda activate evalscope
219
+ pip install 'evalscope[perf]'
196
220
  ```
197
-
198
- 2. Install dependencies using pip
221
+ - To use visualization features, install the app dependency:
222
+ ```shell
223
+ pip install 'evalscope[app]'
224
+ ```
225
+ - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
199
226
  ```shell
200
- pip install evalscope # Install Native backend (default)
201
- # Additional options
202
- pip install 'evalscope[opencompass]' # Install OpenCompass backend
203
- pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
204
- pip install 'evalscope[rag]' # Install RAGEval backend
205
- pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
206
- pip install 'evalscope[app]' # Install dependencies for visualization
207
- pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
227
+ pip install 'evalscope[opencompass]'
228
+ pip install 'evalscope[vlmeval]'
229
+ pip install 'evalscope[rag]'
230
+ ```
231
+ - To install all dependencies:
232
+ ```shell
233
+ pip install 'evalscope[all]'
208
234
  ```
209
235
 
210
- > [!WARNING]
211
- > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
236
+ > [!NOTE]
237
+ > The project has been renamed to `evalscope`. For version `v0.4.3` or earlier, you can install it with:
212
238
  > ```shell
213
- > pip install llmuses<=0.4.3
239
+ > pip install llmuses<=0.4.3
214
240
  > ```
215
- > To import relevant dependencies using `llmuses`:
216
- > ``` python
241
+ > Then, import related dependencies using `llmuses`:
242
+ > ```python
217
243
  > from llmuses import ...
218
244
  > ```
219
245
 
220
- ### Method 2: Install from Source
221
- 1. Download the source code
222
- ```shell
223
- git clone https://github.com/modelscope/evalscope.git
224
- ```
246
+ ### Method 2. Install from source
225
247
 
248
+ Installing from source allows you to use the latest code and makes it easier for further development and debugging.
249
+
250
+ 1. Clone the source code
251
+ ```shell
252
+ git clone https://github.com/modelscope/evalscope.git
253
+ ```
226
254
  2. Install dependencies
227
- ```shell
228
- cd evalscope/
229
- pip install -e . # Install Native backend
230
- # Additional options
231
- pip install -e '.[opencompass]' # Install OpenCompass backend
232
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
233
- pip install -e '.[rag]' # Install RAGEval backend
234
- pip install -e '.[perf]' # Install Perf dependencies
235
- pip install -e '.[app]' # Install visualization dependencies
236
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
237
- ```
255
+ ```shell
256
+ cd evalscope/
257
+
258
+ pip install -e .
259
+ ```
260
+ 3. Install additional dependencies
261
+ - To use model service inference benchmarking features, install the perf dependency:
262
+ ```shell
263
+ pip install '.[perf]'
264
+ ```
265
+ - To use visualization features, install the app dependency:
266
+ ```shell
267
+ pip install '.[app]'
268
+ ```
269
+ - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
270
+ ```shell
271
+ pip install '.[opencompass]'
272
+ pip install '.[vlmeval]'
273
+ pip install '.[rag]'
274
+ ```
275
+ - To install all dependencies:
276
+ ```shell
277
+ pip install '.[all]'
278
+ ```
238
279
 
239
280
 
240
281
  ## 🚀 Quick Start
@@ -255,33 +296,31 @@ evalscope eval \
255
296
 
256
297
  When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
257
298
 
258
- **Using Python Dictionary**
299
+ **Using `TaskConfig`**
259
300
 
260
301
  ```python
261
- from evalscope.run import run_task
302
+ from evalscope import run_task, TaskConfig
262
303
 
263
- task_cfg = {
264
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
265
- 'datasets': ['gsm8k', 'arc'],
266
- 'limit': 5
267
- }
304
+ task_cfg = TaskConfig(
305
+ model='Qwen/Qwen2.5-0.5B-Instruct',
306
+ datasets=['gsm8k', 'arc'],
307
+ limit=5
308
+ )
268
309
 
269
310
  run_task(task_cfg=task_cfg)
270
311
  ```
271
-
272
312
  <details><summary>More Startup Methods</summary>
273
313
 
274
- **Using `TaskConfig`**
314
+ **Using Python Dictionary**
275
315
 
276
316
  ```python
277
317
  from evalscope.run import run_task
278
- from evalscope.config import TaskConfig
279
318
 
280
- task_cfg = TaskConfig(
281
- model='Qwen/Qwen2.5-0.5B-Instruct',
282
- datasets=['gsm8k', 'arc'],
283
- limit=5
284
- )
319
+ task_cfg = {
320
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
321
+ 'datasets': ['gsm8k', 'arc'],
322
+ 'limit': 5
323
+ }
285
324
 
286
325
  run_task(task_cfg=task_cfg)
287
326
  ```
@@ -384,7 +423,7 @@ To create a public link, set `share=True` in `launch()`.
384
423
 
385
424
  For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
386
425
 
387
- ## 🌐 Evaluation of Specified Model API
426
+ ## 🌐 Evaluation of Model API
388
427
 
389
428
  Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
390
429
 
@@ -435,7 +474,7 @@ evalscope eval \
435
474
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
436
475
 
437
476
 
438
- ## Evaluation Backend
477
+ ## 🧪 Other Evaluation Backends
439
478
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
440
479
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
441
480
  - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
@@ -508,6 +547,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
508
547
  </table>
509
548
  </a>
510
549
 
550
+ ## 📚 Citation
551
+
552
+ ```bibtex
553
+ @misc{evalscope_2024,
554
+ title={{EvalScope}: Evaluation Framework for Large Models},
555
+ author={ModelScope Team},
556
+ year={2024},
557
+ url={https://github.com/modelscope/evalscope}
558
+ }
559
+ ```
560
+
511
561
  ## 🔜 Roadmap
512
562
  - [x] Support for better evaluation report visualization
513
563
  - [x] Support for mixed evaluations across multiple datasets
@@ -523,6 +573,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
523
573
  - [x] MBPP
524
574
 
525
575
 
526
- ## Star History
576
+ ## Star History
527
577
 
528
578
  [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -1,30 +1,3 @@
1
- Metadata-Version: 2.1
2
- Name: evalscope
3
- Version: 0.17.0
4
- Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
- Home-page: https://github.com/modelscope/evalscope
6
- Author: ModelScope team
7
- Author-email: contact@modelscope.cn
8
- Keywords: python,llm,evaluation
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: License :: OSI Approved :: Apache Software License
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.8
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Requires-Python: >=3.8
17
- Description-Content-Type: text/markdown
18
- Provides-Extra: opencompass
19
- Provides-Extra: vlmeval
20
- Provides-Extra: rag
21
- Provides-Extra: perf
22
- Provides-Extra: app
23
- Provides-Extra: aigc
24
- Provides-Extra: dev
25
- Provides-Extra: all
26
- License-File: LICENSE
27
-
28
1
  <p align="center">
29
2
  <br>
30
3
  <img src="docs/en/_static/images/evalscope_logo.png"/>
@@ -55,25 +28,26 @@ License-File: LICENSE
55
28
  - [📝 Introduction](#-introduction)
56
29
  - [☎ User Groups](#-user-groups)
57
30
  - [🎉 News](#-news)
58
- - [🛠️ Installation](#️-installation)
59
- - [Method 1: Install Using pip](#method-1-install-using-pip)
60
- - [Method 2: Install from Source](#method-2-install-from-source)
31
+ - [🛠️ Environment Setup](#️-environment-setup)
32
+ - [Method 1. Install via pip](#method-1-install-via-pip)
33
+ - [Method 2. Install from source](#method-2-install-from-source)
61
34
  - [🚀 Quick Start](#-quick-start)
62
35
  - [Method 1. Using Command Line](#method-1-using-command-line)
63
36
  - [Method 2. Using Python Code](#method-2-using-python-code)
64
37
  - [Basic Parameter](#basic-parameter)
65
38
  - [Output Results](#output-results)
66
39
  - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
67
- - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
40
+ - [🌐 Evaluation of Model API](#-evaluation-of-model-api)
68
41
  - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
69
- - [Parameter](#parameter)
70
- - [Evaluation Backend](#evaluation-backend)
42
+ - [Parameter Description](#parameter-description)
43
+ - [🧪 Other Evaluation Backends](#-other-evaluation-backends)
71
44
  - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
72
45
  - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
73
- - [🏟️ Arena Mode](#️-arena-mode)
46
+ - [⚔️ Arena Mode](#️-arena-mode)
74
47
  - [👷‍♂️ Contribution](#️-contribution)
48
+ - [📚 Citation](#-citation)
75
49
  - [🔜 Roadmap](#-roadmap)
76
- - [Star History](#star-history)
50
+ - [Star History](#-star-history)
77
51
 
78
52
 
79
53
  ## 📝 Introduction
@@ -138,6 +112,15 @@ Please scan the QR code below to join our community groups:
138
112
 
139
113
  ## 🎉 News
140
114
 
115
+ > [!IMPORTANT]
116
+ > **Version 1.0 Refactoring**
117
+ >
118
+ > Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
119
+
120
+ - 🔥 **[2025.08.22]** Version 1.0 Refactoring.
121
+ - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
122
+ - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
123
+ - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
141
124
  - 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
142
125
  - 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
143
126
  - 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
@@ -145,6 +128,8 @@ Please scan the QR code below to join our community groups:
145
128
  - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
146
129
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
147
130
  - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
131
+ <details><summary>More</summary>
132
+
148
133
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
149
134
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
150
135
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
@@ -158,8 +143,6 @@ Please scan the QR code below to join our community groups:
158
143
  - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
159
144
  - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
160
145
  - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
161
- <details><summary>More</summary>
162
-
163
146
  - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
164
147
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
165
148
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
@@ -183,58 +166,87 @@ Please scan the QR code below to join our community groups:
183
166
 
184
167
  </details>
185
168
 
186
- ## 🛠️ Installation
187
- ### Method 1: Install Using pip
188
- We recommend using conda to manage your environment and installing dependencies with pip:
169
+ ## 🛠️ Environment Setup
170
+
171
+ ### Method 1. Install via pip
172
+
173
+ We recommend using conda to manage your environment and pip to install dependencies. This allows you to use the latest evalscope PyPI package.
189
174
 
190
175
  1. Create a conda environment (optional)
176
+ ```shell
177
+ # Python 3.10 is recommended
178
+ conda create -n evalscope python=3.10
179
+
180
+ # Activate the conda environment
181
+ conda activate evalscope
182
+ ```
183
+ 2. Install dependencies via pip
184
+ ```shell
185
+ pip install evalscope
186
+ ```
187
+ 3. Install additional dependencies (optional)
188
+ - To use model service inference benchmarking features, install the perf dependency:
191
189
  ```shell
192
- # It is recommended to use Python 3.10
193
- conda create -n evalscope python=3.10
194
- # Activate the conda environment
195
- conda activate evalscope
190
+ pip install 'evalscope[perf]'
196
191
  ```
197
-
198
- 2. Install dependencies using pip
192
+ - To use visualization features, install the app dependency:
199
193
  ```shell
200
- pip install evalscope # Install Native backend (default)
201
- # Additional options
202
- pip install 'evalscope[opencompass]' # Install OpenCompass backend
203
- pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
204
- pip install 'evalscope[rag]' # Install RAGEval backend
205
- pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
206
- pip install 'evalscope[app]' # Install dependencies for visualization
207
- pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
194
+ pip install 'evalscope[app]'
195
+ ```
196
+ - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
197
+ ```shell
198
+ pip install 'evalscope[opencompass]'
199
+ pip install 'evalscope[vlmeval]'
200
+ pip install 'evalscope[rag]'
201
+ ```
202
+ - To install all dependencies:
203
+ ```shell
204
+ pip install 'evalscope[all]'
208
205
  ```
209
206
 
210
- > [!WARNING]
211
- > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
207
+ > [!NOTE]
208
+ > The project has been renamed to `evalscope`. For version `v0.4.3` or earlier, you can install it with:
212
209
  > ```shell
213
- > pip install llmuses<=0.4.3
210
+ > pip install llmuses<=0.4.3
214
211
  > ```
215
- > To import relevant dependencies using `llmuses`:
216
- > ``` python
212
+ > Then, import related dependencies using `llmuses`:
213
+ > ```python
217
214
  > from llmuses import ...
218
215
  > ```
219
216
 
220
- ### Method 2: Install from Source
221
- 1. Download the source code
222
- ```shell
223
- git clone https://github.com/modelscope/evalscope.git
224
- ```
217
+ ### Method 2. Install from source
218
+
219
+ Installing from source allows you to use the latest code and makes it easier for further development and debugging.
225
220
 
221
+ 1. Clone the source code
222
+ ```shell
223
+ git clone https://github.com/modelscope/evalscope.git
224
+ ```
226
225
  2. Install dependencies
227
- ```shell
228
- cd evalscope/
229
- pip install -e . # Install Native backend
230
- # Additional options
231
- pip install -e '.[opencompass]' # Install OpenCompass backend
232
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
233
- pip install -e '.[rag]' # Install RAGEval backend
234
- pip install -e '.[perf]' # Install Perf dependencies
235
- pip install -e '.[app]' # Install visualization dependencies
236
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
237
- ```
226
+ ```shell
227
+ cd evalscope/
228
+
229
+ pip install -e .
230
+ ```
231
+ 3. Install additional dependencies
232
+ - To use model service inference benchmarking features, install the perf dependency:
233
+ ```shell
234
+ pip install '.[perf]'
235
+ ```
236
+ - To use visualization features, install the app dependency:
237
+ ```shell
238
+ pip install '.[app]'
239
+ ```
240
+ - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
241
+ ```shell
242
+ pip install '.[opencompass]'
243
+ pip install '.[vlmeval]'
244
+ pip install '.[rag]'
245
+ ```
246
+ - To install all dependencies:
247
+ ```shell
248
+ pip install '.[all]'
249
+ ```
238
250
 
239
251
 
240
252
  ## 🚀 Quick Start
@@ -255,33 +267,31 @@ evalscope eval \
255
267
 
256
268
  When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
257
269
 
258
- **Using Python Dictionary**
270
+ **Using `TaskConfig`**
259
271
 
260
272
  ```python
261
- from evalscope.run import run_task
273
+ from evalscope import run_task, TaskConfig
262
274
 
263
- task_cfg = {
264
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
265
- 'datasets': ['gsm8k', 'arc'],
266
- 'limit': 5
267
- }
275
+ task_cfg = TaskConfig(
276
+ model='Qwen/Qwen2.5-0.5B-Instruct',
277
+ datasets=['gsm8k', 'arc'],
278
+ limit=5
279
+ )
268
280
 
269
281
  run_task(task_cfg=task_cfg)
270
282
  ```
271
-
272
283
  <details><summary>More Startup Methods</summary>
273
284
 
274
- **Using `TaskConfig`**
285
+ **Using Python Dictionary**
275
286
 
276
287
  ```python
277
288
  from evalscope.run import run_task
278
- from evalscope.config import TaskConfig
279
289
 
280
- task_cfg = TaskConfig(
281
- model='Qwen/Qwen2.5-0.5B-Instruct',
282
- datasets=['gsm8k', 'arc'],
283
- limit=5
284
- )
290
+ task_cfg = {
291
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
292
+ 'datasets': ['gsm8k', 'arc'],
293
+ 'limit': 5
294
+ }
285
295
 
286
296
  run_task(task_cfg=task_cfg)
287
297
  ```
@@ -384,7 +394,7 @@ To create a public link, set `share=True` in `launch()`.
384
394
 
385
395
  For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
386
396
 
387
- ## 🌐 Evaluation of Specified Model API
397
+ ## 🌐 Evaluation of Model API
388
398
 
389
399
  Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
390
400
 
@@ -435,7 +445,7 @@ evalscope eval \
435
445
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
436
446
 
437
447
 
438
- ## Evaluation Backend
448
+ ## 🧪 Other Evaluation Backends
439
449
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
440
450
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
441
451
  - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
@@ -508,6 +518,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
508
518
  </table>
509
519
  </a>
510
520
 
521
+ ## 📚 Citation
522
+
523
+ ```bibtex
524
+ @misc{evalscope_2024,
525
+ title={{EvalScope}: Evaluation Framework for Large Models},
526
+ author={ModelScope Team},
527
+ year={2024},
528
+ url={https://github.com/modelscope/evalscope}
529
+ }
530
+ ```
531
+
511
532
  ## 🔜 Roadmap
512
533
  - [x] Support for better evaluation report visualization
513
534
  - [x] Support for mixed evaluations across multiple datasets
@@ -523,6 +544,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
523
544
  - [x] MBPP
524
545
 
525
546
 
526
- ## Star History
547
+ ## Star History
527
548
 
528
549
  [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -0,0 +1,8 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from evalscope.benchmarks import * # registered benchmarks
3
+ from evalscope.config import TaskConfig
4
+ from evalscope.filters import extraction, selection # registered filters
5
+ from evalscope.metrics import metric # registered metrics
6
+ from evalscope.models import model_apis # need for register model apis
7
+ from evalscope.run import run_task
8
+ from .version import __release_datetime__, __version__
@@ -0,0 +1,3 @@
1
+ from .adapters import DefaultDataAdapter, MultiChoiceAdapter, Text2ImageAdapter
2
+ from .benchmark import DataAdapter
3
+ from .meta import BenchmarkMeta