evalscope 0.16.3__tar.gz → 0.17.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (532) hide show
  1. {evalscope-0.16.3/evalscope.egg-info → evalscope-0.17.1}/PKG-INFO +81 -150
  2. {evalscope-0.16.3 → evalscope-0.17.1}/README.md +73 -43
  3. evalscope-0.17.1/evalscope/app/app.py +35 -0
  4. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/app/constants.py +1 -0
  5. evalscope-0.17.1/evalscope/app/ui/__init__.py +20 -0
  6. evalscope-0.17.1/evalscope/app/ui/app_ui.py +52 -0
  7. evalscope-0.17.1/evalscope/app/ui/multi_model.py +323 -0
  8. evalscope-0.17.1/evalscope/app/ui/sidebar.py +42 -0
  9. evalscope-0.17.1/evalscope/app/ui/single_model.py +202 -0
  10. evalscope-0.17.1/evalscope/app/ui/visualization.py +36 -0
  11. evalscope-0.17.1/evalscope/app/utils/data_utils.py +178 -0
  12. evalscope-0.17.1/evalscope/app/utils/localization.py +221 -0
  13. evalscope-0.17.1/evalscope/app/utils/text_utils.py +119 -0
  14. evalscope-0.17.1/evalscope/app/utils/visualization.py +91 -0
  15. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/backend_manager.py +2 -1
  16. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/backend_manager.py +2 -1
  17. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/embedding.py +1 -1
  18. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  19. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/__init__.py +15 -1
  20. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  21. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  22. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  23. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arc/arc_adapter.py +1 -1
  24. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  25. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/utils.py +0 -12
  26. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  27. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  28. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  29. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  30. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/data_adapter.py +29 -9
  31. evalscope-0.17.1/evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  32. evalscope-0.17.1/evalscope/benchmarks/general_arena/utils.py +226 -0
  33. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  34. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  35. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  36. evalscope-0.17.1/evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope-0.17.1/evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  49. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  50. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  51. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/utils.py +2 -2
  52. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  53. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/config.py +8 -123
  54. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/constants.py +5 -21
  55. evalscope-0.17.1/evalscope/evaluator/__init__.py +3 -0
  56. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/evaluator/evaluator.py +20 -15
  57. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/__init__.py +9 -1
  58. evalscope-0.16.3/evalscope/utils/utils.py → evalscope-0.17.1/evalscope/metrics/completion_parsers.py +71 -176
  59. evalscope-0.17.1/evalscope/metrics/llm_judge.py +197 -0
  60. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/metrics.py +20 -8
  61. {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
  62. {evalscope-0.16.3/evalscope/third_party/thinkbench/tools → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models}/__init__.py +0 -0
  63. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/__init__.py +4 -8
  64. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/__init__.py +4 -9
  65. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/base_adapter.py +4 -0
  66. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/bfcl_adapter.py +2 -0
  67. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/chat_adapter.py +3 -0
  68. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/choice_adapter.py +4 -0
  69. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/custom_adapter.py +7 -3
  70. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/server_adapter.py +4 -2
  71. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/adapters/t2i_adapter.py +3 -0
  72. evalscope-0.17.1/evalscope/models/adapters/tau_bench_adapter.py +189 -0
  73. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/custom/dummy_model.py +3 -3
  74. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/register.py +0 -14
  75. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/arguments.py +15 -16
  76. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/benchmark.py +38 -39
  77. evalscope-0.17.1/evalscope/perf/http_client.py +120 -0
  78. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/main.py +3 -3
  79. evalscope-0.17.1/evalscope/perf/plugin/__init__.py +3 -0
  80. evalscope-0.17.1/evalscope/perf/plugin/api/__init__.py +4 -0
  81. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/api/base.py +22 -4
  82. evalscope-0.17.1/evalscope/perf/plugin/api/custom_api.py +249 -0
  83. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/api/dashscope_api.py +4 -10
  84. evalscope-0.17.1/evalscope/perf/plugin/api/default_api.py +105 -0
  85. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/api/openai_api.py +17 -19
  86. evalscope-0.17.1/evalscope/perf/plugin/datasets/__init__.py +10 -0
  87. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/base.py +22 -1
  88. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/custom.py +2 -1
  89. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  90. evalscope-0.17.1/evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  91. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  92. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  93. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/openqa.py +2 -1
  94. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  95. evalscope-0.17.1/evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  96. evalscope-0.17.1/evalscope/perf/plugin/registry.py +74 -0
  97. evalscope-0.17.1/evalscope/perf/utils/__init__.py +0 -0
  98. evalscope-0.17.1/evalscope/perf/utils/analysis_result.py +30 -0
  99. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/benchmark_util.py +14 -20
  100. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/db_util.py +79 -61
  101. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/report/__init__.py +1 -1
  102. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/report/utils.py +34 -15
  103. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/run.py +1 -1
  104. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/summarizer.py +1 -2
  105. evalscope-0.17.1/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  106. evalscope-0.17.1/evalscope/utils/__init__.py +65 -0
  107. evalscope-0.17.1/evalscope/utils/argument_utils.py +64 -0
  108. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/import_utils.py +16 -0
  109. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/io_utils.py +55 -4
  110. evalscope-0.17.1/evalscope/utils/model_utils.py +76 -0
  111. evalscope-0.17.1/evalscope/version.py +4 -0
  112. {evalscope-0.16.3 → evalscope-0.17.1/evalscope.egg-info}/PKG-INFO +81 -150
  113. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/SOURCES.txt +27 -30
  114. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/requires.txt +24 -4
  115. evalscope-0.17.1/requirements/dev.txt +5 -0
  116. {evalscope-0.16.3 → evalscope-0.17.1}/requirements/framework.txt +2 -2
  117. {evalscope-0.16.3 → evalscope-0.17.1}/setup.cfg +1 -1
  118. {evalscope-0.16.3 → evalscope-0.17.1}/setup.py +35 -15
  119. {evalscope-0.16.3 → evalscope-0.17.1}/tests/aigc/test_t2i.py +1 -1
  120. {evalscope-0.16.3 → evalscope-0.17.1}/tests/cli/test_all.py +68 -4
  121. {evalscope-0.16.3 → evalscope-0.17.1}/tests/cli/test_collection.py +1 -1
  122. evalscope-0.17.1/tests/cli/test_custom.py +261 -0
  123. {evalscope-0.16.3 → evalscope-0.17.1}/tests/cli/test_run.py +34 -70
  124. {evalscope-0.16.3 → evalscope-0.17.1}/tests/perf/test_perf.py +31 -4
  125. evalscope-0.17.1/tests/rag/__init__.py +0 -0
  126. {evalscope-0.16.3 → evalscope-0.17.1}/tests/rag/test_clip_benchmark.py +2 -1
  127. {evalscope-0.16.3 → evalscope-0.17.1}/tests/rag/test_mteb.py +3 -1
  128. {evalscope-0.16.3 → evalscope-0.17.1}/tests/rag/test_ragas.py +3 -1
  129. {evalscope-0.16.3 → evalscope-0.17.1}/tests/swift/test_run_swift_eval.py +2 -1
  130. {evalscope-0.16.3 → evalscope-0.17.1}/tests/swift/test_run_swift_vlm_eval.py +2 -1
  131. {evalscope-0.16.3 → evalscope-0.17.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  132. evalscope-0.17.1/tests/utils.py +13 -0
  133. {evalscope-0.16.3 → evalscope-0.17.1}/tests/vlm/test_vlmeval.py +8 -2
  134. evalscope-0.16.3/evalscope/app/app.py +0 -788
  135. evalscope-0.16.3/evalscope/evaluator/__init__.py +0 -3
  136. evalscope-0.16.3/evalscope/evaluator/rating_eval.py +0 -157
  137. evalscope-0.16.3/evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  138. evalscope-0.16.3/evalscope/metrics/llm_judge.py +0 -111
  139. evalscope-0.16.3/evalscope/models/model.py +0 -189
  140. evalscope-0.16.3/evalscope/perf/http_client.py +0 -176
  141. evalscope-0.16.3/evalscope/perf/plugin/__init__.py +0 -2
  142. evalscope-0.16.3/evalscope/perf/plugin/api/__init__.py +0 -3
  143. evalscope-0.16.3/evalscope/perf/plugin/api/custom_api.py +0 -92
  144. evalscope-0.16.3/evalscope/perf/plugin/datasets/__init__.py +0 -7
  145. evalscope-0.16.3/evalscope/perf/plugin/registry.py +0 -54
  146. evalscope-0.16.3/evalscope/perf/utils/analysis_result.py +0 -29
  147. evalscope-0.16.3/evalscope/registry/config/cfg_arena.yaml +0 -77
  148. evalscope-0.16.3/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  149. evalscope-0.16.3/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  150. evalscope-0.16.3/evalscope/registry/config/cfg_single.yaml +0 -78
  151. evalscope-0.16.3/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  152. evalscope-0.16.3/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  153. evalscope-0.16.3/evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  154. evalscope-0.16.3/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  155. evalscope-0.16.3/evalscope/registry/data/question.jsonl +0 -80
  156. evalscope-0.16.3/evalscope/registry/tasks/arc.yaml +0 -28
  157. evalscope-0.16.3/evalscope/registry/tasks/bbh.yaml +0 -26
  158. evalscope-0.16.3/evalscope/registry/tasks/bbh_mini.yaml +0 -26
  159. evalscope-0.16.3/evalscope/registry/tasks/ceval.yaml +0 -27
  160. evalscope-0.16.3/evalscope/registry/tasks/ceval_mini.yaml +0 -26
  161. evalscope-0.16.3/evalscope/registry/tasks/cmmlu.yaml +0 -27
  162. evalscope-0.16.3/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  163. evalscope-0.16.3/evalscope/registry/tasks/general_qa.yaml +0 -27
  164. evalscope-0.16.3/evalscope/registry/tasks/gsm8k.yaml +0 -29
  165. evalscope-0.16.3/evalscope/registry/tasks/mmlu.yaml +0 -29
  166. evalscope-0.16.3/evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  167. evalscope-0.16.3/evalscope/run_arena.py +0 -202
  168. evalscope-0.16.3/evalscope/utils/__init__.py +0 -4
  169. evalscope-0.16.3/evalscope/utils/arena_utils.py +0 -217
  170. evalscope-0.16.3/evalscope/utils/completion_parsers.py +0 -82
  171. evalscope-0.16.3/evalscope/utils/model_utils.py +0 -40
  172. evalscope-0.16.3/evalscope/version.py +0 -4
  173. evalscope-0.16.3/tests/swift/__init__.py +0 -1
  174. evalscope-0.16.3/tests/vlm/__init__.py +0 -1
  175. {evalscope-0.16.3 → evalscope-0.17.1}/LICENSE +0 -0
  176. {evalscope-0.16.3 → evalscope-0.17.1}/MANIFEST.in +0 -0
  177. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/__init__.py +0 -0
  178. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/app/__init__.py +0 -0
  179. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/app/arguments.py +0 -0
  180. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/arguments.py +0 -0
  181. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/__init__.py +0 -0
  182. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/base.py +0 -0
  183. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/__init__.py +0 -0
  184. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  185. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  186. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  187. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  188. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/__init__.py +0 -0
  189. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  190. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  191. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  192. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  193. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  194. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  195. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  196. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  197. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  198. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  199. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  200. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  201. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  202. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  203. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  204. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  205. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  206. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  207. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  208. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  209. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  210. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  211. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  212. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  213. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  214. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  215. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  216. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  217. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  218. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  219. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  220. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  221. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  222. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  223. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  224. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  225. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/__init__.py +0 -0
  226. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  227. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
  228. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
  229. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
  230. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
  231. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
  232. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
  233. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/aime/__init__.py +0 -0
  234. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  235. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arc/__init__.py +0 -0
  236. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  237. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
  238. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
  239. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  240. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  241. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  242. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  243. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  244. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  245. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  246. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  247. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  248. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  249. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  250. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  251. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  252. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  253. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  254. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  255. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  256. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  257. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  258. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  259. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  260. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  261. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  262. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  263. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  264. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  265. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  266. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  267. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/benchmark.py +0 -0
  268. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/bfcl/__init__.py +0 -0
  269. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
  270. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  271. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  272. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
  273. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  274. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  275. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  276. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  277. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  278. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
  279. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
  280. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/docmath/__init__.py +0 -0
  281. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/docmath/docmath_adapter.py +0 -0
  282. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/docmath/utils.py +0 -0
  283. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/drop/__init__.py +0 -0
  284. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/drop/drop_adapter.py +0 -0
  285. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/drop/utils.py +0 -0
  286. {evalscope-0.16.3/evalscope/utils → evalscope-0.17.1/evalscope/benchmarks}/filters.py +0 -0
  287. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/frames/__init__.py +0 -0
  288. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/frames/frames_adapter.py +0 -0
  289. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/frames/utils.py +0 -0
  290. {evalscope-0.16.3/evalscope/benchmarks/general_mcq → evalscope-0.17.1/evalscope/benchmarks/general_arena}/__init__.py +0 -0
  291. {evalscope-0.16.3/evalscope/benchmarks/gpqa → evalscope-0.17.1/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  292. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  293. {evalscope-0.16.3/evalscope/benchmarks/ifeval → evalscope-0.17.1/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  294. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  295. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
  296. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  297. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  298. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  299. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  300. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  301. {evalscope-0.16.3/evalscope/benchmarks/iquiz → evalscope-0.17.1/evalscope/benchmarks/hle}/__init__.py +0 -0
  302. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  303. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  304. {evalscope-0.16.3/evalscope/benchmarks/live_code_bench → evalscope-0.17.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  305. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  306. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  307. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  308. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
  309. {evalscope-0.16.3/evalscope/benchmarks/maritime_bench → evalscope-0.17.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  310. {evalscope-0.16.3/evalscope/benchmarks/math_500 → evalscope-0.17.1/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  311. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
  312. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  313. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
  314. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  315. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  316. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
  317. {evalscope-0.16.3/evalscope/benchmarks/mmlu_pro → evalscope-0.17.1/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
  318. {evalscope-0.16.3/evalscope/benchmarks/mmlu_redux → evalscope-0.17.1/evalscope/benchmarks/math_500}/__init__.py +0 -0
  319. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  320. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  321. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  322. {evalscope-0.16.3/evalscope/benchmarks/musr → evalscope-0.17.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  323. {evalscope-0.16.3/evalscope/benchmarks/needle_haystack → evalscope-0.17.1/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
  324. {evalscope-0.16.3/evalscope/benchmarks/process_bench → evalscope-0.17.1/evalscope/benchmarks/musr}/__init__.py +0 -0
  325. {evalscope-0.16.3/evalscope/benchmarks/simple_qa → evalscope-0.17.1/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
  326. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +0 -0
  327. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/needle_haystack/utils.py +0 -0
  328. {evalscope-0.16.3/evalscope/benchmarks/super_gpqa → evalscope-0.17.1/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  329. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  330. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
  331. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/race/__init__.py +0 -0
  332. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/race/race.py +0 -0
  333. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
  334. {evalscope-0.16.3/evalscope/benchmarks/tool_bench → evalscope-0.17.1/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  335. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
  336. {evalscope-0.16.3/evalscope/benchmarks/winogrande → evalscope-0.17.1/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  337. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
  338. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
  339. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
  340. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
  341. {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models → evalscope-0.17.1/evalscope/benchmarks/tau_bench}/__init__.py +0 -0
  342. {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.17.1/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
  343. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/tool_bench/utils.py +0 -0
  344. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  345. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  346. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  347. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  348. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  349. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  350. {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.17.1/evalscope/benchmarks/winogrande}/__init__.py +0 -0
  351. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/__init__.py +0 -0
  352. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/base.py +0 -0
  353. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/cli.py +0 -0
  354. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/start_app.py +0 -0
  355. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/start_eval.py +0 -0
  356. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/start_perf.py +0 -0
  357. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/cli/start_server.py +0 -0
  358. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/collections/__init__.py +0 -0
  359. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/collections/evaluator.py +0 -0
  360. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/collections/sampler.py +0 -0
  361. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/collections/schema.py +0 -0
  362. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  363. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  364. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/math_parser.py +0 -0
  365. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/named_metrics.py +0 -0
  366. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/rouge_metric.py +0 -0
  367. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
  368. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
  369. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/constants.py +0 -0
  370. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
  371. {evalscope-0.16.3/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
  372. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
  373. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
  374. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
  375. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
  376. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
  377. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
  378. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
  379. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
  380. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
  381. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
  382. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
  383. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
  384. {evalscope-0.16.3/evalscope/perf → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
  385. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
  386. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
  387. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
  388. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
  389. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
  390. {evalscope-0.16.3/evalscope/perf/utils → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
  391. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
  392. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
  393. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
  394. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
  395. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
  396. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
  397. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
  398. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
  399. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  400. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  401. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  402. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  403. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  404. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  405. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  406. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  407. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  408. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  409. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
  410. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
  411. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
  412. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
  413. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
  414. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
  415. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
  416. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
  417. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
  418. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
  419. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
  420. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
  421. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
  422. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
  423. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
  424. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
  425. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
  426. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
  427. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
  428. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
  429. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
  430. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
  431. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
  432. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
  433. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
  434. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
  435. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
  436. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
  437. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
  438. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
  439. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
  440. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
  441. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
  442. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
  443. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
  444. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
  445. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
  446. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
  447. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
  448. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
  449. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
  450. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
  451. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
  452. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
  453. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
  454. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
  455. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
  456. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
  457. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
  458. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
  459. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
  460. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
  461. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
  462. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
  463. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
  464. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/score.py +0 -0
  465. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
  466. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/custom/__init__.py +0 -0
  467. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/custom/custom_model.py +0 -0
  468. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/models/local_model.py +0 -0
  469. {evalscope-0.16.3/tests/rag → evalscope-0.17.1/evalscope/perf}/__init__.py +0 -0
  470. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  471. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/handler.py +0 -0
  472. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/local_server.py +0 -0
  473. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/log_utils.py +0 -0
  474. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/perf/utils/rich_display.py +0 -0
  475. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/report/combinator.py +0 -0
  476. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/report/generator.py +0 -0
  477. {evalscope-0.16.3/evalscope/evaluator/reviewer → evalscope-0.17.1/evalscope/third_party}/__init__.py +0 -0
  478. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/README.md +0 -0
  479. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
  480. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
  481. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  482. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/eval.py +0 -0
  483. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/infer.py +0 -0
  484. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  485. {evalscope-0.16.3/evalscope/registry → evalscope-0.17.1/evalscope/third_party/longbench_write/resources}/__init__.py +0 -0
  486. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  487. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  488. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  489. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  490. {evalscope-0.16.3/evalscope/third_party → evalscope-0.17.1/evalscope/third_party/longbench_write/tools}/__init__.py +0 -0
  491. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  492. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  493. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/longbench_write/utils.py +0 -0
  494. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
  495. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/eval.py +0 -0
  496. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/infer.py +0 -0
  497. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  498. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  499. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  500. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  501. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/README.md +0 -0
  502. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  503. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  504. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  505. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
  506. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
  507. {evalscope-0.16.3/evalscope/third_party/longbench_write/resources → evalscope-0.17.1/evalscope/third_party/toolbench_static/llm}/__init__.py +0 -0
  508. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  509. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  510. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  511. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/chat_service.py +0 -0
  512. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/deprecation_utils.py +0 -0
  513. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope/utils/logger.py +0 -0
  514. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/dependency_links.txt +0 -0
  515. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/entry_points.txt +0 -0
  516. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/not-zip-safe +0 -0
  517. {evalscope-0.16.3 → evalscope-0.17.1}/evalscope.egg-info/top_level.txt +0 -0
  518. {evalscope-0.16.3 → evalscope-0.17.1}/requirements/aigc.txt +0 -0
  519. {evalscope-0.16.3 → evalscope-0.17.1}/requirements/app.txt +0 -0
  520. {evalscope-0.16.3 → evalscope-0.17.1}/requirements/docs.txt +0 -0
  521. {evalscope-0.16.3 → evalscope-0.17.1}/requirements/opencompass.txt +0 -0
  522. {evalscope-0.16.3 → evalscope-0.17.1}/requirements/perf.txt +0 -0
  523. {evalscope-0.16.3 → evalscope-0.17.1}/requirements/rag.txt +0 -0
  524. {evalscope-0.16.3 → evalscope-0.17.1}/requirements/vlmeval.txt +0 -0
  525. {evalscope-0.16.3 → evalscope-0.17.1}/requirements.txt +0 -0
  526. {evalscope-0.16.3/evalscope/third_party/longbench_write/tools → evalscope-0.17.1/tests}/__init__.py +0 -0
  527. {evalscope-0.16.3/evalscope/third_party/toolbench_static/llm → evalscope-0.17.1/tests/aigc}/__init__.py +0 -0
  528. {evalscope-0.16.3/tests → evalscope-0.17.1/tests/cli}/__init__.py +0 -0
  529. {evalscope-0.16.3/tests/aigc → evalscope-0.17.1/tests/perf}/__init__.py +0 -0
  530. {evalscope-0.16.3/tests/cli → evalscope-0.17.1/tests/swift}/__init__.py +0 -0
  531. {evalscope-0.16.3 → evalscope-0.17.1}/tests/test_run_all.py +0 -0
  532. {evalscope-0.16.3/tests/perf → evalscope-0.17.1/tests/vlm}/__init__.py +0 -0
@@ -1,130 +1,31 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.16.3
3
+ Version: 0.17.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
7
7
  Author-email: contact@modelscope.cn
8
+ License: Apache License 2.0
8
9
  Keywords: python,llm,evaluation
9
10
  Classifier: Development Status :: 4 - Beta
10
- Classifier: License :: OSI Approved :: Apache Software License
11
11
  Classifier: Operating System :: OS Independent
12
12
  Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.8
14
13
  Classifier: Programming Language :: Python :: 3.9
15
14
  Classifier: Programming Language :: Python :: 3.10
16
- Requires-Python: >=3.8
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.9
17
18
  Description-Content-Type: text/markdown
18
- License-File: LICENSE
19
- Requires-Dist: accelerate
20
- Requires-Dist: datasets>=3.0
21
- Requires-Dist: immutabledict
22
- Requires-Dist: jieba
23
- Requires-Dist: jsonlines
24
- Requires-Dist: langdetect
25
- Requires-Dist: latex2sympy2_extended
26
- Requires-Dist: matplotlib
27
- Requires-Dist: modelscope[framework]
28
- Requires-Dist: nltk>=3.9
29
- Requires-Dist: openai
30
- Requires-Dist: pandas
31
- Requires-Dist: pillow
32
- Requires-Dist: pyarrow
33
- Requires-Dist: pyyaml>=5.1
34
- Requires-Dist: requests
35
- Requires-Dist: rouge-chinese
36
- Requires-Dist: rouge-score>=0.1.0
37
- Requires-Dist: sacrebleu
38
- Requires-Dist: scikit-learn
39
- Requires-Dist: seaborn
40
- Requires-Dist: sympy
41
- Requires-Dist: tabulate
42
- Requires-Dist: torch
43
- Requires-Dist: tqdm
44
- Requires-Dist: transformers>=4.33
45
- Requires-Dist: word2number
46
19
  Provides-Extra: opencompass
47
- Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
48
20
  Provides-Extra: vlmeval
49
- Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
50
21
  Provides-Extra: rag
51
- Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
52
- Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
53
- Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
54
- Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
55
- Requires-Dist: mteb==1.38.20; extra == "rag"
56
- Requires-Dist: ragas==0.2.14; extra == "rag"
57
- Requires-Dist: webdataset>0.2.0; extra == "rag"
58
22
  Provides-Extra: perf
59
- Requires-Dist: aiohttp; extra == "perf"
60
- Requires-Dist: fastapi; extra == "perf"
61
- Requires-Dist: numpy; extra == "perf"
62
- Requires-Dist: rich; extra == "perf"
63
- Requires-Dist: sse_starlette; extra == "perf"
64
- Requires-Dist: transformers; extra == "perf"
65
- Requires-Dist: uvicorn; extra == "perf"
66
23
  Provides-Extra: app
67
- Requires-Dist: gradio==5.4.0; extra == "app"
68
- Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
69
24
  Provides-Extra: aigc
70
- Requires-Dist: diffusers; extra == "aigc"
71
- Requires-Dist: iopath; extra == "aigc"
72
- Requires-Dist: omegaconf; extra == "aigc"
73
- Requires-Dist: open_clip_torch; extra == "aigc"
74
- Requires-Dist: opencv-python; extra == "aigc"
75
- Requires-Dist: torchvision; extra == "aigc"
25
+ Provides-Extra: dev
26
+ Provides-Extra: docs
76
27
  Provides-Extra: all
77
- Requires-Dist: accelerate; extra == "all"
78
- Requires-Dist: datasets>=3.0; extra == "all"
79
- Requires-Dist: immutabledict; extra == "all"
80
- Requires-Dist: jieba; extra == "all"
81
- Requires-Dist: jsonlines; extra == "all"
82
- Requires-Dist: langdetect; extra == "all"
83
- Requires-Dist: latex2sympy2_extended; extra == "all"
84
- Requires-Dist: matplotlib; extra == "all"
85
- Requires-Dist: modelscope[framework]; extra == "all"
86
- Requires-Dist: nltk>=3.9; extra == "all"
87
- Requires-Dist: openai; extra == "all"
88
- Requires-Dist: pandas; extra == "all"
89
- Requires-Dist: pillow; extra == "all"
90
- Requires-Dist: pyarrow; extra == "all"
91
- Requires-Dist: pyyaml>=5.1; extra == "all"
92
- Requires-Dist: requests; extra == "all"
93
- Requires-Dist: rouge-chinese; extra == "all"
94
- Requires-Dist: rouge-score>=0.1.0; extra == "all"
95
- Requires-Dist: sacrebleu; extra == "all"
96
- Requires-Dist: scikit-learn; extra == "all"
97
- Requires-Dist: seaborn; extra == "all"
98
- Requires-Dist: sympy; extra == "all"
99
- Requires-Dist: tabulate; extra == "all"
100
- Requires-Dist: torch; extra == "all"
101
- Requires-Dist: tqdm; extra == "all"
102
- Requires-Dist: transformers>=4.33; extra == "all"
103
- Requires-Dist: word2number; extra == "all"
104
- Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
105
- Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
106
- Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
107
- Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
108
- Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
109
- Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
110
- Requires-Dist: mteb==1.38.20; extra == "all"
111
- Requires-Dist: ragas==0.2.14; extra == "all"
112
- Requires-Dist: webdataset>0.2.0; extra == "all"
113
- Requires-Dist: aiohttp; extra == "all"
114
- Requires-Dist: fastapi; extra == "all"
115
- Requires-Dist: numpy; extra == "all"
116
- Requires-Dist: rich; extra == "all"
117
- Requires-Dist: sse_starlette; extra == "all"
118
- Requires-Dist: transformers; extra == "all"
119
- Requires-Dist: uvicorn; extra == "all"
120
- Requires-Dist: gradio==5.4.0; extra == "all"
121
- Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
122
- Requires-Dist: diffusers; extra == "all"
123
- Requires-Dist: iopath; extra == "all"
124
- Requires-Dist: omegaconf; extra == "all"
125
- Requires-Dist: open_clip_torch; extra == "all"
126
- Requires-Dist: opencv-python; extra == "all"
127
- Requires-Dist: torchvision; extra == "all"
28
+ License-File: LICENSE
128
29
 
129
30
  <p align="center">
130
31
  <br>
@@ -165,16 +66,17 @@ Requires-Dist: torchvision; extra == "all"
165
66
  - [Basic Parameter](#basic-parameter)
166
67
  - [Output Results](#output-results)
167
68
  - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
168
- - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
69
+ - [🌐 Evaluation of Model API](#-evaluation-of-model-api)
169
70
  - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
170
- - [Parameter](#parameter)
171
- - [Evaluation Backend](#evaluation-backend)
71
+ - [Parameter Description](#parameter-description)
72
+ - [🧪 Other Evaluation Backends](#-other-evaluation-backends)
172
73
  - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
173
74
  - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
174
- - [🏟️ Arena Mode](#️-arena-mode)
75
+ - [⚔️ Arena Mode](#️-arena-mode)
175
76
  - [👷‍♂️ Contribution](#️-contribution)
77
+ - [📚 Citation](#-citation)
176
78
  - [🔜 Roadmap](#-roadmap)
177
- - [Star History](#star-history)
79
+ - [Star History](#-star-history)
178
80
 
179
81
 
180
82
  ## 📝 Introduction
@@ -198,24 +100,33 @@ EvalScope is not merely an evaluation tool; it is a valuable ally in your model
198
100
  Below is the overall architecture diagram of EvalScope:
199
101
 
200
102
  <p align="center">
201
- <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
103
+ <img src="https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/doc/EvalScope%E6%9E%B6%E6%9E%84%E5%9B%BE.png" width="70%">
202
104
  <br>EvalScope Framework.
203
105
  </p>
204
106
 
205
107
  <details><summary>Framework Description</summary>
206
108
 
207
109
  The architecture includes the following modules:
208
- 1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
209
- 2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
210
- 3. **Evaluation Backend**:
211
- - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
212
- - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
213
- - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
214
- - **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
215
- - **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
216
- 4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
217
- 5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
218
- 6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
110
+ 1. Input Layer
111
+ - **Model Sources**: API models (OpenAI API), local models (ModelScope)
112
+ - **Datasets**: Standard evaluation benchmarks (MMLU/GSM8k, etc.), custom data (MCQ/QA)
113
+
114
+ 2. Core Functions
115
+ - **Multi-backend Evaluation**
116
+ - Native backends: Unified evaluation for LLM/VLM/Embedding/T2I models
117
+ - Integrated frameworks: OpenCompass/MTEB/VLMEvalKit/RAGAS
118
+
119
+ - **Performance Monitoring**
120
+ - Model plugins: Supports various model service APIs
121
+ - Data plugins: Supports multiple data formats
122
+ - Metric tracking: TTFT/TPOP/Stability and other metrics
123
+
124
+ - **Tool Extensions**
125
+ - Integration: Tool-Bench/Needle-in-a-Haystack/BFCL-v3
126
+
127
+ 3. Output Layer
128
+ - **Structured Reports**: Supports JSON/Tables/Logs
129
+ - **Visualization Platforms**: Supports Gradio/Wandb/SwanLab
219
130
 
220
131
  </details>
221
132
 
@@ -229,8 +140,12 @@ Please scan the QR code below to join our community groups:
229
140
 
230
141
 
231
142
  ## 🎉 News
232
-
233
- - 🔥 **[2025.06.19]** Added support for the BFCL-v3 benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
143
+ - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
144
+ - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
145
+ - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
146
+ - 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
147
+ - 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
148
+ - 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
234
149
  - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
235
150
  - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
236
151
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
@@ -239,6 +154,8 @@ Please scan the QR code below to join our community groups:
239
154
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
240
155
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
241
156
  - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
157
+ <details><summary>More</summary>
158
+
242
159
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
243
160
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
244
161
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -252,8 +169,6 @@ Please scan the QR code below to join our community groups:
252
169
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
253
170
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
254
171
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
255
- <details><summary>More</summary>
256
-
257
172
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
258
173
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
259
174
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
@@ -345,33 +260,31 @@ evalscope eval \
345
260
 
346
261
  When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
347
262
 
348
- **Using Python Dictionary**
263
+ **Using `TaskConfig`**
349
264
 
350
265
  ```python
351
- from evalscope.run import run_task
266
+ from evalscope import run_task, TaskConfig
352
267
 
353
- task_cfg = {
354
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
355
- 'datasets': ['gsm8k', 'arc'],
356
- 'limit': 5
357
- }
268
+ task_cfg = TaskConfig(
269
+ model='Qwen/Qwen2.5-0.5B-Instruct',
270
+ datasets=['gsm8k', 'arc'],
271
+ limit=5
272
+ )
358
273
 
359
274
  run_task(task_cfg=task_cfg)
360
275
  ```
361
-
362
276
  <details><summary>More Startup Methods</summary>
363
277
 
364
- **Using `TaskConfig`**
278
+ **Using Python Dictionary**
365
279
 
366
280
  ```python
367
281
  from evalscope.run import run_task
368
- from evalscope.config import TaskConfig
369
282
 
370
- task_cfg = TaskConfig(
371
- model='Qwen/Qwen2.5-0.5B-Instruct',
372
- datasets=['gsm8k', 'arc'],
373
- limit=5
374
- )
283
+ task_cfg = {
284
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
285
+ 'datasets': ['gsm8k', 'arc'],
286
+ 'limit': 5
287
+ }
375
288
 
376
289
  run_task(task_cfg=task_cfg)
377
290
  ```
@@ -474,7 +387,7 @@ To create a public link, set `share=True` in `launch()`.
474
387
 
475
388
  For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
476
389
 
477
- ## 🌐 Evaluation of Specified Model API
390
+ ## 🌐 Evaluation of Model API
478
391
 
479
392
  Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
480
393
 
@@ -525,7 +438,7 @@ evalscope eval \
525
438
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
526
439
 
527
440
 
528
- ## Evaluation Backend
441
+ ## 🧪 Other Evaluation Backends
529
442
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
530
443
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
531
444
  - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
@@ -572,10 +485,17 @@ Speed Benchmark Results:
572
485
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
573
486
 
574
487
 
575
- ## 🏟️ Arena Mode
576
- The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
488
+ ## ⚔️ Arena Mode
577
489
 
578
- Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
490
+ Arena mode allows you to configure multiple candidate models and specify a baseline model. Evaluation is performed by pairwise battles between each candidate model and the baseline model, with the final output including each model's win rate and ranking. This method is suitable for comparative evaluation among multiple models, providing an intuitive reflection of each model's strengths and weaknesses. Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
491
+
492
+ ```text
493
+ Model WinRate (%) CI (%)
494
+ ------------ ------------- ---------------
495
+ qwen2.5-72b 69.3 (-13.3 / +12.2)
496
+ qwen2.5-7b 50 (+0.0 / +0.0)
497
+ qwen2.5-0.5b 4.7 (-2.5 / +4.4)
498
+ ```
579
499
 
580
500
  ## 👷‍♂️ Contribution
581
501
 
@@ -591,6 +511,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
591
511
  </table>
592
512
  </a>
593
513
 
514
+ ## 📚 Citation
515
+
516
+ ```bibtex
517
+ @misc{evalscope_2024,
518
+ title={{EvalScope}: Evaluation Framework for Large Models},
519
+ author={ModelScope Team},
520
+ year={2024},
521
+ url={https://github.com/modelscope/evalscope}
522
+ }
523
+ ```
524
+
594
525
  ## 🔜 Roadmap
595
526
  - [x] Support for better evaluation report visualization
596
527
  - [x] Support for mixed evaluations across multiple datasets
@@ -601,11 +532,11 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
601
532
  - [ ] Distributed evaluating
602
533
  - [x] Multi-modal evaluation
603
534
  - [ ] Benchmarks
604
- - [ ] GAIA
535
+ - [x] BFCL-v3
605
536
  - [x] GPQA
606
537
  - [x] MBPP
607
538
 
608
539
 
609
- ## Star History
540
+ ## Star History
610
541
 
611
542
  [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -37,16 +37,17 @@
37
37
  - [Basic Parameter](#basic-parameter)
38
38
  - [Output Results](#output-results)
39
39
  - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
40
- - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
40
+ - [🌐 Evaluation of Model API](#-evaluation-of-model-api)
41
41
  - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
42
- - [Parameter](#parameter)
43
- - [Evaluation Backend](#evaluation-backend)
42
+ - [Parameter Description](#parameter-description)
43
+ - [🧪 Other Evaluation Backends](#-other-evaluation-backends)
44
44
  - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
45
45
  - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
46
- - [🏟️ Arena Mode](#️-arena-mode)
46
+ - [⚔️ Arena Mode](#️-arena-mode)
47
47
  - [👷‍♂️ Contribution](#️-contribution)
48
+ - [📚 Citation](#-citation)
48
49
  - [🔜 Roadmap](#-roadmap)
49
- - [Star History](#star-history)
50
+ - [Star History](#-star-history)
50
51
 
51
52
 
52
53
  ## 📝 Introduction
@@ -70,24 +71,33 @@ EvalScope is not merely an evaluation tool; it is a valuable ally in your model
70
71
  Below is the overall architecture diagram of EvalScope:
71
72
 
72
73
  <p align="center">
73
- <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
74
+ <img src="https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/doc/EvalScope%E6%9E%B6%E6%9E%84%E5%9B%BE.png" width="70%">
74
75
  <br>EvalScope Framework.
75
76
  </p>
76
77
 
77
78
  <details><summary>Framework Description</summary>
78
79
 
79
80
  The architecture includes the following modules:
80
- 1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
81
- 2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
82
- 3. **Evaluation Backend**:
83
- - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
84
- - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
85
- - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
86
- - **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
87
- - **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
88
- 4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
89
- 5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
90
- 6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
81
+ 1. Input Layer
82
+ - **Model Sources**: API models (OpenAI API), local models (ModelScope)
83
+ - **Datasets**: Standard evaluation benchmarks (MMLU/GSM8k, etc.), custom data (MCQ/QA)
84
+
85
+ 2. Core Functions
86
+ - **Multi-backend Evaluation**
87
+ - Native backends: Unified evaluation for LLM/VLM/Embedding/T2I models
88
+ - Integrated frameworks: OpenCompass/MTEB/VLMEvalKit/RAGAS
89
+
90
+ - **Performance Monitoring**
91
+ - Model plugins: Supports various model service APIs
92
+ - Data plugins: Supports multiple data formats
93
+ - Metric tracking: TTFT/TPOP/Stability and other metrics
94
+
95
+ - **Tool Extensions**
96
+ - Integration: Tool-Bench/Needle-in-a-Haystack/BFCL-v3
97
+
98
+ 3. Output Layer
99
+ - **Structured Reports**: Supports JSON/Tables/Logs
100
+ - **Visualization Platforms**: Supports Gradio/Wandb/SwanLab
91
101
 
92
102
  </details>
93
103
 
@@ -101,8 +111,12 @@ Please scan the QR code below to join our community groups:
101
111
 
102
112
 
103
113
  ## 🎉 News
104
-
105
- - 🔥 **[2025.06.19]** Added support for the BFCL-v3 benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
114
+ - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
115
+ - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
116
+ - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
117
+ - 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
118
+ - 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
119
+ - 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
106
120
  - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
107
121
  - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
108
122
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
@@ -111,6 +125,8 @@ Please scan the QR code below to join our community groups:
111
125
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
112
126
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
113
127
  - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
128
+ <details><summary>More</summary>
129
+
114
130
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
115
131
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
116
132
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -124,8 +140,6 @@ Please scan the QR code below to join our community groups:
124
140
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
125
141
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
126
142
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
127
- <details><summary>More</summary>
128
-
129
143
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
130
144
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
131
145
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
@@ -217,33 +231,31 @@ evalscope eval \
217
231
 
218
232
  When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
219
233
 
220
- **Using Python Dictionary**
234
+ **Using `TaskConfig`**
221
235
 
222
236
  ```python
223
- from evalscope.run import run_task
237
+ from evalscope import run_task, TaskConfig
224
238
 
225
- task_cfg = {
226
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
227
- 'datasets': ['gsm8k', 'arc'],
228
- 'limit': 5
229
- }
239
+ task_cfg = TaskConfig(
240
+ model='Qwen/Qwen2.5-0.5B-Instruct',
241
+ datasets=['gsm8k', 'arc'],
242
+ limit=5
243
+ )
230
244
 
231
245
  run_task(task_cfg=task_cfg)
232
246
  ```
233
-
234
247
  <details><summary>More Startup Methods</summary>
235
248
 
236
- **Using `TaskConfig`**
249
+ **Using Python Dictionary**
237
250
 
238
251
  ```python
239
252
  from evalscope.run import run_task
240
- from evalscope.config import TaskConfig
241
253
 
242
- task_cfg = TaskConfig(
243
- model='Qwen/Qwen2.5-0.5B-Instruct',
244
- datasets=['gsm8k', 'arc'],
245
- limit=5
246
- )
254
+ task_cfg = {
255
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
256
+ 'datasets': ['gsm8k', 'arc'],
257
+ 'limit': 5
258
+ }
247
259
 
248
260
  run_task(task_cfg=task_cfg)
249
261
  ```
@@ -346,7 +358,7 @@ To create a public link, set `share=True` in `launch()`.
346
358
 
347
359
  For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
348
360
 
349
- ## 🌐 Evaluation of Specified Model API
361
+ ## 🌐 Evaluation of Model API
350
362
 
351
363
  Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
352
364
 
@@ -397,7 +409,7 @@ evalscope eval \
397
409
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
398
410
 
399
411
 
400
- ## Evaluation Backend
412
+ ## 🧪 Other Evaluation Backends
401
413
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
402
414
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
403
415
  - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
@@ -444,10 +456,17 @@ Speed Benchmark Results:
444
456
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
445
457
 
446
458
 
447
- ## 🏟️ Arena Mode
448
- The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
459
+ ## ⚔️ Arena Mode
449
460
 
450
- Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
461
+ Arena mode allows you to configure multiple candidate models and specify a baseline model. Evaluation is performed by pairwise battles between each candidate model and the baseline model, with the final output including each model's win rate and ranking. This method is suitable for comparative evaluation among multiple models, providing an intuitive reflection of each model's strengths and weaknesses. Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
462
+
463
+ ```text
464
+ Model WinRate (%) CI (%)
465
+ ------------ ------------- ---------------
466
+ qwen2.5-72b 69.3 (-13.3 / +12.2)
467
+ qwen2.5-7b 50 (+0.0 / +0.0)
468
+ qwen2.5-0.5b 4.7 (-2.5 / +4.4)
469
+ ```
451
470
 
452
471
  ## 👷‍♂️ Contribution
453
472
 
@@ -463,6 +482,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
463
482
  </table>
464
483
  </a>
465
484
 
485
+ ## 📚 Citation
486
+
487
+ ```bibtex
488
+ @misc{evalscope_2024,
489
+ title={{EvalScope}: Evaluation Framework for Large Models},
490
+ author={ModelScope Team},
491
+ year={2024},
492
+ url={https://github.com/modelscope/evalscope}
493
+ }
494
+ ```
495
+
466
496
  ## 🔜 Roadmap
467
497
  - [x] Support for better evaluation report visualization
468
498
  - [x] Support for mixed evaluations across multiple datasets
@@ -473,11 +503,11 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
473
503
  - [ ] Distributed evaluating
474
504
  - [x] Multi-modal evaluation
475
505
  - [ ] Benchmarks
476
- - [ ] GAIA
506
+ - [x] BFCL-v3
477
507
  - [x] GPQA
478
508
  - [x] MBPP
479
509
 
480
510
 
481
- ## Star History
511
+ ## Star History
482
512
 
483
513
  [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -0,0 +1,35 @@
1
+ """
2
+ Main application module for the Evalscope dashboard.
3
+ """
4
+ import argparse
5
+
6
+ from evalscope.utils.logger import configure_logging
7
+ from .arguments import add_argument
8
+ from .ui import create_app_ui
9
+
10
+
11
+ def create_app(args: argparse.Namespace):
12
+ """
13
+ Create and launch the Evalscope dashboard application.
14
+
15
+ Args:
16
+ args: Command line arguments.
17
+ """
18
+ configure_logging(debug=args.debug)
19
+
20
+ demo = create_app_ui(args)
21
+
22
+ demo.launch(
23
+ share=args.share,
24
+ server_name=args.server_name,
25
+ server_port=args.server_port,
26
+ debug=args.debug,
27
+ allowed_paths=args.allowed_paths,
28
+ )
29
+
30
+
31
+ if __name__ == '__main__':
32
+ parser = argparse.ArgumentParser()
33
+ add_argument(parser)
34
+ args = parser.parse_args()
35
+ create_app(args)
@@ -2,6 +2,7 @@ PLOTLY_THEME = 'plotly_dark'
2
2
  REPORT_TOKEN = '@@'
3
3
  MODEL_TOKEN = '::'
4
4
  DATASET_TOKEN = ', '
5
+ DEFAULT_BAR_WIDTH = 0.2
5
6
  LATEX_DELIMITERS = [{
6
7
  'left': '$$',
7
8
  'right': '$$',