evalscope 0.17.0__tar.gz → 0.17.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (500) hide show
  1. {evalscope-0.17.0/evalscope.egg-info → evalscope-0.17.1}/PKG-INFO +44 -30
  2. {evalscope-0.17.0 → evalscope-0.17.1}/README.md +38 -26
  3. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  4. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/data_adapter.py +9 -4
  5. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
  6. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
  7. evalscope-0.17.1/evalscope/benchmarks/hle/hle_adapter.py +118 -0
  8. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  9. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  10. evalscope-0.17.1/evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  11. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  12. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/utils.py +1 -0
  13. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/constants.py +5 -21
  14. evalscope-0.17.1/evalscope/evaluator/__init__.py +3 -0
  15. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/evaluator/evaluator.py +5 -3
  16. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/__init__.py +3 -1
  17. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/completion_parsers.py +7 -0
  18. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/llm_judge.py +6 -5
  19. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/metrics.py +19 -7
  20. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
  21. {evalscope-0.17.0/evalscope/perf/utils → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models}/__init__.py +0 -0
  22. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/__init__.py +4 -8
  23. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/__init__.py +4 -9
  24. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/base_adapter.py +4 -0
  25. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/bfcl_adapter.py +2 -0
  26. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/chat_adapter.py +3 -0
  27. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/choice_adapter.py +4 -0
  28. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/custom_adapter.py +7 -3
  29. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/server_adapter.py +2 -0
  30. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/adapters/t2i_adapter.py +3 -0
  31. evalscope-0.17.1/evalscope/models/adapters/tau_bench_adapter.py +189 -0
  32. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/register.py +0 -14
  33. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/arguments.py +13 -0
  34. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/benchmark.py +38 -39
  35. evalscope-0.17.1/evalscope/perf/http_client.py +120 -0
  36. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/main.py +2 -2
  37. evalscope-0.17.1/evalscope/perf/plugin/__init__.py +3 -0
  38. evalscope-0.17.1/evalscope/perf/plugin/api/__init__.py +4 -0
  39. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/api/base.py +22 -4
  40. evalscope-0.17.1/evalscope/perf/plugin/api/custom_api.py +249 -0
  41. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/api/dashscope_api.py +4 -10
  42. evalscope-0.17.1/evalscope/perf/plugin/api/default_api.py +105 -0
  43. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/api/openai_api.py +17 -19
  44. evalscope-0.17.1/evalscope/perf/plugin/datasets/__init__.py +10 -0
  45. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/base.py +22 -1
  46. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/custom.py +2 -1
  47. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  48. evalscope-0.17.1/evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  49. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  50. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  51. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/openqa.py +2 -1
  52. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  53. evalscope-0.17.1/evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  54. evalscope-0.17.1/evalscope/perf/plugin/registry.py +74 -0
  55. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/benchmark_util.py +14 -20
  56. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/db_util.py +79 -61
  57. evalscope-0.17.1/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  58. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/io_utils.py +10 -0
  59. evalscope-0.17.1/evalscope/version.py +4 -0
  60. {evalscope-0.17.0 → evalscope-0.17.1/evalscope.egg-info}/PKG-INFO +44 -30
  61. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/SOURCES.txt +8 -1
  62. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/requires.txt +12 -4
  63. {evalscope-0.17.0 → evalscope-0.17.1}/requirements/app.txt +1 -1
  64. {evalscope-0.17.0 → evalscope-0.17.1}/requirements/dev.txt +1 -1
  65. {evalscope-0.17.0 → evalscope-0.17.1}/setup.cfg +1 -1
  66. {evalscope-0.17.0 → evalscope-0.17.1}/setup.py +33 -15
  67. {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/test_all.py +18 -2
  68. {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/test_run.py +25 -37
  69. {evalscope-0.17.0 → evalscope-0.17.1}/tests/perf/test_perf.py +29 -2
  70. evalscope-0.17.1/tests/rag/__init__.py +0 -0
  71. evalscope-0.17.0/evalscope/evaluator/__init__.py +0 -3
  72. evalscope-0.17.0/evalscope/models/model.py +0 -189
  73. evalscope-0.17.0/evalscope/perf/http_client.py +0 -176
  74. evalscope-0.17.0/evalscope/perf/plugin/__init__.py +0 -2
  75. evalscope-0.17.0/evalscope/perf/plugin/api/__init__.py +0 -3
  76. evalscope-0.17.0/evalscope/perf/plugin/api/custom_api.py +0 -92
  77. evalscope-0.17.0/evalscope/perf/plugin/datasets/__init__.py +0 -7
  78. evalscope-0.17.0/evalscope/perf/plugin/registry.py +0 -54
  79. evalscope-0.17.0/evalscope/version.py +0 -4
  80. {evalscope-0.17.0 → evalscope-0.17.1}/LICENSE +0 -0
  81. {evalscope-0.17.0 → evalscope-0.17.1}/MANIFEST.in +0 -0
  82. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/__init__.py +0 -0
  83. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/__init__.py +0 -0
  84. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/app.py +0 -0
  85. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/arguments.py +0 -0
  86. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/constants.py +0 -0
  87. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/__init__.py +0 -0
  88. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/app_ui.py +0 -0
  89. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/multi_model.py +0 -0
  90. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/sidebar.py +0 -0
  91. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/single_model.py +0 -0
  92. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/ui/visualization.py +0 -0
  93. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/utils/data_utils.py +0 -0
  94. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/utils/localization.py +0 -0
  95. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/utils/text_utils.py +0 -0
  96. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/app/utils/visualization.py +0 -0
  97. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/arguments.py +0 -0
  98. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/__init__.py +0 -0
  99. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/base.py +0 -0
  100. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/__init__.py +0 -0
  101. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  102. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
  103. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  104. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  105. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  106. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/__init__.py +0 -0
  107. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  108. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  109. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  110. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  111. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  112. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  113. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  114. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  115. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  116. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  117. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  118. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  119. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  120. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  121. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  122. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  123. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  124. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  125. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  126. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  127. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  128. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  129. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  130. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  131. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  132. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  133. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  134. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  135. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  136. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  137. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  138. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  139. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  140. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  141. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  142. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  143. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  144. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  145. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  146. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/__init__.py +0 -0
  147. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/__init__.py +0 -0
  148. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  149. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
  150. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
  151. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
  152. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
  153. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
  154. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
  155. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aime/__init__.py +0 -0
  156. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
  157. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
  158. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  159. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -0
  160. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arc/__init__.py +0 -0
  161. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  162. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  163. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
  164. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -0
  165. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/arena_hard/utils.py +0 -0
  166. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
  167. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  168. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  169. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  170. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  171. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  172. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  173. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  174. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  175. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  176. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  177. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  178. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  179. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  180. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  181. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  182. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  183. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  184. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  185. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  186. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  187. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  188. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  189. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  190. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  191. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  192. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  193. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  194. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  195. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/benchmark.py +0 -0
  196. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/bfcl/__init__.py +0 -0
  197. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
  198. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  199. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  200. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  201. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +0 -0
  202. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  203. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  204. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  205. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  206. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  207. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  208. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  209. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
  210. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
  211. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/docmath/__init__.py +0 -0
  212. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/docmath/docmath_adapter.py +0 -0
  213. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/docmath/utils.py +0 -0
  214. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/drop/__init__.py +0 -0
  215. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/drop/drop_adapter.py +0 -0
  216. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/drop/utils.py +0 -0
  217. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/filters.py +0 -0
  218. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/frames/__init__.py +0 -0
  219. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/frames/frames_adapter.py +0 -0
  220. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/frames/utils.py +0 -0
  221. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_arena/__init__.py +0 -0
  222. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_arena/general_arena_adapter.py +0 -0
  223. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_arena/utils.py +0 -0
  224. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_mcq/__init__.py +0 -0
  225. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  226. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/__init__.py +0 -0
  227. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  228. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
  229. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  230. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  231. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  232. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  233. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  234. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  235. {evalscope-0.17.0/evalscope/benchmarks/ifeval → evalscope-0.17.1/evalscope/benchmarks/hle}/__init__.py +0 -0
  236. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  237. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  238. {evalscope-0.17.0/evalscope/benchmarks/iquiz → evalscope-0.17.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  239. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
  240. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  241. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  242. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  243. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
  244. {evalscope-0.17.0/evalscope/benchmarks/live_code_bench → evalscope-0.17.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  245. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
  246. {evalscope-0.17.0/evalscope/benchmarks/maritime_bench → evalscope-0.17.1/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  247. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
  248. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  249. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
  250. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
  251. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  252. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  253. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
  254. {evalscope-0.17.0/evalscope/benchmarks/math_500 → evalscope-0.17.1/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
  255. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -0
  256. {evalscope-0.17.0/evalscope/benchmarks/mmlu_pro → evalscope-0.17.1/evalscope/benchmarks/math_500}/__init__.py +0 -0
  257. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
  258. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  259. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  260. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  261. {evalscope-0.17.0/evalscope/benchmarks/mmlu_redux → evalscope-0.17.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  262. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
  263. {evalscope-0.17.0/evalscope/benchmarks/musr → evalscope-0.17.1/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
  264. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -0
  265. {evalscope-0.17.0/evalscope/benchmarks/needle_haystack → evalscope-0.17.1/evalscope/benchmarks/musr}/__init__.py +0 -0
  266. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
  267. {evalscope-0.17.0/evalscope/benchmarks/process_bench → evalscope-0.17.1/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
  268. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +0 -0
  269. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/needle_haystack/utils.py +0 -0
  270. {evalscope-0.17.0/evalscope/benchmarks/simple_qa → evalscope-0.17.1/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  271. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  272. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
  273. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/race/__init__.py +0 -0
  274. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/race/race.py +0 -0
  275. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
  276. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
  277. {evalscope-0.17.0/evalscope/benchmarks/super_gpqa → evalscope-0.17.1/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  278. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -0
  279. {evalscope-0.17.0/evalscope/benchmarks/tool_bench → evalscope-0.17.1/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  280. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
  281. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
  282. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
  283. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
  284. {evalscope-0.17.0/evalscope/benchmarks/winogrande → evalscope-0.17.1/evalscope/benchmarks/tau_bench}/__init__.py +0 -0
  285. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models → evalscope-0.17.1/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
  286. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/tool_bench/utils.py +0 -0
  287. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  288. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  289. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  290. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  291. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  292. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  293. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  294. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.17.1/evalscope/benchmarks/winogrande}/__init__.py +0 -0
  295. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/benchmarks/winogrande/winogrande_adapter.py +0 -0
  296. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/__init__.py +0 -0
  297. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/base.py +0 -0
  298. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/cli.py +0 -0
  299. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/start_app.py +0 -0
  300. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/start_eval.py +0 -0
  301. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/start_perf.py +0 -0
  302. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/cli/start_server.py +0 -0
  303. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/collections/__init__.py +0 -0
  304. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/collections/evaluator.py +0 -0
  305. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/collections/sampler.py +0 -0
  306. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/collections/schema.py +0 -0
  307. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/config.py +0 -0
  308. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  309. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  310. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/math_parser.py +0 -0
  311. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/named_metrics.py +0 -0
  312. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/rouge_metric.py +0 -0
  313. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
  314. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
  315. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/constants.py +0 -0
  316. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
  317. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
  318. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
  319. {evalscope-0.17.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
  320. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
  321. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
  322. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
  323. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
  324. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
  325. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
  326. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
  327. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
  328. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
  329. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
  330. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
  331. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
  332. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
  333. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
  334. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
  335. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
  336. {evalscope-0.17.0/evalscope/perf → evalscope-0.17.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
  337. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
  338. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
  339. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
  340. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
  341. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
  342. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
  343. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
  344. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
  345. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  346. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  347. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  348. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  349. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  350. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  351. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  352. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  353. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  354. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  355. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
  356. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
  357. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
  358. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
  359. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
  360. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
  361. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
  362. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
  363. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
  364. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
  365. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
  366. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
  367. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
  368. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
  369. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
  370. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
  371. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
  372. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
  373. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
  374. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
  375. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
  376. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
  377. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
  378. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
  379. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
  380. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
  381. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
  382. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
  383. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
  384. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
  385. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
  386. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
  387. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
  388. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
  389. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
  390. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
  391. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
  392. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
  393. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
  394. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
  395. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
  396. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
  397. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
  398. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
  399. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
  400. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
  401. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
  402. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
  403. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
  404. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
  405. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
  406. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
  407. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
  408. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
  409. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
  410. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/score.py +0 -0
  411. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
  412. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/custom/__init__.py +0 -0
  413. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/custom/custom_model.py +0 -0
  414. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/custom/dummy_model.py +0 -0
  415. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/models/local_model.py +0 -0
  416. {evalscope-0.17.0/evalscope/third_party/thinkbench/tools → evalscope-0.17.1/evalscope/perf}/__init__.py +0 -0
  417. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  418. {evalscope-0.17.0/tests/rag → evalscope-0.17.1/evalscope/perf/utils}/__init__.py +0 -0
  419. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/analysis_result.py +0 -0
  420. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/handler.py +0 -0
  421. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/local_server.py +0 -0
  422. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/log_utils.py +0 -0
  423. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/perf/utils/rich_display.py +0 -0
  424. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/report/__init__.py +0 -0
  425. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/report/combinator.py +0 -0
  426. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/report/generator.py +0 -0
  427. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/report/utils.py +0 -0
  428. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/run.py +0 -0
  429. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/summarizer.py +0 -0
  430. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/__init__.py +0 -0
  431. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/README.md +0 -0
  432. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
  433. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
  434. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  435. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/eval.py +0 -0
  436. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/infer.py +0 -0
  437. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  438. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  439. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  440. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  441. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  442. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  443. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  444. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  445. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  446. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/longbench_write/utils.py +0 -0
  447. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
  448. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/eval.py +0 -0
  449. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/infer.py +0 -0
  450. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  451. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  452. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  453. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  454. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/README.md +0 -0
  455. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  456. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  457. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  458. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
  459. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
  460. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  461. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  462. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  463. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  464. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/__init__.py +0 -0
  465. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/argument_utils.py +0 -0
  466. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/chat_service.py +0 -0
  467. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/deprecation_utils.py +0 -0
  468. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/import_utils.py +0 -0
  469. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/logger.py +0 -0
  470. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope/utils/model_utils.py +0 -0
  471. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/dependency_links.txt +0 -0
  472. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/entry_points.txt +0 -0
  473. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/not-zip-safe +0 -0
  474. {evalscope-0.17.0 → evalscope-0.17.1}/evalscope.egg-info/top_level.txt +0 -0
  475. {evalscope-0.17.0 → evalscope-0.17.1}/requirements/aigc.txt +0 -0
  476. {evalscope-0.17.0 → evalscope-0.17.1}/requirements/docs.txt +0 -0
  477. {evalscope-0.17.0 → evalscope-0.17.1}/requirements/framework.txt +0 -0
  478. {evalscope-0.17.0 → evalscope-0.17.1}/requirements/opencompass.txt +0 -0
  479. {evalscope-0.17.0 → evalscope-0.17.1}/requirements/perf.txt +0 -0
  480. {evalscope-0.17.0 → evalscope-0.17.1}/requirements/rag.txt +0 -0
  481. {evalscope-0.17.0 → evalscope-0.17.1}/requirements/vlmeval.txt +0 -0
  482. {evalscope-0.17.0 → evalscope-0.17.1}/requirements.txt +0 -0
  483. {evalscope-0.17.0 → evalscope-0.17.1}/tests/__init__.py +0 -0
  484. {evalscope-0.17.0 → evalscope-0.17.1}/tests/aigc/__init__.py +0 -0
  485. {evalscope-0.17.0 → evalscope-0.17.1}/tests/aigc/test_t2i.py +0 -0
  486. {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/__init__.py +0 -0
  487. {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/test_collection.py +0 -0
  488. {evalscope-0.17.0 → evalscope-0.17.1}/tests/cli/test_custom.py +0 -0
  489. {evalscope-0.17.0 → evalscope-0.17.1}/tests/perf/__init__.py +0 -0
  490. {evalscope-0.17.0 → evalscope-0.17.1}/tests/rag/test_clip_benchmark.py +0 -0
  491. {evalscope-0.17.0 → evalscope-0.17.1}/tests/rag/test_mteb.py +0 -0
  492. {evalscope-0.17.0 → evalscope-0.17.1}/tests/rag/test_ragas.py +0 -0
  493. {evalscope-0.17.0 → evalscope-0.17.1}/tests/swift/__init__.py +0 -0
  494. {evalscope-0.17.0 → evalscope-0.17.1}/tests/swift/test_run_swift_eval.py +0 -0
  495. {evalscope-0.17.0 → evalscope-0.17.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  496. {evalscope-0.17.0 → evalscope-0.17.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  497. {evalscope-0.17.0 → evalscope-0.17.1}/tests/test_run_all.py +0 -0
  498. {evalscope-0.17.0 → evalscope-0.17.1}/tests/utils.py +0 -0
  499. {evalscope-0.17.0 → evalscope-0.17.1}/tests/vlm/__init__.py +0 -0
  500. {evalscope-0.17.0 → evalscope-0.17.1}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,19 +1,20 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.17.0
3
+ Version: 0.17.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
7
7
  Author-email: contact@modelscope.cn
8
+ License: Apache License 2.0
8
9
  Keywords: python,llm,evaluation
9
10
  Classifier: Development Status :: 4 - Beta
10
- Classifier: License :: OSI Approved :: Apache Software License
11
11
  Classifier: Operating System :: OS Independent
12
12
  Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.8
14
13
  Classifier: Programming Language :: Python :: 3.9
15
14
  Classifier: Programming Language :: Python :: 3.10
16
- Requires-Python: >=3.8
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.9
17
18
  Description-Content-Type: text/markdown
18
19
  Provides-Extra: opencompass
19
20
  Provides-Extra: vlmeval
@@ -22,6 +23,7 @@ Provides-Extra: perf
22
23
  Provides-Extra: app
23
24
  Provides-Extra: aigc
24
25
  Provides-Extra: dev
26
+ Provides-Extra: docs
25
27
  Provides-Extra: all
26
28
  License-File: LICENSE
27
29
 
@@ -64,16 +66,17 @@ License-File: LICENSE
64
66
  - [Basic Parameter](#basic-parameter)
65
67
  - [Output Results](#output-results)
66
68
  - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
67
- - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
69
+ - [🌐 Evaluation of Model API](#-evaluation-of-model-api)
68
70
  - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
69
- - [Parameter](#parameter)
70
- - [Evaluation Backend](#evaluation-backend)
71
+ - [Parameter Description](#parameter-description)
72
+ - [🧪 Other Evaluation Backends](#-other-evaluation-backends)
71
73
  - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
72
74
  - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
73
- - [🏟️ Arena Mode](#️-arena-mode)
75
+ - [⚔️ Arena Mode](#️-arena-mode)
74
76
  - [👷‍♂️ Contribution](#️-contribution)
77
+ - [📚 Citation](#-citation)
75
78
  - [🔜 Roadmap](#-roadmap)
76
- - [Star History](#star-history)
79
+ - [Star History](#-star-history)
77
80
 
78
81
 
79
82
  ## 📝 Introduction
@@ -137,7 +140,9 @@ Please scan the QR code below to join our community groups:
137
140
 
138
141
 
139
142
  ## 🎉 News
140
-
143
+ - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
144
+ - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
145
+ - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
141
146
  - 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
142
147
  - 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
143
148
  - 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
@@ -149,6 +154,8 @@ Please scan the QR code below to join our community groups:
149
154
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
150
155
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
151
156
  - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
157
+ <details><summary>More</summary>
158
+
152
159
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
153
160
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
154
161
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -158,8 +165,6 @@ Please scan the QR code below to join our community groups:
158
165
  - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
159
166
  - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
160
167
  - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
161
- <details><summary>More</summary>
162
-
163
168
  - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
164
169
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
165
170
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
@@ -255,33 +260,31 @@ evalscope eval \
255
260
 
256
261
  When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
257
262
 
258
- **Using Python Dictionary**
263
+ **Using `TaskConfig`**
259
264
 
260
265
  ```python
261
- from evalscope.run import run_task
266
+ from evalscope import run_task, TaskConfig
262
267
 
263
- task_cfg = {
264
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
265
- 'datasets': ['gsm8k', 'arc'],
266
- 'limit': 5
267
- }
268
+ task_cfg = TaskConfig(
269
+ model='Qwen/Qwen2.5-0.5B-Instruct',
270
+ datasets=['gsm8k', 'arc'],
271
+ limit=5
272
+ )
268
273
 
269
274
  run_task(task_cfg=task_cfg)
270
275
  ```
271
-
272
276
  <details><summary>More Startup Methods</summary>
273
277
 
274
- **Using `TaskConfig`**
278
+ **Using Python Dictionary**
275
279
 
276
280
  ```python
277
281
  from evalscope.run import run_task
278
- from evalscope.config import TaskConfig
279
282
 
280
- task_cfg = TaskConfig(
281
- model='Qwen/Qwen2.5-0.5B-Instruct',
282
- datasets=['gsm8k', 'arc'],
283
- limit=5
284
- )
283
+ task_cfg = {
284
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
285
+ 'datasets': ['gsm8k', 'arc'],
286
+ 'limit': 5
287
+ }
285
288
 
286
289
  run_task(task_cfg=task_cfg)
287
290
  ```
@@ -384,7 +387,7 @@ To create a public link, set `share=True` in `launch()`.
384
387
 
385
388
  For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
386
389
 
387
- ## 🌐 Evaluation of Specified Model API
390
+ ## 🌐 Evaluation of Model API
388
391
 
389
392
  Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
390
393
 
@@ -435,7 +438,7 @@ evalscope eval \
435
438
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
436
439
 
437
440
 
438
- ## Evaluation Backend
441
+ ## 🧪 Other Evaluation Backends
439
442
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
440
443
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
441
444
  - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
@@ -508,6 +511,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
508
511
  </table>
509
512
  </a>
510
513
 
514
+ ## 📚 Citation
515
+
516
+ ```bibtex
517
+ @misc{evalscope_2024,
518
+ title={{EvalScope}: Evaluation Framework for Large Models},
519
+ author={ModelScope Team},
520
+ year={2024},
521
+ url={https://github.com/modelscope/evalscope}
522
+ }
523
+ ```
524
+
511
525
  ## 🔜 Roadmap
512
526
  - [x] Support for better evaluation report visualization
513
527
  - [x] Support for mixed evaluations across multiple datasets
@@ -523,6 +537,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
523
537
  - [x] MBPP
524
538
 
525
539
 
526
- ## Star History
540
+ ## Star History
527
541
 
528
542
  [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -37,16 +37,17 @@
37
37
  - [Basic Parameter](#basic-parameter)
38
38
  - [Output Results](#output-results)
39
39
  - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
40
- - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
40
+ - [🌐 Evaluation of Model API](#-evaluation-of-model-api)
41
41
  - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
42
- - [Parameter](#parameter)
43
- - [Evaluation Backend](#evaluation-backend)
42
+ - [Parameter Description](#parameter-description)
43
+ - [🧪 Other Evaluation Backends](#-other-evaluation-backends)
44
44
  - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
45
45
  - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
46
- - [🏟️ Arena Mode](#️-arena-mode)
46
+ - [⚔️ Arena Mode](#️-arena-mode)
47
47
  - [👷‍♂️ Contribution](#️-contribution)
48
+ - [📚 Citation](#-citation)
48
49
  - [🔜 Roadmap](#-roadmap)
49
- - [Star History](#star-history)
50
+ - [Star History](#-star-history)
50
51
 
51
52
 
52
53
  ## 📝 Introduction
@@ -110,7 +111,9 @@ Please scan the QR code below to join our community groups:
110
111
 
111
112
 
112
113
  ## 🎉 News
113
-
114
+ - 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
115
+ - 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
116
+ - 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
114
117
  - 🔥 **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
115
118
  - 🔥 **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
116
119
  - 🔥 **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
@@ -122,6 +125,8 @@ Please scan the QR code below to join our community groups:
122
125
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
123
126
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
124
127
  - 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
128
+ <details><summary>More</summary>
129
+
125
130
  - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
126
131
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
127
132
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
@@ -131,8 +136,6 @@ Please scan the QR code below to join our community groups:
131
136
  - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
132
137
  - 🔥 **[2025.02.27]** Added support for evaluating the reasoning efficiency of models. Refer to [📖 Best Practices for Evaluating Thinking Efficiency](https://evalscope.readthedocs.io/en/latest/best_practice/think_eval.html). This implementation is inspired by the works [Overthinking](https://doi.org/10.48550/arXiv.2412.21187) and [Underthinking](https://doi.org/10.48550/arXiv.2501.18585).
133
138
  - 🔥 **[2025.02.25]** Added support for two model inference-related evaluation benchmarks: [MuSR](https://modelscope.cn/datasets/AI-ModelScope/MuSR) and [ProcessBench](https://www.modelscope.cn/datasets/Qwen/ProcessBench/summary). To use them, simply specify `musr` and `process_bench` respectively in the datasets parameter.
134
- <details><summary>More</summary>
135
-
136
139
  - 🔥 **[2025.02.18]** Supports the AIME25 dataset, which contains 15 questions (Grok3 scored 93 on this dataset).
137
140
  - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
138
141
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
@@ -228,33 +231,31 @@ evalscope eval \
228
231
 
229
232
  When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
230
233
 
231
- **Using Python Dictionary**
234
+ **Using `TaskConfig`**
232
235
 
233
236
  ```python
234
- from evalscope.run import run_task
237
+ from evalscope import run_task, TaskConfig
235
238
 
236
- task_cfg = {
237
- 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
238
- 'datasets': ['gsm8k', 'arc'],
239
- 'limit': 5
240
- }
239
+ task_cfg = TaskConfig(
240
+ model='Qwen/Qwen2.5-0.5B-Instruct',
241
+ datasets=['gsm8k', 'arc'],
242
+ limit=5
243
+ )
241
244
 
242
245
  run_task(task_cfg=task_cfg)
243
246
  ```
244
-
245
247
  <details><summary>More Startup Methods</summary>
246
248
 
247
- **Using `TaskConfig`**
249
+ **Using Python Dictionary**
248
250
 
249
251
  ```python
250
252
  from evalscope.run import run_task
251
- from evalscope.config import TaskConfig
252
253
 
253
- task_cfg = TaskConfig(
254
- model='Qwen/Qwen2.5-0.5B-Instruct',
255
- datasets=['gsm8k', 'arc'],
256
- limit=5
257
- )
254
+ task_cfg = {
255
+ 'model': 'Qwen/Qwen2.5-0.5B-Instruct',
256
+ 'datasets': ['gsm8k', 'arc'],
257
+ 'limit': 5
258
+ }
258
259
 
259
260
  run_task(task_cfg=task_cfg)
260
261
  ```
@@ -357,7 +358,7 @@ To create a public link, set `share=True` in `launch()`.
357
358
 
358
359
  For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
359
360
 
360
- ## 🌐 Evaluation of Specified Model API
361
+ ## 🌐 Evaluation of Model API
361
362
 
362
363
  Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
363
364
 
@@ -408,7 +409,7 @@ evalscope eval \
408
409
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
409
410
 
410
411
 
411
- ## Evaluation Backend
412
+ ## 🧪 Other Evaluation Backends
412
413
  EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
413
414
  - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
414
415
  - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
@@ -481,6 +482,17 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
481
482
  </table>
482
483
  </a>
483
484
 
485
+ ## 📚 Citation
486
+
487
+ ```bibtex
488
+ @misc{evalscope_2024,
489
+ title={{EvalScope}: Evaluation Framework for Large Models},
490
+ author={ModelScope Team},
491
+ year={2024},
492
+ url={https://github.com/modelscope/evalscope}
493
+ }
494
+ ```
495
+
484
496
  ## 🔜 Roadmap
485
497
  - [x] Support for better evaluation report visualization
486
498
  - [x] Support for mixed evaluations across multiple datasets
@@ -496,6 +508,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
496
508
  - [x] MBPP
497
509
 
498
510
 
499
- ## Star History
511
+ ## Star History
500
512
 
501
513
  [![Star History Chart](https://api.star-history.com/svg?repos=modelscope/evalscope&type=Date)](https://star-history.com/#modelscope/evalscope&Date)
@@ -35,7 +35,7 @@ SUBJECT_MAPPING = {
35
35
  @Benchmark.register(
36
36
  name='bfcl_v3',
37
37
  pretty_name='BFCL-v3',
38
- tags=['Agent'],
38
+ tags=['Agent', 'Function Calling'],
39
39
  description=
40
40
  'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
41
41
  'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
@@ -168,6 +168,11 @@ class DataAdapter(ABC):
168
168
  If you want to support local dataset, please rewrite this method in xxx_data_adapter.
169
169
  Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
170
170
  """
171
+ # remove dataset_infos.json file if exists, since MsDataset will occur an error if it exists.
172
+ dataset_infos_path = os.path.join(dataset_name_or_path, 'dataset_infos.json')
173
+ if os.path.exists(dataset_infos_path):
174
+ logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid MsDataset errors.')
175
+ os.remove(dataset_infos_path)
171
176
  return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
172
177
 
173
178
  def load_with_snapshot(self,
@@ -382,7 +387,7 @@ class DataAdapter(ABC):
382
387
  pass
383
388
 
384
389
  def gen_prompt_data(self,
385
- prompt: str,
390
+ prompt: str = '',
386
391
  system_prompt: Optional[str] = None,
387
392
  choices: Optional[List[str]] = None,
388
393
  index: Optional[Union[int, str]] = None,
@@ -413,7 +418,8 @@ class DataAdapter(ABC):
413
418
  system_prompt=system_prompt or self.system_prompt,
414
419
  index=index or 0,
415
420
  id=id,
416
- messages=messages)
421
+ messages=messages,
422
+ extra_data=kwargs.get('extra_data', None))
417
423
  return prompt_data.to_dict()
418
424
 
419
425
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
@@ -477,7 +483,6 @@ class DataAdapter(ABC):
477
483
  """
478
484
  return result
479
485
 
480
- @abstractmethod
481
486
  def match(self, gold: Any, pred: Any) -> Any:
482
487
  """
483
488
  Match the gold answer and the predicted answer.
@@ -491,7 +496,7 @@ class DataAdapter(ABC):
491
496
  Returns:
492
497
  The match result. Usually a score (float) for chat/multiple-choice-questions.
493
498
  """
494
- raise NotImplementedError
499
+ return 1.0 if gold == pred else 0.0
495
500
 
496
501
  def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
497
502
  """
@@ -17,7 +17,8 @@ logger = get_logger()
17
17
  @Benchmark.register(
18
18
  name='general_mcq',
19
19
  pretty_name='General-MCQ',
20
- description='A general multiple-choice question answering dataset.',
20
+ description='A general multiple-choice question answering dataset for custom evaluation. '
21
+ 'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#mcq).',
21
22
  tags=['MCQ', 'Custom'],
22
23
  dataset_id='general_mcq',
23
24
  model_adapter=OutputType.GENERATION,
@@ -14,7 +14,8 @@ logger = get_logger()
14
14
  @Benchmark.register(
15
15
  name='general_qa',
16
16
  pretty_name='General-QA',
17
- description='General Question Answering dataset',
17
+ description='A general question answering dataset for custom evaluation. '
18
+ 'For detailed instructions on how to use this benchmark, please refer to the [User Guide](https://evalscope.readthedocs.io/zh-cn/latest/advanced_guides/custom_dataset/llm.html#qa).', # noqa: E501
18
19
  tags=['QA', 'Custom'],
19
20
  dataset_id='general_qa',
20
21
  subset_list=['default'],
@@ -0,0 +1,118 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import DEFAULT_PROMPT_TEMPLATE, LLMJudge, exact_match, mean
7
+ from evalscope.utils.logger import get_logger
8
+
9
+ # flake8: noqa
10
+
11
+ logger = get_logger()
12
+
13
+ SUBSET_LIST = [
14
+ 'Biology/Medicine',
15
+ 'Chemistry',
16
+ 'Computer Science/AI',
17
+ 'Engineering',
18
+ 'Humanities/Social Science',
19
+ 'Math',
20
+ 'Physics',
21
+ 'Other',
22
+ ]
23
+
24
+
25
+ @Benchmark.register(
26
+ name='hle',
27
+ pretty_name="Humanity's-Last-Exam",
28
+ tags=['Knowledge', 'QA'],
29
+ description=
30
+ 'Humanity\'s Last Exam (HLE) is a language model benchmark consisting of 2,500 questions across a broad range of subjects. It was created jointly by the Center for AI Safety and Scale AI. The benchmark classifies the questions into the following broad subjects: mathematics (41%), physics (9%), biology/medicine (11%), humanities/social science (9%), computer science/artificial intelligence (10%), engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions require the ability to understand both text and images, i.e., multi-modality. 24% of the questions are multiple-choice; the rest are short-answer, exact-match questions.', # noqa: E501
31
+ dataset_id='cais/hle',
32
+ subset_list=SUBSET_LIST,
33
+ metric_list=['AverageAccuracy'],
34
+ few_shot_num=0,
35
+ train_split=None,
36
+ eval_split='test',
37
+ prompt_template='{query}\n\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
38
+ )
39
+ class HLEAdapter(DataAdapter):
40
+
41
+ def __init__(self, *args, **kwargs):
42
+ super().__init__(*args, **kwargs)
43
+
44
+ self.llm_as_a_judge = True
45
+
46
+ def load(self, **kwargs):
47
+ kwargs['subset_list'] = ['default']
48
+ data_dict = super().load(**kwargs)
49
+ return self.reformat_subset(data_dict, subset_key='category', format='{}')
50
+
51
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
52
+ # remove image preview
53
+ input_d.pop('image_preview', None)
54
+ input_d.pop('rationale_image', None)
55
+ # generate prompt
56
+ question = input_d['question']
57
+ prompt = self.prompt_template.format(query=question)
58
+ image = input_d.get('image', None)
59
+ # build messages for multi-modal input
60
+ messages = []
61
+ if self.system_prompt:
62
+ messages.append({'role': 'system', 'content': self.system_prompt})
63
+ if image:
64
+ messages.append({
65
+ 'role':
66
+ 'user',
67
+ 'content': [{
68
+ 'type': 'text',
69
+ 'text': prompt
70
+ }, {
71
+ 'type': 'image_url',
72
+ 'image_url': {
73
+ 'url': image
74
+ }
75
+ }]
76
+ })
77
+ else:
78
+ messages.append({'role': 'user', 'content': prompt})
79
+ return self.gen_prompt_data(prompt='', messages=messages)
80
+
81
+ def get_gold_answer(self, input_d: dict) -> str:
82
+ return input_d['answer']
83
+
84
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
85
+ # Extract the answer from the model output \boxed{answer}
86
+ match = re.search(r'\\boxed{([^}]*)}', result)
87
+ if match:
88
+ return match.group(1).strip()
89
+ else:
90
+ logger.warning(f'No answer found in the model output: {result}')
91
+ return ''
92
+
93
+ def llm_parse_pred_result(self, result, raw_input_d=None, **kwargs) -> str:
94
+ return result.strip()
95
+
96
+ def match(self, gold: str, pred: str) -> dict:
97
+ # simple match
98
+ return {
99
+ 'AverageAccuracy': 1.0 if exact_match(gold, pred) else 0.0,
100
+ }
101
+
102
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
103
+ raw_input = kwargs.get('raw_input', None)
104
+ question = raw_input['question']
105
+ # get grading response
106
+ prompt = judge.build_prompt(pred, gold, question)
107
+ judge_response = judge(prompt)
108
+ score = judge.get_score(judge_response)
109
+ return {
110
+ 'AverageAccuracy': score,
111
+ 'response': judge_response,
112
+ }
113
+
114
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
115
+ # zip dict answers
116
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
117
+
118
+ return super().compute_metric(res_dict, **kwargs)
@@ -22,7 +22,8 @@ logger = get_logger()
22
22
  few_shot_num=0,
23
23
  train_split=None,
24
24
  eval_split='test',
25
- prompt_template='Complete the following python code:\n{query}',
25
+ prompt_template=
26
+ 'Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{query}', # noqa: E501
26
27
  extra_params={
27
28
  'num_workers': 4,
28
29
  'timeout': 4
@@ -76,26 +77,9 @@ class HumanevalAdapter(DataAdapter):
76
77
 
77
78
  @classmethod
78
79
  def _postprocess(cls, text: str) -> str:
79
- if '```' in text:
80
- blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
81
- if len(blocks) == 0:
82
- text = text.split('```')[1] # fall back to default strategy
83
- else:
84
- text = blocks[0] # fetch the first code block
85
- if not text.startswith('\n'): # in case starting with ```python
86
- text = text[max(text.find('\n') + 1, 0):]
87
- if text.strip().startswith('from') or text.strip().startswith('import'):
88
- def_idx = text.find('def')
89
- if def_idx != -1:
90
- text = text[max(text.find('\n', def_idx) + 1, 0):]
91
- text = text.split('\n\n')[0]
92
- if text.strip().startswith('def'):
93
- text = '\n'.join(text.split('\n')[1:])
94
- if not text.startswith(' '):
95
- if text.startswith(' '):
96
- text = ' ' + text.lstrip()
97
- else:
98
- text = '\n'.join([' ' + line for line in text.split('\n')])
80
+ blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
81
+ if len(blocks) >= 1:
82
+ text = blocks[0]
99
83
  return text
100
84
 
101
85
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
@@ -144,7 +144,7 @@ SUBJECT_MAPPING = {
144
144
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
145
145
  subset_list=SUBSET_LIST,
146
146
  metric_list=['AverageAccuracy'],
147
- few_shot_num=5,
147
+ few_shot_num=0,
148
148
  train_split='train',
149
149
  eval_split='test',
150
150
  prompt_template=