evalscope 0.12.1__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (352) hide show
  1. {evalscope-0.12.1/evalscope.egg-info → evalscope-0.13.0}/PKG-INFO +25 -10
  2. {evalscope-0.12.1 → evalscope-0.13.0}/README.md +24 -9
  3. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/arguments.py +6 -1
  4. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/arc/arc_adapter.py +3 -3
  5. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/benchmark.py +3 -2
  6. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
  7. evalscope-0.13.0/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  8. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
  9. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/data_adapter.py +32 -4
  10. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
  11. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
  12. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
  13. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  14. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  15. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  16. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  17. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  18. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  19. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  20. evalscope-0.13.0/evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  21. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
  22. evalscope-0.13.0/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
  23. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/super_gpqa/utils.py +0 -5
  24. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/collections/evaluator.py +3 -3
  25. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/config.py +6 -1
  26. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/constants.py +7 -0
  27. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/evaluator/evaluator.py +51 -13
  28. evalscope-0.13.0/evalscope/metrics/llm_judge.py +104 -0
  29. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/benchmark.py +5 -0
  30. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/http_client.py +9 -1
  31. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/main.py +1 -0
  32. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/run.py +1 -1
  33. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/infer.py +1 -1
  34. evalscope-0.13.0/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  35. evalscope-0.13.0/evalscope/version.py +4 -0
  36. {evalscope-0.12.1 → evalscope-0.13.0/evalscope.egg-info}/PKG-INFO +25 -10
  37. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope.egg-info/SOURCES.txt +13 -0
  38. evalscope-0.13.0/tests/cli/test_all.py +144 -0
  39. {evalscope-0.12.1 → evalscope-0.13.0}/tests/cli/test_collection.py +27 -1
  40. {evalscope-0.12.1 → evalscope-0.13.0}/tests/cli/test_run.py +72 -10
  41. evalscope-0.13.0/tests/rag/__init__.py +0 -0
  42. evalscope-0.12.1/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +0 -20
  43. evalscope-0.12.1/evalscope/version.py +0 -4
  44. {evalscope-0.12.1 → evalscope-0.13.0}/LICENSE +0 -0
  45. {evalscope-0.12.1 → evalscope-0.13.0}/MANIFEST.in +0 -0
  46. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/__init__.py +0 -0
  47. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/__init__.py +0 -0
  48. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/base.py +0 -0
  49. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/opencompass/__init__.py +0 -0
  50. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  51. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/opencompass/backend_manager.py +0 -0
  52. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  53. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  54. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  55. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/__init__.py +0 -0
  56. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  57. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  58. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  59. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  60. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  61. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  62. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  63. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  64. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  65. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  66. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  67. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  68. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  69. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  70. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  71. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  72. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  73. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  74. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  75. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  76. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  77. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  78. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  79. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  80. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  81. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  82. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  83. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  84. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  85. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  86. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  87. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  88. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  89. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  90. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  91. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  92. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  93. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  94. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  95. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  96. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/__init__.py +0 -0
  97. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/aime/__init__.py +0 -0
  98. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
  99. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
  100. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/arc/__init__.py +0 -0
  101. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  102. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/__init__.py +0 -0
  103. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  104. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  105. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  106. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  107. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  108. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  109. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  110. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  111. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  112. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  113. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  114. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  115. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  116. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  117. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  118. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  119. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  120. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  121. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  122. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  123. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  124. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  125. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  126. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  127. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  128. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  129. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  130. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  131. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/ceval/__init__.py +0 -0
  132. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  133. {evalscope-0.12.1/evalscope/benchmarks/data_collection → evalscope-0.13.0/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
  134. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  135. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  136. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  137. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  138. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  139. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  140. {evalscope-0.12.1/evalscope/benchmarks/general_mcq → evalscope-0.13.0/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  141. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
  142. {evalscope-0.12.1/evalscope/benchmarks/gpqa → evalscope-0.13.0/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  143. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
  144. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  145. {evalscope-0.12.1/evalscope/benchmarks/ifeval → evalscope-0.13.0/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  146. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  147. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
  148. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  149. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  150. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  151. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  152. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  153. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  154. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  155. {evalscope-0.12.1/evalscope/benchmarks/iquiz → evalscope-0.13.0/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  156. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
  157. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  158. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  159. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  160. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/ifeval/utils.py +0 -0
  161. {evalscope-0.12.1/evalscope/benchmarks/math_500 → evalscope-0.13.0/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  162. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
  163. {evalscope-0.12.1/evalscope/benchmarks/mmlu_pro → evalscope-0.13.0/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  164. {evalscope-0.12.1/evalscope/benchmarks/musr → evalscope-0.13.0/evalscope/benchmarks/math_500}/__init__.py +0 -0
  165. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
  166. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  167. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  168. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  169. {evalscope-0.12.1/evalscope/benchmarks/process_bench → evalscope-0.13.0/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  170. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
  171. {evalscope-0.12.1/evalscope/benchmarks/simple_qa → evalscope-0.13.0/evalscope/benchmarks/musr}/__init__.py +0 -0
  172. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
  173. {evalscope-0.12.1/evalscope/benchmarks/super_gpqa → evalscope-0.13.0/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  174. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  175. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
  176. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/race/__init__.py +0 -0
  177. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/race/race.py +0 -0
  178. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/race/race_adapter.py +0 -0
  179. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/race/samples.jsonl +0 -0
  180. {evalscope-0.12.1/evalscope/perf → evalscope-0.13.0/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  181. {evalscope-0.12.1/evalscope/perf/utils → evalscope-0.13.0/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  182. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
  183. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
  184. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
  185. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  186. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  187. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  188. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  189. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  190. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  191. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  192. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/benchmarks/utils.py +0 -0
  193. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/cli/__init__.py +0 -0
  194. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/cli/base.py +0 -0
  195. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/cli/cli.py +0 -0
  196. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/cli/start_app.py +0 -0
  197. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/cli/start_eval.py +0 -0
  198. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/cli/start_perf.py +0 -0
  199. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/cli/start_server.py +0 -0
  200. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/collections/__init__.py +0 -0
  201. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/collections/sampler.py +0 -0
  202. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/collections/schema.py +0 -0
  203. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/evaluator/__init__.py +0 -0
  204. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/evaluator/rating_eval.py +0 -0
  205. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/evaluator/reviewer/__init__.py +0 -0
  206. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  207. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/__init__.py +0 -0
  208. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  209. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  210. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/code_metric.py +0 -0
  211. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/math_parser.py +0 -0
  212. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/metrics.py +0 -0
  213. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/named_metrics.py +0 -0
  214. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  215. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  216. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/metrics/rouge_metric.py +0 -0
  217. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/__init__.py +0 -0
  218. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/base_adapter.py +0 -0
  219. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/chat_adapter.py +0 -0
  220. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/choice_adapter.py +0 -0
  221. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/custom/__init__.py +0 -0
  222. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/custom/custom_model.py +0 -0
  223. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/custom/dummy_model.py +0 -0
  224. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/custom_adapter.py +0 -0
  225. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/local_model.py +0 -0
  226. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/model.py +0 -0
  227. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/register.py +0 -0
  228. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/models/server_adapter.py +0 -0
  229. {evalscope-0.12.1/evalscope/third_party/thinkbench/tools → evalscope-0.13.0/evalscope/perf}/__init__.py +0 -0
  230. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/arguments.py +0 -0
  231. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/__init__.py +0 -0
  232. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/api/__init__.py +0 -0
  233. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/api/base.py +0 -0
  234. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/api/custom_api.py +0 -0
  235. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  236. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/api/openai_api.py +0 -0
  237. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  238. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/base.py +0 -0
  239. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/custom.py +0 -0
  240. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  241. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  242. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  243. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  244. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  245. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/plugin/registry.py +0 -0
  246. {evalscope-0.12.1/tests/rag → evalscope-0.13.0/evalscope/perf/utils}/__init__.py +0 -0
  247. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/utils/analysis_result.py +0 -0
  248. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/utils/benchmark_util.py +0 -0
  249. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/utils/db_util.py +0 -0
  250. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/utils/handler.py +0 -0
  251. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/perf/utils/local_server.py +0 -0
  252. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/__init__.py +0 -0
  253. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/config/cfg_arena.yaml +0 -0
  254. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  255. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  256. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/config/cfg_single.yaml +0 -0
  257. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  258. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  259. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  260. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  261. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/data/question.jsonl +0 -0
  262. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/arc.yaml +0 -0
  263. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/bbh.yaml +0 -0
  264. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  265. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/ceval.yaml +0 -0
  266. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  267. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  268. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  269. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/general_qa.yaml +0 -0
  270. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  271. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/mmlu.yaml +0 -0
  272. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  273. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/report/__init__.py +0 -0
  274. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/report/app.py +0 -0
  275. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/report/combinator.py +0 -0
  276. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/report/generator.py +0 -0
  277. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/report/utils.py +0 -0
  278. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/run_arena.py +0 -0
  279. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/summarizer.py +0 -0
  280. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/__init__.py +0 -0
  281. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/README.md +0 -0
  282. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/__init__.py +0 -0
  283. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/default_task.json +0 -0
  284. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  285. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/eval.py +0 -0
  286. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  287. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  288. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  289. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  290. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  291. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  292. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  293. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  294. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  295. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/longbench_write/utils.py +0 -0
  296. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/thinkbench/__init__.py +0 -0
  297. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/thinkbench/eval.py +0 -0
  298. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/thinkbench/infer.py +0 -0
  299. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  300. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  301. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  302. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  303. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/README.md +0 -0
  304. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  305. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  306. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  307. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/eval.py +0 -0
  308. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/infer.py +0 -0
  309. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  310. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  311. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  312. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  313. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/utils/__init__.py +0 -0
  314. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/utils/arena_utils.py +0 -0
  315. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/utils/chat_service.py +0 -0
  316. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/utils/completion_parsers.py +0 -0
  317. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/utils/filters.py +0 -0
  318. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/utils/io_utils.py +0 -0
  319. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/utils/logger.py +0 -0
  320. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/utils/model_utils.py +0 -0
  321. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope/utils/utils.py +0 -0
  322. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope.egg-info/dependency_links.txt +0 -0
  323. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope.egg-info/entry_points.txt +0 -0
  324. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope.egg-info/not-zip-safe +0 -0
  325. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope.egg-info/requires.txt +0 -0
  326. {evalscope-0.12.1 → evalscope-0.13.0}/evalscope.egg-info/top_level.txt +0 -0
  327. {evalscope-0.12.1 → evalscope-0.13.0}/requirements/app.txt +0 -0
  328. {evalscope-0.12.1 → evalscope-0.13.0}/requirements/docs.txt +0 -0
  329. {evalscope-0.12.1 → evalscope-0.13.0}/requirements/framework.txt +0 -0
  330. {evalscope-0.12.1 → evalscope-0.13.0}/requirements/inner.txt +0 -0
  331. {evalscope-0.12.1 → evalscope-0.13.0}/requirements/opencompass.txt +0 -0
  332. {evalscope-0.12.1 → evalscope-0.13.0}/requirements/perf.txt +0 -0
  333. {evalscope-0.12.1 → evalscope-0.13.0}/requirements/rag.txt +0 -0
  334. {evalscope-0.12.1 → evalscope-0.13.0}/requirements/tests.txt +0 -0
  335. {evalscope-0.12.1 → evalscope-0.13.0}/requirements/vlmeval.txt +0 -0
  336. {evalscope-0.12.1 → evalscope-0.13.0}/requirements.txt +0 -0
  337. {evalscope-0.12.1 → evalscope-0.13.0}/setup.cfg +0 -0
  338. {evalscope-0.12.1 → evalscope-0.13.0}/setup.py +0 -0
  339. {evalscope-0.12.1 → evalscope-0.13.0}/tests/__init__.py +0 -0
  340. {evalscope-0.12.1 → evalscope-0.13.0}/tests/cli/__init__.py +0 -0
  341. {evalscope-0.12.1 → evalscope-0.13.0}/tests/perf/__init__.py +0 -0
  342. {evalscope-0.12.1 → evalscope-0.13.0}/tests/perf/test_perf.py +0 -0
  343. {evalscope-0.12.1 → evalscope-0.13.0}/tests/rag/test_clip_benchmark.py +0 -0
  344. {evalscope-0.12.1 → evalscope-0.13.0}/tests/rag/test_mteb.py +0 -0
  345. {evalscope-0.12.1 → evalscope-0.13.0}/tests/rag/test_ragas.py +0 -0
  346. {evalscope-0.12.1 → evalscope-0.13.0}/tests/swift/__init__.py +0 -0
  347. {evalscope-0.12.1 → evalscope-0.13.0}/tests/swift/test_run_swift_eval.py +0 -0
  348. {evalscope-0.12.1 → evalscope-0.13.0}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  349. {evalscope-0.12.1 → evalscope-0.13.0}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  350. {evalscope-0.12.1 → evalscope-0.13.0}/tests/test_run_all.py +0 -0
  351. {evalscope-0.12.1 → evalscope-0.13.0}/tests/vlm/__init__.py +0 -0
  352. {evalscope-0.12.1 → evalscope-0.13.0}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.12.1
3
+ Version: 0.13.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -175,16 +175,29 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
175
175
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
176
176
 
177
177
  ## 📋 Contents
178
- - [Introduction](#-introduction)
179
- - [News](#-news)
180
- - [Installation](#️-installation)
181
- - [Quick Start](#-quick-start)
178
+ - [📋 Contents](#-contents)
179
+ - [📝 Introduction](#-introduction)
180
+ - [☎ User Groups](#-user-groups)
181
+ - [🎉 News](#-news)
182
+ - [🛠️ Installation](#️-installation)
183
+ - [Method 1: Install Using pip](#method-1-install-using-pip)
184
+ - [Method 2: Install from Source](#method-2-install-from-source)
185
+ - [🚀 Quick Start](#-quick-start)
186
+ - [Method 1. Using Command Line](#method-1-using-command-line)
187
+ - [Method 2. Using Python Code](#method-2-using-python-code)
188
+ - [Basic Parameter](#basic-parameter)
189
+ - [Output Results](#output-results)
190
+ - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
191
+ - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
192
+ - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
193
+ - [Parameter](#parameter)
182
194
  - [Evaluation Backend](#evaluation-backend)
183
- - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
184
- - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
185
- - [Arena Mode](#-arena-mode)
186
- - [Contribution](#️-contribution)
187
- - [Roadmap](#-roadmap)
195
+ - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
196
+ - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
197
+ - [🏟️ Arena Mode](#️-arena-mode)
198
+ - [👷‍♂️ Contribution](#️-contribution)
199
+ - [🔜 Roadmap](#-roadmap)
200
+ - [Star History](#star-history)
188
201
 
189
202
 
190
203
  ## 📝 Introduction
@@ -226,6 +239,8 @@ Please scan the QR code below to join our community groups:
226
239
 
227
240
  ## 🎉 News
228
241
 
242
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
243
+ - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
229
244
  - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
230
245
  - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
231
246
  - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
@@ -24,16 +24,29 @@
24
24
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
25
25
 
26
26
  ## 📋 Contents
27
- - [Introduction](#-introduction)
28
- - [News](#-news)
29
- - [Installation](#️-installation)
30
- - [Quick Start](#-quick-start)
27
+ - [📋 Contents](#-contents)
28
+ - [📝 Introduction](#-introduction)
29
+ - [☎ User Groups](#-user-groups)
30
+ - [🎉 News](#-news)
31
+ - [🛠️ Installation](#️-installation)
32
+ - [Method 1: Install Using pip](#method-1-install-using-pip)
33
+ - [Method 2: Install from Source](#method-2-install-from-source)
34
+ - [🚀 Quick Start](#-quick-start)
35
+ - [Method 1. Using Command Line](#method-1-using-command-line)
36
+ - [Method 2. Using Python Code](#method-2-using-python-code)
37
+ - [Basic Parameter](#basic-parameter)
38
+ - [Output Results](#output-results)
39
+ - [📈 Visualization of Evaluation Results](#-visualization-of-evaluation-results)
40
+ - [🌐 Evaluation of Specified Model API](#-evaluation-of-specified-model-api)
41
+ - [⚙️ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
42
+ - [Parameter](#parameter)
31
43
  - [Evaluation Backend](#evaluation-backend)
32
- - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
33
- - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
34
- - [Arena Mode](#-arena-mode)
35
- - [Contribution](#️-contribution)
36
- - [Roadmap](#-roadmap)
44
+ - [📈 Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
45
+ - [🖊️ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
46
+ - [🏟️ Arena Mode](#️-arena-mode)
47
+ - [👷‍♂️ Contribution](#️-contribution)
48
+ - [🔜 Roadmap](#-roadmap)
49
+ - [Star History](#star-history)
37
50
 
38
51
 
39
52
  ## 📝 Introduction
@@ -75,6 +88,8 @@ Please scan the QR code below to join our community groups:
75
88
 
76
89
  ## 🎉 News
77
90
 
91
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
92
+ - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
78
93
  - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
79
94
  - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
80
95
  - 🔥 **[2025.03.03]** Added support for evaluating the IQ and EQ of models. Refer to [📖 Best Practices for IQ and EQ Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/iquiz.html) to find out how smart your AI is!
@@ -1,7 +1,7 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
- from evalscope.constants import EvalBackend, EvalStage, EvalType, OutputType
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, OutputType
5
5
 
6
6
 
7
7
  class ParseStrArgsAction(argparse.Action):
@@ -73,6 +73,11 @@ def add_argument(parser: argparse.ArgumentParser):
73
73
  parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
74
74
  parser.add_argument('--timeout', type=float, default=None, help='The timeout for the remote API model.')
75
75
  parser.add_argument('--stream', action='store_true', default=False, help='Stream mode.') # noqa: E501
76
+
77
+ # LLMJudge arguments
78
+ parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
79
+ parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
80
+ parser.add_argument('--judge-worker-num', type=int, default=8, help='The number of workers for the judge model.')
76
81
  # yapf: enable
77
82
 
78
83
 
@@ -26,7 +26,7 @@ logger = get_logger()
26
26
  train_split='train',
27
27
  eval_split='test',
28
28
  prompt_template=
29
- 'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n{query}',
29
+ 'Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n{query}\nYour response should end with "The best answer is [the_answer_letter]" where the [the_answer_letter] is one of A, B, C or D.', # noqa
30
30
  )
31
31
  class ARCAdapter(DataAdapter):
32
32
 
@@ -134,7 +134,7 @@ class ARCAdapter(DataAdapter):
134
134
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
135
135
  return result
136
136
  else:
137
- return ResponseParser.parse_first_capital(text=result, options=self.choices)
137
+ return ResponseParser.parse_first_option(text=result)
138
138
 
139
139
  def match(self, gold: str, pred: str) -> float:
140
140
  return exact_match(gold=gold, pred=pred)
@@ -149,8 +149,8 @@ class ARCAdapter(DataAdapter):
149
149
  choices_prompts: str = '\n'.join([label + '. ' + text for text, label in zip(choices_texts, choices_labels)])
150
150
  example += '\n' + choices_prompts
151
151
 
152
- example += '\nAnswer:'
153
152
  if include_answer:
153
+ example += '\nAnswer:'
154
154
  example += ' {}\n\n'.format(input_d['answerKey'])
155
155
 
156
156
  return example
@@ -1,6 +1,6 @@
1
1
  import copy
2
2
  from collections import OrderedDict
3
- from dataclasses import dataclass, field
3
+ from dataclasses import dataclass, field, fields
4
4
  from typing import TYPE_CHECKING, Dict, List, Optional
5
5
 
6
6
  from evalscope.constants import OutputType
@@ -29,6 +29,7 @@ class BenchmarkMeta:
29
29
  query_template: Optional[str] = None
30
30
  pretty_name: Optional[str] = None
31
31
  filters: Optional[OrderedDict] = None
32
+ extra_params: Optional[Dict] = field(default_factory=dict)
32
33
 
33
34
  def _update(self, args: dict):
34
35
  if args.get('local_path'):
@@ -40,7 +41,7 @@ class BenchmarkMeta:
40
41
  return self.__dict__
41
42
 
42
43
  def to_string_dict(self) -> dict:
43
- cur_dict = copy.deepcopy(self.__dict__)
44
+ cur_dict = copy.deepcopy(self.to_dict())
44
45
  # cur_dict['data_adapter'] = self.data_adapter.__name__
45
46
  del cur_dict['data_adapter']
46
47
  return cur_dict
@@ -134,7 +134,8 @@ SUBJECT_MAPPING = {
134
134
  few_shot_num=0,
135
135
  train_split='dev',
136
136
  eval_split='val',
137
- prompt_template='以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。\n{query}',
137
+ prompt_template=
138
+ '以下是中国关于{subset_name}考试的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:“答案是:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
138
139
  )
139
140
  class CEVALAdapter(DataAdapter):
140
141
 
@@ -0,0 +1,168 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import Metric, mean, metric_registry
7
+ from evalscope.metrics.llm_judge import LLMJudge
8
+ from evalscope.utils.logger import get_logger
9
+
10
+ # flake8: noqa
11
+
12
+ logger = get_logger()
13
+
14
+ GRADER_TEMPLATE = """
15
+ 请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。您的任务是将结果评定为:【正确】、【错误】或【未尝试】。
16
+
17
+ 首先,我们将列出每个评定类别的示例,然后请您对新问题的预测答案进行评定。
18
+ 以下是【正确】的答复示例:
19
+ ```
20
+ 问题:贝拉克·奥巴马的孩子叫什么名字?
21
+ 标准答案:玛丽亚·奥巴马和萨莎·奥巴马
22
+ 模型预测1:Malia Obama and Sasha Obama
23
+ 模型预测2:玛丽亚和萨沙
24
+ 模型预测3:大多数人会说是玛丽亚和萨莎,但我不确定,需要再确认
25
+ 模型预测4:巴拉克·奥巴马有两个女儿,她们分别是玛丽亚·安和娜塔莎·玛丽安,但通常称作玛丽亚·奥巴马和萨莎·奥巴马。玛丽亚出生于1998年7月4日,萨莎出生于2001年6月10日。
26
+ ```
27
+ 这些答复均为【正确】,因为:
28
+ - 完整地包含了标准答案中的重要信息。
29
+ - 不包含任何与标准答案矛盾的信息。
30
+ - 只关注语义内容,中英文,大小写、标点、语法和顺序不重要。
31
+ - 答复中出现模糊语句或猜测是可以接受的,前提是包含了标准答案且不含有不正确信息或矛盾。
32
+
33
+ 以下是【错误】的答复示例:
34
+ ```
35
+ 问题:巴拉克·奥巴马的孩子叫什么名字?
36
+ 标准答案:玛丽亚·奥巴马和萨莎·奥巴马
37
+ 模型预测1:玛丽亚
38
+ 模型预测2:玛丽亚、萨莎和苏珊
39
+ 模型预测3:巴拉克·奥巴马没有孩子
40
+ 模型预测4:我认为是玛丽亚和萨莎。或者是玛丽亚和杰基。或者是乔伊和玛丽亚。
41
+ 模型预测5:虽然我不知道他们的确切名字,但能说出巴拉克·奥巴马有三个孩子。
42
+ 模型预测6:你可能是想说贝茜和奥利维亚。不过您应通过最新的参考资料确认详细信息。那是正确的答案吗?
43
+ ```
44
+ 这些答复均为【错误】,因为:
45
+ - 答复中包含与标准答案矛盾的事实陈述。即使在陈述中略带保留(例如:“可能是”,“虽然我不确定,但我认为”),也视为错误。
46
+
47
+ 以下是【未尝试】的答复示例:
48
+ ```
49
+ 问题:巴拉克·奥巴马的孩子叫什么名字?
50
+ 标准答案:玛丽亚·奥巴马和萨莎·奥巴马
51
+ 模型预测1:我不知道。
52
+ 模型预测2:我需要更多关于您所指奥巴马的上下文。
53
+ 模型预测3:不查阅网络我无法回答这个问题,不过我知道巴拉克·奥巴马有两个孩子。
54
+ 模型预测4:巴拉克·奥巴马有两个孩子。我知道其中一个叫玛丽亚,但我不确定另一个的名字。
55
+ ```
56
+ 这些答复均为【未尝试】,因为:
57
+ - 没有包含标准答案中的重要信息。
58
+ - 回复中没有与标准答案矛盾的陈述。
59
+
60
+ 另外注意以下几点:
61
+ - 对于标准答案为数字的问题,预测答案应和标准答案一致。例如,考虑问题“金山铁路黄浦江特大桥的全长是多少米?”,标准答案为“3518.17”:
62
+ - 预测答案“3518”、“3518.1”、“3518.17”均为【正确】。
63
+ - 预测答案“3520”和“3600”均为【错误】。
64
+ - 预测答案“大约3500米”和“超过3000米”被视为【未尝试】,因为它们既不确认也不与标准答案矛盾。
65
+ - 如果标准答案包含比问题更多的信息,预测答案只需包含问题中提到的信息。
66
+ - 例如,考虑问题“菱镁矿的主要化学成分是什么?”标准答案为“碳酸镁(MgCO3)”。“碳酸镁”或“MgCO3”均视为【正确】答案。
67
+ - 如果从问题中明显可以推断出预测答案省略的信息,那么算作正确。
68
+ - 例如,问题“巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产,那么这遗址在哪个地区?”标准答案为“意大利撒丁岛”,预测答案“撒丁岛”被视为【正确】。
69
+ - 如果能明显看出名字翻译版本不同但是是同一个人也认为正确。
70
+ - 例如,如果标准答案是“Robinson”,那么回答鲁滨逊或者鲁滨孙均正确。
71
+
72
+ 下面是一个新的问题示例。请只回复A、B、C之一,不要道歉或纠正自己的错误,只需要评估该回答。
73
+ ```
74
+ 问题: {question}
75
+ 正确答案: {target}
76
+ 预测答案: {predicted_answer}
77
+ ```
78
+
79
+ 将此新问题的预测答案评定为以下之一:
80
+ A:【正确】
81
+ B:【错误】
82
+ C:【未尝试】
83
+
84
+ 只返回字母"A"、"B"或"C",无须添加其他文本。
85
+ """.strip() # noqa E501
86
+
87
+ SUBSET_LIST = ['中华文化', '人文与社会科学', '工程、技术与应用科学', '生活、艺术与文化', '社会', '自然与自然科学']
88
+
89
+
90
+ @Benchmark.register(
91
+ name='chinese_simpleqa',
92
+ pretty_name='Chinese SimpleQA',
93
+ subset_list=SUBSET_LIST,
94
+ dataset_id='AI-ModelScope/Chinese-SimpleQA',
95
+ metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
96
+ few_shot_num=0,
97
+ train_split=None,
98
+ eval_split='train')
99
+ class ChineseSimpleQAAdapter(DataAdapter):
100
+
101
+ def __init__(self, *args, **kwargs):
102
+ super().__init__(*args, **kwargs)
103
+
104
+ # register metrics
105
+ metric_registry.register(Metric(name='is_correct', object=mean))
106
+ metric_registry.register(Metric(name='is_incorrect', object=mean))
107
+ metric_registry.register(Metric(name='is_not_attempted', object=mean))
108
+
109
+ # whether to use LLM as a judge
110
+ self.llm_as_a_judge = True
111
+
112
+ def load(self, **kwargs):
113
+ kwargs['subset_list'] = ['default']
114
+ data_dict = super().load(**kwargs)
115
+ return self.reformat_subset(data_dict, subset_key='primary_category', format='{}')
116
+
117
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
118
+ question = input_d['question']
119
+ return self.gen_prompt_data(question)
120
+
121
+ def get_gold_answer(self, input_d: dict) -> str:
122
+ return input_d['answer']
123
+
124
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
125
+ return result.strip()
126
+
127
+ def match(self, gold: str, pred: str) -> float:
128
+ # simple match
129
+ logger.warning(f'Please use LLMJudge to match the result for ChineseSimpleQA')
130
+ is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
131
+ is_incorrect = not is_correct
132
+ is_not_attempted = 0
133
+ return {
134
+ 'is_correct': is_correct,
135
+ 'is_incorrect': is_incorrect,
136
+ 'is_not_attempted': is_not_attempted,
137
+ }
138
+
139
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
140
+ raw_input = kwargs.get('raw_input', None)
141
+ question = raw_input['question']
142
+ # get grading response
143
+ prompt = GRADER_TEMPLATE.format(question=question, target=gold, predicted_answer=pred)
144
+ system_prompt = '你是一个智能助手,请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。'
145
+ grading_response = judge(prompt, system_prompt)
146
+ # parse grading response
147
+ match = re.search(r'(A|B|C)', grading_response)
148
+ res = match.group(0) if match else 'C'
149
+ return {
150
+ 'is_correct': 1 if res == 'A' else 0,
151
+ 'is_incorrect': 1 if res == 'B' else 0,
152
+ 'is_not_attempted': 1 if res == 'C' else 0,
153
+ }
154
+
155
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
156
+ """
157
+ compute weighted mean of the bleu score of all samples
158
+
159
+ Args:
160
+ review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
161
+ """
162
+ # zip dict answers
163
+ res_dict = defaultdict(list)
164
+ for res in review_res_list:
165
+ for key, value in res.items():
166
+ res_dict[key].append(value)
167
+
168
+ return super().compute_metric(res_dict, **kwargs)
@@ -111,7 +111,8 @@ SUBJECT_MAPPING = {
111
111
  few_shot_num=5,
112
112
  train_split='dev',
113
113
  eval_split='test',
114
- prompt_template='以下是关于{subset_name}的单项选择题,请直接给出正确答案的选项。\n{query}',
114
+ prompt_template=
115
+ '以下是关于{subset_name}的单项选择题,请给出正确答案的选项。你的回答的最后一行应该是这样的格式:“答案:LETTER”(不带引号),其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
115
116
  )
116
117
  class CMMLUAdapter(DataAdapter):
117
118
 
@@ -7,6 +7,7 @@ from typing import Any, List, Optional, Union
7
7
 
8
8
  from evalscope.benchmarks.utils import PromptData, preprocess_decorator
9
9
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
10
+ from evalscope.metrics.llm_judge import LLMJudge
10
11
  from evalscope.metrics.named_metrics import metric_registry
11
12
  from evalscope.report import Report, ReportGenerator
12
13
  from evalscope.utils.logger import get_logger
@@ -22,6 +23,7 @@ class DataAdapter(ABC):
22
23
  model_adapter: str,
23
24
  subset_list: list,
24
25
  metric_list: List[str],
26
+ llm_as_a_judge: bool = False,
25
27
  few_shot_num: Optional[int] = 0,
26
28
  train_split: Optional[str] = None,
27
29
  eval_split: Optional[str] = None,
@@ -61,6 +63,7 @@ class DataAdapter(ABC):
61
63
  self.query_template = query_template
62
64
  self.pretty_name = pretty_name
63
65
  self.config_kwargs = kwargs
66
+ self.llm_as_a_judge = llm_as_a_judge
64
67
  self.category_map = kwargs.get('category_map', {})
65
68
  self.choices = kwargs.get('choices', None)
66
69
 
@@ -91,12 +94,14 @@ class DataAdapter(ABC):
91
94
  # Try to load dataset from local disk
92
95
  if os.path.exists(dataset_name_or_path):
93
96
  logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
97
+ trust_remote_code = kwargs.pop('trust_remote_code', False)
94
98
  data_dict = self.load_from_disk(
95
- dataset_name_or_path, subset_list, work_dir, trust_remote_code=False, **kwargs)
99
+ dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
96
100
  else:
97
101
  logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
102
+ trust_remote_code = kwargs.pop('trust_remote_code', True)
98
103
  data_dict = self.load_from_hub(
99
- dataset_name_or_path, subset_list, work_dir, trust_remote_code=True, **kwargs)
104
+ dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
100
105
  if len(data_dict) == 0:
101
106
  raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
102
107
  return data_dict
@@ -356,5 +361,28 @@ class DataAdapter(ABC):
356
361
  """
357
362
  raise NotImplementedError
358
363
 
359
- def llm_match(self, *args, **kwargs):
360
- pass
364
+ def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
365
+ """
366
+ Use LLM as a judge to evaluate the predicted answer against the gold answer.
367
+
368
+ Args:
369
+ gold (Any): The golden answer.
370
+ pred (Any): The predicted answer.
371
+
372
+ Returns:
373
+ The match result as a float score between 0 and 1.
374
+ """
375
+ # Default judge handling
376
+ if judge is None:
377
+ logger.warning('No judge LLM provided, please specify a judge LLM in the config.')
378
+ return 0
379
+
380
+ # Extract question from raw_input if available
381
+ raw_input = kwargs.get('raw_input', {})
382
+ question_keys = ['question', 'prompt', 'query', 'problem']
383
+ question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
384
+
385
+ # Request judge and obtain score
386
+ prompt = judge.build_prompt(pred, gold, question)
387
+ score = judge(prompt)
388
+ return judge.get_score(score)
@@ -1,12 +1,10 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import glob
3
2
  import os.path
4
3
  from collections import defaultdict
5
4
  from typing import List
6
5
 
7
6
  from evalscope.benchmarks import Benchmark, DataAdapter
8
7
  from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
9
- from evalscope.models import ChatGenerationModelAdapter
10
8
  from evalscope.utils.io_utils import jsonl_to_list
11
9
  from evalscope.utils.logger import get_logger
12
10
 
@@ -133,7 +131,10 @@ class GeneralQAAdapter(DataAdapter):
133
131
  """
134
132
  items = defaultdict(list)
135
133
  for scores in review_res_list:
136
- for k, v in scores.items():
137
- items[k].append(v)
134
+ if isinstance(scores, dict):
135
+ for k, v in scores.items():
136
+ items[k].append(v)
137
+ else:
138
+ items['AverageAccuracy'].append(scores)
138
139
  # items = [(score, 1.0) for score in review_res_list]
139
140
  return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -19,15 +19,14 @@ logger = get_logger()
19
19
  name='hellaswag',
20
20
  pretty_name='HellaSwag',
21
21
  dataset_id='modelscope/hellaswag',
22
- model_adapter=OutputType.CONTINUOUS,
23
- output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],
22
+ model_adapter=OutputType.MULTIPLE_CHOICE,
23
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
24
24
  subset_list=['default'],
25
25
  metric_list=['AverageAccuracy'],
26
26
  few_shot_num=0,
27
27
  train_split='train',
28
28
  eval_split='validation',
29
- prompt_template=
30
- 'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.', # noqa: E501
29
+ prompt_template='{query}', # noqa: E501
31
30
  )
32
31
  class HellaSwagAdapter(DataAdapter):
33
32
 
@@ -39,7 +38,7 @@ class HellaSwagAdapter(DataAdapter):
39
38
  kwargs['few_shot_num'] = 0
40
39
 
41
40
  super().__init__(**kwargs)
42
- self.choices = ['0', '1', '2', '3']
41
+ self.choices = ['A', 'B', 'C', 'D']
43
42
 
44
43
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
45
44
  data_dict = {}
@@ -85,15 +84,14 @@ class HellaSwagAdapter(DataAdapter):
85
84
  self._generate_prompt(input_d=sample, endings=endings, include_answer=True) for sample in few_shot_list
86
85
  ]
87
86
  context: str = '\n'.join(few_shot_prompts) + '\n'
88
- context += self._generate_prompt(input_d=input_d, endings=endings, include_answer=False)
87
+ query = context.strip() + self._generate_prompt(input_d=input_d, endings=endings, include_answer=False)
89
88
 
90
- ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
91
-
92
- return self.gen_prompt_data(ctx_continuation_pair_list)
89
+ full_prompt = self.prompt_template.format(query=query)
90
+ return self.gen_prompt_data(full_prompt)
93
91
 
94
92
  def get_gold_answer(self, input_d: dict) -> str:
95
- # Get the gold choice
96
- return input_d['label']
93
+ # Get the gold choice from the label
94
+ return self.choices[int(input_d['label'])]
97
95
 
98
96
  def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
99
97
  """
@@ -107,30 +105,22 @@ class HellaSwagAdapter(DataAdapter):
107
105
  Returns:
108
106
  The parsed answer. Depending on the dataset. Usually a string for chat.
109
107
  """
110
- if self.model_adapter == OutputType.CONTINUOUS:
111
- # answer: in the form of [-2.3, -4.5, ...], len of self.choices
112
- result = np.array(result)
113
- endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
114
- completion_len = np.array([float(len(i)) for i in endings])
115
- best_choice_idx = np.argmax(result / completion_len)
116
-
117
- return str(best_choice_idx)
108
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
109
+ return result
118
110
  else:
119
111
  return ResponseParser.parse_first_option(result)
120
112
 
121
113
  def match(self, gold: str, pred: str) -> float:
122
114
  return exact_match(gold=str(gold), pred=str(pred))
123
115
 
124
- @classmethod
125
- def _preprocess(cls, text):
116
+ def _preprocess(self, text):
126
117
  text = text.strip()
127
118
  text = text.replace(' [title]', '. ')
128
119
  text = re.sub('\\[.*?\\]', '', text)
129
120
  text = text.replace(' ', ' ')
130
121
  return text
131
122
 
132
- @classmethod
133
- def _generate_prompt(cls, input_d: dict, endings: list, include_answer=True) -> str:
123
+ def _generate_prompt(self, input_d: dict, endings: list, include_answer=True) -> str:
134
124
  """
135
125
  Generate prompt for HellaSwag dataset.
136
126
 
@@ -144,7 +134,13 @@ class HellaSwagAdapter(DataAdapter):
144
134
  """
145
135
 
146
136
  ctx = input_d['ctx_a'] + ' ' + input_d['ctx_b'].capitalize()
147
- example: str = cls._preprocess(input_d['activity_label'] + ': ' + ctx)
137
+ # example: str = cls._preprocess(input_d['activity_label'] + ': ' + ctx)
138
+ example: str = self._preprocess(ctx)
139
+
140
+ example += '\nQuestion: Which ending makes the most sense?'
141
+ for i, ending in enumerate(endings):
142
+ example += f'\n{self.choices[i]}. {ending}'
143
+ example += '\nYou may choose from A, B, C, D. Derive your final answer as `The answer is ...`.'
148
144
 
149
145
  if include_answer:
150
146
  example += '{}\n\n'.format(endings[int(input_d['label'])])
@@ -2,7 +2,6 @@
2
2
  import re
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.models import ChatGenerationModelAdapter
6
5
  from evalscope.utils.logger import get_logger
7
6
 
8
7
  logger = get_logger()
@@ -21,6 +20,10 @@ logger = get_logger()
21
20
  train_split=None,
22
21
  eval_split='test',
23
22
  prompt_template='Complete the following python code:\n{query}',
23
+ extra_params={
24
+ 'num_workers': 4,
25
+ 'timeout': 4
26
+ },
24
27
  )
25
28
  class HumanevalAdapter(DataAdapter):
26
29
  """
@@ -35,17 +38,17 @@ class HumanevalAdapter(DataAdapter):
35
38
  raise ImportError('Please install human_eval:'
36
39
  'https://github.com/openai/human-eval/tree/master#installation , '
37
40
  'Note that you need to enable the execution code in the human_eval/execution.py first.')
41
+ super().__init__(**kwargs)
38
42
 
43
+ extra_params = kwargs.get('extra_params', {})
39
44
  self.k = [1]
40
- self.num_workers = 4
41
- self.timeout = 4.0
45
+ self.num_workers = extra_params.get('num_workers', 4)
46
+ self.timeout = extra_params.get('timeout', 4)
42
47
 
43
48
  self.read_problems_func = stream_jsonl
44
49
  self.write_jsonl_func = write_jsonl
45
50
  self.eval_func = check_correctness
46
51
 
47
- super().__init__(**kwargs)
48
-
49
52
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
50
53
  data_dict = {}
51
54
  for subset_name in subset_list: