evalscope 0.13.0__tar.gz → 0.13.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (360) hide show
  1. {evalscope-0.13.0/evalscope.egg-info → evalscope-0.13.2}/PKG-INFO +42 -78
  2. {evalscope-0.13.0 → evalscope-0.13.2}/README.md +33 -29
  3. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/arguments.py +1 -1
  4. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/utils/llm.py +4 -5
  5. evalscope-0.13.2/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  6. evalscope-0.13.2/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  7. evalscope-0.13.2/evalscope/benchmarks/arena_hard/utils.py +162 -0
  8. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  9. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  10. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/data_adapter.py +26 -2
  11. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
  13. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  14. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
  15. evalscope-0.13.2/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  16. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  17. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/collections/evaluator.py +1 -1
  18. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/config.py +6 -3
  19. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/constants.py +1 -0
  20. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/evaluator/evaluator.py +5 -4
  21. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/llm_judge.py +1 -1
  22. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/chat_adapter.py +32 -11
  23. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/custom_adapter.py +1 -1
  24. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/arguments.py +19 -46
  25. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/benchmark.py +64 -90
  26. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/main.py +1 -1
  27. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/api/openai_api.py +4 -2
  28. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/datasets/__init__.py +1 -0
  29. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/datasets/openqa.py +6 -11
  30. evalscope-0.13.2/evalscope/perf/plugin/datasets/random_dataset.py +51 -0
  31. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  32. evalscope-0.13.2/evalscope/perf/utils/__init__.py +0 -0
  33. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/utils/db_util.py +5 -2
  34. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/run.py +14 -2
  35. evalscope-0.13.2/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  36. evalscope-0.13.2/evalscope/version.py +4 -0
  37. {evalscope-0.13.0 → evalscope-0.13.2/evalscope.egg-info}/PKG-INFO +42 -78
  38. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope.egg-info/SOURCES.txt +8 -2
  39. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope.egg-info/requires.txt +8 -49
  40. {evalscope-0.13.0 → evalscope-0.13.2}/requirements/framework.txt +0 -12
  41. evalscope-0.13.2/requirements/rag.txt +7 -0
  42. {evalscope-0.13.0 → evalscope-0.13.2}/setup.py +0 -1
  43. {evalscope-0.13.0 → evalscope-0.13.2}/tests/cli/test_all.py +33 -24
  44. {evalscope-0.13.0 → evalscope-0.13.2}/tests/cli/test_run.py +69 -22
  45. {evalscope-0.13.0 → evalscope-0.13.2}/tests/perf/test_perf.py +23 -0
  46. evalscope-0.13.2/tests/rag/__init__.py +0 -0
  47. {evalscope-0.13.0 → evalscope-0.13.2}/tests/rag/test_ragas.py +4 -1
  48. evalscope-0.13.0/evalscope/version.py +0 -4
  49. evalscope-0.13.0/requirements/inner.txt +0 -25
  50. evalscope-0.13.0/requirements/rag.txt +0 -3
  51. evalscope-0.13.0/requirements/tests.txt +0 -5
  52. {evalscope-0.13.0 → evalscope-0.13.2}/LICENSE +0 -0
  53. {evalscope-0.13.0 → evalscope-0.13.2}/MANIFEST.in +0 -0
  54. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/__init__.py +0 -0
  55. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/__init__.py +0 -0
  56. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/base.py +0 -0
  57. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/opencompass/__init__.py +0 -0
  58. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  59. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/opencompass/backend_manager.py +0 -0
  60. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  61. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  62. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  63. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/__init__.py +0 -0
  64. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  65. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  66. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  67. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  68. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  69. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  70. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  71. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  72. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  73. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  74. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  75. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  76. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  77. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  78. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  79. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  80. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  81. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  82. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  83. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  84. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  85. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  86. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  87. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  88. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  89. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  90. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  91. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  92. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  93. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  94. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  95. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  96. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  97. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  98. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  99. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  100. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  101. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  102. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  103. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/__init__.py +0 -0
  104. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/aime/__init__.py +0 -0
  105. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
  106. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
  107. {evalscope-0.13.0/evalscope/benchmarks/chinese_simple_qa → evalscope-0.13.2/evalscope/benchmarks/alpaca_eval}/__init__.py +0 -0
  108. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/arc/__init__.py +0 -0
  109. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  110. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  111. {evalscope-0.13.0/evalscope/benchmarks/data_collection → evalscope-0.13.2/evalscope/benchmarks/arena_hard}/__init__.py +0 -0
  112. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/__init__.py +0 -0
  113. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  114. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  115. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  116. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  117. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  118. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  119. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  120. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  121. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  122. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  123. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  124. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  125. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  126. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  127. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  128. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  129. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  130. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  131. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  132. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  133. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  134. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  135. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  136. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  137. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  138. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  139. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  140. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  141. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/benchmark.py +0 -0
  142. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/ceval/__init__.py +0 -0
  143. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  144. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  145. {evalscope-0.13.0/evalscope/benchmarks/general_mcq → evalscope-0.13.2/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
  146. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  147. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  148. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  149. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  150. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  151. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  152. {evalscope-0.13.0/evalscope/benchmarks/gpqa → evalscope-0.13.2/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  153. {evalscope-0.13.0/evalscope/benchmarks/ifeval → evalscope-0.13.2/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  154. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
  155. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  156. {evalscope-0.13.0/evalscope/benchmarks/iquiz → evalscope-0.13.2/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  157. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  158. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
  159. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  160. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  161. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  162. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  163. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  164. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  165. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  166. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  167. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  168. {evalscope-0.13.0/evalscope/benchmarks/live_code_bench → evalscope-0.13.2/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  169. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  170. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  171. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  172. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/ifeval/utils.py +0 -0
  173. {evalscope-0.13.0/evalscope/benchmarks/math_500 → evalscope-0.13.2/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  174. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
  175. {evalscope-0.13.0/evalscope/benchmarks/mmlu_pro → evalscope-0.13.2/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  176. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
  177. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/live_code_bench/execute_utils.py +0 -0
  178. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  179. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
  180. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
  181. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  182. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  183. {evalscope-0.13.0/evalscope/benchmarks/musr → evalscope-0.13.2/evalscope/benchmarks/math_500}/__init__.py +0 -0
  184. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
  185. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  186. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  187. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  188. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  189. {evalscope-0.13.0/evalscope/benchmarks/process_bench → evalscope-0.13.2/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  190. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
  191. {evalscope-0.13.0/evalscope/benchmarks/simple_qa → evalscope-0.13.2/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
  192. {evalscope-0.13.0/evalscope/benchmarks/super_gpqa → evalscope-0.13.2/evalscope/benchmarks/musr}/__init__.py +0 -0
  193. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
  194. {evalscope-0.13.0/evalscope/perf → evalscope-0.13.2/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  195. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  196. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
  197. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/race/__init__.py +0 -0
  198. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/race/race.py +0 -0
  199. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/race/race_adapter.py +0 -0
  200. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/race/samples.jsonl +0 -0
  201. {evalscope-0.13.0/evalscope/perf/utils → evalscope-0.13.2/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  202. {evalscope-0.13.0/evalscope/third_party/thinkbench/tools → evalscope-0.13.2/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  203. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
  204. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
  205. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
  206. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
  207. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  208. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  209. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  210. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  211. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  212. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  213. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  214. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/benchmarks/utils.py +0 -0
  215. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/cli/__init__.py +0 -0
  216. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/cli/base.py +0 -0
  217. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/cli/cli.py +0 -0
  218. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/cli/start_app.py +0 -0
  219. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/cli/start_eval.py +0 -0
  220. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/cli/start_perf.py +0 -0
  221. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/cli/start_server.py +0 -0
  222. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/collections/__init__.py +0 -0
  223. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/collections/sampler.py +0 -0
  224. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/collections/schema.py +0 -0
  225. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/evaluator/__init__.py +0 -0
  226. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/evaluator/rating_eval.py +0 -0
  227. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/evaluator/reviewer/__init__.py +0 -0
  228. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  229. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/__init__.py +0 -0
  230. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  231. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  232. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/code_metric.py +0 -0
  233. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/math_parser.py +0 -0
  234. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/metrics.py +0 -0
  235. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/named_metrics.py +0 -0
  236. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  237. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  238. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/metrics/rouge_metric.py +0 -0
  239. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/__init__.py +0 -0
  240. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/base_adapter.py +0 -0
  241. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/choice_adapter.py +0 -0
  242. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/custom/__init__.py +0 -0
  243. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/custom/custom_model.py +0 -0
  244. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/custom/dummy_model.py +0 -0
  245. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/local_model.py +0 -0
  246. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/model.py +0 -0
  247. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/register.py +0 -0
  248. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/models/server_adapter.py +0 -0
  249. {evalscope-0.13.0/tests/rag → evalscope-0.13.2/evalscope/perf}/__init__.py +0 -0
  250. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/http_client.py +0 -0
  251. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/__init__.py +0 -0
  252. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/api/__init__.py +0 -0
  253. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/api/base.py +0 -0
  254. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/api/custom_api.py +0 -0
  255. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  256. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/datasets/base.py +0 -0
  257. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/datasets/custom.py +0 -0
  258. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  259. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  260. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  261. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/plugin/registry.py +0 -0
  262. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/utils/analysis_result.py +0 -0
  263. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/utils/benchmark_util.py +0 -0
  264. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/utils/handler.py +0 -0
  265. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/perf/utils/local_server.py +0 -0
  266. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/__init__.py +0 -0
  267. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/config/cfg_arena.yaml +0 -0
  268. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  269. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  270. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/config/cfg_single.yaml +0 -0
  271. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  272. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  273. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  274. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  275. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/data/question.jsonl +0 -0
  276. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/arc.yaml +0 -0
  277. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/bbh.yaml +0 -0
  278. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  279. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/ceval.yaml +0 -0
  280. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  281. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  282. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  283. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/general_qa.yaml +0 -0
  284. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  285. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/mmlu.yaml +0 -0
  286. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  287. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/report/__init__.py +0 -0
  288. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/report/app.py +0 -0
  289. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/report/combinator.py +0 -0
  290. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/report/generator.py +0 -0
  291. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/report/utils.py +0 -0
  292. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/run_arena.py +0 -0
  293. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/summarizer.py +0 -0
  294. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/__init__.py +0 -0
  295. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/README.md +0 -0
  296. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/__init__.py +0 -0
  297. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/default_task.json +0 -0
  298. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  299. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/eval.py +0 -0
  300. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/infer.py +0 -0
  301. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  302. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  303. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  304. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  305. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  306. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  307. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  308. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  309. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  310. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/longbench_write/utils.py +0 -0
  311. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/thinkbench/__init__.py +0 -0
  312. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/thinkbench/eval.py +0 -0
  313. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/thinkbench/infer.py +0 -0
  314. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  315. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  316. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  317. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  318. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/README.md +0 -0
  319. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  320. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  321. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  322. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/eval.py +0 -0
  323. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/infer.py +0 -0
  324. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  325. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  326. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  327. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  328. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/utils/__init__.py +0 -0
  329. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/utils/arena_utils.py +0 -0
  330. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/utils/chat_service.py +0 -0
  331. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/utils/completion_parsers.py +0 -0
  332. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/utils/filters.py +0 -0
  333. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/utils/io_utils.py +0 -0
  334. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/utils/logger.py +0 -0
  335. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/utils/model_utils.py +0 -0
  336. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope/utils/utils.py +0 -0
  337. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope.egg-info/dependency_links.txt +0 -0
  338. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope.egg-info/entry_points.txt +0 -0
  339. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope.egg-info/not-zip-safe +0 -0
  340. {evalscope-0.13.0 → evalscope-0.13.2}/evalscope.egg-info/top_level.txt +0 -0
  341. {evalscope-0.13.0 → evalscope-0.13.2}/requirements/app.txt +0 -0
  342. {evalscope-0.13.0 → evalscope-0.13.2}/requirements/docs.txt +0 -0
  343. {evalscope-0.13.0 → evalscope-0.13.2}/requirements/opencompass.txt +0 -0
  344. {evalscope-0.13.0 → evalscope-0.13.2}/requirements/perf.txt +0 -0
  345. {evalscope-0.13.0 → evalscope-0.13.2}/requirements/vlmeval.txt +0 -0
  346. {evalscope-0.13.0 → evalscope-0.13.2}/requirements.txt +0 -0
  347. {evalscope-0.13.0 → evalscope-0.13.2}/setup.cfg +0 -0
  348. {evalscope-0.13.0 → evalscope-0.13.2}/tests/__init__.py +0 -0
  349. {evalscope-0.13.0 → evalscope-0.13.2}/tests/cli/__init__.py +0 -0
  350. {evalscope-0.13.0 → evalscope-0.13.2}/tests/cli/test_collection.py +0 -0
  351. {evalscope-0.13.0 → evalscope-0.13.2}/tests/perf/__init__.py +0 -0
  352. {evalscope-0.13.0 → evalscope-0.13.2}/tests/rag/test_clip_benchmark.py +0 -0
  353. {evalscope-0.13.0 → evalscope-0.13.2}/tests/rag/test_mteb.py +0 -0
  354. {evalscope-0.13.0 → evalscope-0.13.2}/tests/swift/__init__.py +0 -0
  355. {evalscope-0.13.0 → evalscope-0.13.2}/tests/swift/test_run_swift_eval.py +0 -0
  356. {evalscope-0.13.0 → evalscope-0.13.2}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  357. {evalscope-0.13.0 → evalscope-0.13.2}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  358. {evalscope-0.13.0 → evalscope-0.13.2}/tests/test_run_all.py +0 -0
  359. {evalscope-0.13.0 → evalscope-0.13.2}/tests/vlm/__init__.py +0 -0
  360. {evalscope-0.13.0 → evalscope-0.13.2}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.13.0
3
+ Version: 0.13.2
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -16,11 +16,8 @@ Classifier: Programming Language :: Python :: 3.10
16
16
  Requires-Python: >=3.8
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: absl-py
20
19
  Requires-Dist: accelerate
21
- Requires-Dist: cachetools
22
20
  Requires-Dist: datasets<=3.2.0,>=3.0.0
23
- Requires-Dist: editdistance
24
21
  Requires-Dist: immutabledict
25
22
  Requires-Dist: jieba
26
23
  Requires-Dist: jsonlines
@@ -31,33 +28,29 @@ Requires-Dist: modelscope[framework]
31
28
  Requires-Dist: nltk>=3.9
32
29
  Requires-Dist: openai
33
30
  Requires-Dist: pandas
34
- Requires-Dist: plotly
35
31
  Requires-Dist: pyarrow
36
- Requires-Dist: pympler
37
32
  Requires-Dist: pyyaml
38
- Requires-Dist: regex
39
33
  Requires-Dist: requests
40
- Requires-Dist: requests-toolbelt
41
34
  Requires-Dist: rouge-chinese
42
35
  Requires-Dist: rouge-score>=0.1.0
43
36
  Requires-Dist: sacrebleu
44
37
  Requires-Dist: scikit-learn
45
38
  Requires-Dist: seaborn
46
- Requires-Dist: sentencepiece
47
- Requires-Dist: simple-ddl-parser
48
39
  Requires-Dist: sympy
49
40
  Requires-Dist: tabulate
50
- Requires-Dist: tiktoken
51
41
  Requires-Dist: torch
52
42
  Requires-Dist: tqdm
53
43
  Requires-Dist: transformers>=4.33
54
- Requires-Dist: transformers_stream_generator
55
44
  Requires-Dist: word2number
56
45
  Provides-Extra: opencompass
57
46
  Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
58
47
  Provides-Extra: vlmeval
59
48
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
60
49
  Provides-Extra: rag
50
+ Requires-Dist: langchain<0.3.0; extra == "rag"
51
+ Requires-Dist: langchain-community<0.3.0; extra == "rag"
52
+ Requires-Dist: langchain-core<0.3.0; extra == "rag"
53
+ Requires-Dist: langchain-openai<0.3.0; extra == "rag"
61
54
  Requires-Dist: mteb==1.19.4; extra == "rag"
62
55
  Requires-Dist: ragas==0.2.9; extra == "rag"
63
56
  Requires-Dist: webdataset>0.2.0; extra == "rag"
@@ -71,38 +64,9 @@ Requires-Dist: unicorn; extra == "perf"
71
64
  Provides-Extra: app
72
65
  Requires-Dist: gradio==5.4.0; extra == "app"
73
66
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
74
- Provides-Extra: inner
75
- Requires-Dist: absl-py; extra == "inner"
76
- Requires-Dist: accelerate; extra == "inner"
77
- Requires-Dist: alibaba_itag_sdk; extra == "inner"
78
- Requires-Dist: dashscope; extra == "inner"
79
- Requires-Dist: editdistance; extra == "inner"
80
- Requires-Dist: jsonlines; extra == "inner"
81
- Requires-Dist: nltk; extra == "inner"
82
- Requires-Dist: openai; extra == "inner"
83
- Requires-Dist: pandas==1.5.3; extra == "inner"
84
- Requires-Dist: plotly; extra == "inner"
85
- Requires-Dist: pyarrow; extra == "inner"
86
- Requires-Dist: pyodps; extra == "inner"
87
- Requires-Dist: pyyaml; extra == "inner"
88
- Requires-Dist: regex; extra == "inner"
89
- Requires-Dist: requests==2.28.1; extra == "inner"
90
- Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
91
- Requires-Dist: rouge-score; extra == "inner"
92
- Requires-Dist: sacrebleu; extra == "inner"
93
- Requires-Dist: scikit-learn; extra == "inner"
94
- Requires-Dist: seaborn; extra == "inner"
95
- Requires-Dist: simple-ddl-parser; extra == "inner"
96
- Requires-Dist: streamlit; extra == "inner"
97
- Requires-Dist: tqdm; extra == "inner"
98
- Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
99
- Requires-Dist: transformers_stream_generator; extra == "inner"
100
67
  Provides-Extra: all
101
- Requires-Dist: absl-py; extra == "all"
102
68
  Requires-Dist: accelerate; extra == "all"
103
- Requires-Dist: cachetools; extra == "all"
104
69
  Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
105
- Requires-Dist: editdistance; extra == "all"
106
70
  Requires-Dist: immutabledict; extra == "all"
107
71
  Requires-Dist: jieba; extra == "all"
108
72
  Requires-Dist: jsonlines; extra == "all"
@@ -113,30 +77,26 @@ Requires-Dist: modelscope[framework]; extra == "all"
113
77
  Requires-Dist: nltk>=3.9; extra == "all"
114
78
  Requires-Dist: openai; extra == "all"
115
79
  Requires-Dist: pandas; extra == "all"
116
- Requires-Dist: plotly; extra == "all"
117
80
  Requires-Dist: pyarrow; extra == "all"
118
- Requires-Dist: pympler; extra == "all"
119
81
  Requires-Dist: pyyaml; extra == "all"
120
- Requires-Dist: regex; extra == "all"
121
82
  Requires-Dist: requests; extra == "all"
122
- Requires-Dist: requests-toolbelt; extra == "all"
123
83
  Requires-Dist: rouge-chinese; extra == "all"
124
84
  Requires-Dist: rouge-score>=0.1.0; extra == "all"
125
85
  Requires-Dist: sacrebleu; extra == "all"
126
86
  Requires-Dist: scikit-learn; extra == "all"
127
87
  Requires-Dist: seaborn; extra == "all"
128
- Requires-Dist: sentencepiece; extra == "all"
129
- Requires-Dist: simple-ddl-parser; extra == "all"
130
88
  Requires-Dist: sympy; extra == "all"
131
89
  Requires-Dist: tabulate; extra == "all"
132
- Requires-Dist: tiktoken; extra == "all"
133
90
  Requires-Dist: torch; extra == "all"
134
91
  Requires-Dist: tqdm; extra == "all"
135
92
  Requires-Dist: transformers>=4.33; extra == "all"
136
- Requires-Dist: transformers_stream_generator; extra == "all"
137
93
  Requires-Dist: word2number; extra == "all"
138
94
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
139
95
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
96
+ Requires-Dist: langchain<0.3.0; extra == "all"
97
+ Requires-Dist: langchain-community<0.3.0; extra == "all"
98
+ Requires-Dist: langchain-core<0.3.0; extra == "all"
99
+ Requires-Dist: langchain-openai<0.3.0; extra == "all"
140
100
  Requires-Dist: mteb==1.19.4; extra == "all"
141
101
  Requires-Dist: ragas==0.2.9; extra == "all"
142
102
  Requires-Dist: webdataset>0.2.0; extra == "all"
@@ -239,7 +199,9 @@ Please scan the QR code below to join our community groups:
239
199
 
240
200
  ## 🎉 News
241
201
 
242
- - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
202
+ - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
203
+ - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
204
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
243
205
  - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
244
206
  - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
245
207
  - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
@@ -277,23 +239,24 @@ Please scan the QR code below to join our community groups:
277
239
  We recommend using conda to manage your environment and installing dependencies with pip:
278
240
 
279
241
  1. Create a conda environment (optional)
280
- ```shell
281
- # It is recommended to use Python 3.10
282
- conda create -n evalscope python=3.10
283
- # Activate the conda environment
284
- conda activate evalscope
285
- ```
242
+ ```shell
243
+ # It is recommended to use Python 3.10
244
+ conda create -n evalscope python=3.10
245
+ # Activate the conda environment
246
+ conda activate evalscope
247
+ ```
286
248
 
287
249
  2. Install dependencies using pip
288
- ```shell
289
- pip install evalscope # Install Native backend (default)
290
- # Additional options
291
- pip install evalscope[opencompass] # Install OpenCompass backend
292
- pip install evalscope[vlmeval] # Install VLMEvalKit backend
293
- pip install evalscope[rag] # Install RAGEval backend
294
- pip install evalscope[perf] # Install Perf dependencies
295
- pip install evalscope[all] # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
296
- ```
250
+ ```shell
251
+ pip install evalscope # Install Native backend (default)
252
+ # Additional options
253
+ pip install 'evalscope[opencompass]' # Install OpenCompass backend
254
+ pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
255
+ pip install 'evalscope[rag]' # Install RAGEval backend
256
+ pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
257
+ pip install 'evalscope[app]' # Install dependencies for visualization
258
+ pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
259
+ ```
297
260
 
298
261
  > [!WARNING]
299
262
  > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
@@ -307,21 +270,22 @@ We recommend using conda to manage your environment and installing dependencies
307
270
 
308
271
  ### Method 2: Install from Source
309
272
  1. Download the source code
310
- ```shell
311
- git clone https://github.com/modelscope/evalscope.git
312
- ```
273
+ ```shell
274
+ git clone https://github.com/modelscope/evalscope.git
275
+ ```
313
276
 
314
277
  2. Install dependencies
315
- ```shell
316
- cd evalscope/
317
- pip install -e . # Install Native backend
318
- # Additional options
319
- pip install -e '.[opencompass]' # Install OpenCompass backend
320
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
321
- pip install -e '.[rag]' # Install RAGEval backend
322
- pip install -e '.[perf]' # Install Perf dependencies
323
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
324
- ```
278
+ ```shell
279
+ cd evalscope/
280
+ pip install -e . # Install Native backend
281
+ # Additional options
282
+ pip install -e '.[opencompass]' # Install OpenCompass backend
283
+ pip install -e '.[vlmeval]' # Install VLMEvalKit backend
284
+ pip install -e '.[rag]' # Install RAGEval backend
285
+ pip install -e '.[perf]' # Install Perf dependencies
286
+ pip install -e '.[app]' # Install visualization dependencies
287
+ pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
288
+ ```
325
289
 
326
290
 
327
291
  ## 🚀 Quick Start
@@ -88,7 +88,9 @@ Please scan the QR code below to join our community groups:
88
88
 
89
89
  ## 🎉 News
90
90
 
91
- - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
91
+ - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
92
+ - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
93
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
92
94
  - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
93
95
  - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
94
96
  - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
@@ -126,23 +128,24 @@ Please scan the QR code below to join our community groups:
126
128
  We recommend using conda to manage your environment and installing dependencies with pip:
127
129
 
128
130
  1. Create a conda environment (optional)
129
- ```shell
130
- # It is recommended to use Python 3.10
131
- conda create -n evalscope python=3.10
132
- # Activate the conda environment
133
- conda activate evalscope
134
- ```
131
+ ```shell
132
+ # It is recommended to use Python 3.10
133
+ conda create -n evalscope python=3.10
134
+ # Activate the conda environment
135
+ conda activate evalscope
136
+ ```
135
137
 
136
138
  2. Install dependencies using pip
137
- ```shell
138
- pip install evalscope # Install Native backend (default)
139
- # Additional options
140
- pip install evalscope[opencompass] # Install OpenCompass backend
141
- pip install evalscope[vlmeval] # Install VLMEvalKit backend
142
- pip install evalscope[rag] # Install RAGEval backend
143
- pip install evalscope[perf] # Install Perf dependencies
144
- pip install evalscope[all] # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
145
- ```
139
+ ```shell
140
+ pip install evalscope # Install Native backend (default)
141
+ # Additional options
142
+ pip install 'evalscope[opencompass]' # Install OpenCompass backend
143
+ pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
144
+ pip install 'evalscope[rag]' # Install RAGEval backend
145
+ pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
146
+ pip install 'evalscope[app]' # Install dependencies for visualization
147
+ pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
148
+ ```
146
149
 
147
150
  > [!WARNING]
148
151
  > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
@@ -156,21 +159,22 @@ We recommend using conda to manage your environment and installing dependencies
156
159
 
157
160
  ### Method 2: Install from Source
158
161
  1. Download the source code
159
- ```shell
160
- git clone https://github.com/modelscope/evalscope.git
161
- ```
162
+ ```shell
163
+ git clone https://github.com/modelscope/evalscope.git
164
+ ```
162
165
 
163
166
  2. Install dependencies
164
- ```shell
165
- cd evalscope/
166
- pip install -e . # Install Native backend
167
- # Additional options
168
- pip install -e '.[opencompass]' # Install OpenCompass backend
169
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
170
- pip install -e '.[rag]' # Install RAGEval backend
171
- pip install -e '.[perf]' # Install Perf dependencies
172
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
173
- ```
167
+ ```shell
168
+ cd evalscope/
169
+ pip install -e . # Install Native backend
170
+ # Additional options
171
+ pip install -e '.[opencompass]' # Install OpenCompass backend
172
+ pip install -e '.[vlmeval]' # Install VLMEvalKit backend
173
+ pip install -e '.[rag]' # Install RAGEval backend
174
+ pip install -e '.[perf]' # Install Perf dependencies
175
+ pip install -e '.[app]' # Install visualization dependencies
176
+ pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
177
+ ```
174
178
 
175
179
 
176
180
  ## 🚀 Quick Start
@@ -77,7 +77,7 @@ def add_argument(parser: argparse.ArgumentParser):
77
77
  # LLMJudge arguments
78
78
  parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
79
79
  parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
80
- parser.add_argument('--judge-worker-num', type=int, default=8, help='The number of workers for the judge model.')
80
+ parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
81
81
  # yapf: enable
82
82
 
83
83
 
@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models import ChatGenerationModelAdapter
9
+ from evalscope.models import ChatGenerationModelAdapter, LocalModel
10
10
 
11
11
 
12
12
  class LLM:
@@ -38,8 +38,7 @@ class LocalLLM(BaseLLM):
38
38
  super().__init__(**kw)
39
39
  self.model_name = os.path.basename(self.model_name_or_path)
40
40
  self.model = ChatGenerationModelAdapter(
41
- model_id=self.model_name_or_path,
42
- model_revision=self.model_revision,
41
+ model=LocalModel(model_id=self.model_name_or_path, model_revision=self.model_revision),
43
42
  generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
44
43
  )
45
44
 
@@ -53,8 +52,8 @@ class LocalLLM(BaseLLM):
53
52
  """Run the LLM on the given input."""
54
53
  infer_cfg = {'stop': stop}
55
54
 
56
- response = self.model._model_generate(prompt, infer_cfg)
57
- return response
55
+ response, _ = self.model._model_generate([prompt], infer_cfg=infer_cfg)
56
+ return response[0][0]
58
57
 
59
58
  @property
60
59
  def _identifying_params(self) -> Dict[str, Any]:
@@ -0,0 +1,109 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import Metric, mean, metric_registry
7
+ from evalscope.metrics.llm_judge import LLMJudge
8
+ from evalscope.utils.logger import get_logger
9
+
10
+ # flake8: noqa
11
+
12
+ logger = get_logger()
13
+
14
+ GRADER_SYSTEM_PROMPT = """You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers."""
15
+
16
+ GRADER_TEMPLATE = """
17
+ I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
18
+
19
+ ## Instruction
20
+
21
+ {{
22
+ "instruction": "{instruction}"
23
+ }}
24
+
25
+ ## Model Outputs
26
+
27
+ Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
28
+
29
+ {{
30
+ {{
31
+ "model_identifier": "m",
32
+ "output": "{output_1}"
33
+ }},
34
+ {{
35
+ "model_identifier": "M",
36
+ "output": "{output_2}"
37
+ }}
38
+ }}
39
+
40
+ ## Task
41
+
42
+ Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
43
+
44
+ ## Best Model Identifier
45
+ """.strip() # noqa: E501
46
+
47
+
48
+ @Benchmark.register(
49
+ name='alpaca_eval',
50
+ pretty_name='AlpacaEval2.0',
51
+ dataset_id='AI-ModelScope/alpaca_eval',
52
+ subset_list=['alpaca_eval_gpt4_baseline'],
53
+ metric_list=['winrate'],
54
+ few_shot_num=0,
55
+ train_split=None,
56
+ eval_split='eval')
57
+ class AlpacaEvalAdapter(DataAdapter):
58
+
59
+ def __init__(self, *args, **kwargs):
60
+ super().__init__(*args, **kwargs)
61
+
62
+ # register metrics
63
+ metric_registry.register(Metric(name='winrate', object=mean))
64
+
65
+ # whether to use LLM as a judge
66
+ self.llm_as_a_judge = True
67
+
68
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
69
+ question = input_d['instruction']
70
+ return self.gen_prompt_data(question)
71
+
72
+ def get_gold_answer(self, input_d: dict) -> str:
73
+ return input_d['output']
74
+
75
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
76
+ return result.strip()
77
+
78
+ def match(self, gold: str, pred: str):
79
+ # simple match
80
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
81
+ return None
82
+
83
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> bool:
84
+ raw_input = kwargs.get('raw_input', None)
85
+ instruction = raw_input['instruction']
86
+ # gold is baseline answer 'm', pred is model answer 'M'
87
+ prompt = GRADER_TEMPLATE.format(instruction=instruction, output_1=gold, output_2=pred)
88
+ # get grading response
89
+ grading_response = judge(prompt, system_prompt=GRADER_SYSTEM_PROMPT)
90
+ # parse grading response
91
+ match = re.search(r'(m|M)', grading_response)
92
+ res = match.group(0) if match else None
93
+ if res:
94
+ return res == 'M'
95
+ else:
96
+ logger.info(f'Failed to parse grading response: {prompt=}\n {grading_response=}')
97
+ return None
98
+
99
+ def compute_metric(self, review_res_list: List[bool], **kwargs) -> List[dict]:
100
+ """
101
+ compute weighted mean of the bleu score of all samples
102
+
103
+ Args:
104
+ review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
105
+ """
106
+ # zip dict answers
107
+ res_list = [res for res in review_res_list if res is not None]
108
+
109
+ return super().compute_metric(res_list, **kwargs)
@@ -0,0 +1,120 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import AnswerKeys
7
+ from evalscope.metrics import Metric, mean, metric_registry
8
+ from evalscope.metrics.llm_judge import LLMJudge
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ # flake8: noqa
12
+
13
+ logger = get_logger()
14
+
15
+ GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." # noqa: E501
16
+
17
+ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>".strip(
18
+ ) # noqa: E501
19
+
20
+
21
+ @Benchmark.register(
22
+ name='arena_hard',
23
+ pretty_name='ArenaHard',
24
+ dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
25
+ metric_list=['winrate'],
26
+ few_shot_num=0,
27
+ train_split=None,
28
+ eval_split='test')
29
+ class AlpacaEvalAdapter(DataAdapter):
30
+
31
+ def __init__(self, *args, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+
34
+ # register metrics
35
+ metric_registry.register(Metric(name='winrate', object=mean))
36
+
37
+ # whether to use LLM as a judge
38
+ self.llm_as_a_judge = True
39
+
40
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
41
+ question = input_d['question']
42
+ return self.gen_prompt_data(question)
43
+
44
+ def get_gold_answer(self, input_d: dict) -> str:
45
+ return input_d['prediction']
46
+
47
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
48
+ return result.strip()
49
+
50
+ def match(self, gold: str, pred: str):
51
+ # simple match
52
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
53
+ return None
54
+
55
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
56
+ from .utils import post_process_arenahard
57
+
58
+ raw_input = kwargs.get('raw_input', None)
59
+ question = raw_input['question']
60
+ # gold is baseline answer 'A', pred is model answer 'B'
61
+ prompt1 = GRADER_TEMPLATE.format(question=question, answer_1=gold, answer_2=pred)
62
+ # reverse the order
63
+ prompt2 = GRADER_TEMPLATE.format(question=question, answer_1=pred, answer_2=gold)
64
+ # get grading response
65
+ game1_response = judge(prompt1, system_prompt=GRADER_SYSTEM_PROMPT)
66
+ game2_response = judge(prompt2, system_prompt=GRADER_SYSTEM_PROMPT)
67
+ # parse grading response
68
+ res1 = post_process_arenahard(game1_response)
69
+ res2 = post_process_arenahard(game2_response)
70
+ return {
71
+ 'model_a':
72
+ 'gpt4-0314',
73
+ 'model_b':
74
+ 'test_model',
75
+ 'games': [
76
+ {
77
+ 'user_prompt': prompt1,
78
+ 'judgment': game1_response,
79
+ 'score': res1
80
+ },
81
+ {
82
+ 'user_prompt': prompt2,
83
+ 'judgment': game2_response,
84
+ 'score': res2
85
+ },
86
+ ]
87
+ }
88
+
89
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
90
+ """
91
+ compute score of the model
92
+ """
93
+ import pandas as pd
94
+
95
+ from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
96
+
97
+ if isinstance(review_res_list[0], list):
98
+ review_res_list = [item for sublist in review_res_list for item in sublist]
99
+
100
+ battles = pd.concat([get_battles_from_row(res) for res in review_res_list])
101
+
102
+ bootstrap_online_elo = compute_mle_elo(battles)
103
+
104
+ # bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100)
105
+ stats = pd.DataFrame()
106
+ stats['results'] = None
107
+ stats['results'] = stats['results'].astype('object')
108
+
109
+ for i, model in enumerate(bootstrap_online_elo.index):
110
+ # assert model in bootstrap_elo_lu.columns
111
+ stats.at[i, 'model'] = model
112
+ stats.at[i, 'score'] = bootstrap_online_elo[model]
113
+ # stats.at[i, "lower"] = np.percentile(bootstrap_elo_lu[model], 2.5)
114
+ # stats.at[i, "upper"] = np.percentile(bootstrap_elo_lu[model], 97.5)
115
+
116
+ # stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist()
117
+
118
+ score = get_win_rate_column(stats, 'score', 'gpt4-0314').at['test_model']
119
+
120
+ return [{'metric_name': 'winrate', 'score': score, 'num': len(review_res_list)}]