evalscope 0.9.0__tar.gz → 0.10.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (312) hide show
  1. {evalscope-0.9.0/evalscope.egg-info → evalscope-0.10.1}/PKG-INFO +84 -7
  2. {evalscope-0.9.0 → evalscope-0.10.1}/README.md +78 -6
  3. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/arguments.py +1 -0
  4. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/arc/arc_adapter.py +3 -5
  5. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  6. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/benchmark.py +1 -1
  7. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  8. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  9. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  10. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/data_adapter.py +69 -70
  11. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  12. evalscope-0.10.1/evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  13. evalscope-0.10.1/evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  14. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  15. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  16. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  17. evalscope-0.10.1/evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  18. evalscope-0.10.1/evalscope/benchmarks/ifeval/instructions.py +1477 -0
  19. evalscope-0.10.1/evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  20. evalscope-0.10.1/evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  21. evalscope-0.10.1/evalscope/benchmarks/ifeval/utils.py +134 -0
  22. evalscope-0.10.1/evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  23. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  24. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  25. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/race/race_adapter.py +4 -73
  26. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  27. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  28. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/cli.py +2 -0
  29. evalscope-0.10.1/evalscope/cli/start_app.py +30 -0
  30. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/collections/evaluator.py +82 -62
  31. evalscope-0.10.1/evalscope/collections/sampler.py +138 -0
  32. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/collections/schema.py +14 -10
  33. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/constants.py +4 -0
  34. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/evaluator.py +22 -13
  35. evalscope-0.10.1/evalscope/metrics/__init__.py +4 -0
  36. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/metrics.py +11 -2
  37. evalscope-0.10.1/evalscope/metrics/named_metrics.py +17 -0
  38. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/chat_adapter.py +2 -0
  39. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/server_adapter.py +11 -4
  40. evalscope-0.10.1/evalscope/perf/__init__.py +1 -0
  41. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/main.py +0 -1
  42. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/custom_api.py +1 -1
  43. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/openai_api.py +1 -1
  44. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  45. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  46. evalscope-0.10.1/evalscope/perf/utils/__init__.py +0 -0
  47. evalscope-0.10.1/evalscope/report/__init__.py +5 -0
  48. evalscope-0.10.1/evalscope/report/app.py +693 -0
  49. evalscope-0.10.1/evalscope/report/combinator.py +73 -0
  50. evalscope-0.10.1/evalscope/report/generator.py +80 -0
  51. evalscope-0.10.1/evalscope/report/utils.py +133 -0
  52. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/run.py +16 -11
  53. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/summarizer.py +1 -1
  54. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/chat_service.py +1 -1
  55. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/logger.py +1 -0
  56. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/model_utils.py +5 -2
  57. evalscope-0.10.1/evalscope/version.py +4 -0
  58. {evalscope-0.9.0 → evalscope-0.10.1/evalscope.egg-info}/PKG-INFO +84 -7
  59. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/SOURCES.txt +20 -4
  60. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/requires.txt +6 -0
  61. evalscope-0.10.1/requirements/app.txt +2 -0
  62. {evalscope-0.9.0 → evalscope-0.10.1}/setup.py +2 -0
  63. {evalscope-0.9.0 → evalscope-0.10.1}/tests/cli/test_collection.py +11 -7
  64. {evalscope-0.9.0 → evalscope-0.10.1}/tests/cli/test_run.py +13 -4
  65. evalscope-0.10.1/tests/rag/__init__.py +0 -0
  66. evalscope-0.9.0/evalscope/collections/sampler.py +0 -132
  67. evalscope-0.9.0/evalscope/metrics/__init__.py +0 -7
  68. evalscope-0.9.0/evalscope/tools/combine_reports.py +0 -133
  69. evalscope-0.9.0/evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  70. evalscope-0.9.0/evalscope/version.py +0 -4
  71. evalscope-0.9.0/tests/vlm/__init__.py +0 -1
  72. {evalscope-0.9.0 → evalscope-0.10.1}/LICENSE +0 -0
  73. {evalscope-0.9.0 → evalscope-0.10.1}/MANIFEST.in +0 -0
  74. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/__init__.py +0 -0
  75. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/__init__.py +0 -0
  76. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/base.py +0 -0
  77. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/__init__.py +0 -0
  78. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  79. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/backend_manager.py +0 -0
  80. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  81. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  82. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  83. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/__init__.py +0 -0
  84. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  85. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  86. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  87. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  88. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  89. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  90. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  91. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  92. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  93. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  94. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  95. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  96. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/arguments.py +0 -0
  97. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  98. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/task_template.py +0 -0
  99. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  100. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  101. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +0 -0
  102. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  103. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  104. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  105. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  106. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  107. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  108. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  109. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  110. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  111. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  112. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  113. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  114. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  115. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  116. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  117. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  118. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/embedding.py +0 -0
  119. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  120. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  121. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  122. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  123. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -0
  124. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/__init__.py +0 -0
  125. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/arc/__init__.py +0 -0
  126. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  127. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
  128. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  129. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  130. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  131. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  132. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  133. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  134. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  135. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  136. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  137. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  138. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  139. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  140. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  141. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  142. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  143. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  144. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  145. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  146. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  147. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  148. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  149. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  150. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  151. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  152. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  153. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  154. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  155. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
  156. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  157. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/ceval/samples.jsonl +0 -0
  158. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  159. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  160. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  161. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  162. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  163. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  164. {evalscope-0.9.0/evalscope/benchmarks/mmlu_pro → evalscope-0.10.1/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  165. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  166. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  167. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  168. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  169. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  170. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  171. {evalscope-0.9.0/evalscope/perf → evalscope-0.10.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  172. {evalscope-0.9.0/evalscope/perf/utils → evalscope-0.10.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  173. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  174. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  175. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  176. {evalscope-0.9.0/tests/rag → evalscope-0.10.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  177. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/race/__init__.py +0 -0
  178. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/race/race.py +0 -0
  179. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
  180. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  181. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  182. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  183. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  184. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  185. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/__init__.py +0 -0
  186. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/base.py +0 -0
  187. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/start_eval.py +0 -0
  188. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/start_perf.py +0 -0
  189. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/cli/start_server.py +0 -0
  190. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/collections/__init__.py +0 -0
  191. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/config.py +0 -0
  192. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/__init__.py +0 -0
  193. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/rating_eval.py +0 -0
  194. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
  195. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  196. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  197. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -0
  198. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/code_metric.py +0 -0
  199. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/math_accuracy.py +0 -0
  200. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -0
  201. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -0
  202. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/metrics/rouge_metric.py +0 -0
  203. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/__init__.py +0 -0
  204. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/base_adapter.py +0 -0
  205. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/choice_adapter.py +0 -0
  206. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/custom/__init__.py +0 -0
  207. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/custom/custom_model.py +0 -0
  208. /evalscope-0.9.0/evalscope/tools/rewrite_eval_results.py → /evalscope-0.10.1/evalscope/models/custom/dummy_model.py +0 -0
  209. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/custom_adapter.py +0 -0
  210. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/local_model.py +0 -0
  211. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/models/model.py +0 -0
  212. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/arguments.py +0 -0
  213. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/benchmark.py +0 -0
  214. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/http_client.py +0 -0
  215. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/__init__.py +0 -0
  216. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/__init__.py +0 -0
  217. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/base.py +0 -0
  218. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  219. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  220. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/base.py +0 -0
  221. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/custom.py +0 -0
  222. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  223. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  224. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  225. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/plugin/registry.py +0 -0
  226. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/analysis_result.py +0 -0
  227. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/benchmark_util.py +0 -0
  228. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/db_util.py +0 -0
  229. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/handler.py +0 -0
  230. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/perf/utils/local_server.py +0 -0
  231. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/__init__.py +0 -0
  232. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_arena.yaml +0 -0
  233. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  234. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  235. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/config/cfg_single.yaml +0 -0
  236. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  237. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  238. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  239. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  240. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/data/question.jsonl +0 -0
  241. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/arc.yaml +0 -0
  242. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/bbh.yaml +0 -0
  243. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  244. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/ceval.yaml +0 -0
  245. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  246. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  247. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  248. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
  249. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  250. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
  251. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  252. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/run_arena.py +0 -0
  253. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/__init__.py +0 -0
  254. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/README.md +0 -0
  255. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
  256. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
  257. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  258. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/eval.py +0 -0
  259. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/infer.py +0 -0
  260. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  261. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  262. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  263. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  264. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  265. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  266. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  267. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  268. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  269. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/longbench_write/utils.py +0 -0
  270. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/README.md +0 -0
  271. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  272. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  273. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  274. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
  275. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
  276. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  277. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  278. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  279. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  280. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/__init__.py +0 -0
  281. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/arena_utils.py +0 -0
  282. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/completion_parsers.py +0 -0
  283. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/io_utils.py +0 -0
  284. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope/utils/utils.py +0 -0
  285. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/dependency_links.txt +0 -0
  286. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/entry_points.txt +0 -0
  287. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/not-zip-safe +0 -0
  288. {evalscope-0.9.0 → evalscope-0.10.1}/evalscope.egg-info/top_level.txt +0 -0
  289. {evalscope-0.9.0 → evalscope-0.10.1}/requirements/docs.txt +0 -0
  290. {evalscope-0.9.0 → evalscope-0.10.1}/requirements/framework.txt +0 -0
  291. {evalscope-0.9.0 → evalscope-0.10.1}/requirements/inner.txt +0 -0
  292. {evalscope-0.9.0 → evalscope-0.10.1}/requirements/opencompass.txt +0 -0
  293. {evalscope-0.9.0 → evalscope-0.10.1}/requirements/perf.txt +0 -0
  294. {evalscope-0.9.0 → evalscope-0.10.1}/requirements/rag.txt +0 -0
  295. {evalscope-0.9.0 → evalscope-0.10.1}/requirements/tests.txt +0 -0
  296. {evalscope-0.9.0 → evalscope-0.10.1}/requirements/vlmeval.txt +0 -0
  297. {evalscope-0.9.0 → evalscope-0.10.1}/requirements.txt +0 -0
  298. {evalscope-0.9.0 → evalscope-0.10.1}/setup.cfg +0 -0
  299. {evalscope-0.9.0/evalscope/tools → evalscope-0.10.1/tests}/__init__.py +0 -0
  300. {evalscope-0.9.0/tests → evalscope-0.10.1/tests/cli}/__init__.py +0 -0
  301. {evalscope-0.9.0/tests/cli → evalscope-0.10.1/tests/perf}/__init__.py +0 -0
  302. {evalscope-0.9.0 → evalscope-0.10.1}/tests/perf/test_perf.py +0 -0
  303. {evalscope-0.9.0 → evalscope-0.10.1}/tests/rag/test_clip_benchmark.py +0 -0
  304. {evalscope-0.9.0 → evalscope-0.10.1}/tests/rag/test_mteb.py +0 -0
  305. {evalscope-0.9.0 → evalscope-0.10.1}/tests/rag/test_ragas.py +0 -0
  306. {evalscope-0.9.0/tests/perf → evalscope-0.10.1/tests/swift}/__init__.py +0 -0
  307. {evalscope-0.9.0 → evalscope-0.10.1}/tests/swift/test_run_swift_eval.py +0 -0
  308. {evalscope-0.9.0 → evalscope-0.10.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  309. {evalscope-0.9.0 → evalscope-0.10.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  310. {evalscope-0.9.0 → evalscope-0.10.1}/tests/test_run_all.py +0 -0
  311. {evalscope-0.9.0/tests/swift → evalscope-0.10.1/tests/vlm}/__init__.py +0 -0
  312. {evalscope-0.9.0 → evalscope-0.10.1}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.9.0
3
+ Version: 0.10.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -63,6 +63,9 @@ Requires-Dist: numpy; extra == "perf"
63
63
  Requires-Dist: sse_starlette; extra == "perf"
64
64
  Requires-Dist: transformers; extra == "perf"
65
65
  Requires-Dist: unicorn; extra == "perf"
66
+ Provides-Extra: app
67
+ Requires-Dist: gradio>=5.4.0; extra == "app"
68
+ Requires-Dist: plotly>=5.23.0; extra == "app"
66
69
  Provides-Extra: inner
67
70
  Requires-Dist: absl-py; extra == "inner"
68
71
  Requires-Dist: accelerate; extra == "inner"
@@ -133,6 +136,8 @@ Requires-Dist: numpy; extra == "all"
133
136
  Requires-Dist: sse_starlette; extra == "all"
134
137
  Requires-Dist: transformers; extra == "all"
135
138
  Requires-Dist: unicorn; extra == "all"
139
+ Requires-Dist: gradio>=5.4.0; extra == "all"
140
+ Requires-Dist: plotly>=5.23.0; extra == "all"
136
141
 
137
142
  <p align="center">
138
143
  <br>
@@ -210,6 +215,8 @@ Please scan the QR code below to join our community groups:
210
215
 
211
216
 
212
217
  ## 🎉 News
218
+ - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
+ - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
213
220
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
214
221
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
215
222
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
@@ -374,15 +381,85 @@ run_task(task_cfg="config.json")
374
381
  - `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
375
382
 
376
383
  ### Output Results
384
+ ```text
385
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
386
+ | Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
387
+ +=======================+================+=================+=================+===============+=======+=========+
388
+ | Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
389
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
390
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
391
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
392
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
393
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
394
+ ```
395
+
396
+ ## 📈 Visualization of Evaluation Results
397
+
398
+ 1. Install the dependencies required for visualization, including gradio, plotly, etc.
399
+ ```bash
400
+ pip install 'evalscope[app]'
377
401
  ```
378
- +-----------------------+-------------------+-----------------+
379
- | Model | ai2_arc | gsm8k |
380
- +=======================+===================+=================+
381
- | Qwen2.5-0.5B-Instruct | (ai2_arc/acc) 0.6 | (gsm8k/acc) 0.6 |
382
- +-----------------------+-------------------+-----------------+
402
+
403
+ 2. Start the Visualization Service
404
+
405
+ Run the following command to start the visualization service.
406
+ ```bash
407
+ evalscope app
408
+ ```
409
+ You can access the visualization service in the browser if the following output appears.
410
+ ```text
411
+ * Running on local URL: http://127.0.0.1:7861
412
+
413
+ To create a public link, set `share=True` in `launch()`.
383
414
  ```
384
415
 
385
- ## ⚙️ Complex Evaluation
416
+ <table>
417
+ <tr>
418
+ <td style="text-align: center;">
419
+ <img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
420
+ <p>Setting Interface</p>
421
+ </td>
422
+ <td style="text-align: center;">
423
+ <img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
424
+ <p>Model Comparison</p>
425
+ </td>
426
+ </tr>
427
+ <tr>
428
+ <td style="text-align: center;">
429
+ <img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
430
+ <p>Report Overview</p>
431
+ </td>
432
+ <td style="text-align: center;">
433
+ <img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
434
+ <p>Report Details</p>
435
+ </td>
436
+ </tr>
437
+ </table>
438
+
439
+ For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
440
+
441
+ ## 🌐 Evaluation of Specified Model API
442
+
443
+ Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
444
+
445
+ For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
446
+
447
+ ```shell
448
+ export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
449
+ ```
450
+ Then, you can use the following command to evaluate the model API service:
451
+ ```shell
452
+ evalscope eval \
453
+ --model qwen2.5 \
454
+ --api-url http://127.0.0.1:8801/v1/chat/completions \
455
+ --api-key EMPTY \
456
+ --eval-type service \
457
+ --datasets gsm8k \
458
+ --limit 10
459
+ ```
460
+
461
+ ## ⚙️ Custom Parameter Evaluation
462
+
386
463
  For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
387
464
 
388
465
  ```shell
@@ -74,6 +74,8 @@ Please scan the QR code below to join our community groups:
74
74
 
75
75
 
76
76
  ## 🎉 News
77
+ - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
78
+ - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
77
79
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
78
80
  - 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
79
81
  - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
@@ -238,15 +240,85 @@ run_task(task_cfg="config.json")
238
240
  - `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
239
241
 
240
242
  ### Output Results
243
+ ```text
244
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
245
+ | Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
246
+ +=======================+================+=================+=================+===============+=======+=========+
247
+ | Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
248
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
249
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
250
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
251
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
252
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
253
+ ```
254
+
255
+ ## 📈 Visualization of Evaluation Results
256
+
257
+ 1. Install the dependencies required for visualization, including gradio, plotly, etc.
258
+ ```bash
259
+ pip install 'evalscope[app]'
241
260
  ```
242
- +-----------------------+-------------------+-----------------+
243
- | Model | ai2_arc | gsm8k |
244
- +=======================+===================+=================+
245
- | Qwen2.5-0.5B-Instruct | (ai2_arc/acc) 0.6 | (gsm8k/acc) 0.6 |
246
- +-----------------------+-------------------+-----------------+
261
+
262
+ 2. Start the Visualization Service
263
+
264
+ Run the following command to start the visualization service.
265
+ ```bash
266
+ evalscope app
267
+ ```
268
+ You can access the visualization service in the browser if the following output appears.
269
+ ```text
270
+ * Running on local URL: http://127.0.0.1:7861
271
+
272
+ To create a public link, set `share=True` in `launch()`.
247
273
  ```
248
274
 
249
- ## ⚙️ Complex Evaluation
275
+ <table>
276
+ <tr>
277
+ <td style="text-align: center;">
278
+ <img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
279
+ <p>Setting Interface</p>
280
+ </td>
281
+ <td style="text-align: center;">
282
+ <img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
283
+ <p>Model Comparison</p>
284
+ </td>
285
+ </tr>
286
+ <tr>
287
+ <td style="text-align: center;">
288
+ <img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
289
+ <p>Report Overview</p>
290
+ </td>
291
+ <td style="text-align: center;">
292
+ <img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
293
+ <p>Report Details</p>
294
+ </td>
295
+ </tr>
296
+ </table>
297
+
298
+ For more details, refer to: [📖 Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
299
+
300
+ ## 🌐 Evaluation of Specified Model API
301
+
302
+ Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
303
+
304
+ For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
305
+
306
+ ```shell
307
+ export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
308
+ ```
309
+ Then, you can use the following command to evaluate the model API service:
310
+ ```shell
311
+ evalscope eval \
312
+ --model qwen2.5 \
313
+ --api-url http://127.0.0.1:8801/v1/chat/completions \
314
+ --api-key EMPTY \
315
+ --eval-type service \
316
+ --datasets gsm8k \
317
+ --limit 10
318
+ ```
319
+
320
+ ## ⚙️ Custom Parameter Evaluation
321
+
250
322
  For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
251
323
 
252
324
  ```shell
@@ -33,6 +33,7 @@ def add_argument(parser: argparse.ArgumentParser):
33
33
  # yapf: disable
34
34
  # Model-related arguments
35
35
  parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
36
+ parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
36
37
  parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
37
38
 
38
39
  # Template-related arguments
@@ -5,7 +5,7 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
8
+ from evalscope.metrics import AverageAccuracy, exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser
11
11
  from evalscope.utils.logger import get_logger
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  dataset_id='modelscope/ai2_arc',
21
21
  model_adapter=MultiChoiceModelAdapter,
22
22
  subset_list=['ARC-Easy', 'ARC-Challenge'],
23
- metric_list=[WeightedAverageAccuracy],
23
+ metric_list=[AverageAccuracy],
24
24
  few_shot_num=0,
25
25
  train_split='train',
26
26
  eval_split='test',
@@ -109,12 +109,10 @@ class ARCAdapter(DataAdapter):
109
109
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
110
110
  context: str = '\n'.join(few_shot_prompts)
111
111
 
112
- context = f'{self.prompt_template}\n{context}' if self.prompt_template else context
113
-
114
112
  # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
115
113
  full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
116
114
 
117
- return {'data': [full_prompt], 'multi_choices': self.choices}
115
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
118
116
 
119
117
  def get_gold_answer(self, input_d: dict) -> str:
120
118
  # Get the gold choice
@@ -7,7 +7,7 @@ import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
10
+ from evalscope.metrics import AverageAccuracy, exact_match
11
11
  from evalscope.models.chat_adapter import ChatGenerationModelAdapter
12
12
  from evalscope.utils import ResponseParser
13
13
  from evalscope.utils.logger import get_logger
@@ -63,7 +63,7 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
63
63
  dataset_id='modelscope/bbh',
64
64
  model_adapter=ChatGenerationModelAdapter,
65
65
  subset_list=SUBSET_LIST,
66
- metric_list=[WeightedAverageAccuracy],
66
+ metric_list=[AverageAccuracy],
67
67
  few_shot_num=3,
68
68
  train_split=None,
69
69
  eval_split='test',
@@ -122,7 +122,7 @@ class BBHAdapter(DataAdapter):
122
122
  cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
123
123
  full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
124
124
 
125
- return {'data': [full_prompt]}
125
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
126
126
 
127
127
  def gen_prompts(self, data_dict: dict) -> dict:
128
128
  """
@@ -22,7 +22,7 @@ class BenchmarkMeta:
22
22
  few_shot_random: bool = False
23
23
  train_split: Optional[str] = None
24
24
  eval_split: Optional[str] = None
25
- prompt_template: str = ''
25
+ prompt_template: Optional[str] = None
26
26
 
27
27
  def _update(self, args: dict):
28
28
  if args.get('local_path'):
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import WeightedAverageAccuracy
7
+ from evalscope.metrics import AverageAccuracy
8
8
  from evalscope.metrics.metrics import exact_match, weighted_mean
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser, normalize_score
@@ -130,7 +130,7 @@ SUBJECT_MAPPING = {
130
130
  dataset_id='modelscope/ceval-exam',
131
131
  model_adapter=MultiChoiceModelAdapter,
132
132
  subset_list=SUBSET_LIST,
133
- metric_list=[WeightedAverageAccuracy],
133
+ metric_list=[AverageAccuracy],
134
134
  few_shot_num=0,
135
135
  train_split='dev',
136
136
  eval_split='val',
@@ -145,9 +145,10 @@ class CEVALAdapter(DataAdapter):
145
145
  if few_shot_num > 5:
146
146
  logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
147
147
  kwargs['few_shot_num'] = 5
148
-
149
148
  super().__init__(**kwargs)
150
149
 
150
+ self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
151
+
151
152
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
152
153
  data_dict = {}
153
154
  for subset_name in subset_list:
@@ -206,7 +207,7 @@ class CEVALAdapter(DataAdapter):
206
207
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
207
208
  full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
208
209
 
209
- return {'data': [full_prompt], 'multi_choices': self.choices}
210
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
210
211
 
211
212
  def get_gold_answer(self, input_d: dict) -> str:
212
213
  # Get the gold choice
@@ -236,84 +237,6 @@ class CEVALAdapter(DataAdapter):
236
237
  def match(self, gold: str, pred: str) -> float:
237
238
  return exact_match(gold=gold, pred=pred)
238
239
 
239
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
240
- """
241
- Generate report for the evaluation.
242
-
243
- Args:
244
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
245
- report_name: The user-defined report name.
246
-
247
- Returns:
248
- {
249
- "name":"C-Eval",
250
- "metric":"WeightedAverageAccuracy",
251
- "score":0.3389,
252
- "category":[
253
- {
254
- "name":"STEM",
255
- "score":0.2528,
256
- "subset":[
257
- {
258
- "name":"computer_network",
259
- "score":0.2632
260
- },
261
- {
262
- "name":"operating_system",
263
- "score":0.3157
264
- },
265
- {
266
- "name":"computer_architecture",
267
- "score":0.4285
268
- }
269
- ]
270
- }
271
- ],
272
- "total_num":59
273
- }
274
- """
275
- total_num: int = sum([num for _, num in subset_score_map.values()])
276
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
277
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
278
-
279
- # Get domain-subject mapping
280
- subject_review_map = {}
281
- for subset_name, (subset_score, num) in subset_score_map.items():
282
- domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else 'DEFAULT'
283
- if domain_name in subject_review_map:
284
- subject_review_map[domain_name].append((subset_name, subset_score, num))
285
- else:
286
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
287
-
288
- # Get domain score
289
- category_list = []
290
- for domain_name, domain_res_list in subject_review_map.items():
291
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
292
- sum([num for _, _, num in domain_res_list])
293
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
294
- category_list.append({
295
- 'name':
296
- domain_name,
297
- 'score':
298
- domain_weighted_avg_acc,
299
- 'subset': [{
300
- 'name': subset_name,
301
- 'score': normalize_score(score=subset_score)
302
- } for subset_name, subset_score, _ in domain_res_list]
303
- })
304
-
305
- category_list = sorted(category_list, key=lambda x: x['name'])
306
-
307
- # Get final dict of report
308
- res_map = dict(
309
- name=report_name or 'ceval',
310
- metric=self.metric_list[0]['name'],
311
- score=weighted_avg_acc,
312
- category=category_list,
313
- total_num=total_num)
314
-
315
- return res_map
316
-
317
240
  @classmethod
318
241
  def _format_example(cls, input_d: dict, include_answer=True):
319
242
  example = '问题:' + input_d['question']
@@ -5,7 +5,7 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
8
+ from evalscope.metrics import AverageAccuracy, exact_match
9
9
  from evalscope.models import MultiChoiceModelAdapter
10
10
  from evalscope.utils import ResponseParser, normalize_score
11
11
  from evalscope.utils.logger import get_logger
@@ -106,7 +106,7 @@ SUBJECT_MAPPING = {
106
106
  dataset_id='modelscope/cmmlu',
107
107
  model_adapter=MultiChoiceModelAdapter,
108
108
  subset_list=SUBSET_LIST,
109
- metric_list=[WeightedAverageAccuracy],
109
+ metric_list=[AverageAccuracy],
110
110
  few_shot_num=5,
111
111
  train_split='dev',
112
112
  eval_split='test',
@@ -116,9 +116,10 @@ class CMMLUAdapter(DataAdapter):
116
116
  choices = ['A', 'B', 'C', 'D']
117
117
 
118
118
  def __init__(self, **kwargs):
119
-
120
119
  super().__init__(**kwargs)
121
120
 
121
+ self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
122
+
122
123
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
123
124
  data_dict = {}
124
125
  for subset_name in subset_list:
@@ -173,7 +174,7 @@ class CMMLUAdapter(DataAdapter):
173
174
 
174
175
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
175
176
 
176
- return {'data': [full_prompt], 'multi_choices': self.choices}
177
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
177
178
 
178
179
  def get_gold_answer(self, input_d: dict) -> str:
179
180
  # Get the gold choice
@@ -203,81 +204,6 @@ class CMMLUAdapter(DataAdapter):
203
204
  def match(self, gold: str, pred: str) -> float:
204
205
  return exact_match(gold=gold, pred=pred)
205
206
 
206
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
207
- """
208
- Generate report for the evaluation.
209
-
210
- Args:
211
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
212
- report_name: the user-defined report name. Default: None
213
-
214
- Returns:
215
- {
216
- "name":"CMMLU",
217
- "metric":"WeightedAverageAccuracy",
218
- "score":0.3389,
219
- "category":[
220
- {
221
- "name":"STEM",
222
- "score":0.2528,
223
- "subset":[
224
- {
225
- "name":"computer_network",
226
- "score":0.2632
227
- },
228
- {
229
- "name":"operating_system",
230
- "score":0.3157
231
- },
232
- {
233
- "name":"computer_architecture",
234
- "score":0.4285
235
- }
236
- ]
237
- }
238
- ],
239
- "total_num":59
240
- }
241
- """
242
- total_num: int = sum([num for _, num in subset_score_map.values()])
243
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
244
-
245
- # Get domain-subject mapping
246
- subject_review_map = {}
247
- for subset_name, (subset_score, num) in subset_score_map.items():
248
- domain_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
249
- if domain_name in subject_review_map:
250
- subject_review_map[domain_name].append((subset_name, subset_score, num))
251
- else:
252
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
253
-
254
- # Get domain score
255
- category_list = []
256
- for domain_name, domain_res_list in subject_review_map.items():
257
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
258
- sum([num for _, _, num in domain_res_list])
259
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
260
- category_list.append({
261
- 'name':
262
- domain_name,
263
- 'score':
264
- domain_weighted_avg_acc,
265
- 'subset': [{
266
- 'name': subset_name,
267
- 'score': normalize_score(subset_score)
268
- } for subset_name, subset_score, _ in domain_res_list]
269
- })
270
-
271
- # Get final dict of report
272
- res_map = dict(
273
- name=report_name or 'cmmlu',
274
- metric=self.metric_list[0]['name'],
275
- score=weighted_avg_acc,
276
- category=category_list,
277
- total_num=total_num)
278
-
279
- return res_map
280
-
281
207
  @classmethod
282
208
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
283
209
 
@@ -5,7 +5,7 @@ import json
5
5
  import os
6
6
 
7
7
  from evalscope.benchmarks import Benchmark, DataAdapter
8
- from evalscope.metrics import WeightedAverageAccuracy
8
+ from evalscope.metrics import AverageAccuracy
9
9
  from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
10
10
  from evalscope.models import ChatGenerationModelAdapter
11
11
  from evalscope.utils.logger import get_logger
@@ -20,11 +20,11 @@ logger = get_logger()
20
20
  dataset_id='modelscope/competition_math',
21
21
  model_adapter=ChatGenerationModelAdapter,
22
22
  subset_list=['default'],
23
- metric_list=[WeightedAverageAccuracy],
23
+ metric_list=[AverageAccuracy],
24
24
  few_shot_num=4,
25
25
  train_split='train',
26
26
  eval_split='test',
27
- prompt_template='',
27
+ prompt_template='Put the final answer in \\boxed{}.',
28
28
  )
29
29
  class CompetitionMathAdapter(DataAdapter):
30
30
  """ To be tested for all models. """
@@ -77,7 +77,7 @@ class CompetitionMathAdapter(DataAdapter):
77
77
  use_fewshot = self.few_shot_num > 0
78
78
  full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
79
79
 
80
- return {'data': [full_prompt], 'system_prompt': 'Put the final answer in \\boxed{}.'}
80
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
81
81
 
82
82
  def get_gold_answer(self, input_d: dict) -> str:
83
83
  # Extract the gold answer from the input dict.