evalscope 0.15.1__tar.gz → 0.16.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (498) hide show
  1. {evalscope-0.15.1/evalscope.egg-info → evalscope-0.16.1}/PKG-INFO +57 -31
  2. {evalscope-0.15.1 → evalscope-0.16.1}/README.md +42 -18
  3. evalscope-0.16.1/evalscope/app/__init__.py +28 -0
  4. {evalscope-0.15.1/evalscope/report → evalscope-0.16.1/evalscope/app}/app.py +67 -59
  5. evalscope-0.16.1/evalscope/app/constants.py +21 -0
  6. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/arguments.py +12 -1
  7. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/backend_manager.py +2 -1
  8. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  9. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  10. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/embedding.py +75 -35
  11. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/llm.py +1 -1
  12. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  13. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/benchmark.py +1 -0
  14. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  15. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/data_adapter.py +101 -18
  16. evalscope-0.16.1/evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
  17. evalscope-0.16.1/evalscope/benchmarks/docmath/utils.py +220 -0
  18. evalscope-0.16.1/evalscope/benchmarks/drop/drop_adapter.py +133 -0
  19. evalscope-0.16.1/evalscope/benchmarks/drop/utils.py +59 -0
  20. evalscope-0.16.1/evalscope/benchmarks/frames/frames_adapter.py +90 -0
  21. evalscope-0.16.1/evalscope/benchmarks/frames/utils.py +37 -0
  22. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  23. evalscope-0.16.1/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
  24. evalscope-0.16.1/evalscope/benchmarks/needle_haystack/utils.py +79 -0
  25. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  26. {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.16.1/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
  27. evalscope-0.16.1/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
  28. evalscope-0.16.1/evalscope/benchmarks/tool_bench/utils.py +203 -0
  29. evalscope-0.16.1/evalscope/benchmarks/utils.py +60 -0
  30. evalscope-0.16.1/evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  31. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/start_app.py +2 -2
  32. evalscope-0.16.1/evalscope/collections/__init__.py +35 -0
  33. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/collections/evaluator.py +94 -32
  34. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/config.py +54 -17
  35. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/evaluator.py +80 -41
  36. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/__init__.py +3 -1
  37. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  38. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/llm_judge.py +15 -8
  39. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/math_parser.py +1 -1
  40. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/rouge_metric.py +11 -13
  41. evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  42. evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  43. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/chat_adapter.py +51 -34
  44. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/server_adapter.py +17 -25
  45. evalscope-0.16.1/evalscope/perf/__init__.py +0 -0
  46. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/arguments.py +16 -7
  47. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/benchmark.py +0 -15
  48. evalscope-0.16.1/evalscope/perf/main.py +103 -0
  49. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/custom.py +15 -0
  50. evalscope-0.16.1/evalscope/perf/utils/__init__.py +0 -0
  51. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/benchmark_util.py +34 -16
  52. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/db_util.py +25 -15
  53. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/local_server.py +1 -0
  54. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/log_utils.py +12 -5
  55. evalscope-0.16.1/evalscope/perf/utils/rich_display.py +186 -0
  56. evalscope-0.16.1/evalscope/report/__init__.py +38 -0
  57. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/report/combinator.py +8 -0
  58. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/report/generator.py +33 -9
  59. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/report/utils.py +61 -4
  60. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/run.py +12 -0
  61. evalscope-0.16.1/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  62. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  63. evalscope-0.16.1/evalscope/utils/deprecation_utils.py +42 -0
  64. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/logger.py +1 -1
  65. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/utils.py +12 -0
  66. evalscope-0.16.1/evalscope/version.py +4 -0
  67. {evalscope-0.15.1 → evalscope-0.16.1/evalscope.egg-info}/PKG-INFO +57 -31
  68. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/SOURCES.txt +23 -2
  69. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/requires.txt +14 -12
  70. {evalscope-0.15.1 → evalscope-0.16.1}/requirements/framework.txt +2 -2
  71. evalscope-0.16.1/requirements/opencompass.txt +1 -0
  72. {evalscope-0.15.1 → evalscope-0.16.1}/requirements/perf.txt +2 -1
  73. {evalscope-0.15.1 → evalscope-0.16.1}/requirements/rag.txt +1 -1
  74. evalscope-0.16.1/requirements/vlmeval.txt +1 -0
  75. {evalscope-0.15.1 → evalscope-0.16.1}/tests/aigc/test_t2i.py +40 -3
  76. {evalscope-0.15.1 → evalscope-0.16.1}/tests/cli/test_all.py +39 -32
  77. {evalscope-0.15.1 → evalscope-0.16.1}/tests/cli/test_collection.py +8 -6
  78. {evalscope-0.15.1 → evalscope-0.16.1}/tests/cli/test_run.py +43 -17
  79. {evalscope-0.15.1 → evalscope-0.16.1}/tests/perf/test_perf.py +23 -0
  80. evalscope-0.16.1/tests/rag/__init__.py +0 -0
  81. {evalscope-0.15.1 → evalscope-0.16.1}/tests/rag/test_mteb.py +5 -5
  82. evalscope-0.15.1/evalscope/benchmarks/utils.py +0 -34
  83. evalscope-0.15.1/evalscope/collections/__init__.py +0 -3
  84. evalscope-0.15.1/evalscope/perf/main.py +0 -46
  85. evalscope-0.15.1/evalscope/report/__init__.py +0 -6
  86. evalscope-0.15.1/evalscope/version.py +0 -4
  87. evalscope-0.15.1/requirements/opencompass.txt +0 -1
  88. evalscope-0.15.1/requirements/vlmeval.txt +0 -1
  89. {evalscope-0.15.1 → evalscope-0.16.1}/LICENSE +0 -0
  90. {evalscope-0.15.1 → evalscope-0.16.1}/MANIFEST.in +0 -0
  91. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/__init__.py +0 -0
  92. evalscope-0.15.1/evalscope/report/app_arguments.py → evalscope-0.16.1/evalscope/app/arguments.py +0 -0
  93. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/__init__.py +0 -0
  94. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/base.py +0 -0
  95. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/__init__.py +0 -0
  96. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  97. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  98. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  99. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  100. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/__init__.py +0 -0
  101. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  102. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  103. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  104. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +0 -0
  105. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  106. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  107. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  108. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  109. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  110. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  111. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  112. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  113. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  114. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  115. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  116. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  117. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  118. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  119. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  120. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  121. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  122. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  123. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  124. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  125. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  126. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  127. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  128. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  129. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  130. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  131. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  132. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  133. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  134. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  135. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  136. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/__init__.py +0 -0
  137. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/__init__.py +0 -0
  138. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  139. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
  140. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -0
  141. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
  142. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
  143. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
  144. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
  145. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aime/__init__.py +0 -0
  146. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aime/aime24_adapter.py +0 -0
  147. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/aime/aime25_adapter.py +0 -0
  148. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  149. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arc/__init__.py +0 -0
  150. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  151. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arc/arc_adapter.py +0 -0
  152. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
  153. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +0 -0
  154. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/arena_hard/utils.py +0 -0
  155. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/__init__.py +0 -0
  156. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/bbh_adapter.py +0 -0
  157. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  158. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  159. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  160. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  161. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  162. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  163. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  164. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  165. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  166. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  167. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  168. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  169. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  170. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  171. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  172. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  173. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  174. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  175. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  176. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  177. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  178. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  179. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  180. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  181. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  182. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  183. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  184. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ceval/__init__.py +0 -0
  185. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ceval/ceval_adapter.py +0 -0
  186. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  187. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  188. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  189. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  190. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +0 -0
  191. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  192. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  193. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  194. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -0
  195. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/data_collection/__init__.py +0 -0
  196. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -0
  197. {evalscope-0.15.1/evalscope/benchmarks/general_mcq → evalscope-0.16.1/evalscope/benchmarks/docmath}/__init__.py +0 -0
  198. {evalscope-0.15.1/evalscope/benchmarks/gpqa → evalscope-0.16.1/evalscope/benchmarks/drop}/__init__.py +0 -0
  199. {evalscope-0.15.1/evalscope/benchmarks/ifeval → evalscope-0.16.1/evalscope/benchmarks/frames}/__init__.py +0 -0
  200. {evalscope-0.15.1/evalscope/benchmarks/iquiz → evalscope-0.16.1/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  201. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +0 -0
  202. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  203. {evalscope-0.15.1/evalscope/benchmarks/live_code_bench → evalscope-0.16.1/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  204. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  205. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gpqa/gpqa_adapter.py +0 -0
  206. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  207. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  208. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +0 -0
  209. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  210. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  211. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +0 -0
  212. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  213. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  214. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -0
  215. {evalscope-0.15.1/evalscope/benchmarks/maritime_bench → evalscope-0.16.1/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  216. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/ifeval_adapter.py +0 -0
  217. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  218. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  219. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  220. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/ifeval/utils.py +0 -0
  221. {evalscope-0.15.1/evalscope/benchmarks/math_500 → evalscope-0.16.1/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  222. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/iquiz/iquiz_adapter.py +0 -0
  223. {evalscope-0.15.1/evalscope/benchmarks/mmlu_pro → evalscope-0.16.1/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  224. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
  225. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  226. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -0
  227. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
  228. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  229. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  230. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
  231. {evalscope-0.15.1/evalscope/benchmarks/mmlu_redux → evalscope-0.16.1/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
  232. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +0 -0
  233. {evalscope-0.15.1/evalscope/benchmarks/musr → evalscope-0.16.1/evalscope/benchmarks/math_500}/__init__.py +0 -0
  234. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/math_500/math_500_adapter.py +0 -0
  235. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  236. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  237. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/mmlu_adapter.py +0 -0
  238. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  239. {evalscope-0.15.1/evalscope/benchmarks/process_bench → evalscope-0.16.1/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  240. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +0 -0
  241. {evalscope-0.15.1/evalscope/benchmarks/simple_qa → evalscope-0.16.1/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
  242. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +0 -0
  243. {evalscope-0.15.1/evalscope/benchmarks/super_gpqa → evalscope-0.16.1/evalscope/benchmarks/musr}/__init__.py +0 -0
  244. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/musr/musr_adapter.py +0 -0
  245. {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models → evalscope-0.16.1/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
  246. {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.16.1/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  247. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  248. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/process_bench/process_bench_adapter.py +0 -0
  249. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/race/__init__.py +0 -0
  250. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/race/race.py +0 -0
  251. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/race/race_adapter.py +0 -0
  252. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/race/samples.jsonl +0 -0
  253. {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.16.1/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  254. {evalscope-0.15.1/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.16.1/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  255. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +0 -0
  256. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +0 -0
  257. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
  258. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -0
  259. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  260. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  261. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  262. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +0 -0
  263. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  264. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  265. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +0 -0
  266. {evalscope-0.15.1/evalscope/perf → evalscope-0.16.1/evalscope/benchmarks/winogrande}/__init__.py +0 -0
  267. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/__init__.py +0 -0
  268. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/base.py +0 -0
  269. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/cli.py +0 -0
  270. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/start_eval.py +0 -0
  271. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/start_perf.py +0 -0
  272. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/cli/start_server.py +0 -0
  273. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/collections/sampler.py +0 -0
  274. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/collections/schema.py +0 -0
  275. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/constants.py +0 -0
  276. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/__init__.py +0 -0
  277. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/rating_eval.py +0 -0
  278. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/reviewer/__init__.py +0 -0
  279. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  280. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  281. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/metrics.py +0 -0
  282. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/named_metrics.py +0 -0
  283. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/__init__.py +0 -0
  284. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
  285. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/constants.py +0 -0
  286. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
  287. {evalscope-0.15.1/evalscope/perf/utils → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
  288. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
  289. {evalscope-0.15.1/evalscope/third_party/thinkbench/tools → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
  290. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
  291. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
  292. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
  293. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
  294. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
  295. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
  296. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
  297. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
  298. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
  299. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
  300. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
  301. {evalscope-0.15.1/tests/rag → evalscope-0.16.1/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
  302. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
  303. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
  304. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
  305. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
  306. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
  307. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
  308. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
  309. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
  310. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
  311. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
  312. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
  313. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
  314. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
  315. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  316. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  317. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  318. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  319. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  320. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  321. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  322. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  323. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  324. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  325. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
  326. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
  327. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
  328. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
  329. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
  330. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
  331. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
  332. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
  333. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
  334. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
  335. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
  336. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
  337. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
  338. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
  339. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
  340. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
  341. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
  342. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
  343. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
  344. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
  345. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
  346. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
  347. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
  348. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
  349. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
  350. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
  351. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
  352. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
  353. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
  354. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
  355. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
  356. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
  357. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
  358. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
  359. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
  360. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
  361. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
  362. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
  363. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
  364. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
  365. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
  366. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
  367. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
  368. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
  369. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
  370. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
  371. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
  372. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
  373. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
  374. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
  375. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
  376. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
  377. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
  378. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
  379. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
  380. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/score.py +0 -0
  381. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
  382. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/__init__.py +0 -0
  383. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/__init__.py +0 -0
  384. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/base_adapter.py +0 -0
  385. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/choice_adapter.py +0 -0
  386. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/custom_adapter.py +0 -0
  387. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/adapters/t2i_adapter.py +0 -0
  388. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/custom/__init__.py +0 -0
  389. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/custom/custom_model.py +0 -0
  390. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/custom/dummy_model.py +0 -0
  391. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/local_model.py +0 -0
  392. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/model.py +0 -0
  393. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/models/register.py +0 -0
  394. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/http_client.py +0 -0
  395. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/__init__.py +0 -0
  396. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/__init__.py +0 -0
  397. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/base.py +0 -0
  398. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/custom_api.py +0 -0
  399. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  400. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/api/openai_api.py +0 -0
  401. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  402. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/base.py +0 -0
  403. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  404. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  405. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  406. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  407. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
  408. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  409. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/plugin/registry.py +0 -0
  410. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/analysis_result.py +0 -0
  411. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/perf/utils/handler.py +0 -0
  412. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/__init__.py +0 -0
  413. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/config/cfg_arena.yaml +0 -0
  414. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  415. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  416. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/config/cfg_single.yaml +0 -0
  417. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  418. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  419. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  420. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  421. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/data/question.jsonl +0 -0
  422. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/arc.yaml +0 -0
  423. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/bbh.yaml +0 -0
  424. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  425. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/ceval.yaml +0 -0
  426. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  427. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  428. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  429. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/general_qa.yaml +0 -0
  430. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  431. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/mmlu.yaml +0 -0
  432. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  433. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/run_arena.py +0 -0
  434. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/summarizer.py +0 -0
  435. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/__init__.py +0 -0
  436. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/README.md +0 -0
  437. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/__init__.py +0 -0
  438. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/default_task.json +0 -0
  439. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  440. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/eval.py +0 -0
  441. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/infer.py +0 -0
  442. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  443. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  444. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  445. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  446. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  447. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  448. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  449. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  450. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  451. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/longbench_write/utils.py +0 -0
  452. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/__init__.py +0 -0
  453. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/eval.py +0 -0
  454. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/infer.py +0 -0
  455. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  456. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  457. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  458. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  459. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/README.md +0 -0
  460. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  461. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  462. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  463. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/eval.py +0 -0
  464. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/infer.py +0 -0
  465. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  466. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  467. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  468. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/__init__.py +0 -0
  469. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/arena_utils.py +0 -0
  470. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/chat_service.py +0 -0
  471. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/completion_parsers.py +0 -0
  472. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/filters.py +0 -0
  473. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/import_utils.py +0 -0
  474. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/io_utils.py +0 -0
  475. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope/utils/model_utils.py +0 -0
  476. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/dependency_links.txt +0 -0
  477. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/entry_points.txt +0 -0
  478. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/not-zip-safe +0 -0
  479. {evalscope-0.15.1 → evalscope-0.16.1}/evalscope.egg-info/top_level.txt +0 -0
  480. {evalscope-0.15.1 → evalscope-0.16.1}/requirements/aigc.txt +0 -0
  481. {evalscope-0.15.1 → evalscope-0.16.1}/requirements/app.txt +0 -0
  482. {evalscope-0.15.1 → evalscope-0.16.1}/requirements/docs.txt +0 -0
  483. {evalscope-0.15.1 → evalscope-0.16.1}/requirements.txt +0 -0
  484. {evalscope-0.15.1 → evalscope-0.16.1}/setup.cfg +0 -0
  485. {evalscope-0.15.1 → evalscope-0.16.1}/setup.py +0 -0
  486. {evalscope-0.15.1 → evalscope-0.16.1}/tests/__init__.py +0 -0
  487. {evalscope-0.15.1 → evalscope-0.16.1}/tests/aigc/__init__.py +0 -0
  488. {evalscope-0.15.1 → evalscope-0.16.1}/tests/cli/__init__.py +0 -0
  489. {evalscope-0.15.1 → evalscope-0.16.1}/tests/perf/__init__.py +0 -0
  490. {evalscope-0.15.1 → evalscope-0.16.1}/tests/rag/test_clip_benchmark.py +0 -0
  491. {evalscope-0.15.1 → evalscope-0.16.1}/tests/rag/test_ragas.py +0 -0
  492. {evalscope-0.15.1 → evalscope-0.16.1}/tests/swift/__init__.py +0 -0
  493. {evalscope-0.15.1 → evalscope-0.16.1}/tests/swift/test_run_swift_eval.py +0 -0
  494. {evalscope-0.15.1 → evalscope-0.16.1}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  495. {evalscope-0.15.1 → evalscope-0.16.1}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  496. {evalscope-0.15.1 → evalscope-0.16.1}/tests/test_run_all.py +0 -0
  497. {evalscope-0.15.1 → evalscope-0.16.1}/tests/vlm/__init__.py +0 -0
  498. {evalscope-0.15.1 → evalscope-0.16.1}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.15.1
3
+ Version: 0.16.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -17,12 +17,12 @@ Requires-Python: >=3.8
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
19
  Requires-Dist: accelerate
20
- Requires-Dist: datasets<=3.2.0,>=3.0.0
20
+ Requires-Dist: datasets>=3.0
21
21
  Requires-Dist: immutabledict
22
22
  Requires-Dist: jieba
23
23
  Requires-Dist: jsonlines
24
24
  Requires-Dist: langdetect
25
- Requires-Dist: latex2sympy2
25
+ Requires-Dist: latex2sympy2_extended
26
26
  Requires-Dist: matplotlib
27
27
  Requires-Dist: modelscope[framework]
28
28
  Requires-Dist: nltk>=3.9
@@ -45,24 +45,25 @@ Requires-Dist: tqdm
45
45
  Requires-Dist: transformers>=4.33
46
46
  Requires-Dist: word2number
47
47
  Provides-Extra: opencompass
48
- Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
48
+ Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
49
49
  Provides-Extra: vlmeval
50
- Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
50
+ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
51
51
  Provides-Extra: rag
52
52
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
53
53
  Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
54
54
  Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
55
55
  Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
56
- Requires-Dist: mteb==1.19.4; extra == "rag"
56
+ Requires-Dist: mteb==1.38.20; extra == "rag"
57
57
  Requires-Dist: ragas==0.2.14; extra == "rag"
58
58
  Requires-Dist: webdataset>0.2.0; extra == "rag"
59
59
  Provides-Extra: perf
60
60
  Requires-Dist: aiohttp; extra == "perf"
61
61
  Requires-Dist: fastapi; extra == "perf"
62
62
  Requires-Dist: numpy; extra == "perf"
63
+ Requires-Dist: rich; extra == "perf"
63
64
  Requires-Dist: sse_starlette; extra == "perf"
64
65
  Requires-Dist: transformers; extra == "perf"
65
- Requires-Dist: unicorn; extra == "perf"
66
+ Requires-Dist: uvicorn; extra == "perf"
66
67
  Provides-Extra: app
67
68
  Requires-Dist: gradio==5.4.0; extra == "app"
68
69
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
@@ -74,12 +75,12 @@ Requires-Dist: open_clip_torch; extra == "aigc"
74
75
  Requires-Dist: opencv-python; extra == "aigc"
75
76
  Provides-Extra: all
76
77
  Requires-Dist: accelerate; extra == "all"
77
- Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
78
+ Requires-Dist: datasets>=3.0; extra == "all"
78
79
  Requires-Dist: immutabledict; extra == "all"
79
80
  Requires-Dist: jieba; extra == "all"
80
81
  Requires-Dist: jsonlines; extra == "all"
81
82
  Requires-Dist: langdetect; extra == "all"
82
- Requires-Dist: latex2sympy2; extra == "all"
83
+ Requires-Dist: latex2sympy2_extended; extra == "all"
83
84
  Requires-Dist: matplotlib; extra == "all"
84
85
  Requires-Dist: modelscope[framework]; extra == "all"
85
86
  Requires-Dist: nltk>=3.9; extra == "all"
@@ -101,21 +102,22 @@ Requires-Dist: torchvision; extra == "all"
101
102
  Requires-Dist: tqdm; extra == "all"
102
103
  Requires-Dist: transformers>=4.33; extra == "all"
103
104
  Requires-Dist: word2number; extra == "all"
104
- Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
105
- Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
105
+ Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
106
+ Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
106
107
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
107
108
  Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
108
109
  Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
109
110
  Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
110
- Requires-Dist: mteb==1.19.4; extra == "all"
111
+ Requires-Dist: mteb==1.38.20; extra == "all"
111
112
  Requires-Dist: ragas==0.2.14; extra == "all"
112
113
  Requires-Dist: webdataset>0.2.0; extra == "all"
113
114
  Requires-Dist: aiohttp; extra == "all"
114
115
  Requires-Dist: fastapi; extra == "all"
115
116
  Requires-Dist: numpy; extra == "all"
117
+ Requires-Dist: rich; extra == "all"
116
118
  Requires-Dist: sse_starlette; extra == "all"
117
119
  Requires-Dist: transformers; extra == "all"
118
- Requires-Dist: unicorn; extra == "all"
120
+ Requires-Dist: uvicorn; extra == "all"
119
121
  Requires-Dist: gradio==5.4.0; extra == "all"
120
122
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
121
123
  Requires-Dist: diffusers; extra == "all"
@@ -177,9 +179,23 @@ Requires-Dist: opencv-python; extra == "all"
177
179
 
178
180
  ## 📝 Introduction
179
181
 
180
- EvalScope is [ModelScope](https://modelscope.cn/)'s official framework for model evaluation and benchmarking, designed for diverse assessment needs. It supports various model types including large language models, multimodal, embedding, reranker, and CLIP models.
182
+ EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
181
183
 
182
- The framework accommodates multiple evaluation scenarios such as end-to-end RAG evaluation, arena mode, and inference performance testing. It features built-in benchmarks and metrics like MMLU, CMMLU, C-Eval, and GSM8K. Seamlessly integrated with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, EvalScope enables one-click evaluations, offering comprehensive support for model training and assessment 🚀
184
+ - 🧠 Large Language Models
185
+ - 🎨 Multimodal Models
186
+ - 🔍 Embedding Models
187
+ - 🏆 Reranker Models
188
+ - 🖼️ CLIP Models
189
+ - 🎭 AIGC Models (Image-to-Text/Video)
190
+ - ...and more!
191
+
192
+ EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
193
+
194
+ - 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
195
+ - 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
196
+ - 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
197
+
198
+ Below is the overall architecture diagram of EvalScope:
183
199
 
184
200
  <p align="center">
185
201
  <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
@@ -214,6 +230,10 @@ Please scan the QR code below to join our community groups:
214
230
 
215
231
  ## 🎉 News
216
232
 
233
+ - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
234
+ - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
235
+ - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
236
+ - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
217
237
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
218
238
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
219
239
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
@@ -479,26 +499,27 @@ For more customized evaluations, such as customizing model parameters or dataset
479
499
 
480
500
  ```shell
481
501
  evalscope eval \
482
- --model Qwen/Qwen2.5-0.5B-Instruct \
483
- --model-args revision=master,precision=torch.float16,device_map=auto \
484
- --generation-config do_sample=true,temperature=0.5 \
502
+ --model Qwen/Qwen3-0.6B \
503
+ --model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
504
+ --generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
485
505
  --dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
486
506
  --datasets gsm8k \
487
507
  --limit 10
488
508
  ```
489
509
 
490
- ### Parameter
491
- - `--model-args`: Model loading parameters, separated by commas in `key=value` format. Default parameters:
492
- - `revision`: Model version, default is `master`
493
- - `precision`: Model precision, default is `auto`
494
- - `device_map`: Model device allocation, default is `auto`
495
- - `--generation-config`: Generation parameters, separated by commas in `key=value` format. Default parameters:
496
- - `do_sample`: Whether to use sampling, default is `false`
497
- - `max_length`: Maximum length, default is 2048
498
- - `max_new_tokens`: Maximum length of generation, default is 512
499
- - `--dataset-args`: Configuration parameters for evaluation datasets, passed in `json` format. The key is the dataset name, and the value is the parameters. Note that it needs to correspond one-to-one with the values in the `--datasets` parameter:
510
+ ### Parameter Description
511
+ - `--model-args`: Model loading parameters, passed as a JSON string:
512
+ - `revision`: Model version
513
+ - `precision`: Model precision
514
+ - `device_map`: Device allocation for the model
515
+ - `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
516
+ - `do_sample`: Whether to use sampling
517
+ - `temperature`: Generation temperature
518
+ - `max_new_tokens`: Maximum length of generated tokens
519
+ - `chat_template_kwargs`: Model inference template parameters
520
+ - `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
500
521
  - `few_shot_num`: Number of few-shot examples
501
- - `few_shot_random`: Whether to randomly sample few-shot data, if not set, defaults to `true`
522
+ - `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
502
523
 
503
524
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
504
525
 
@@ -517,6 +538,11 @@ A stress testing tool focused on large language models, which can be customized
517
538
 
518
539
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
519
540
 
541
+ **Output example**
542
+
543
+ ![multi_perf](docs/en/user_guides/stress_test/images/multi_perf.png)
544
+
545
+
520
546
  **Supports wandb for recording results**
521
547
 
522
548
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
@@ -565,7 +591,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
565
591
  </a>
566
592
 
567
593
  ## 🔜 Roadmap
568
- - [ ] Support for better evaluation report visualization
594
+ - [x] Support for better evaluation report visualization
569
595
  - [x] Support for mixed evaluations across multiple datasets
570
596
  - [x] RAG evaluation
571
597
  - [x] VLM evaluation
@@ -575,7 +601,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
575
601
  - [x] Multi-modal evaluation
576
602
  - [ ] Benchmarks
577
603
  - [ ] GAIA
578
- - [ ] GPQA
604
+ - [x] GPQA
579
605
  - [x] MBPP
580
606
 
581
607
 
@@ -51,9 +51,23 @@
51
51
 
52
52
  ## 📝 Introduction
53
53
 
54
- EvalScope is [ModelScope](https://modelscope.cn/)'s official framework for model evaluation and benchmarking, designed for diverse assessment needs. It supports various model types including large language models, multimodal, embedding, reranker, and CLIP models.
54
+ EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
55
55
 
56
- The framework accommodates multiple evaluation scenarios such as end-to-end RAG evaluation, arena mode, and inference performance testing. It features built-in benchmarks and metrics like MMLU, CMMLU, C-Eval, and GSM8K. Seamlessly integrated with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, EvalScope enables one-click evaluations, offering comprehensive support for model training and assessment 🚀
56
+ - 🧠 Large Language Models
57
+ - 🎨 Multimodal Models
58
+ - 🔍 Embedding Models
59
+ - 🏆 Reranker Models
60
+ - 🖼️ CLIP Models
61
+ - 🎭 AIGC Models (Image-to-Text/Video)
62
+ - ...and more!
63
+
64
+ EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
65
+
66
+ - 🏅 Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
67
+ - 📊 Model inference performance stress testing: Ensuring your model excels in real-world applications.
68
+ - 🚀 Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
69
+
70
+ Below is the overall architecture diagram of EvalScope:
57
71
 
58
72
  <p align="center">
59
73
  <img src="docs/en/_static/images/evalscope_framework.png" width="70%">
@@ -88,6 +102,10 @@ Please scan the QR code below to join our community groups:
88
102
 
89
103
  ## 🎉 News
90
104
 
105
+ - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
106
+ - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
107
+ - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
108
+ - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
91
109
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
92
110
  - 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
93
111
  - 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
@@ -353,26 +371,27 @@ For more customized evaluations, such as customizing model parameters or dataset
353
371
 
354
372
  ```shell
355
373
  evalscope eval \
356
- --model Qwen/Qwen2.5-0.5B-Instruct \
357
- --model-args revision=master,precision=torch.float16,device_map=auto \
358
- --generation-config do_sample=true,temperature=0.5 \
374
+ --model Qwen/Qwen3-0.6B \
375
+ --model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
376
+ --generation-config '{"do_sample":true,"temperature":0.6,"max_new_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
359
377
  --dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
360
378
  --datasets gsm8k \
361
379
  --limit 10
362
380
  ```
363
381
 
364
- ### Parameter
365
- - `--model-args`: Model loading parameters, separated by commas in `key=value` format. Default parameters:
366
- - `revision`: Model version, default is `master`
367
- - `precision`: Model precision, default is `auto`
368
- - `device_map`: Model device allocation, default is `auto`
369
- - `--generation-config`: Generation parameters, separated by commas in `key=value` format. Default parameters:
370
- - `do_sample`: Whether to use sampling, default is `false`
371
- - `max_length`: Maximum length, default is 2048
372
- - `max_new_tokens`: Maximum length of generation, default is 512
373
- - `--dataset-args`: Configuration parameters for evaluation datasets, passed in `json` format. The key is the dataset name, and the value is the parameters. Note that it needs to correspond one-to-one with the values in the `--datasets` parameter:
382
+ ### Parameter Description
383
+ - `--model-args`: Model loading parameters, passed as a JSON string:
384
+ - `revision`: Model version
385
+ - `precision`: Model precision
386
+ - `device_map`: Device allocation for the model
387
+ - `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
388
+ - `do_sample`: Whether to use sampling
389
+ - `temperature`: Generation temperature
390
+ - `max_new_tokens`: Maximum length of generated tokens
391
+ - `chat_template_kwargs`: Model inference template parameters
392
+ - `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
374
393
  - `few_shot_num`: Number of few-shot examples
375
- - `few_shot_random`: Whether to randomly sample few-shot data, if not set, defaults to `true`
394
+ - `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
376
395
 
377
396
  Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
378
397
 
@@ -391,6 +410,11 @@ A stress testing tool focused on large language models, which can be customized
391
410
 
392
411
  Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
393
412
 
413
+ **Output example**
414
+
415
+ ![multi_perf](docs/en/user_guides/stress_test/images/multi_perf.png)
416
+
417
+
394
418
  **Supports wandb for recording results**
395
419
 
396
420
  ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
@@ -439,7 +463,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
439
463
  </a>
440
464
 
441
465
  ## 🔜 Roadmap
442
- - [ ] Support for better evaluation report visualization
466
+ - [x] Support for better evaluation report visualization
443
467
  - [x] Support for mixed evaluations across multiple datasets
444
468
  - [x] RAG evaluation
445
469
  - [x] VLM evaluation
@@ -449,7 +473,7 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
449
473
  - [x] Multi-modal evaluation
450
474
  - [ ] Benchmarks
451
475
  - [ ] GAIA
452
- - [ ] GPQA
476
+ - [x] GPQA
453
477
  - [x] MBPP
454
478
 
455
479
 
@@ -0,0 +1,28 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
3
+
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .app import create_app
8
+ from .arguments import add_argument
9
+
10
+ else:
11
+ _import_structure = {
12
+ 'app': [
13
+ 'create_app',
14
+ ],
15
+ 'arguments': [
16
+ 'add_argument',
17
+ ],
18
+ }
19
+
20
+ import sys
21
+
22
+ sys.modules[__name__] = _LazyModule(
23
+ __name__,
24
+ globals()['__file__'],
25
+ _import_structure,
26
+ module_spec=__spec__,
27
+ extra_objects={},
28
+ )
@@ -11,35 +11,15 @@ from dataclasses import dataclass
11
11
  from typing import Any, List, Union
12
12
 
13
13
  from evalscope.constants import DataCollection
14
- from evalscope.report import Report, ReportKey, add_argument, get_data_frame, get_report_list
14
+ from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
15
15
  from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
16
16
  from evalscope.utils.logger import configure_logging, get_logger
17
17
  from evalscope.version import __version__
18
+ from .arguments import add_argument
19
+ from .constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, PLOTLY_THEME, REPORT_TOKEN
18
20
 
19
21
  logger = get_logger()
20
22
 
21
- PLOTLY_THEME = 'plotly_dark'
22
- REPORT_TOKEN = '@@'
23
- MODEL_TOKEN = '::'
24
- DATASET_TOKEN = ', '
25
- LATEX_DELIMITERS = [{
26
- 'left': '$$',
27
- 'right': '$$',
28
- 'display': True
29
- }, {
30
- 'left': '$',
31
- 'right': '$',
32
- 'display': False
33
- }, {
34
- 'left': '\\(',
35
- 'right': '\\)',
36
- 'display': False
37
- }, {
38
- 'left': '\\[',
39
- 'right': '\\]',
40
- 'display': True
41
- }]
42
-
43
23
 
44
24
  def scan_for_report_folders(root_path):
45
25
  """Scan for folders containing reports subdirectories"""
@@ -185,6 +165,13 @@ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
185
165
  return df, styler
186
166
 
187
167
 
168
+ def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
169
+ for report in report_list:
170
+ if report.dataset_name == dataset_name:
171
+ return report.analysis
172
+ return 'N/A'
173
+
174
+
188
175
  def plot_single_dataset_scores(df: pd.DataFrame):
189
176
  # TODO: add metric radio and relace category name
190
177
  plot = px.bar(
@@ -223,6 +210,33 @@ def plot_multi_report_radar(df: pd.DataFrame):
223
210
  return fig
224
211
 
225
212
 
213
+ def convert_markdown_image(text):
214
+ if not os.path.isfile(text):
215
+ return text
216
+ # Convert the image path to a markdown image tag
217
+ if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
218
+ text = os.path.abspath(text)
219
+ image_tag = f'![image](gradio_api/file={text})'
220
+ logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
221
+ return image_tag
222
+ return text
223
+
224
+
225
+ def convert_html_tags(text):
226
+ # match begin label
227
+ text = re.sub(r'<(\w+)>', r'[\1]', text)
228
+ # match end label
229
+ text = re.sub(r'</(\w+)>', r'[/\1]', text)
230
+ return text
231
+
232
+
233
+ def process_string(string: str, max_length: int = 2048) -> str:
234
+ string = convert_html_tags(string) # for display labels e.g.
235
+ if max_length and len(string) > max_length:
236
+ return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
237
+ return string
238
+
239
+
226
240
  def dict_to_markdown(data) -> str:
227
241
  markdown_lines = []
228
242
 
@@ -230,55 +244,41 @@ def dict_to_markdown(data) -> str:
230
244
  bold_key = f'**{key}**'
231
245
 
232
246
  if isinstance(value, list):
233
- value_str = '\n' + '\n'.join([f' - {item}' for item in value])
247
+ value_str = '\n' + '\n'.join([f'- {process_model_prediction(item, max_length=None)}' for item in value])
234
248
  elif isinstance(value, dict):
235
249
  value_str = dict_to_markdown(value)
236
250
  else:
237
251
  value_str = str(value)
238
252
 
239
- value_str = process_string(value_str)
240
- markdown_line = f'{bold_key}: {value_str}'
253
+ value_str = process_string(value_str, max_length=None) # Convert HTML tags but don't truncate
254
+ markdown_line = f'{bold_key}:\n{value_str}'
241
255
  markdown_lines.append(markdown_line)
242
256
 
243
257
  return '\n\n'.join(markdown_lines)
244
258
 
245
259
 
246
- def convert_html_tags(text):
247
- # match begin label
248
- text = re.sub(r'<(\w+)>', r'[\1]', text)
249
- # match end label
250
- text = re.sub(r'</(\w+)>', r'[/\1]', text)
251
- return text
260
+ def process_model_prediction(item: Any, max_length: int = 2048) -> str:
261
+ """
262
+ Process model prediction output into a formatted string.
252
263
 
264
+ Args:
265
+ item: The item to process. Can be a string, list, or dictionary.
266
+ max_length: The maximum length of the output string.
253
267
 
254
- def convert_markdown_image(text):
255
- if not os.path.isfile(text):
256
- return text
257
- # Convert the image path to a markdown image tag
258
- if text.endswith('.png') or text.endswith('.jpg') or text.endswith('.jpeg'):
259
- text = os.path.abspath(text)
260
- image_tag = f'![image](gradio_api/file={text})'
261
- logger.debug(f'Converting image path to markdown: {text} -> {image_tag}')
262
- return image_tag
263
- return text
264
-
265
-
266
- def process_string(string: str, max_length: int = 2048) -> str:
267
- string = convert_html_tags(string) # for display labels e.g. `<think>`
268
- if len(string) > max_length:
269
- return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
270
- return string
271
-
272
-
273
- def process_model_prediction(item: Any):
268
+ Returns:
269
+ A formatted string representation of the input.
270
+ """
274
271
  if isinstance(item, dict):
275
- res = dict_to_markdown(item)
276
- return process_string(res)
272
+ result = dict_to_markdown(item)
277
273
  elif isinstance(item, list):
278
- res = '\n'.join([process_model_prediction(item) for item in item])
279
- return process_string(res)
274
+ result = '\n'.join([f'- {process_model_prediction(i, max_length=None)}' for i in item])
280
275
  else:
281
- return process_string(str(item))
276
+ result = str(item)
277
+
278
+ # Apply HTML tag conversion and truncation only at the final output
279
+ if max_length is not None:
280
+ return process_string(result, max_length)
281
+ return result
282
282
 
283
283
 
284
284
  def normalize_score(score):
@@ -443,6 +443,10 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
443
443
  'zh': '数据集分数',
444
444
  'en': 'Dataset Scores'
445
445
  },
446
+ 'report_analysis': {
447
+ 'zh': '报告智能分析',
448
+ 'en': 'Report Intelligent Analysis'
449
+ },
446
450
  'dataset_scores_table': {
447
451
  'zh': '数据集分数表',
448
452
  'en': 'Dataset Scores Table'
@@ -498,6 +502,9 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
498
502
  with gr.Tab(locale_dict['dataset_details'][lang]):
499
503
  dataset_radio = gr.Radio(
500
504
  label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
505
+ # show dataset details
506
+ with gr.Accordion(locale_dict['report_analysis'][lang], open=True):
507
+ report_analysis = gr.Markdown(value='N/A', show_copy_button=True)
501
508
  gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
502
509
  dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
503
510
  gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
@@ -573,15 +580,16 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
573
580
  @gr.on(
574
581
  triggers=[dataset_radio.change, report_list.change],
575
582
  inputs=[dataset_radio, report_list],
576
- outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
583
+ outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
577
584
  def update_single_report_dataset(dataset_name, report_list):
578
585
  logger.debug(f'Updating single report dataset: {dataset_name}')
579
586
  report_df = get_data_frame(report_list)
587
+ analysis = get_report_analysis(report_list, dataset_name)
580
588
  data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
581
589
  data_score_plot = plot_single_dataset_scores(data_score_df)
582
590
  subsets = data_score_df[ReportKey.subset_name].unique().tolist()
583
591
  logger.debug(f'subsets: {subsets}')
584
- return data_score_plot, styler, gr.update(choices=subsets, value=None), None
592
+ return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
585
593
 
586
594
  @gr.on(
587
595
  triggers=[subset_select.change],
@@ -0,0 +1,21 @@
1
+ PLOTLY_THEME = 'plotly_dark'
2
+ REPORT_TOKEN = '@@'
3
+ MODEL_TOKEN = '::'
4
+ DATASET_TOKEN = ', '
5
+ LATEX_DELIMITERS = [{
6
+ 'left': '$$',
7
+ 'right': '$$',
8
+ 'display': True
9
+ }, {
10
+ 'left': '$',
11
+ 'right': '$',
12
+ 'display': False
13
+ }, {
14
+ 'left': '\\(',
15
+ 'right': '\\)',
16
+ 'display': False
17
+ }, {
18
+ 'left': '\\[',
19
+ 'right': '\\]',
20
+ 'display': True
21
+ }]
@@ -9,6 +9,15 @@ class ParseStrArgsAction(argparse.Action):
9
9
  def __call__(self, parser, namespace, values, option_string=None):
10
10
  assert isinstance(values, str), 'args should be a string.'
11
11
 
12
+ # try json load first
13
+ try:
14
+ arg_dict = json.loads(values)
15
+ setattr(namespace, self.dest, arg_dict)
16
+ return
17
+ except (json.JSONDecodeError, ValueError):
18
+ pass
19
+
20
+ # If JSON load fails, fall back to parsing as key=value pairs
12
21
  arg_dict = {}
13
22
  for arg in values.strip().split(','):
14
23
  key, value = map(str.strip, arg.split('=', 1)) # Use maxsplit=1 to handle multiple '='
@@ -58,7 +67,7 @@ def add_argument(parser: argparse.ArgumentParser):
58
67
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
59
68
  parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
60
69
  choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
61
- parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
70
+ parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
62
71
  parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
63
72
 
64
73
  # Cache and working directory arguments
@@ -67,6 +76,7 @@ def add_argument(parser: argparse.ArgumentParser):
67
76
  parser.add_argument('--work-dir', type=str, help='The root cache dir.')
68
77
 
69
78
  # Debug and runtime mode arguments
79
+ parser.add_argument('--ignore-errors', action='store_true', default=False, help='Ignore errors during evaluation.')
70
80
  parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
71
81
  parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
72
82
  parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
@@ -79,6 +89,7 @@ def add_argument(parser: argparse.ArgumentParser):
79
89
  parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
80
90
  parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
81
91
  parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
92
+ parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
82
93
  # yapf: enable
83
94
 
84
95
 
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
2
3
  import subprocess
3
4
  import tempfile
4
5
  from dataclasses import asdict
@@ -204,7 +205,7 @@ class OpenCompassBackendManager(BackendManager):
204
205
  model_d['meta_template'] = get_template(model_d['meta_template'])
205
206
 
206
207
  # set the 'abbr' as the 'path' if 'abbr' is not specified
207
- model_d['abbr'] = model_d['path']
208
+ model_d['abbr'] = os.path.basename(model_d['path'])
208
209
 
209
210
  model_config = ApiModelConfig(**model_d)
210
211
  models.append(asdict(model_config))
@@ -11,7 +11,9 @@ class ModelArguments:
11
11
  pooling_mode: Optional[str] = None
12
12
  max_seq_length: int = 512 # max sequence length
13
13
  # prompt for llm based model
14
- prompt: str = ''
14
+ prompt: Optional[str] = None
15
+ # prompts dictionary for different tasks, if prompt is not set
16
+ prompts: Optional[Dict[str, str]] = None
15
17
  # model kwargs
16
18
  model_kwargs: dict = field(default_factory=dict)
17
19
  # config kwargs
@@ -33,6 +35,7 @@ class ModelArguments:
33
35
  'pooling_mode': self.pooling_mode,
34
36
  'max_seq_length': self.max_seq_length,
35
37
  'prompt': self.prompt,
38
+ 'prompts': self.prompts,
36
39
  'model_kwargs': self.model_kwargs,
37
40
  'config_kwargs': self.config_kwargs,
38
41
  'encode_kwargs': self.encode_kwargs,