evalscope 0.16.0__tar.gz → 0.16.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (501) hide show
  1. {evalscope-0.16.0/evalscope.egg-info → evalscope-0.16.2}/PKG-INFO +16 -13
  2. {evalscope-0.16.0 → evalscope-0.16.2}/README.md +3 -0
  3. evalscope-0.16.2/evalscope/app/__init__.py +28 -0
  4. {evalscope-0.16.0/evalscope/report → evalscope-0.16.2/evalscope/app}/app.py +40 -30
  5. evalscope-0.16.2/evalscope/app/constants.py +21 -0
  6. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/arguments.py +2 -1
  7. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/backend_manager.py +2 -1
  8. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  9. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  10. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  11. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/embedding.py +77 -39
  12. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  13. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  14. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  15. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  16. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arc/arc_adapter.py +3 -0
  17. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  18. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  19. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/benchmark.py +2 -0
  20. evalscope-0.16.2/evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  21. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  22. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  23. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  24. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  25. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/data_adapter.py +99 -16
  26. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  27. evalscope-0.16.2/evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
  28. evalscope-0.16.2/evalscope/benchmarks/docmath/utils.py +220 -0
  29. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/drop/drop_adapter.py +3 -0
  30. evalscope-0.16.2/evalscope/benchmarks/frames/frames_adapter.py +91 -0
  31. evalscope-0.16.2/evalscope/benchmarks/frames/utils.py +37 -0
  32. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  33. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  34. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  35. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  36. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  37. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  38. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  39. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  40. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  41. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  42. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  43. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  44. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  45. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  46. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/musr/musr_adapter.py +3 -0
  47. evalscope-0.16.2/evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
  48. evalscope-0.16.2/evalscope/benchmarks/needle_haystack/utils.py +79 -0
  49. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  50. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/race/race_adapter.py +3 -0
  51. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  52. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  53. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  54. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  55. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
  56. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/tool_bench/utils.py +5 -4
  57. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  58. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  59. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/utils.py +25 -0
  60. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  61. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/start_app.py +2 -2
  62. evalscope-0.16.2/evalscope/collections/__init__.py +35 -0
  63. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/collections/evaluator.py +68 -34
  64. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/config.py +8 -2
  65. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/constants.py +1 -1
  66. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/evaluator.py +40 -28
  67. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/__init__.py +3 -1
  68. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  69. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/llm_judge.py +12 -5
  70. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/math_parser.py +1 -1
  71. evalscope-0.16.2/evalscope/metrics/t2v_metrics/__init__.py +52 -0
  72. {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models}/__init__.py +0 -0
  73. {evalscope-0.16.0/tests/rag → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models}/__init__.py +0 -0
  74. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/__init__.py +2 -0
  75. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/base_adapter.py +31 -27
  76. evalscope-0.16.2/evalscope/models/adapters/bfcl_adapter.py +244 -0
  77. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/server_adapter.py +80 -23
  78. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/custom/custom_model.py +0 -3
  79. evalscope-0.16.2/evalscope/models/custom/dummy_model.py +99 -0
  80. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/local_model.py +1 -1
  81. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/register.py +2 -1
  82. evalscope-0.16.2/evalscope/perf/__init__.py +0 -0
  83. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/arguments.py +4 -2
  84. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/benchmark.py +16 -12
  85. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/main.py +7 -0
  86. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/openai_api.py +2 -0
  87. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/custom.py +15 -0
  88. evalscope-0.16.2/evalscope/perf/utils/__init__.py +0 -0
  89. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/benchmark_util.py +1 -1
  90. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/local_server.py +1 -0
  91. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/log_utils.py +12 -5
  92. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/rich_display.py +1 -1
  93. evalscope-0.16.2/evalscope/report/__init__.py +38 -0
  94. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/report/combinator.py +40 -6
  95. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/report/generator.py +33 -9
  96. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/report/utils.py +84 -4
  97. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/run.py +12 -0
  98. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/summarizer.py +1 -1
  99. evalscope-0.16.2/evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  100. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/io_utils.py +59 -2
  101. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/logger.py +1 -1
  102. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/utils.py +12 -0
  103. evalscope-0.16.2/evalscope/version.py +4 -0
  104. {evalscope-0.16.0 → evalscope-0.16.2/evalscope.egg-info}/PKG-INFO +16 -13
  105. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/SOURCES.txt +16 -2
  106. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/requires.txt +12 -12
  107. {evalscope-0.16.0 → evalscope-0.16.2}/requirements/aigc.txt +1 -0
  108. {evalscope-0.16.0 → evalscope-0.16.2}/requirements/framework.txt +2 -3
  109. evalscope-0.16.2/requirements/opencompass.txt +1 -0
  110. {evalscope-0.16.0 → evalscope-0.16.2}/requirements/rag.txt +1 -1
  111. evalscope-0.16.2/requirements/vlmeval.txt +1 -0
  112. {evalscope-0.16.0 → evalscope-0.16.2}/tests/aigc/test_t2i.py +48 -11
  113. {evalscope-0.16.0 → evalscope-0.16.2}/tests/cli/test_all.py +14 -3
  114. {evalscope-0.16.0 → evalscope-0.16.2}/tests/cli/test_collection.py +6 -4
  115. {evalscope-0.16.0 → evalscope-0.16.2}/tests/cli/test_run.py +50 -25
  116. evalscope-0.16.2/tests/rag/__init__.py +0 -0
  117. {evalscope-0.16.0 → evalscope-0.16.2}/tests/rag/test_clip_benchmark.py +5 -1
  118. {evalscope-0.16.0 → evalscope-0.16.2}/tests/rag/test_mteb.py +51 -7
  119. evalscope-0.16.0/evalscope/collections/__init__.py +0 -3
  120. evalscope-0.16.0/evalscope/metrics/t2v_metrics/__init__.py +0 -66
  121. evalscope-0.16.0/evalscope/models/custom/dummy_model.py +0 -61
  122. evalscope-0.16.0/evalscope/report/__init__.py +0 -6
  123. evalscope-0.16.0/evalscope/version.py +0 -4
  124. evalscope-0.16.0/requirements/opencompass.txt +0 -1
  125. evalscope-0.16.0/requirements/vlmeval.txt +0 -1
  126. {evalscope-0.16.0 → evalscope-0.16.2}/LICENSE +0 -0
  127. {evalscope-0.16.0 → evalscope-0.16.2}/MANIFEST.in +0 -0
  128. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/__init__.py +0 -0
  129. evalscope-0.16.0/evalscope/report/app_arguments.py → evalscope-0.16.2/evalscope/app/arguments.py +0 -0
  130. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/__init__.py +0 -0
  131. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/base.py +0 -0
  132. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/__init__.py +0 -0
  133. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/api_meta_template.py +0 -0
  134. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/tasks/__init__.py +0 -0
  135. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/tasks/eval_api.py +0 -0
  136. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/opencompass/tasks/eval_datasets.py +0 -0
  137. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/__init__.py +0 -0
  138. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/backend_manager.py +0 -0
  139. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/__init__.py +0 -0
  140. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/arguments.py +0 -0
  141. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/task_template.py +0 -0
  142. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py +0 -0
  143. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +0 -0
  144. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -0
  145. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +0 -0
  146. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +0 -0
  147. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +0 -0
  148. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/__init__.py +0 -0
  149. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/base.py +0 -0
  150. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -0
  151. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -0
  152. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  153. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -0
  154. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -0
  155. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -0
  156. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -0
  157. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -0
  158. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/__init__.py +0 -0
  159. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/arguments.py +0 -0
  160. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -0
  161. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/task_template.py +0 -0
  162. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/__init__.py +0 -0
  163. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +0 -0
  164. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/build_transform.py +0 -0
  165. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +0 -0
  166. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -0
  167. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/__init__.py +0 -0
  168. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/clip.py +0 -0
  169. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/llm.py +0 -0
  170. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/rag_eval/utils/tools.py +0 -0
  171. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/vlm_eval_kit/__init__.py +0 -0
  172. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/backend/vlm_eval_kit/backend_manager.py +0 -0
  173. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/__init__.py +0 -0
  174. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/__init__.py +0 -0
  175. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  176. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/base.py +0 -0
  177. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -0
  178. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -0
  179. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -0
  180. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -0
  181. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/aime/__init__.py +0 -0
  182. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  183. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arc/__init__.py +0 -0
  184. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arc/ai2_arc.py +0 -0
  185. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arena_hard/__init__.py +0 -0
  186. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/arena_hard/utils.py +0 -0
  187. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/__init__.py +0 -0
  188. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +0 -0
  189. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +0 -0
  190. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +0 -0
  191. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +0 -0
  192. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +0 -0
  193. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +0 -0
  194. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +0 -0
  195. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +0 -0
  196. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +0 -0
  197. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +0 -0
  198. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +0 -0
  199. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +0 -0
  200. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +0 -0
  201. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/navigate.txt +0 -0
  202. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +0 -0
  203. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +0 -0
  204. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +0 -0
  205. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +0 -0
  206. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +0 -0
  207. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/snarks.txt +0 -0
  208. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +0 -0
  209. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +0 -0
  210. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +0 -0
  211. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +0 -0
  212. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +0 -0
  213. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +0 -0
  214. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +0 -0
  215. {evalscope-0.16.0/evalscope/benchmarks/chinese_simple_qa → evalscope-0.16.2/evalscope/benchmarks/bfcl}/__init__.py +0 -0
  216. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ceval/__init__.py +0 -0
  217. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ceval/ceval_exam.py +0 -0
  218. {evalscope-0.16.0/evalscope/benchmarks/data_collection → evalscope-0.16.2/evalscope/benchmarks/chinese_simple_qa}/__init__.py +0 -0
  219. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/cmmlu/__init__.py +0 -0
  220. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/cmmlu/cmmlu.py +0 -0
  221. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/cmmlu/samples.jsonl +0 -0
  222. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/competition_math/__init__.py +0 -0
  223. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/competition_math/competition_math.py +0 -0
  224. {evalscope-0.16.0/evalscope/benchmarks/drop → evalscope-0.16.2/evalscope/benchmarks/data_collection}/__init__.py +0 -0
  225. {evalscope-0.16.0/evalscope/benchmarks/general_mcq → evalscope-0.16.2/evalscope/benchmarks/docmath}/__init__.py +0 -0
  226. {evalscope-0.16.0/evalscope/benchmarks/gpqa → evalscope-0.16.2/evalscope/benchmarks/drop}/__init__.py +0 -0
  227. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/drop/utils.py +0 -0
  228. {evalscope-0.16.0/evalscope/benchmarks/ifeval → evalscope-0.16.2/evalscope/benchmarks/frames}/__init__.py +0 -0
  229. {evalscope-0.16.0/evalscope/benchmarks/iquiz → evalscope-0.16.2/evalscope/benchmarks/general_mcq}/__init__.py +0 -0
  230. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/general_qa/__init__.py +0 -0
  231. {evalscope-0.16.0/evalscope/benchmarks/live_code_bench → evalscope-0.16.2/evalscope/benchmarks/gpqa}/__init__.py +0 -0
  232. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gpqa/chain_of_thought.txt +0 -0
  233. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gsm8k/__init__.py +0 -0
  234. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/gsm8k/gsm8k.py +0 -0
  235. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/hellaswag/__init__.py +0 -0
  236. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/hellaswag/hellaswag.py +0 -0
  237. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/humaneval/__init__.py +0 -0
  238. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/humaneval/humaneval.py +0 -0
  239. {evalscope-0.16.0/evalscope/benchmarks/maritime_bench → evalscope-0.16.2/evalscope/benchmarks/ifeval}/__init__.py +0 -0
  240. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/instructions.py +0 -0
  241. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/instructions_registry.py +0 -0
  242. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/instructions_util.py +0 -0
  243. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/ifeval/utils.py +0 -0
  244. {evalscope-0.16.0/evalscope/benchmarks/math_500 → evalscope-0.16.2/evalscope/benchmarks/iquiz}/__init__.py +0 -0
  245. {evalscope-0.16.0/evalscope/benchmarks/mmlu_pro → evalscope-0.16.2/evalscope/benchmarks/live_code_bench}/__init__.py +0 -0
  246. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/evaluate_utils.py +0 -0
  247. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/extract_utils.py +0 -0
  248. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/load_utils.py +0 -0
  249. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/pass_k_utils.py +0 -0
  250. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/prompts.py +0 -0
  251. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/live_code_bench/testing_util.py +0 -0
  252. {evalscope-0.16.0/evalscope/benchmarks/mmlu_redux → evalscope-0.16.2/evalscope/benchmarks/maritime_bench}/__init__.py +0 -0
  253. {evalscope-0.16.0/evalscope/benchmarks/musr → evalscope-0.16.2/evalscope/benchmarks/math_500}/__init__.py +0 -0
  254. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu/__init__.py +0 -0
  255. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu/mmlu.py +0 -0
  256. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/mmlu/samples.jsonl +0 -0
  257. {evalscope-0.16.0/evalscope/benchmarks/process_bench → evalscope-0.16.2/evalscope/benchmarks/mmlu_pro}/__init__.py +0 -0
  258. {evalscope-0.16.0/evalscope/benchmarks/simple_qa → evalscope-0.16.2/evalscope/benchmarks/mmlu_redux}/__init__.py +0 -0
  259. {evalscope-0.16.0/evalscope/benchmarks/super_gpqa → evalscope-0.16.2/evalscope/benchmarks/musr}/__init__.py +0 -0
  260. {evalscope-0.16.0/evalscope/benchmarks/tool_bench → evalscope-0.16.2/evalscope/benchmarks/needle_haystack}/__init__.py +0 -0
  261. {evalscope-0.16.0/evalscope/benchmarks/winogrande → evalscope-0.16.2/evalscope/benchmarks/process_bench}/__init__.py +0 -0
  262. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/process_bench/critique_template.txt +0 -0
  263. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/race/__init__.py +0 -0
  264. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/race/race.py +0 -0
  265. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/race/samples.jsonl +0 -0
  266. {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models → evalscope-0.16.2/evalscope/benchmarks/simple_qa}/__init__.py +0 -0
  267. {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model → evalscope-0.16.2/evalscope/benchmarks/super_gpqa}/__init__.py +0 -0
  268. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/super_gpqa/utils.py +0 -0
  269. {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward → evalscope-0.16.2/evalscope/benchmarks/tool_bench}/__init__.py +0 -0
  270. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/trivia_qa/__init__.py +0 -0
  271. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/trivia_qa/samples.jsonl +0 -0
  272. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -0
  273. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/truthful_qa/__init__.py +0 -0
  274. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -0
  275. {evalscope-0.16.0/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5 → evalscope-0.16.2/evalscope/benchmarks/winogrande}/__init__.py +0 -0
  276. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/__init__.py +0 -0
  277. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/base.py +0 -0
  278. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/cli.py +0 -0
  279. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/start_eval.py +0 -0
  280. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/start_perf.py +0 -0
  281. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/cli/start_server.py +0 -0
  282. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/collections/sampler.py +0 -0
  283. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/collections/schema.py +0 -0
  284. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/__init__.py +0 -0
  285. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/rating_eval.py +0 -0
  286. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/reviewer/__init__.py +0 -0
  287. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/evaluator/reviewer/auto_reviewer.py +0 -0
  288. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/bundled_rouge_score/__init__.py +0 -0
  289. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/metrics.py +0 -0
  290. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/named_metrics.py +0 -0
  291. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/rouge_metric.py +0 -0
  292. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/clipscore.py +0 -0
  293. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/constants.py +0 -0
  294. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/itmscore.py +0 -0
  295. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +0 -0
  296. {evalscope-0.16.0/evalscope/perf → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model}/__init__.py +0 -0
  297. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +0 -0
  298. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +0 -0
  299. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +0 -0
  300. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +0 -0
  301. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +0 -0
  302. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +0 -0
  303. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +0 -0
  304. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +0 -0
  305. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +0 -0
  306. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +0 -0
  307. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +0 -0
  308. {evalscope-0.16.0/evalscope/perf/utils → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward}/__init__.py +0 -0
  309. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +0 -0
  310. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +0 -0
  311. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/model.py +0 -0
  312. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/utils.py +0 -0
  313. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +0 -0
  314. {evalscope-0.16.0/evalscope/third_party/thinkbench/tools → evalscope-0.16.2/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5}/__init__.py +0 -0
  315. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +0 -0
  316. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +0 -0
  317. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +0 -0
  318. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +0 -0
  319. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +0 -0
  320. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +0 -0
  321. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +0 -0
  322. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +0 -0
  323. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  324. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  325. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  326. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  327. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  328. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  329. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  330. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  331. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  332. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  333. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +0 -0
  334. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +0 -0
  335. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +0 -0
  336. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +0 -0
  337. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +0 -0
  338. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +0 -0
  339. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +0 -0
  340. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +0 -0
  341. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +0 -0
  342. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +0 -0
  343. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +0 -0
  344. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +0 -0
  345. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +0 -0
  346. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +0 -0
  347. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +0 -0
  348. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +0 -0
  349. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +0 -0
  350. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +0 -0
  351. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +0 -0
  352. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +0 -0
  353. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +0 -0
  354. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +0 -0
  355. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +0 -0
  356. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +0 -0
  357. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +0 -0
  358. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +0 -0
  359. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +0 -0
  360. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +0 -0
  361. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +0 -0
  362. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +0 -0
  363. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +0 -0
  364. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +0 -0
  365. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +0 -0
  366. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +0 -0
  367. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +0 -0
  368. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +0 -0
  369. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +0 -0
  370. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +0 -0
  371. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +0 -0
  372. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +0 -0
  373. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +0 -0
  374. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +0 -0
  375. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +0 -0
  376. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +0 -0
  377. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +0 -0
  378. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +0 -0
  379. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +0 -0
  380. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +0 -0
  381. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +0 -0
  382. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +0 -0
  383. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +0 -0
  384. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +0 -0
  385. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +0 -0
  386. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +0 -0
  387. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +0 -0
  388. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/score.py +0 -0
  389. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/metrics/t2v_metrics/vqascore.py +0 -0
  390. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/__init__.py +0 -0
  391. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/chat_adapter.py +0 -0
  392. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/choice_adapter.py +0 -0
  393. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/custom_adapter.py +0 -0
  394. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/adapters/t2i_adapter.py +0 -0
  395. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/custom/__init__.py +0 -0
  396. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/models/model.py +0 -0
  397. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/http_client.py +0 -0
  398. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/__init__.py +0 -0
  399. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/__init__.py +0 -0
  400. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/base.py +0 -0
  401. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/custom_api.py +0 -0
  402. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/api/dashscope_api.py +0 -0
  403. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/__init__.py +0 -0
  404. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/base.py +0 -0
  405. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/flickr8k.py +0 -0
  406. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/line_by_line.py +0 -0
  407. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/longalpaca.py +0 -0
  408. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/openqa.py +0 -0
  409. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/random_dataset.py +0 -0
  410. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/datasets/speed_benchmark.py +0 -0
  411. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/plugin/registry.py +0 -0
  412. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/analysis_result.py +0 -0
  413. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/db_util.py +0 -0
  414. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/perf/utils/handler.py +0 -0
  415. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/__init__.py +0 -0
  416. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/config/cfg_arena.yaml +0 -0
  417. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/config/cfg_arena_zhihu.yaml +0 -0
  418. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -0
  419. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/config/cfg_single.yaml +0 -0
  420. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -0
  421. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -0
  422. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/qa_browser/battle.jsonl +0 -0
  423. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/qa_browser/category_mapping.yaml +0 -0
  424. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/data/question.jsonl +0 -0
  425. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/arc.yaml +0 -0
  426. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/bbh.yaml +0 -0
  427. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/bbh_mini.yaml +0 -0
  428. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/ceval.yaml +0 -0
  429. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/ceval_mini.yaml +0 -0
  430. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/cmmlu.yaml +0 -0
  431. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -0
  432. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/general_qa.yaml +0 -0
  433. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/gsm8k.yaml +0 -0
  434. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/mmlu.yaml +0 -0
  435. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/registry/tasks/mmlu_mini.yaml +0 -0
  436. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/run_arena.py +0 -0
  437. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/__init__.py +0 -0
  438. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/README.md +0 -0
  439. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/__init__.py +0 -0
  440. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/default_task.json +0 -0
  441. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/default_task.yaml +0 -0
  442. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/eval.py +0 -0
  443. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/infer.py +0 -0
  444. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/longbench_write.py +0 -0
  445. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/__init__.py +0 -0
  446. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/judge.txt +0 -0
  447. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/longbench_write.jsonl +0 -0
  448. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +0 -0
  449. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +0 -0
  450. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/tools/__init__.py +0 -0
  451. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/tools/data_etl.py +0 -0
  452. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/tools/openai_api.py +0 -0
  453. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/longbench_write/utils.py +0 -0
  454. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/__init__.py +0 -0
  455. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/eval.py +0 -0
  456. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/infer.py +0 -0
  457. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/resources/critique_template.txt +0 -0
  458. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/resources/reformat_template.txt +0 -0
  459. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/tools/llm.py +0 -0
  460. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/thinkbench/tools/utils.py +0 -0
  461. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/README.md +0 -0
  462. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/__init__.py +0 -0
  463. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/config_default.json +0 -0
  464. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/config_default.yaml +0 -0
  465. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/eval.py +0 -0
  466. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/infer.py +0 -0
  467. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/llm/__init__.py +0 -0
  468. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -0
  469. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/requirements.txt +0 -0
  470. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/third_party/toolbench_static/toolbench_static.py +0 -0
  471. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/__init__.py +0 -0
  472. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/arena_utils.py +0 -0
  473. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/chat_service.py +0 -0
  474. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/completion_parsers.py +0 -0
  475. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/deprecation_utils.py +0 -0
  476. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/filters.py +0 -0
  477. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/import_utils.py +0 -0
  478. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope/utils/model_utils.py +0 -0
  479. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/dependency_links.txt +0 -0
  480. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/entry_points.txt +0 -0
  481. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/not-zip-safe +0 -0
  482. {evalscope-0.16.0 → evalscope-0.16.2}/evalscope.egg-info/top_level.txt +0 -0
  483. {evalscope-0.16.0 → evalscope-0.16.2}/requirements/app.txt +0 -0
  484. {evalscope-0.16.0 → evalscope-0.16.2}/requirements/docs.txt +0 -0
  485. {evalscope-0.16.0 → evalscope-0.16.2}/requirements/perf.txt +0 -0
  486. {evalscope-0.16.0 → evalscope-0.16.2}/requirements.txt +0 -0
  487. {evalscope-0.16.0 → evalscope-0.16.2}/setup.cfg +0 -0
  488. {evalscope-0.16.0 → evalscope-0.16.2}/setup.py +0 -0
  489. {evalscope-0.16.0 → evalscope-0.16.2}/tests/__init__.py +0 -0
  490. {evalscope-0.16.0 → evalscope-0.16.2}/tests/aigc/__init__.py +0 -0
  491. {evalscope-0.16.0 → evalscope-0.16.2}/tests/cli/__init__.py +0 -0
  492. {evalscope-0.16.0 → evalscope-0.16.2}/tests/perf/__init__.py +0 -0
  493. {evalscope-0.16.0 → evalscope-0.16.2}/tests/perf/test_perf.py +0 -0
  494. {evalscope-0.16.0 → evalscope-0.16.2}/tests/rag/test_ragas.py +0 -0
  495. {evalscope-0.16.0 → evalscope-0.16.2}/tests/swift/__init__.py +0 -0
  496. {evalscope-0.16.0 → evalscope-0.16.2}/tests/swift/test_run_swift_eval.py +0 -0
  497. {evalscope-0.16.0 → evalscope-0.16.2}/tests/swift/test_run_swift_vlm_eval.py +0 -0
  498. {evalscope-0.16.0 → evalscope-0.16.2}/tests/swift/test_run_swift_vlm_jugde_eval.py +0 -0
  499. {evalscope-0.16.0 → evalscope-0.16.2}/tests/test_run_all.py +0 -0
  500. {evalscope-0.16.0 → evalscope-0.16.2}/tests/vlm/__init__.py +0 -0
  501. {evalscope-0.16.0 → evalscope-0.16.2}/tests/vlm/test_vlmeval.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.16.0
3
+ Version: 0.16.2
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -17,12 +17,12 @@ Requires-Python: >=3.8
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
19
  Requires-Dist: accelerate
20
- Requires-Dist: datasets<=3.2.0,>=3.0.0
20
+ Requires-Dist: datasets>=3.0
21
21
  Requires-Dist: immutabledict
22
22
  Requires-Dist: jieba
23
23
  Requires-Dist: jsonlines
24
24
  Requires-Dist: langdetect
25
- Requires-Dist: latex2sympy2
25
+ Requires-Dist: latex2sympy2_extended
26
26
  Requires-Dist: matplotlib
27
27
  Requires-Dist: modelscope[framework]
28
28
  Requires-Dist: nltk>=3.9
@@ -40,20 +40,19 @@ Requires-Dist: seaborn
40
40
  Requires-Dist: sympy
41
41
  Requires-Dist: tabulate
42
42
  Requires-Dist: torch
43
- Requires-Dist: torchvision
44
43
  Requires-Dist: tqdm
45
44
  Requires-Dist: transformers>=4.33
46
45
  Requires-Dist: word2number
47
46
  Provides-Extra: opencompass
48
- Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
47
+ Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
49
48
  Provides-Extra: vlmeval
50
- Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
49
+ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
51
50
  Provides-Extra: rag
52
51
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
53
52
  Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
54
53
  Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
55
54
  Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
56
- Requires-Dist: mteb==1.19.4; extra == "rag"
55
+ Requires-Dist: mteb==1.38.20; extra == "rag"
57
56
  Requires-Dist: ragas==0.2.14; extra == "rag"
58
57
  Requires-Dist: webdataset>0.2.0; extra == "rag"
59
58
  Provides-Extra: perf
@@ -73,14 +72,15 @@ Requires-Dist: iopath; extra == "aigc"
73
72
  Requires-Dist: omegaconf; extra == "aigc"
74
73
  Requires-Dist: open_clip_torch; extra == "aigc"
75
74
  Requires-Dist: opencv-python; extra == "aigc"
75
+ Requires-Dist: torchvision; extra == "aigc"
76
76
  Provides-Extra: all
77
77
  Requires-Dist: accelerate; extra == "all"
78
- Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
78
+ Requires-Dist: datasets>=3.0; extra == "all"
79
79
  Requires-Dist: immutabledict; extra == "all"
80
80
  Requires-Dist: jieba; extra == "all"
81
81
  Requires-Dist: jsonlines; extra == "all"
82
82
  Requires-Dist: langdetect; extra == "all"
83
- Requires-Dist: latex2sympy2; extra == "all"
83
+ Requires-Dist: latex2sympy2_extended; extra == "all"
84
84
  Requires-Dist: matplotlib; extra == "all"
85
85
  Requires-Dist: modelscope[framework]; extra == "all"
86
86
  Requires-Dist: nltk>=3.9; extra == "all"
@@ -98,17 +98,16 @@ Requires-Dist: seaborn; extra == "all"
98
98
  Requires-Dist: sympy; extra == "all"
99
99
  Requires-Dist: tabulate; extra == "all"
100
100
  Requires-Dist: torch; extra == "all"
101
- Requires-Dist: torchvision; extra == "all"
102
101
  Requires-Dist: tqdm; extra == "all"
103
102
  Requires-Dist: transformers>=4.33; extra == "all"
104
103
  Requires-Dist: word2number; extra == "all"
105
- Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
106
- Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
104
+ Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
105
+ Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
107
106
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
108
107
  Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
109
108
  Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
110
109
  Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
111
- Requires-Dist: mteb==1.19.4; extra == "all"
110
+ Requires-Dist: mteb==1.38.20; extra == "all"
112
111
  Requires-Dist: ragas==0.2.14; extra == "all"
113
112
  Requires-Dist: webdataset>0.2.0; extra == "all"
114
113
  Requires-Dist: aiohttp; extra == "all"
@@ -125,6 +124,7 @@ Requires-Dist: iopath; extra == "all"
125
124
  Requires-Dist: omegaconf; extra == "all"
126
125
  Requires-Dist: open_clip_torch; extra == "all"
127
126
  Requires-Dist: opencv-python; extra == "all"
127
+ Requires-Dist: torchvision; extra == "all"
128
128
 
129
129
  <p align="center">
130
130
  <br>
@@ -230,6 +230,9 @@ Please scan the QR code below to join our community groups:
230
230
 
231
231
  ## 🎉 News
232
232
 
233
+ - 🔥 **[2025.06.19]** Added support for the BFCL-v3 benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
234
+ - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
235
+ - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
233
236
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
234
237
  - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
235
238
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
@@ -102,6 +102,9 @@ Please scan the QR code below to join our community groups:
102
102
 
103
103
  ## 🎉 News
104
104
 
105
+ - 🔥 **[2025.06.19]** Added support for the BFCL-v3 benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
106
+ - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
107
+ - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
105
108
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
106
109
  - 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
107
110
  - 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
@@ -0,0 +1,28 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
3
+
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .app import create_app
8
+ from .arguments import add_argument
9
+
10
+ else:
11
+ _import_structure = {
12
+ 'app': [
13
+ 'create_app',
14
+ ],
15
+ 'arguments': [
16
+ 'add_argument',
17
+ ],
18
+ }
19
+
20
+ import sys
21
+
22
+ sys.modules[__name__] = _LazyModule(
23
+ __name__,
24
+ globals()['__file__'],
25
+ _import_structure,
26
+ module_spec=__spec__,
27
+ extra_objects={},
28
+ )
@@ -1,6 +1,7 @@
1
1
  import argparse
2
2
  import glob
3
3
  import gradio as gr
4
+ import json
4
5
  import numpy as np
5
6
  import os
6
7
  import pandas as pd
@@ -11,35 +12,15 @@ from dataclasses import dataclass
11
12
  from typing import Any, List, Union
12
13
 
13
14
  from evalscope.constants import DataCollection
14
- from evalscope.report import Report, ReportKey, add_argument, get_data_frame, get_report_list
15
+ from evalscope.report import Report, ReportKey, get_data_frame, get_report_list
15
16
  from evalscope.utils.io_utils import OutputsStructure, yaml_to_dict
16
17
  from evalscope.utils.logger import configure_logging, get_logger
17
18
  from evalscope.version import __version__
19
+ from .arguments import add_argument
20
+ from .constants import DATASET_TOKEN, LATEX_DELIMITERS, MODEL_TOKEN, PLOTLY_THEME, REPORT_TOKEN
18
21
 
19
22
  logger = get_logger()
20
23
 
21
- PLOTLY_THEME = 'plotly_dark'
22
- REPORT_TOKEN = '@@'
23
- MODEL_TOKEN = '::'
24
- DATASET_TOKEN = ', '
25
- LATEX_DELIMITERS = [{
26
- 'left': '$$',
27
- 'right': '$$',
28
- 'display': True
29
- }, {
30
- 'left': '$',
31
- 'right': '$',
32
- 'display': False
33
- }, {
34
- 'left': '\\(',
35
- 'right': '\\)',
36
- 'display': False
37
- }, {
38
- 'left': '\\[',
39
- 'right': '\\]',
40
- 'display': True
41
- }]
42
-
43
24
 
44
25
  def scan_for_report_folders(root_path):
45
26
  """Scan for folders containing reports subdirectories"""
@@ -155,11 +136,11 @@ def plot_single_report_scores(df: pd.DataFrame):
155
136
 
156
137
  def plot_single_report_sunburst(report_list: List[Report]):
157
138
  if report_list[0].name == DataCollection.NAME:
158
- df = get_data_frame(report_list)
139
+ df = get_data_frame(report_list=report_list)
159
140
  categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
160
141
  path = categories + [ReportKey.subset_name]
161
142
  else:
162
- df = get_data_frame(report_list, flatten_metrics=False)
143
+ df = get_data_frame(report_list=report_list, flatten_metrics=False)
163
144
  categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
164
145
  path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
165
146
  logger.debug(f'df: {df}')
@@ -185,6 +166,13 @@ def get_single_dataset_df(df: pd.DataFrame, dataset_name: str):
185
166
  return df, styler
186
167
 
187
168
 
169
+ def get_report_analysis(report_list: List[Report], dataset_name: str) -> str:
170
+ for report in report_list:
171
+ if report.dataset_name == dataset_name:
172
+ return report.analysis
173
+ return 'N/A'
174
+
175
+
188
176
  def plot_single_dataset_scores(df: pd.DataFrame):
189
177
  # TODO: add metric radio and relace category name
190
178
  plot = px.bar(
@@ -246,7 +234,7 @@ def convert_html_tags(text):
246
234
  def process_string(string: str, max_length: int = 2048) -> str:
247
235
  string = convert_html_tags(string) # for display labels e.g.
248
236
  if max_length and len(string) > max_length:
249
- return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
237
+ return f'{string[:max_length // 2]}...[truncate]...{string[-max_length // 2:]}'
250
238
  return string
251
239
 
252
240
 
@@ -270,7 +258,7 @@ def dict_to_markdown(data) -> str:
270
258
  return '\n\n'.join(markdown_lines)
271
259
 
272
260
 
273
- def process_model_prediction(item: Any, max_length: int = 2048) -> str:
261
+ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
274
262
  """
275
263
  Process model prediction output into a formatted string.
276
264
 
@@ -294,6 +282,20 @@ def process_model_prediction(item: Any, max_length: int = 2048) -> str:
294
282
  return result
295
283
 
296
284
 
285
+ def process_model_prediction(item: Any, max_length: int = 4096) -> str:
286
+ if isinstance(item, (dict, list)):
287
+ result = json.dumps(item, ensure_ascii=False, indent=2)
288
+ result = f'```json\n{result}\n```'
289
+ else:
290
+ result = str(item)
291
+
292
+ # Apply HTML tag conversion and truncation only at the final output
293
+ if max_length is not None:
294
+ return process_string(result, max_length)
295
+
296
+ return result
297
+
298
+
297
299
  def normalize_score(score):
298
300
  try:
299
301
  if isinstance(score, bool):
@@ -456,6 +458,10 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
456
458
  'zh': '数据集分数',
457
459
  'en': 'Dataset Scores'
458
460
  },
461
+ 'report_analysis': {
462
+ 'zh': '报告智能分析',
463
+ 'en': 'Report Intelligent Analysis'
464
+ },
459
465
  'dataset_scores_table': {
460
466
  'zh': '数据集分数表',
461
467
  'en': 'Dataset Scores Table'
@@ -511,6 +517,9 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
511
517
  with gr.Tab(locale_dict['dataset_details'][lang]):
512
518
  dataset_radio = gr.Radio(
513
519
  label=locale_dict['select_dataset'][lang], choices=[], show_label=True, interactive=True)
520
+ # show dataset details
521
+ with gr.Accordion(locale_dict['report_analysis'][lang], open=True):
522
+ report_analysis = gr.Markdown(value='N/A', show_copy_button=True)
514
523
  gr.Markdown(f'### {locale_dict["dataset_scores"][lang]}')
515
524
  dataset_plot = gr.Plot(value=None, scale=1, label=locale_dict['dataset_scores'][lang])
516
525
  gr.Markdown(f'### {locale_dict["dataset_scores_table"][lang]}')
@@ -586,15 +595,16 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
586
595
  @gr.on(
587
596
  triggers=[dataset_radio.change, report_list.change],
588
597
  inputs=[dataset_radio, report_list],
589
- outputs=[dataset_plot, dataset_table, subset_select, data_review_df])
598
+ outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
590
599
  def update_single_report_dataset(dataset_name, report_list):
591
600
  logger.debug(f'Updating single report dataset: {dataset_name}')
592
- report_df = get_data_frame(report_list)
601
+ report_df = get_data_frame(report_list=report_list)
602
+ analysis = get_report_analysis(report_list, dataset_name)
593
603
  data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
594
604
  data_score_plot = plot_single_dataset_scores(data_score_df)
595
605
  subsets = data_score_df[ReportKey.subset_name].unique().tolist()
596
606
  logger.debug(f'subsets: {subsets}')
597
- return data_score_plot, styler, gr.update(choices=subsets, value=None), None
607
+ return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
598
608
 
599
609
  @gr.on(
600
610
  triggers=[subset_select.change],
@@ -0,0 +1,21 @@
1
+ PLOTLY_THEME = 'plotly_dark'
2
+ REPORT_TOKEN = '@@'
3
+ MODEL_TOKEN = '::'
4
+ DATASET_TOKEN = ', '
5
+ LATEX_DELIMITERS = [{
6
+ 'left': '$$',
7
+ 'right': '$$',
8
+ 'display': True
9
+ }, {
10
+ 'left': '$',
11
+ 'right': '$',
12
+ 'display': False
13
+ }, {
14
+ 'left': '\\(',
15
+ 'right': '\\)',
16
+ 'display': False
17
+ }, {
18
+ 'left': '\\[',
19
+ 'right': '\\]',
20
+ 'display': True
21
+ }]
@@ -67,7 +67,7 @@ def add_argument(parser: argparse.ArgumentParser):
67
67
  parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
68
68
  parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
69
69
  choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.REVIEW])
70
- parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
70
+ parser.add_argument('--limit', type=float, default=None, help='Max evaluation samples num for each subset.')
71
71
  parser.add_argument('--eval-batch-size', type=int, default=1, help='The batch size for evaluation.')
72
72
 
73
73
  # Cache and working directory arguments
@@ -89,6 +89,7 @@ def add_argument(parser: argparse.ArgumentParser):
89
89
  parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
90
90
  parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
91
91
  parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
92
+ parser.add_argument('--analysis-report', action='store_true', default=False, help='Generate analysis report for the evaluation results using judge model.') # noqa: E501
92
93
  # yapf: enable
93
94
 
94
95
 
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
2
3
  import subprocess
3
4
  import tempfile
4
5
  from dataclasses import asdict
@@ -204,7 +205,7 @@ class OpenCompassBackendManager(BackendManager):
204
205
  model_d['meta_template'] = get_template(model_d['meta_template'])
205
206
 
206
207
  # set the 'abbr' as the 'path' if 'abbr' is not specified
207
- model_d['abbr'] = model_d['path']
208
+ model_d['abbr'] = os.path.basename(model_d['path'])
208
209
 
209
210
  model_config = ApiModelConfig(**model_d)
210
211
  models.append(asdict(model_config))
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import posixpath # For URL path handling
2
3
  import torch
3
4
  from torch.utils.data import DataLoader
4
5
  from torch.utils.data import Dataset as TorchDataset
@@ -186,42 +187,53 @@ def build_wds_dataset(dataset_name, transform, split='test', data_dir='root', ca
186
187
 
187
188
  Set `cache_dir` to a path to cache the dataset, otherwise, no caching will occur.
188
189
  """
190
+ import requests
189
191
  import webdataset as wds
190
192
 
191
193
  def read_txt(fname):
192
- if '://' in fname:
193
- stream = os.popen("curl -L -s --fail '%s'" % fname, 'r')
194
- value = stream.read()
195
- if stream.close():
196
- raise FileNotFoundError('Failed to retreive data')
194
+ if fname.startswith(('http://', 'https://')):
195
+ try:
196
+ response = requests.get(fname)
197
+ response.raise_for_status() # Ensure the HTTP request was successful
198
+ return response.text
199
+ except requests.exceptions.RequestException as e:
200
+ raise FileNotFoundError(f'Failed to read {fname}: {e}')
197
201
  else:
198
202
  with open(fname, 'r') as file:
199
- value = file.read()
200
- return value
203
+ return file.read()
204
+
205
+ def url_path_join(*parts):
206
+ """Join URL path parts with forward slashes regardless of platform"""
207
+ return posixpath.join(*parts)
201
208
 
202
209
  if not data_dir:
203
210
  data_dir = f'https://modelscope.cn/datasets/clip-benchmark/wds_{dataset_name}/resolve/master'
204
211
 
205
212
  # Git LFS files have a different file path to access the raw data than other files
206
- if data_dir.startswith('https://modelscope.cn/datasets'):
213
+ is_url = data_dir.startswith(('http://', 'https://'))
214
+ if is_url and data_dir.startswith('https://modelscope.cn/datasets'):
207
215
  *split_url_head, _, url_path = data_dir.split('/', 7)
208
216
  url_head = '/'.join(split_url_head)
209
217
  metadata_dir = '/'.join([url_head, 'resolve', url_path])
210
218
  tardata_dir = '/'.join([url_head, 'resolve', url_path])
211
219
  else:
212
220
  metadata_dir = tardata_dir = data_dir
221
+
222
+ # Use appropriate path joining function based on whether we're dealing with a URL
223
+ path_join = url_path_join if is_url else os.path.join
224
+
213
225
  # Get number of shards
214
- nshards_fname = os.path.join(metadata_dir, split, 'nshards.txt')
226
+ nshards_fname = path_join(metadata_dir, split, 'nshards.txt')
215
227
  nshards = int(read_txt(nshards_fname)) # Do not catch FileNotFound, nshards.txt should be mandatory
216
228
 
217
229
  # Get dataset type (classification or retrieval)
218
- type_fname = os.path.join(metadata_dir, 'dataset_type.txt')
230
+ type_fname = path_join(metadata_dir, 'dataset_type.txt')
219
231
  try:
220
232
  dataset_type = read_txt(type_fname).strip().lower()
221
233
  except FileNotFoundError:
222
234
  dataset_type = 'classification'
223
235
 
224
- filepattern = os.path.join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
236
+ filepattern = path_join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
225
237
  # Load webdataset (support WEBP, PNG, and JPG for now)
226
238
  if not cache_dir or not isinstance(cache_dir, str):
227
239
  cache_dir = None
@@ -11,7 +11,9 @@ class ModelArguments:
11
11
  pooling_mode: Optional[str] = None
12
12
  max_seq_length: int = 512 # max sequence length
13
13
  # prompt for llm based model
14
- prompt: str = ''
14
+ prompt: Optional[str] = None
15
+ # prompts dictionary for different tasks, if prompt is not set
16
+ prompts: Optional[Dict[str, str]] = None
15
17
  # model kwargs
16
18
  model_kwargs: dict = field(default_factory=dict)
17
19
  # config kwargs
@@ -33,6 +35,7 @@ class ModelArguments:
33
35
  'pooling_mode': self.pooling_mode,
34
36
  'max_seq_length': self.max_seq_length,
35
37
  'prompt': self.prompt,
38
+ 'prompts': self.prompts,
36
39
  'model_kwargs': self.model_kwargs,
37
40
  'config_kwargs': self.config_kwargs,
38
41
  'encode_kwargs': self.encode_kwargs,
@@ -1,6 +1,6 @@
1
1
  import mteb
2
2
  import os
3
- from mteb.task_selection import results_to_dataframe
3
+ from tabulate import tabulate
4
4
 
5
5
  from evalscope.backend.rag_eval import EmbeddingModel, cmteb
6
6
  from evalscope.utils.logger import get_logger
@@ -12,14 +12,27 @@ def show_results(output_folder, model, results):
12
12
  model_name = model.mteb_model_meta.model_name_as_path()
13
13
  revision = model.mteb_model_meta.revision
14
14
 
15
- results_df = results_to_dataframe({model_name: {revision: results}})
15
+ data = []
16
+ for model_res in results:
17
+ main_res = model_res.only_main_score()
18
+ for split, score in main_res.scores.items():
19
+ for sub_score in score:
20
+ data.append({
21
+ 'Model': model_name.replace('eval__', ''),
22
+ 'Revision': revision,
23
+ 'Task Type': main_res.task_type,
24
+ 'Task': main_res.task_name,
25
+ 'Split': split,
26
+ 'Subset': sub_score['hf_subset'],
27
+ 'Main Score': sub_score['main_score'],
28
+ })
16
29
 
17
30
  save_path = os.path.join(
18
31
  output_folder,
19
32
  model_name,
20
33
  revision,
21
34
  )
22
- logger.info(f'Evaluation results:\n{results_df.to_markdown()}')
35
+ logger.info(f'Evaluation results:\n{tabulate(data, headers="keys", tablefmt="grid")}')
23
36
  logger.info(f'Evaluation results saved in {os.path.abspath(save_path)}')
24
37
 
25
38
 
@@ -34,6 +47,7 @@ def one_stage_eval(
34
47
  tasks = cmteb.TaskBase.get_tasks(task_names=eval_args['tasks'], dataset_path=custom_dataset_path)
35
48
  evaluation = mteb.MTEB(tasks=tasks)
36
49
 
50
+ eval_args['encode_kwargs'] = model_args.get('encode_kwargs', {})
37
51
  # run evaluation
38
52
  results = evaluation.run(model, **eval_args)
39
53
 
@@ -66,6 +80,7 @@ def two_stage_eval(
66
80
  overwrite_results=True,
67
81
  hub=eval_args['hub'],
68
82
  limits=eval_args['limits'],
83
+ encode_kwargs=model1_args.get('encode_kwargs', {}),
69
84
  )
70
85
  # stage 2: run cross encoder
71
86
  results = evaluation.run(
@@ -77,6 +92,7 @@ def two_stage_eval(
77
92
  overwrite_results=True,
78
93
  hub=eval_args['hub'],
79
94
  limits=eval_args['limits'],
95
+ encode_kwargs=model2_args.get('encode_kwargs', {}),
80
96
  )
81
97
 
82
98
  # save and log results