evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (82) hide show
  1. evalscope/app/app.py +20 -5
  2. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  3. evalscope/backend/rag_eval/utils/embedding.py +2 -4
  4. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  5. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  6. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  7. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  8. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  9. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  10. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  11. evalscope/benchmarks/benchmark.py +1 -0
  12. evalscope/benchmarks/bfcl/__init__.py +0 -0
  13. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  14. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  15. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  16. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  17. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  18. evalscope/benchmarks/data_adapter.py +2 -0
  19. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  20. evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
  21. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +1 -0
  23. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  26. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  27. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  29. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  30. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  32. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  34. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  35. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  36. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  37. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  38. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
  39. evalscope/benchmarks/needle_haystack/utils.py +2 -2
  40. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  41. evalscope/benchmarks/race/race_adapter.py +3 -0
  42. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  43. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  44. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  45. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  46. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  48. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  49. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  50. evalscope/collections/evaluator.py +50 -28
  51. evalscope/constants.py +1 -1
  52. evalscope/evaluator/evaluator.py +6 -5
  53. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  54. evalscope/models/adapters/__init__.py +2 -0
  55. evalscope/models/adapters/base_adapter.py +31 -27
  56. evalscope/models/adapters/bfcl_adapter.py +244 -0
  57. evalscope/models/adapters/server_adapter.py +78 -17
  58. evalscope/models/custom/custom_model.py +0 -3
  59. evalscope/models/custom/dummy_model.py +77 -39
  60. evalscope/models/local_model.py +1 -1
  61. evalscope/models/register.py +2 -1
  62. evalscope/perf/arguments.py +2 -0
  63. evalscope/perf/benchmark.py +16 -3
  64. evalscope/perf/plugin/api/openai_api.py +2 -0
  65. evalscope/report/combinator.py +38 -12
  66. evalscope/report/utils.py +24 -1
  67. evalscope/run.py +1 -1
  68. evalscope/summarizer.py +1 -1
  69. evalscope/utils/io_utils.py +59 -2
  70. evalscope/version.py +2 -2
  71. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
  72. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
  73. tests/aigc/test_t2i.py +8 -8
  74. tests/cli/test_all.py +40 -33
  75. tests/cli/test_collection.py +4 -3
  76. tests/cli/test_run.py +36 -21
  77. tests/rag/test_clip_benchmark.py +5 -1
  78. tests/rag/test_mteb.py +46 -2
  79. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  80. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  81. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  82. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
2
  evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
3
3
  evalscope/config.py,sha256=HGvIlhjVjA9QtAiNEUrx_hev3wa-RaNEXelEiLJn9OM,11015
4
- evalscope/constants.py,sha256=PHnsGndB4N5-jvmawPxMK5b9geE2Es5cUe8ZKYSuKgM,4016
5
- evalscope/run.py,sha256=saHZGlwbBLYtFk4BmKkjQEOOHQQ-pDKzN21taao6Os0,6957
4
+ evalscope/constants.py,sha256=1CYghe0fGccyiVgzMIHd2HIb6lOo9fmB-8pH_l99iI4,4014
5
+ evalscope/run.py,sha256=ss7ECL4dq18ur9qFOWqCNIsckXQWWl1EsVaJxDPBVq8,7000
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
- evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
8
- evalscope/version.py,sha256=vMuGTezikPNdTLYlejHdHznB5WhuHCnAhaOdw3iqU5E,119
7
+ evalscope/summarizer.py,sha256=nZOaXfaSaXht8GAVik_Pvz2YL0Gv24UG45mMklyBkvA,5938
8
+ evalscope/version.py,sha256=VHNGbQIK9g2FDZyk0Yk7RSDY_XsEEtvEBuN8kjAA8PM,119
9
9
  evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
10
- evalscope/app/app.py,sha256=sTYoc3Uag7DqYbb_qXo8QJX4oer8dueQK1wdgaLlTiY,29371
10
+ evalscope/app/app.py,sha256=QyO0RFfkLeOVzx-Mr8br3bYPwii2O_eVGmNgwCGHkac,29863
11
11
  evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
12
12
  evalscope/app/constants.py,sha256=KpItEl9lF0VldOm0grjS7RVbbseemtsXZJKtgGmAQB8,361
13
13
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -22,7 +22,7 @@ evalscope/backend/rag_eval/__init__.py,sha256=Tbj7HboP5zzJ77-9qVEwwhHKjHL5V8MwLF
22
22
  evalscope/backend/rag_eval/backend_manager.py,sha256=OEFADT8kdsuVMU0QOfiafzFQopY7bKbWZ_jhdXyYElY,3472
23
23
  evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
24
24
  evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
25
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y0LNBjvttSXppc99gbz-f0TYQjnyLLyU,8347
25
+ evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=2OdPj4gSUWdAGCfS9PHpPGbd6q5RqEyli2G6UGb1ffw,8888
26
26
  evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=2NQRvlYY2SOzvOOj9WRLyxvRlyj8CAcgbQqgsv-Xjgw,3929
27
27
  evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=CQnWZZTQ0FOzDtmGv7OF0W4Cv4g6u4_LQ93koDu1pes,2556
@@ -53,36 +53,36 @@ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8m
53
53
  evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
54
54
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
55
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
56
- evalscope/backend/rag_eval/utils/embedding.py,sha256=3CkLX6SXGAc6ltUQe4V_IcTr71cZSane5-VjaRYn13M,9466
56
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=64DQrGzB2sw_Y0twwlSmOYobpOfgmRBFLfVMOc39UTk,9370
57
57
  evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
58
58
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
59
59
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
60
60
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
61
61
  evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
62
- evalscope/benchmarks/benchmark.py,sha256=X-vBzz5PDVI5rBbqWpiUZq0bmGhp9cRZiA27XCgxPdE,2573
63
- evalscope/benchmarks/data_adapter.py,sha256=Z2s4mfJssxNAeFPVNgZLkBbc3DBbJRZNGbRBigLe4I4,22893
62
+ evalscope/benchmarks/benchmark.py,sha256=uZ_-Y_wPhy6TxufWiElF4BwEWN93azT1JHtGRW8tR-w,2633
63
+ evalscope/benchmarks/data_adapter.py,sha256=NgaKHfm288hVGeG1l_xGbLvB-Gno4M7Xd5Pa2ozY17Q,22975
64
64
  evalscope/benchmarks/utils.py,sha256=81MwUJYWjJgoiRClY-IFB-EZN0th-oQDTvU2ekaEmpc,1869
65
65
  evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
67
  evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
68
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=WV9w3z8TxWNzVzn9A_g0xqeHh76ydnHL5xLwyg63VmU,2992
68
+ evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=cmkny4nIWofHJdQCvu_7wR-2NZVTaJo2l98zZlgGSAM,3081
69
69
  evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py,sha256=baDGFRpVcSKpc1CdzNAMBtjeCZDUpyEc5l1KyrPNoEU,1892
70
70
  evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z6fEAi0HfUqDZvaItQdS7dZ4,2097
71
71
  evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
72
72
  evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
73
73
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
- evalscope/benchmarks/aime/aime24_adapter.py,sha256=GrIxCHpUwgUy8tXGTB7iQOt8k7wG8MJB0CWbwBmIy-8,1703
75
- evalscope/benchmarks/aime/aime25_adapter.py,sha256=yxo5roCb8ryX9ROUU2FdZ-WBTUPZ14MrBzEL0zPOh-U,1718
74
+ evalscope/benchmarks/aime/aime24_adapter.py,sha256=hVoQMXpp_DSoZuJzCQLbAAUR8p4h9_1WcFUxelGUJBA,2036
75
+ evalscope/benchmarks/aime/aime25_adapter.py,sha256=TJ2pivciL8LhffGP6lZPMBqaaTzuaCN_00Bz51E7QFI,2037
76
76
  evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=oUHpWrt5Gx0jF80RBd7zTh_1AWI66YvDd6U1vOMoqj0,3828
77
+ evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=AwrtuC_6o2Wa1zGnZ080OCuWv8S-hwvGHJqZ7KPQwoI,4328
78
78
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
79
79
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
80
- evalscope/benchmarks/arc/arc_adapter.py,sha256=0h-eT4BBmUJQrakKMPUNE1nSRwK6LHB-cflWpWzY978,6364
80
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=BG_VeTyN88oXu7qquhva2ou1I3-RePzXLxQCsY_ne2M,6682
81
81
  evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=S3FQ_UD3GC8M7FU-PPeuJm5YVrG5qhnVE5T1jRpPuxo,6131
82
+ evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=FBwkxfnbyXgTiFmwKA5mjIOb_eOuUnXrijM4rrBHZE4,6672
83
83
  evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
84
84
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
85
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
85
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=IFu9XctrLNJcIFXK4jV3LmyqQCVb66z8YhL07Osc1TA,8623
86
86
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
87
87
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
88
88
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -110,104 +110,106 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt
110
110
  evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
111
111
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
112
112
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
113
+ evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
+ evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=MQPlfMvTQYHA4EP5g7eNzXDs4A4QvgYOiGC458Z39q4,10080
113
115
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
114
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=1ITBXI0f01Dt1p7sb2RGswIeg9685Bkk2S2xmA1vat8,11295
116
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=jZNOtaTwiyXAA6wQ8udXKyOo-f2mKOPjE6q7mrKCPXQ,11639
115
117
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
116
118
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
117
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=Q6ncuLrCUrrhhljIfMsgWnyhHfcWWwh8iA6NZvz3W28,8079
119
+ evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=kaZ8fZK2a9oVwpGRUA3wz3FkxtcTY_FkRDYrdLjDNro,8433
118
120
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
119
121
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
120
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=r9zael_Y2Jso0ashevYpF8e5SHOBh8iMcPIJU5WT3pQ,10367
122
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=QdeXKS8TdEua8lWWjoNOLvSB2fN3AKa7pKV0xjwmwME,10596
121
123
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
122
124
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
123
125
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
124
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=wgejW-_QswtT8_3JKAQ_H6svH8IotDJDBEH7X4nP4bY,6760
126
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=yubOKBm8IqskyuEYkbUDxdkUCmVJE1-yB5SxxMWyHjA,7004
125
127
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
126
- evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=QgLgIrjD3q53T-lu1UWTV6T4h1cKGoCQDh0O4QxFezw,2569
128
+ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=z_wbrA4yJoMwfg4TJkvEZB2aV5cPFcxCZ3JIj49F4Do,2604
127
129
  evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
- evalscope/benchmarks/docmath/docmath_adapter.py,sha256=GAoHuFASKyWCVbB0nmImsEB-YCREwB75WjdqYB0CcyU,2912
130
+ evalscope/benchmarks/docmath/docmath_adapter.py,sha256=LQ_beSN5RrvNqIQa5BYgwasLRrpUvM08R6BNOhIh6zA,2967
129
131
  evalscope/benchmarks/docmath/utils.py,sha256=ptd-Sot4QtUmUG4dMlqXtUWHKZplo5jSTolsypqX9Ho,7716
130
132
  evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
- evalscope/benchmarks/drop/drop_adapter.py,sha256=V-Vx6g2_1kcDUDWOKVX1vPSLt5iHn8NQkpWbsIwPaa4,8325
133
+ evalscope/benchmarks/drop/drop_adapter.py,sha256=ltt-9w6n_92crepfyb9yLBr5QzzHCWj0y1i5fYw1oF4,8645
132
134
  evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
133
135
  evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
- evalscope/benchmarks/frames/frames_adapter.py,sha256=wbug6yDlq6N5SfCQaOn43K8klJjrZc9iigFEPQs5nKA,3096
136
+ evalscope/benchmarks/frames/frames_adapter.py,sha256=xYvxGzqj_YPDSZYogP9TxUhOxvZFbud1S2SOvz1nlDU,3136
135
137
  evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
136
138
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=fqbt61owPP7t2H4B2zbYVZTs0VBGuXNvWGvkukwhRYc,5039
139
+ evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=r2qLKe8esRe45t2CoYzDiZXlq0zO6jVR-iiqLvdmn7Y,5160
138
140
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
139
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=40mZovspVf-OXcuEu3ei6G_HZlYA8whAHSESHPPONxA,4750
141
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=NFeV3rPSfv7_imlEnCI3oSi7aSJGGX2JDqzgvyLVOFw,4861
140
142
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
143
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
142
- evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
144
+ evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=J6RfxpUT1l8Jj3vT_Vtsn1z8MKCg32XTlKn_eihCI50,5071
143
145
  evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
144
146
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
145
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
147
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=IBMdsvQ1w45_raCiACTBm7DVHtOYfckv8x15_OXIwTI,10752
146
148
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
147
149
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
148
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=SRM_-AKlWtKXi4zrlBAH9YceFnrktZDNsjvQOiPizUM,5893
150
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=l4bHGYaU66ga9J09_QTrrqM9zrzA7mpwQ9Ul7Uy47ig,6176
149
151
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
150
152
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
151
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
153
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=O6muXpiBrQ9RGSglnl3gS0yO6BSkQtXASMR9yXUfhEE,5515
152
154
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
153
- evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=xuQ1EK8Af_093qqeOXPIp_iqTWcG5KGOtE6r5hx3958,1858
155
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=C7Zww11RGbPzlB7dy-mef-2uHOVXFTdLc5W48_PM5xM,2172
154
156
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
155
157
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
156
158
  evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
157
159
  evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
158
160
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
159
- evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
161
+ evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=ZwUWpVe5gkEC3l5wTo-XdePHiDjQbHDhX2W0WTS5mC4,2715
160
162
  evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
163
  evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=iqmVUMZmyRhzOOXXQ-NN9P1nGvvbzTjOSEp6djbN_rw,6503
162
164
  evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
163
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=AkvlQ-3oS8Tr3xZgx3omMt5w8jia6yH07D5Bq27Q5wc,3490
165
+ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=IHqEjfK_2O5Tk1kvWJCOcnEGIVW8Ujes6aLVm5YnkEg,3789
164
166
  evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
165
167
  evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
166
168
  evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
167
169
  evalscope/benchmarks/live_code_bench/testing_util.py,sha256=abjlwp6HDayf88mMI_daOKm06nEOeNBaMkmGWqk2DJo,17286
168
170
  evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
169
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=RVbsiglxmEW37-tDYgr4Drywh26I94DRGhwv7uP2aYk,2829
171
+ evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=WXpieeLsr_BRd48fSHswdKvO2uUGYNDNfB4FyReDW9o,3134
170
172
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
- evalscope/benchmarks/math_500/math_500_adapter.py,sha256=opT73il3CbM1zZhuqRHZu_4O4WEZCZPvZe06I4U8YGM,1911
173
+ evalscope/benchmarks/math_500/math_500_adapter.py,sha256=qrfqXrSSBJ0JzkhMg_6_gZtK6eWyMtgr_WiFqtssQ9c,2290
172
174
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
173
175
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
174
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=__BrO2f7_AZ87a00HCRGPm5ZK8B4JTZKzRBRQY3yf3Q,11635
176
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=NlodlICpGVz9_MjRn-FfCMGIfmEPBBXgMtczcxuvRlc,12090
175
177
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
176
178
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
177
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=hPqxDqDhqin3TxfimfhIxfEc_8UfzTDGAfX7iDrWy28,4248
179
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=zAW3lvWXkGqYsPbVfMj5tc5EuDXLCGLFNPT8sLcKuO0,4539
178
180
  evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=Kr30i_exxBJRz9PLB5g6F04e2HJ4WuF6LDyAwaRh2MY,9578
181
+ evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=8FRC1lQX-Pv5Tji1Lsp5Mr456JvtGT1lU9c3hVO25l4,9871
180
182
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
- evalscope/benchmarks/musr/musr_adapter.py,sha256=85P0sY7H9pthYdCjkE2AOxaiNhcIBW1iZmODkz3FN0M,2464
183
+ evalscope/benchmarks/musr/musr_adapter.py,sha256=lh0UrE3yqWzmOw_ALkxJJ9AbBn11HlQMYHO39P1HAnE,2676
182
184
  evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
183
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=rNi7ULskhhHh1eVN1eV15gyLVFE05uertlZlCzMzgOE,15355
184
- evalscope/benchmarks/needle_haystack/utils.py,sha256=bDwtpMS7Eqr63urCttS9i3BvT_aPuNvrQU-vEc6tcx0,2911
185
+ evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=AybH_Ka2B2WCh-EvwAsMPlCGzJ78dHBhe5sJ6nDgNK4,15691
186
+ evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
185
187
  evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
186
188
  evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
187
- evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
189
+ evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ULuXG68ifTEc_ucH_cj0p5AGdbL-ahA7kcJ-AzYVmSM,3767
188
190
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
189
191
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
190
- evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
192
+ evalscope/benchmarks/race/race_adapter.py,sha256=JjIGGthWbktrsBL68rE-hvVY9ZOwKrrZzJoIdBdNoWg,6614
191
193
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
192
194
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
193
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=TD7hkMLGZ4GK7wD7cwqJ3jCcTAaixOakUy3o5DaPYHI,8997
195
+ evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=gQzrmslukHOJf-VBSnVKYddIg34EEOvQuGYTurQgBy0,9289
194
196
  evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
195
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
196
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
197
+ evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=CQxRszzUrSIygOSd1G10VpLSYWHqle6Jg7JQO1Sze1E,4728
198
+ evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=ce99v28wkhlGnfmihwpv3ikTqy3aumT8Jzm1LGxz-ck,10147
197
199
  evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
198
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
200
+ evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=XZb0CN83YbfH2dF-iIV-ciNLbIb3ON220qHe7zf8KF0,247
199
201
  evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
200
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=c8_Cok_wctlBtWd7kDQY9McaFbkWsW9LTC5JzPpef-Q,2399
202
+ evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=_QNncuCCMhhjsWzB934sYF-k010fKUdhhAOWrJ9LKDA,2813
201
203
  evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
202
204
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
203
205
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
204
206
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
205
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=7tMc8vVZdBnks5jWrBSrb5BSyjO2eD4On6gX8xqlkV8,4961
207
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=xvgt3SQQ0g5qT_RkZ1YOoYPxDS_CZrBJbDIKQjF-xEo,5328
206
208
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
207
209
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
208
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=ueUU860kg5_xf_MtUCa6ck-fGHX3ttw8Xh3mWSJyOZA,12617
210
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=xY4Kr-GzyyE_TWGlaKL5mo9qTaza0frWLy7EgIwlZn4,12958
209
211
  evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
210
- evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=UdANz3YmCtV2YfGuEihTe3vpUTlIxeXBhIqGkKbTFdU,1956
212
+ evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=WSJv4TDLISUy66e_PZEfjrIwsQOhgPXqeyA30nBwetM,2194
211
213
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
212
214
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
213
215
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
@@ -216,11 +218,11 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
216
218
  evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
217
219
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
218
220
  evalscope/collections/__init__.py,sha256=3v7tVLcJk86FeNBrxw3pWhu_lcpKYrnT_dDACCeR2Io,853
219
- evalscope/collections/evaluator.py,sha256=NnLel9lOyR0wzOwxDGSCFWJN4zFx9ZA2hc0PI-FSvl0,16200
221
+ evalscope/collections/evaluator.py,sha256=RJ337S0sy8dsV25I2OAxeWgSx_HrmXTyuuHKSt9vQtM,17474
220
222
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
221
223
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
222
224
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
223
- evalscope/evaluator/evaluator.py,sha256=d8cFq08oJ6kbKcwr4mVh517OxndgyqUrmuEP-bwmR6g,22071
225
+ evalscope/evaluator/evaluator.py,sha256=pQ85iNgnA9ME2b7UNH33uybcStjSQffJTh55ZFqwCNk,22115
224
226
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
225
227
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
226
228
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
@@ -232,7 +234,7 @@ evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uC
232
234
  evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
233
235
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
234
236
  evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=T91PgJfi1As7BR7I-Hq6rLlvHAtMB9JpBw9gMTH8VlE,12114
235
- evalscope/metrics/t2v_metrics/__init__.py,sha256=GBxgKTPVy_qhW_F3M4Oi6QMWhdAi4PqGX5w3t6Tueho,1783
237
+ evalscope/metrics/t2v_metrics/__init__.py,sha256=IwI3umI5wBwMJ7zlvU-l3aw8KmiQ72DgaoJXnwlWHiE,1202
236
238
  evalscope/metrics/t2v_metrics/clipscore.py,sha256=IsrYKIlFb04-FfBq4MbSv4diS6706J15Y3G4qEFIwfU,455
237
239
  evalscope/metrics/t2v_metrics/constants.py,sha256=oY5l5fOFl8qylah9eeebZm0pgY1PYmHDa7JlUC8Qls0,451
238
240
  evalscope/metrics/t2v_metrics/itmscore.py,sha256=cIaz_urio_Of1FiA2DZW7pWRIvo487zr33-x8C3Wx0o,443
@@ -334,22 +336,23 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_proce
334
336
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
335
337
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py,sha256=XzebAHBAjOpkIMZm43dd55PESgmyq_J45Ji6bogYR3s,11204
336
338
  evalscope/models/__init__.py,sha256=yB4NuKvSd3Jd4GRQvJeGPxwigd8RJErdop5PzSQhsMY,1565
337
- evalscope/models/local_model.py,sha256=1yjwt7NHE7pI8xoGv38NTql9KcCd80x1mjlELqkNHBQ,4110
339
+ evalscope/models/local_model.py,sha256=UWsmZlWpT8JNGjijzZQKirvq4YywBkKOS9G-U2cuxAw,4115
338
340
  evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,6308
339
- evalscope/models/register.py,sha256=pNC69YUvw-lodYpOXmByHm26h4m0Lofgd_om-JhOBq4,1882
340
- evalscope/models/adapters/__init__.py,sha256=mduiDZ6LgmkefNf4CtObZk6heOB93HxxgqTuYvrqWoo,590
341
- evalscope/models/adapters/base_adapter.py,sha256=f2FY8DLERudkfb4_anxNVFE_D19xCJj9BObiHWspewI,3268
341
+ evalscope/models/register.py,sha256=WiylzfL-vb6Bl3H3_RdIaBabVOAc9tiuhsQzYJDVzTg,1948
342
+ evalscope/models/adapters/__init__.py,sha256=zmldx8yC_KTI8NDRcxNLyPzv19wc57UvOVvzwyuYnG4,647
343
+ evalscope/models/adapters/base_adapter.py,sha256=z98FiFCZwNSmQElkB7ONwswvUQZxqrCikngZDg0Nn5w,3311
344
+ evalscope/models/adapters/bfcl_adapter.py,sha256=KtreuJ21X1lcUGGhVgW3U62p3P65_oydMdBPtE5um-I,10332
342
345
  evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
343
346
  evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
344
347
  evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
345
- evalscope/models/adapters/server_adapter.py,sha256=qdonCJLoM0qmFQtHziczUqVzA31p4AxIn2j9oNIosLw,6493
348
+ evalscope/models/adapters/server_adapter.py,sha256=tS-SurglnYYuAyXikR-550pE48KUVGpNoeZ8G_y47yA,9602
346
349
  evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
347
350
  evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
348
- evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
349
- evalscope/models/custom/dummy_model.py,sha256=WRT_aCBZLXnC4yRCgggkuySkhM71C47O2Txx_YNc3UM,1933
351
+ evalscope/models/custom/custom_model.py,sha256=rBccFVpCIfTGt9cgXLcxeUWc7w1sTRtbTO5w5qqQIQE,1405
352
+ evalscope/models/custom/dummy_model.py,sha256=aZg_OZ6yFNg2macxS5iCymIdFHODdQGH4OOwMXQe4SM,3113
350
353
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
351
- evalscope/perf/arguments.py,sha256=HUKzcU-FBt34DgGJ0nc5rNgJAMpZwYQXMz8VU8jokco,10668
352
- evalscope/perf/benchmark.py,sha256=qEgIX_Z4x3FNtAKTMlP2mRJTerRV5seCbVtB4XklnQI,7566
354
+ evalscope/perf/arguments.py,sha256=uBKqT_s5aG3a295MxE2VIzs9_8XXxhenN2TdZbsYXEA,10865
355
+ evalscope/perf/benchmark.py,sha256=cjUpJ3SRnZVBs_H24yqLh4WG_hcCADrniLG1VsmByb8,7901
353
356
  evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
354
357
  evalscope/perf/main.py,sha256=yfJWGd2l4uU_qKW9bD6DzV0DK9XXuCJGLYjF_JWR22E,3394
355
358
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
@@ -358,7 +361,7 @@ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2m
358
361
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
359
362
  evalscope/perf/plugin/api/custom_api.py,sha256=ssE4J8AynA0n5SnXSQyk7K5Co3dwUN6Opph08clZna0,3785
360
363
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
361
- evalscope/perf/plugin/api/openai_api.py,sha256=kTL_2OACuKhzd2W0Pf4DirpMumzk4V3rqKZ2mvBZVCs,7655
364
+ evalscope/perf/plugin/api/openai_api.py,sha256=PmjBfIzzSuzcKiVOUeA2aPxihV0dZEzFlgmbrD2isME,7773
362
365
  evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
363
366
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
364
367
  evalscope/perf/plugin/datasets/custom.py,sha256=-meul2hRmYvYAo--c_EtCnItRi5DvN7xxFOpq6vqdts,1346
@@ -398,9 +401,9 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
398
401
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
399
402
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
400
403
  evalscope/report/__init__.py,sha256=mLCgT7G-WPagQHOGz97AOdLQJjyikrswDiXA8d9Wr_Q,923
401
- evalscope/report/combinator.py,sha256=xGX0B6tGZxaEB20tziPQm3HUkvgftghKg5AEQ8JpsBE,2842
404
+ evalscope/report/combinator.py,sha256=4ahUtTFPTNiSjamldX3IcLf33yKTJKs6ZsC4fsCafe8,4192
402
405
  evalscope/report/generator.py,sha256=oykmQROG-Bt8ttCH4RtvmGJ39HmDJMTU6gG26lg5LHE,4321
403
- evalscope/report/utils.py,sha256=KAc4Cq8NMxTUjCJHI5MK3ZqzBNjfDMXrwLBpUkaywjk,6520
406
+ evalscope/report/utils.py,sha256=A8_bo-97UKA7Ys5slZ4TydCno9p7-Y3rxLpOd8gmAjM,7685
404
407
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
405
408
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
406
409
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -443,23 +446,23 @@ evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9fig
443
446
  evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
444
447
  evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
445
448
  evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
446
- evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
449
+ evalscope/utils/io_utils.py,sha256=atRCynX9dFcZGxCDip8HRpdzVkkTXCK6y4HzfiOEFU8,5615
447
450
  evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
448
451
  evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
449
452
  evalscope/utils/utils.py,sha256=P5gmpINv5UQrwEMrFZKZjdJspsOdGjaBARfRSDVNOd0,11414
450
453
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
451
454
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
452
455
  tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
453
- tests/aigc/test_t2i.py,sha256=YjEAwlM8cBfGCGOguz86UebJjJ5bsc3jhs4SQqyxwZs,3844
456
+ tests/aigc/test_t2i.py,sha256=Dqug3rV7EIkj6uwBjgj5UMj8ZrpGSznSHfn2g8J_P3M,3860
454
457
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
455
- tests/cli/test_all.py,sha256=noGE54iWnmoPGTsN2PGh7_jM5ceehN6bMnp6xxq4s3A,4240
456
- tests/cli/test_collection.py,sha256=H7enYWGTmp2VRio-WTEfPRdkf3y-T4fs43Kqf81mbrQ,4181
457
- tests/cli/test_run.py,sha256=OER_I6FeJAMUA2IN0zKUdUIeRDr8mJFaOiEpwQjYbnE,18166
458
+ tests/cli/test_all.py,sha256=yo1ysDM90dI_kWxKKPOf-BsYneeRYRJa5uh6_7SDZ3Y,4332
459
+ tests/cli/test_collection.py,sha256=jIGQNQO4msJE9w4Ms5qxtuhkHVukeLcHvBF2dzHCKCI,4207
460
+ tests/cli/test_run.py,sha256=RoS9Qtlwsm0sGJdeCWZbBrVDfkZV3iKOB9UtkeM1KWs,18651
458
461
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
459
462
  tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
460
463
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
461
- tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
462
- tests/rag/test_mteb.py,sha256=PaWS5GrZdMO680M129QP2EG000rVq7f2iP3n0YDAv-w,5611
464
+ tests/rag/test_clip_benchmark.py,sha256=uykLrRCfNR8aOiLJI0GdSL4mOys3q0LFHsA_Ur7xudc,2658
465
+ tests/rag/test_mteb.py,sha256=38cDYpqf0ozvrWf36I7z_O_DmAUCbF9LX06us65xNXk,7209
463
466
  tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
464
467
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
465
468
  tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
@@ -467,9 +470,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
467
470
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
468
471
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
469
472
  tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
470
- evalscope-0.16.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
471
- evalscope-0.16.1.dist-info/METADATA,sha256=H8eaMzt6o5k2wFIKnwBdTCPXnAexGvM-0PQqc16iKI4,36244
472
- evalscope-0.16.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
473
- evalscope-0.16.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
474
- evalscope-0.16.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
475
- evalscope-0.16.1.dist-info/RECORD,,
473
+ evalscope-0.16.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
474
+ evalscope-0.16.2.dist-info/METADATA,sha256=e60FJsG6ufvawkoGbh8146wtVCE6AA0mb9cnhIDdaSE,36533
475
+ evalscope-0.16.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
476
+ evalscope-0.16.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
477
+ evalscope-0.16.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
478
+ evalscope-0.16.2.dist-info/RECORD,,
tests/aigc/test_t2i.py CHANGED
@@ -28,15 +28,15 @@ class TestRun(unittest.TestCase):
28
28
  dataset_args={
29
29
  'general_t2i': {
30
30
  'metric_list': [
31
- 'PickScore',
31
+ # 'PickScore',
32
32
  'CLIPScore',
33
- 'HPSv2Score',
34
- 'HPSv2.1Score',
35
- 'BLIPv2Score',
36
- 'ImageRewardScore',
37
- 'VQAScore',
38
- 'FGA_BLIP2Score',
39
- 'MPS'
33
+ # 'HPSv2Score',
34
+ # 'HPSv2.1Score',
35
+ # 'BLIPv2Score',
36
+ # 'ImageRewardScore',
37
+ # 'VQAScore',
38
+ # 'FGA_BLIP2Score',
39
+ # 'MPS'
40
40
  ],
41
41
  'dataset_id': 'custom_eval/multimodal/t2i/example.jsonl',
42
42
  }
tests/cli/test_all.py CHANGED
@@ -17,41 +17,42 @@ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
17
17
  logger = get_logger()
18
18
 
19
19
  datasets=[
20
- # 'iquiz',
21
- # 'ifeval',
22
- # 'mmlu',
23
- # 'mmlu_pro',
24
- # 'musr',
25
- # 'process_bench',
26
- # 'race',
27
- # 'trivia_qa',
28
- # 'cmmlu',
29
- # 'humaneval',
30
- # 'gsm8k',
31
- # 'bbh',
32
- # 'competition_math',
33
- # 'math_500',
34
- # 'aime24',
35
- # 'gpqa',
36
- # 'arc',
37
- # 'ceval',
38
- # 'hellaswag',
39
- # 'general_mcq',
40
- # 'general_qa',
41
- # 'super_gpqa',
42
- # 'live_code_bench',
43
- # 'mmlu_redux',
44
- # 'simple_qa',
45
- # 'chinese_simpleqa',
46
- # 'alpaca_eval',
47
- # 'arena_hard',
48
- # 'maritime_bench',
49
- # 'drop',
50
- # 'winogrande',
51
- # 'tool_bench',
20
+ 'iquiz',
21
+ 'ifeval',
22
+ 'mmlu',
23
+ 'mmlu_pro',
24
+ 'musr',
25
+ 'process_bench',
26
+ 'race',
27
+ 'trivia_qa',
28
+ 'cmmlu',
29
+ 'humaneval',
30
+ 'gsm8k',
31
+ 'bbh',
32
+ 'competition_math',
33
+ 'math_500',
34
+ 'aime24',
35
+ 'gpqa',
36
+ 'arc',
37
+ 'ceval',
38
+ 'hellaswag',
39
+ 'general_mcq',
40
+ 'general_qa',
41
+ 'super_gpqa',
42
+ 'live_code_bench',
43
+ 'mmlu_redux',
44
+ 'simple_qa',
45
+ 'chinese_simpleqa',
46
+ 'alpaca_eval',
47
+ 'arena_hard',
48
+ 'maritime_bench',
49
+ 'drop',
50
+ 'winogrande',
51
+ 'tool_bench',
52
52
  'frames',
53
53
  'docmath',
54
- 'needle_haystack'
54
+ 'needle_haystack',
55
+ 'bfcl_v3',
55
56
  ]
56
57
 
57
58
  dataset_args={
@@ -126,6 +127,12 @@ dataset_args={
126
127
  'mmlu_redux':{
127
128
  'subset_list': ['abstract_algebra']
128
129
  },
130
+ 'frames':{
131
+ 'subset_list': ['simpshort_testmini']
132
+ },
133
+ 'bfcl_v3':{
134
+ 'subset_list': ['simple', 'multiple']
135
+ }
129
136
  }
130
137
 
131
138
  class TestRun(unittest.TestCase):
@@ -75,11 +75,12 @@ class TestCollection(unittest.TestCase):
75
75
  limit=5,
76
76
  judge_strategy=JudgeStrategy.AUTO,
77
77
  judge_model_args={
78
- # 'model_id': 'qwen2.5-72b-instruct',
79
- # 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
- # 'api_key': os.getenv('DASHSCOPE_API_KEY'),
78
+ 'model_id': 'qwen2.5-72b-instruct',
79
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
+ 'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
81
  },
82
82
  analysis_report=True,
83
+ ignore_errors=True,
83
84
  # use_cache='outputs/20250522_204520'
84
85
  )
85
86
  res = run_task(task_cfg=task_cfg)