evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +20 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/utils/embedding.py +2 -4
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +2 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/frames_adapter.py +1 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
- evalscope/benchmarks/needle_haystack/utils.py +2 -2
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/collections/evaluator.py +50 -28
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +6 -5
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +78 -17
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/benchmark.py +16 -3
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/report/combinator.py +38 -12
- evalscope/report/utils.py +24 -1
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/version.py +2 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
- tests/aigc/test_t2i.py +8 -8
- tests/cli/test_all.py +40 -33
- tests/cli/test_collection.py +4 -3
- tests/cli/test_run.py +36 -21
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +46 -2
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
2
|
evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
|
|
3
3
|
evalscope/config.py,sha256=HGvIlhjVjA9QtAiNEUrx_hev3wa-RaNEXelEiLJn9OM,11015
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
4
|
+
evalscope/constants.py,sha256=1CYghe0fGccyiVgzMIHd2HIb6lOo9fmB-8pH_l99iI4,4014
|
|
5
|
+
evalscope/run.py,sha256=ss7ECL4dq18ur9qFOWqCNIsckXQWWl1EsVaJxDPBVq8,7000
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
|
-
evalscope/summarizer.py,sha256=
|
|
8
|
-
evalscope/version.py,sha256=
|
|
7
|
+
evalscope/summarizer.py,sha256=nZOaXfaSaXht8GAVik_Pvz2YL0Gv24UG45mMklyBkvA,5938
|
|
8
|
+
evalscope/version.py,sha256=VHNGbQIK9g2FDZyk0Yk7RSDY_XsEEtvEBuN8kjAA8PM,119
|
|
9
9
|
evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
|
|
10
|
-
evalscope/app/app.py,sha256=
|
|
10
|
+
evalscope/app/app.py,sha256=QyO0RFfkLeOVzx-Mr8br3bYPwii2O_eVGmNgwCGHkac,29863
|
|
11
11
|
evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
|
|
12
12
|
evalscope/app/constants.py,sha256=KpItEl9lF0VldOm0grjS7RVbbseemtsXZJKtgGmAQB8,361
|
|
13
13
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -22,7 +22,7 @@ evalscope/backend/rag_eval/__init__.py,sha256=Tbj7HboP5zzJ77-9qVEwwhHKjHL5V8MwLF
|
|
|
22
22
|
evalscope/backend/rag_eval/backend_manager.py,sha256=OEFADT8kdsuVMU0QOfiafzFQopY7bKbWZ_jhdXyYElY,3472
|
|
23
23
|
evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
|
|
24
24
|
evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
|
|
25
|
-
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=
|
|
25
|
+
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=2OdPj4gSUWdAGCfS9PHpPGbd6q5RqEyli2G6UGb1ffw,8888
|
|
26
26
|
evalscope/backend/rag_eval/clip_benchmark/task_template.py,sha256=2NQRvlYY2SOzvOOj9WRLyxvRlyj8CAcgbQqgsv-Xjgw,3929
|
|
27
27
|
evalscope/backend/rag_eval/clip_benchmark/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
28
|
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py,sha256=CQnWZZTQ0FOzDtmGv7OF0W4Cv4g6u4_LQ93koDu1pes,2556
|
|
@@ -53,36 +53,36 @@ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8m
|
|
|
53
53
|
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
|
|
54
54
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
55
55
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
56
|
-
evalscope/backend/rag_eval/utils/embedding.py,sha256=
|
|
56
|
+
evalscope/backend/rag_eval/utils/embedding.py,sha256=64DQrGzB2sw_Y0twwlSmOYobpOfgmRBFLfVMOc39UTk,9370
|
|
57
57
|
evalscope/backend/rag_eval/utils/llm.py,sha256=NHjm0SeQVsSIG8uISXZcQypku4QRc3KtteeO9ldv0FI,2611
|
|
58
58
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
59
59
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
60
60
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
|
|
61
61
|
evalscope/benchmarks/__init__.py,sha256=5AXNhhmbaBFEe3u7y5TtIrviYzFI-hC8oKqxFILs1pE,937
|
|
62
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
63
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
62
|
+
evalscope/benchmarks/benchmark.py,sha256=uZ_-Y_wPhy6TxufWiElF4BwEWN93azT1JHtGRW8tR-w,2633
|
|
63
|
+
evalscope/benchmarks/data_adapter.py,sha256=NgaKHfm288hVGeG1l_xGbLvB-Gno4M7Xd5Pa2ozY17Q,22975
|
|
64
64
|
evalscope/benchmarks/utils.py,sha256=81MwUJYWjJgoiRClY-IFB-EZN0th-oQDTvU2ekaEmpc,1869
|
|
65
65
|
evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
66
|
evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
67
|
evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
|
|
68
|
-
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=
|
|
68
|
+
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=cmkny4nIWofHJdQCvu_7wR-2NZVTaJo2l98zZlgGSAM,3081
|
|
69
69
|
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py,sha256=baDGFRpVcSKpc1CdzNAMBtjeCZDUpyEc5l1KyrPNoEU,1892
|
|
70
70
|
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=t9h5qlo4KrHOgXIhHo3z6fEAi0HfUqDZvaItQdS7dZ4,2097
|
|
71
71
|
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=U0RKN3apyD3YyZfIvqgO8TNuDO-zctlftHsSfBRyQxU,1825
|
|
72
72
|
evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=vOOiOe26H2dk9VN2WbB_Oi3lzavMIaYDBq6sqeSIiAU,1093
|
|
73
73
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
74
|
-
evalscope/benchmarks/aime/aime24_adapter.py,sha256=
|
|
75
|
-
evalscope/benchmarks/aime/aime25_adapter.py,sha256=
|
|
74
|
+
evalscope/benchmarks/aime/aime24_adapter.py,sha256=hVoQMXpp_DSoZuJzCQLbAAUR8p4h9_1WcFUxelGUJBA,2036
|
|
75
|
+
evalscope/benchmarks/aime/aime25_adapter.py,sha256=TJ2pivciL8LhffGP6lZPMBqaaTzuaCN_00Bz51E7QFI,2037
|
|
76
76
|
evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
|
-
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=
|
|
77
|
+
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=AwrtuC_6o2Wa1zGnZ080OCuWv8S-hwvGHJqZ7KPQwoI,4328
|
|
78
78
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
79
79
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
80
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
80
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=BG_VeTyN88oXu7qquhva2ou1I3-RePzXLxQCsY_ne2M,6682
|
|
81
81
|
evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
|
-
evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=
|
|
82
|
+
evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=FBwkxfnbyXgTiFmwKA5mjIOb_eOuUnXrijM4rrBHZE4,6672
|
|
83
83
|
evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
|
|
84
84
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
85
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
85
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=IFu9XctrLNJcIFXK4jV3LmyqQCVb66z8YhL07Osc1TA,8623
|
|
86
86
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
87
87
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
88
88
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -110,104 +110,106 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt
|
|
|
110
110
|
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt,sha256=Su_-fICm9LxGpAkQlRbUZKvet_wPqTK-5jQo_VqJxQI,2604
|
|
111
111
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
112
112
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
113
|
+
evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
|
+
evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=MQPlfMvTQYHA4EP5g7eNzXDs4A4QvgYOiGC458Z39q4,10080
|
|
113
115
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
114
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=
|
|
116
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=jZNOtaTwiyXAA6wQ8udXKyOo-f2mKOPjE6q7mrKCPXQ,11639
|
|
115
117
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
116
118
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
117
|
-
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=
|
|
119
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=kaZ8fZK2a9oVwpGRUA3wz3FkxtcTY_FkRDYrdLjDNro,8433
|
|
118
120
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
119
121
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
120
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
122
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=QdeXKS8TdEua8lWWjoNOLvSB2fN3AKa7pKV0xjwmwME,10596
|
|
121
123
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
122
124
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
123
125
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
124
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
126
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=yubOKBm8IqskyuEYkbUDxdkUCmVJE1-yB5SxxMWyHjA,7004
|
|
125
127
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
126
|
-
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=
|
|
128
|
+
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=z_wbrA4yJoMwfg4TJkvEZB2aV5cPFcxCZ3JIj49F4Do,2604
|
|
127
129
|
evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
|
-
evalscope/benchmarks/docmath/docmath_adapter.py,sha256=
|
|
130
|
+
evalscope/benchmarks/docmath/docmath_adapter.py,sha256=LQ_beSN5RrvNqIQa5BYgwasLRrpUvM08R6BNOhIh6zA,2967
|
|
129
131
|
evalscope/benchmarks/docmath/utils.py,sha256=ptd-Sot4QtUmUG4dMlqXtUWHKZplo5jSTolsypqX9Ho,7716
|
|
130
132
|
evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
|
-
evalscope/benchmarks/drop/drop_adapter.py,sha256=
|
|
133
|
+
evalscope/benchmarks/drop/drop_adapter.py,sha256=ltt-9w6n_92crepfyb9yLBr5QzzHCWj0y1i5fYw1oF4,8645
|
|
132
134
|
evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
|
|
133
135
|
evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
|
-
evalscope/benchmarks/frames/frames_adapter.py,sha256=
|
|
136
|
+
evalscope/benchmarks/frames/frames_adapter.py,sha256=xYvxGzqj_YPDSZYogP9TxUhOxvZFbud1S2SOvz1nlDU,3136
|
|
135
137
|
evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
|
|
136
138
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
137
|
-
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=
|
|
139
|
+
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=r2qLKe8esRe45t2CoYzDiZXlq0zO6jVR-iiqLvdmn7Y,5160
|
|
138
140
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
139
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
141
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=NFeV3rPSfv7_imlEnCI3oSi7aSJGGX2JDqzgvyLVOFw,4861
|
|
140
142
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
141
143
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
142
|
-
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=
|
|
144
|
+
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=J6RfxpUT1l8Jj3vT_Vtsn1z8MKCg32XTlKn_eihCI50,5071
|
|
143
145
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
144
146
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
145
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
147
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=IBMdsvQ1w45_raCiACTBm7DVHtOYfckv8x15_OXIwTI,10752
|
|
146
148
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
147
149
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
148
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
150
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=l4bHGYaU66ga9J09_QTrrqM9zrzA7mpwQ9Ul7Uy47ig,6176
|
|
149
151
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
150
152
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
151
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
153
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=O6muXpiBrQ9RGSglnl3gS0yO6BSkQtXASMR9yXUfhEE,5515
|
|
152
154
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
155
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=C7Zww11RGbPzlB7dy-mef-2uHOVXFTdLc5W48_PM5xM,2172
|
|
154
156
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
155
157
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
156
158
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
157
159
|
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
158
160
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
159
|
-
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=
|
|
161
|
+
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=ZwUWpVe5gkEC3l5wTo-XdePHiDjQbHDhX2W0WTS5mC4,2715
|
|
160
162
|
evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
161
163
|
evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=iqmVUMZmyRhzOOXXQ-NN9P1nGvvbzTjOSEp6djbN_rw,6503
|
|
162
164
|
evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
|
|
163
|
-
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=
|
|
165
|
+
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=IHqEjfK_2O5Tk1kvWJCOcnEGIVW8Ujes6aLVm5YnkEg,3789
|
|
164
166
|
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
|
|
165
167
|
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
166
168
|
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
167
169
|
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=abjlwp6HDayf88mMI_daOKm06nEOeNBaMkmGWqk2DJo,17286
|
|
168
170
|
evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
169
|
-
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=
|
|
171
|
+
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=WXpieeLsr_BRd48fSHswdKvO2uUGYNDNfB4FyReDW9o,3134
|
|
170
172
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
171
|
-
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=
|
|
173
|
+
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=qrfqXrSSBJ0JzkhMg_6_gZtK6eWyMtgr_WiFqtssQ9c,2290
|
|
172
174
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
173
175
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
174
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
176
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=NlodlICpGVz9_MjRn-FfCMGIfmEPBBXgMtczcxuvRlc,12090
|
|
175
177
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
176
178
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
177
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
179
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=zAW3lvWXkGqYsPbVfMj5tc5EuDXLCGLFNPT8sLcKuO0,4539
|
|
178
180
|
evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
179
|
-
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=
|
|
181
|
+
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=8FRC1lQX-Pv5Tji1Lsp5Mr456JvtGT1lU9c3hVO25l4,9871
|
|
180
182
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
181
|
-
evalscope/benchmarks/musr/musr_adapter.py,sha256=
|
|
183
|
+
evalscope/benchmarks/musr/musr_adapter.py,sha256=lh0UrE3yqWzmOw_ALkxJJ9AbBn11HlQMYHO39P1HAnE,2676
|
|
182
184
|
evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
183
|
-
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=
|
|
184
|
-
evalscope/benchmarks/needle_haystack/utils.py,sha256=
|
|
185
|
+
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=AybH_Ka2B2WCh-EvwAsMPlCGzJ78dHBhe5sJ6nDgNK4,15691
|
|
186
|
+
evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
|
|
185
187
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
186
188
|
evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
|
|
187
|
-
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=
|
|
189
|
+
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ULuXG68ifTEc_ucH_cj0p5AGdbL-ahA7kcJ-AzYVmSM,3767
|
|
188
190
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
189
191
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
190
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
192
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=JjIGGthWbktrsBL68rE-hvVY9ZOwKrrZzJoIdBdNoWg,6614
|
|
191
193
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
192
194
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
193
|
-
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=
|
|
195
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=gQzrmslukHOJf-VBSnVKYddIg34EEOvQuGYTurQgBy0,9289
|
|
194
196
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
195
|
-
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=
|
|
196
|
-
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=
|
|
197
|
+
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=CQxRszzUrSIygOSd1G10VpLSYWHqle6Jg7JQO1Sze1E,4728
|
|
198
|
+
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=ce99v28wkhlGnfmihwpv3ikTqy3aumT8Jzm1LGxz-ck,10147
|
|
197
199
|
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
198
|
-
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=
|
|
200
|
+
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=XZb0CN83YbfH2dF-iIV-ciNLbIb3ON220qHe7zf8KF0,247
|
|
199
201
|
evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
200
|
-
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=
|
|
202
|
+
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=_QNncuCCMhhjsWzB934sYF-k010fKUdhhAOWrJ9LKDA,2813
|
|
201
203
|
evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
|
|
202
204
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
203
205
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
204
206
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
205
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
207
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=xvgt3SQQ0g5qT_RkZ1YOoYPxDS_CZrBJbDIKQjF-xEo,5328
|
|
206
208
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
207
209
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
208
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
210
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=xY4Kr-GzyyE_TWGlaKL5mo9qTaza0frWLy7EgIwlZn4,12958
|
|
209
211
|
evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
210
|
-
evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=
|
|
212
|
+
evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=WSJv4TDLISUy66e_PZEfjrIwsQOhgPXqeyA30nBwetM,2194
|
|
211
213
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
212
214
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
213
215
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
@@ -216,11 +218,11 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
216
218
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
217
219
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
218
220
|
evalscope/collections/__init__.py,sha256=3v7tVLcJk86FeNBrxw3pWhu_lcpKYrnT_dDACCeR2Io,853
|
|
219
|
-
evalscope/collections/evaluator.py,sha256=
|
|
221
|
+
evalscope/collections/evaluator.py,sha256=RJ337S0sy8dsV25I2OAxeWgSx_HrmXTyuuHKSt9vQtM,17474
|
|
220
222
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
221
223
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
222
224
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
223
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
225
|
+
evalscope/evaluator/evaluator.py,sha256=pQ85iNgnA9ME2b7UNH33uybcStjSQffJTh55ZFqwCNk,22115
|
|
224
226
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
225
227
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
226
228
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=5WRYuXFTDgVmolrOdiTysk-mXrpw6Qg87-iuY-VD1W4,16618
|
|
@@ -232,7 +234,7 @@ evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uC
|
|
|
232
234
|
evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
|
|
233
235
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
234
236
|
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=T91PgJfi1As7BR7I-Hq6rLlvHAtMB9JpBw9gMTH8VlE,12114
|
|
235
|
-
evalscope/metrics/t2v_metrics/__init__.py,sha256=
|
|
237
|
+
evalscope/metrics/t2v_metrics/__init__.py,sha256=IwI3umI5wBwMJ7zlvU-l3aw8KmiQ72DgaoJXnwlWHiE,1202
|
|
236
238
|
evalscope/metrics/t2v_metrics/clipscore.py,sha256=IsrYKIlFb04-FfBq4MbSv4diS6706J15Y3G4qEFIwfU,455
|
|
237
239
|
evalscope/metrics/t2v_metrics/constants.py,sha256=oY5l5fOFl8qylah9eeebZm0pgY1PYmHDa7JlUC8Qls0,451
|
|
238
240
|
evalscope/metrics/t2v_metrics/itmscore.py,sha256=cIaz_urio_Of1FiA2DZW7pWRIvo487zr33-x8C3Wx0o,443
|
|
@@ -334,22 +336,23 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_proce
|
|
|
334
336
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
|
|
335
337
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py,sha256=XzebAHBAjOpkIMZm43dd55PESgmyq_J45Ji6bogYR3s,11204
|
|
336
338
|
evalscope/models/__init__.py,sha256=yB4NuKvSd3Jd4GRQvJeGPxwigd8RJErdop5PzSQhsMY,1565
|
|
337
|
-
evalscope/models/local_model.py,sha256=
|
|
339
|
+
evalscope/models/local_model.py,sha256=UWsmZlWpT8JNGjijzZQKirvq4YywBkKOS9G-U2cuxAw,4115
|
|
338
340
|
evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,6308
|
|
339
|
-
evalscope/models/register.py,sha256=
|
|
340
|
-
evalscope/models/adapters/__init__.py,sha256=
|
|
341
|
-
evalscope/models/adapters/base_adapter.py,sha256=
|
|
341
|
+
evalscope/models/register.py,sha256=WiylzfL-vb6Bl3H3_RdIaBabVOAc9tiuhsQzYJDVzTg,1948
|
|
342
|
+
evalscope/models/adapters/__init__.py,sha256=zmldx8yC_KTI8NDRcxNLyPzv19wc57UvOVvzwyuYnG4,647
|
|
343
|
+
evalscope/models/adapters/base_adapter.py,sha256=z98FiFCZwNSmQElkB7ONwswvUQZxqrCikngZDg0Nn5w,3311
|
|
344
|
+
evalscope/models/adapters/bfcl_adapter.py,sha256=KtreuJ21X1lcUGGhVgW3U62p3P65_oydMdBPtE5um-I,10332
|
|
342
345
|
evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
|
|
343
346
|
evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
|
|
344
347
|
evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
|
|
345
|
-
evalscope/models/adapters/server_adapter.py,sha256=
|
|
348
|
+
evalscope/models/adapters/server_adapter.py,sha256=tS-SurglnYYuAyXikR-550pE48KUVGpNoeZ8G_y47yA,9602
|
|
346
349
|
evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
|
|
347
350
|
evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
|
|
348
|
-
evalscope/models/custom/custom_model.py,sha256=
|
|
349
|
-
evalscope/models/custom/dummy_model.py,sha256=
|
|
351
|
+
evalscope/models/custom/custom_model.py,sha256=rBccFVpCIfTGt9cgXLcxeUWc7w1sTRtbTO5w5qqQIQE,1405
|
|
352
|
+
evalscope/models/custom/dummy_model.py,sha256=aZg_OZ6yFNg2macxS5iCymIdFHODdQGH4OOwMXQe4SM,3113
|
|
350
353
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
351
|
-
evalscope/perf/arguments.py,sha256=
|
|
352
|
-
evalscope/perf/benchmark.py,sha256=
|
|
354
|
+
evalscope/perf/arguments.py,sha256=uBKqT_s5aG3a295MxE2VIzs9_8XXxhenN2TdZbsYXEA,10865
|
|
355
|
+
evalscope/perf/benchmark.py,sha256=cjUpJ3SRnZVBs_H24yqLh4WG_hcCADrniLG1VsmByb8,7901
|
|
353
356
|
evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
|
|
354
357
|
evalscope/perf/main.py,sha256=yfJWGd2l4uU_qKW9bD6DzV0DK9XXuCJGLYjF_JWR22E,3394
|
|
355
358
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
@@ -358,7 +361,7 @@ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2m
|
|
|
358
361
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
359
362
|
evalscope/perf/plugin/api/custom_api.py,sha256=ssE4J8AynA0n5SnXSQyk7K5Co3dwUN6Opph08clZna0,3785
|
|
360
363
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
361
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
364
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=PmjBfIzzSuzcKiVOUeA2aPxihV0dZEzFlgmbrD2isME,7773
|
|
362
365
|
evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
|
|
363
366
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
364
367
|
evalscope/perf/plugin/datasets/custom.py,sha256=-meul2hRmYvYAo--c_EtCnItRi5DvN7xxFOpq6vqdts,1346
|
|
@@ -398,9 +401,9 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
398
401
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
399
402
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
400
403
|
evalscope/report/__init__.py,sha256=mLCgT7G-WPagQHOGz97AOdLQJjyikrswDiXA8d9Wr_Q,923
|
|
401
|
-
evalscope/report/combinator.py,sha256=
|
|
404
|
+
evalscope/report/combinator.py,sha256=4ahUtTFPTNiSjamldX3IcLf33yKTJKs6ZsC4fsCafe8,4192
|
|
402
405
|
evalscope/report/generator.py,sha256=oykmQROG-Bt8ttCH4RtvmGJ39HmDJMTU6gG26lg5LHE,4321
|
|
403
|
-
evalscope/report/utils.py,sha256=
|
|
406
|
+
evalscope/report/utils.py,sha256=A8_bo-97UKA7Ys5slZ4TydCno9p7-Y3rxLpOd8gmAjM,7685
|
|
404
407
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
405
408
|
evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
|
|
406
409
|
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
@@ -443,23 +446,23 @@ evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9fig
|
|
|
443
446
|
evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
|
|
444
447
|
evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
445
448
|
evalscope/utils/import_utils.py,sha256=Oo8saX_mMw4U1RrA7_pn8FmV6P9laru4fEgecqqwpqk,2585
|
|
446
|
-
evalscope/utils/io_utils.py,sha256=
|
|
449
|
+
evalscope/utils/io_utils.py,sha256=atRCynX9dFcZGxCDip8HRpdzVkkTXCK6y4HzfiOEFU8,5615
|
|
447
450
|
evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
|
|
448
451
|
evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
|
|
449
452
|
evalscope/utils/utils.py,sha256=P5gmpINv5UQrwEMrFZKZjdJspsOdGjaBARfRSDVNOd0,11414
|
|
450
453
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
451
454
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
452
455
|
tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
453
|
-
tests/aigc/test_t2i.py,sha256=
|
|
456
|
+
tests/aigc/test_t2i.py,sha256=Dqug3rV7EIkj6uwBjgj5UMj8ZrpGSznSHfn2g8J_P3M,3860
|
|
454
457
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
455
|
-
tests/cli/test_all.py,sha256=
|
|
456
|
-
tests/cli/test_collection.py,sha256=
|
|
457
|
-
tests/cli/test_run.py,sha256=
|
|
458
|
+
tests/cli/test_all.py,sha256=yo1ysDM90dI_kWxKKPOf-BsYneeRYRJa5uh6_7SDZ3Y,4332
|
|
459
|
+
tests/cli/test_collection.py,sha256=jIGQNQO4msJE9w4Ms5qxtuhkHVukeLcHvBF2dzHCKCI,4207
|
|
460
|
+
tests/cli/test_run.py,sha256=RoS9Qtlwsm0sGJdeCWZbBrVDfkZV3iKOB9UtkeM1KWs,18651
|
|
458
461
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
459
462
|
tests/perf/test_perf.py,sha256=VbXsqiqgQY3R3bVKizYQmP04UPluUS26MO6YhTzMs48,4848
|
|
460
463
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
461
|
-
tests/rag/test_clip_benchmark.py,sha256=
|
|
462
|
-
tests/rag/test_mteb.py,sha256=
|
|
464
|
+
tests/rag/test_clip_benchmark.py,sha256=uykLrRCfNR8aOiLJI0GdSL4mOys3q0LFHsA_Ur7xudc,2658
|
|
465
|
+
tests/rag/test_mteb.py,sha256=38cDYpqf0ozvrWf36I7z_O_DmAUCbF9LX06us65xNXk,7209
|
|
463
466
|
tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
|
|
464
467
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
465
468
|
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
@@ -467,9 +470,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
467
470
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
468
471
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
469
472
|
tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
|
|
470
|
-
evalscope-0.16.
|
|
471
|
-
evalscope-0.16.
|
|
472
|
-
evalscope-0.16.
|
|
473
|
-
evalscope-0.16.
|
|
474
|
-
evalscope-0.16.
|
|
475
|
-
evalscope-0.16.
|
|
473
|
+
evalscope-0.16.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
474
|
+
evalscope-0.16.2.dist-info/METADATA,sha256=e60FJsG6ufvawkoGbh8146wtVCE6AA0mb9cnhIDdaSE,36533
|
|
475
|
+
evalscope-0.16.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
476
|
+
evalscope-0.16.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
477
|
+
evalscope-0.16.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
478
|
+
evalscope-0.16.2.dist-info/RECORD,,
|
tests/aigc/test_t2i.py
CHANGED
|
@@ -28,15 +28,15 @@ class TestRun(unittest.TestCase):
|
|
|
28
28
|
dataset_args={
|
|
29
29
|
'general_t2i': {
|
|
30
30
|
'metric_list': [
|
|
31
|
-
'PickScore',
|
|
31
|
+
# 'PickScore',
|
|
32
32
|
'CLIPScore',
|
|
33
|
-
'HPSv2Score',
|
|
34
|
-
'HPSv2.1Score',
|
|
35
|
-
'BLIPv2Score',
|
|
36
|
-
'ImageRewardScore',
|
|
37
|
-
'VQAScore',
|
|
38
|
-
'FGA_BLIP2Score',
|
|
39
|
-
'MPS'
|
|
33
|
+
# 'HPSv2Score',
|
|
34
|
+
# 'HPSv2.1Score',
|
|
35
|
+
# 'BLIPv2Score',
|
|
36
|
+
# 'ImageRewardScore',
|
|
37
|
+
# 'VQAScore',
|
|
38
|
+
# 'FGA_BLIP2Score',
|
|
39
|
+
# 'MPS'
|
|
40
40
|
],
|
|
41
41
|
'dataset_id': 'custom_eval/multimodal/t2i/example.jsonl',
|
|
42
42
|
}
|
tests/cli/test_all.py
CHANGED
|
@@ -17,41 +17,42 @@ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
|
17
17
|
logger = get_logger()
|
|
18
18
|
|
|
19
19
|
datasets=[
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
20
|
+
'iquiz',
|
|
21
|
+
'ifeval',
|
|
22
|
+
'mmlu',
|
|
23
|
+
'mmlu_pro',
|
|
24
|
+
'musr',
|
|
25
|
+
'process_bench',
|
|
26
|
+
'race',
|
|
27
|
+
'trivia_qa',
|
|
28
|
+
'cmmlu',
|
|
29
|
+
'humaneval',
|
|
30
|
+
'gsm8k',
|
|
31
|
+
'bbh',
|
|
32
|
+
'competition_math',
|
|
33
|
+
'math_500',
|
|
34
|
+
'aime24',
|
|
35
|
+
'gpqa',
|
|
36
|
+
'arc',
|
|
37
|
+
'ceval',
|
|
38
|
+
'hellaswag',
|
|
39
|
+
'general_mcq',
|
|
40
|
+
'general_qa',
|
|
41
|
+
'super_gpqa',
|
|
42
|
+
'live_code_bench',
|
|
43
|
+
'mmlu_redux',
|
|
44
|
+
'simple_qa',
|
|
45
|
+
'chinese_simpleqa',
|
|
46
|
+
'alpaca_eval',
|
|
47
|
+
'arena_hard',
|
|
48
|
+
'maritime_bench',
|
|
49
|
+
'drop',
|
|
50
|
+
'winogrande',
|
|
51
|
+
'tool_bench',
|
|
52
52
|
'frames',
|
|
53
53
|
'docmath',
|
|
54
|
-
'needle_haystack'
|
|
54
|
+
'needle_haystack',
|
|
55
|
+
'bfcl_v3',
|
|
55
56
|
]
|
|
56
57
|
|
|
57
58
|
dataset_args={
|
|
@@ -126,6 +127,12 @@ dataset_args={
|
|
|
126
127
|
'mmlu_redux':{
|
|
127
128
|
'subset_list': ['abstract_algebra']
|
|
128
129
|
},
|
|
130
|
+
'frames':{
|
|
131
|
+
'subset_list': ['simpshort_testmini']
|
|
132
|
+
},
|
|
133
|
+
'bfcl_v3':{
|
|
134
|
+
'subset_list': ['simple', 'multiple']
|
|
135
|
+
}
|
|
129
136
|
}
|
|
130
137
|
|
|
131
138
|
class TestRun(unittest.TestCase):
|
tests/cli/test_collection.py
CHANGED
|
@@ -75,11 +75,12 @@ class TestCollection(unittest.TestCase):
|
|
|
75
75
|
limit=5,
|
|
76
76
|
judge_strategy=JudgeStrategy.AUTO,
|
|
77
77
|
judge_model_args={
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
78
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
79
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
|
+
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
81
|
},
|
|
82
82
|
analysis_report=True,
|
|
83
|
+
ignore_errors=True,
|
|
83
84
|
# use_cache='outputs/20250522_204520'
|
|
84
85
|
)
|
|
85
86
|
res = run_task(task_cfg=task_cfg)
|